33 {
44 "cell_type" : " code" ,
55 "execution_count" : null ,
6- "metadata" : {
7- "collapsed" : false
8- },
6+ "metadata" : {},
97 "outputs" : [],
108 "source" : [
119 " %matplotlib inline"
4240 {
4341 "cell_type" : " markdown" ,
4442 "metadata" : {},
45- "source" : [
46- " 基于经验的k-means初始化方法\n " ,
47- " \n " ,
48- " 评估k-均值初始化的能力,以使算法收敛稳健,如通过聚类惯性的相对标准偏差(即到最近聚类中心的平方距离之和)测量的。\n " ,
49- " \n " ,
50- " 第一个图显示了最佳初始化参数(``KMeans`` or ``MiniBatchKMeans``)和init方法(``init=\" random\" `` or ``init=\" kmeans++\" ``)的选择。\n " ,
51- " \n " ,
52- " 第二个图显示了使用``init=\" random\" `` and ``n_init=1``的``MiniBatchKMeans``一次运行结果。这种运行导致一个坏的收敛(局部最优)。\n " ,
53- " \n " ,
54- " 用于评估的数据集是符合高斯分布的2D网格数据。"
55- ]
43+ "source" : []
5644 },
5745 {
5846 "cell_type" : " code" ,
5947 "execution_count" : 3 ,
60- "metadata" : {
61- "collapsed" : false
62- },
48+ "metadata" : {},
6349 "outputs" : [
6450 {
6551 "name" : " stdout" ,
6652 "output_type" : " stream" ,
6753 "text" : [
68- " Automatically created module for IPython interactive environment\n Evaluation of KMeans with k-means++ init\n "
69- ]
70- },
71- {
72- "name" : " stdout" ,
73- "output_type" : " stream" ,
74- "text" : [
75- " Evaluation of KMeans with random init\n "
76- ]
77- },
78- {
79- "name" : " stdout" ,
80- "output_type" : " stream" ,
81- "text" : [
82- " Evaluation of MiniBatchKMeans with k-means++ init\n "
83- ]
84- },
85- {
86- "name" : " stdout" ,
87- "output_type" : " stream" ,
88- "text" : [
54+ " Automatically created module for IPython interactive environment\n " ,
55+ " Evaluation of KMeans with k-means++ init\n " ,
56+ " Evaluation of KMeans with random init\n " ,
57+ " Evaluation of MiniBatchKMeans with k-means++ init\n " ,
8958 " Evaluation of MiniBatchKMeans with random init\n "
9059 ]
9160 },
11483 " print(__doc__)\n " ,
11584 " \n " ,
11685 " # Author: Olivier Grisel <olivier.grisel@ensta.org>\n " ,
117- " # License: BSD 3 clause\n " ,
118- " \n " ,
119- " import numpy as np\n " ,
120- " import matplotlib.pyplot as plt\n " ,
121- " import matplotlib.cm as cm\n " ,
122- " \n " ,
123- " from sklearn.utils import shuffle\n " ,
124- " from sklearn.utils import check_random_state\n " ,
125- " from sklearn.cluster import MiniBatchKMeans\n " ,
126- " from sklearn.cluster import KMeans\n " ,
127- " \n " ,
128- " random_state = np.random.RandomState(0)\n " ,
129- " \n " ,
130- " # Number of run (with randomly generated dataset) for each strategy so as\n " ,
131- " # to be able to compute an estimate of the standard deviation\n " ,
132- " n_runs = 5\n " ,
133- " \n " ,
134- " # k-means models can do several random inits so as to be able to trade\n " ,
135- " # CPU time for convergence robustness\n " ,
136- " n_init_range = np.array([1, 5, 10, 15, 20])\n " ,
137- " \n " ,
138- " # Datasets generation parameters\n " ,
139- " n_samples_per_center = 100\n " ,
140- " grid_size = 3\n " ,
141- " scale = 0.1\n " ,
142- " n_clusters = grid_size ** 2\n " ,
143- " \n " ,
144- " \n " ,
145- " def make_data(random_state, n_samples_per_center, grid_size, scale):\n " ,
146- " random_state = check_random_state(random_state)\n " ,
147- " centers = np.array([[i, j]\n " ,
148- " for i in range(grid_size)\n " ,
149- " for j in range(grid_size)])\n " ,
150- " n_clusters_true, n_features = centers.shape\n " ,
151- " \n " ,
152- " noise = random_state.normal(\n " ,
153- " scale=scale, size=(n_samples_per_center, centers.shape[1]))\n " ,
154- " \n " ,
155- " X = np.concatenate([c + noise for c in centers])\n " ,
156- " y = np.concatenate([[i] * n_samples_per_center\n " ,
157- " for i in range(n_clusters_true)])\n " ,
158- " return shuffle(X, y, random_state=random_state)\n " ,
159- " \n " ,
160- " # Part 1: Quantitative evaluation of various init methods\n " ,
161- " \n " ,
162- " plt.figure()\n " ,
163- " plots = []\n " ,
164- " legends = []\n " ,
165- " \n " ,
166- " cases = [\n " ,
167- " (KMeans, 'k-means++', {}),\n " ,
168- " (KMeans, 'random', {}),\n " ,
169- " (MiniBatchKMeans, 'k-means++', {'max_no_improvement': 3}),\n " ,
170- " (MiniBatchKMeans, 'random', {'max_no_improvement': 3, 'init_size': 500}),\n " ,
171- " ]\n " ,
172- " \n " ,
173- " for factory, init, params in cases:\n " ,
174- " print(\" Evaluation of %s with %s init\" % (factory.__name__, init))\n " ,
175- " inertia = np.empty((len(n_init_range), n_runs))\n " ,
176- " \n " ,
177- " for run_id in range(n_runs):\n " ,
178- " X, y = make_data(run_id, n_samples_per_center, grid_size, scale)\n " ,
179- " for i, n_init in enumerate(n_init_range):\n " ,
180- " km = factory(n_clusters=n_clusters, init=init, random_state=run_id,\n " ,
181- " n_init=n_init, **params).fit(X)\n " ,
182- " inertia[i, run_id] = km.inertia_\n " ,
183- " p = plt.errorbar(n_init_range, inertia.mean(axis=1), inertia.std(axis=1))\n " ,
184- " plots.append(p[0])\n " ,
185- " legends.append(\" %s with %s init\" % (factory.__name__, init))\n " ,
186- " \n " ,
187- " plt.xlabel('n_init')\n " ,
188- " plt.ylabel('inertia')\n " ,
189- " plt.legend(plots, legends)\n " ,
190- " plt.title(\" Mean inertia for various k-means init across %d runs\" % n_runs)\n " ,
191- " \n " ,
192- " # Part 2: Qualitative visual inspection of the convergence\n " ,
193- " \n " ,
194- " X, y = make_data(random_state, n_samples_per_center, grid_size, scale)\n " ,
195- " km = MiniBatchKMeans(n_clusters=n_clusters, init='random', n_init=1,\n " ,
196- " random_state=random_state).fit(X)\n " ,
197- " \n " ,
198- " plt.figure()\n " ,
199- " for k in range(n_clusters):\n " ,
200- " my_members = km.labels_ == k\n " ,
201- " color = cm.nipy_spectral(float(k) / n_clusters, 1)\n " ,
202- " plt.plot(X[my_members, 0], X[my_members, 1], 'o', marker='.', c=color)\n " ,
203- " cluster_center = km.cluster_centers_[k]\n " ,
204- " plt.plot(cluster_center[0], cluster_center[1], 'o',\n " ,
205- " markerfacecolor=color, markeredgecolor='k', markersize=6)\n " ,
206- " plt.title(\" Example cluster allocation with a single random init\\ n\"\n " ,
207- " \" with MiniBatchKMeans\" )\n " ,
208- " \n " ,
209- " plt.show()"
86+ " # License: BSD 3 clause\n "
21087 ]
21188 },
21289 {
21390 "cell_type" : " code" ,
21491 "execution_count" : null ,
21592 "metadata" : {},
21693 "outputs" : [],
217- "source" : [
218- " "
219- ]
94+ "source" : []
22095 }
22196 ],
22297 "metadata" : {
228103 "language_info" : {
229104 "codemirror_mode" : {
230105 "name" : " ipython" ,
231- "version" : 3.0
106+ "version" : 3
232107 },
233108 "file_extension" : " .py" ,
234109 "mimetype" : " text/x-python" ,
235110 "name" : " python" ,
236111 "nbconvert_exporter" : " python" ,
237112 "pygments_lexer" : " ipython3" ,
238- "version" : " 3.6.6 "
113+ "version" : " 3.8.8 "
239114 }
240115 },
241116 "nbformat" : 4 ,
242- "nbformat_minor" : 0
117+ "nbformat_minor" : 1
243118}
0 commit comments