Commit 4026f9e

committed

更新第三章:数据科学-sklearn, cluster。

1 parent 4e1714f commit 4026f9eCopy full SHA for 4026f9e

File tree

14 files changed

+1571

-720

lines changed

03_data_science
08_nlp
- classification.py
- text_cluster
README.md
docs
- wechat_zhifu.png
- zhifubao.png

14 files changed

+1571

-720

lines changed

`‎03_data_science/03_scikit-learn/kmeans/plot_cluster_iris.ipynb‎`

Lines changed: 12 additions & 143 deletions

Large diffs are not rendered by default.

`‎03_data_science/03_scikit-learn/kmeans/plot_color_quantization.ipynb‎`

Lines changed: 6 additions & 168 deletions

Large diffs are not rendered by default.

`‎03_data_science/03_scikit-learn/kmeans/plot_kmeans_stability_low_dim_dense.ipynb‎`

Lines changed: 12 additions & 137 deletions

Original file line number	Diff line number	Diff line change
`@@ -3,9 +3,7 @@`
`3`	`3`	`{`
`4`	`4`	`"cell_type": "code",`
`5`	`5`	`"execution_count": null,`
`6`		`- "metadata": {`
`7`		`- "collapsed": false`
`8`		`- },`
	`6`	`+ "metadata": {},`
`9`	`7`	`"outputs": [],`
`10`	`8`	`"source": [`
`11`	`9`	`"%matplotlib inline"`
`@@ -42,50 +40,21 @@`
`42`	`40`	`{`
`43`	`41`	`"cell_type": "markdown",`
`44`	`42`	`"metadata": {},`
`45`		`- "source": [`
`46`		`- "基于经验的k-means初始化方法\n",`
`47`		`- "\n",`
`48`		`- "评估k-均值初始化的能力,以使算法收敛稳健,如通过聚类惯性的相对标准偏差(即到最近聚类中心的平方距离之和)测量的。\n",`
`49`		`- "\n",`
`50`		- "第一个图显示了最佳初始化参数(``KMeans`` or ``MiniBatchKMeans``)和init方法(``init=\"random\"`` or ``init=\"kmeans++\"``)的选择。\n",
`51`		`- "\n",`
`52`		- "第二个图显示了使用``init=\"random\"`` and ``n_init=1``的``MiniBatchKMeans``一次运行结果。这种运行导致一个坏的收敛(局部最优)。\n",
`53`		`- "\n",`
`54`		`- "用于评估的数据集是符合高斯分布的2D网格数据。"`
`55`		`- ]`
	`43`	`+ "source": []`
`56`	`44`	`},`
`57`	`45`	`{`
`58`	`46`	`"cell_type": "code",`
`59`	`47`	`"execution_count": 3,`
`60`		`- "metadata": {`
`61`		`- "collapsed": false`
`62`		`- },`
	`48`	`+ "metadata": {},`
`63`	`49`	`"outputs": [`
`64`	`50`	`{`
`65`	`51`	`"name": "stdout",`
`66`	`52`	`"output_type": "stream",`
`67`	`53`	`"text": [`
`68`		`- "Automatically created module for IPython interactive environment\nEvaluation of KMeans with k-means++ init\n"`
`69`		`- ]`
`70`		`- },`
`71`		`- {`
`72`		`- "name": "stdout",`
`73`		`- "output_type": "stream",`
`74`		`- "text": [`
`75`		`- "Evaluation of KMeans with random init\n"`
`76`		`- ]`
`77`		`- },`
`78`		`- {`
`79`		`- "name": "stdout",`
`80`		`- "output_type": "stream",`
`81`		`- "text": [`
`82`		`- "Evaluation of MiniBatchKMeans with k-means++ init\n"`
`83`		`- ]`
`84`		`- },`
`85`		`- {`
`86`		`- "name": "stdout",`
`87`		`- "output_type": "stream",`
`88`		`- "text": [`
	`54`	`+ "Automatically created module for IPython interactive environment\n",`
	`55`	`+ "Evaluation of KMeans with k-means++ init\n",`
	`56`	`+ "Evaluation of KMeans with random init\n",`
	`57`	`+ "Evaluation of MiniBatchKMeans with k-means++ init\n",`
`89`	`58`	`"Evaluation of MiniBatchKMeans with random init\n"`
`90`	`59`	`]`
`91`	`60`	`},`
`@@ -114,109 +83,15 @@`
`114`	`83`	`"print(__doc__)\n",`
`115`	`84`	`"\n",`
`116`	`85`	`"# Author: Olivier Grisel <olivier.grisel@ensta.org>\n",`
`117`		`- "# License: BSD 3 clause\n",`
`118`		`- "\n",`
`119`		`- "import numpy as np\n",`
`120`		`- "import matplotlib.pyplot as plt\n",`
`121`		`- "import matplotlib.cm as cm\n",`
`122`		`- "\n",`
`123`		`- "from sklearn.utils import shuffle\n",`
`124`		`- "from sklearn.utils import check_random_state\n",`
`125`		`- "from sklearn.cluster import MiniBatchKMeans\n",`
`126`		`- "from sklearn.cluster import KMeans\n",`
`127`		`- "\n",`
`128`		`- "random_state = np.random.RandomState(0)\n",`
`129`		`- "\n",`
`130`		`- "# Number of run (with randomly generated dataset) for each strategy so as\n",`
`131`		`- "# to be able to compute an estimate of the standard deviation\n",`
`132`		`- "n_runs = 5\n",`
`133`		`- "\n",`
`134`		`- "# k-means models can do several random inits so as to be able to trade\n",`
`135`		`- "# CPU time for convergence robustness\n",`
`136`		`- "n_init_range = np.array([1, 5, 10, 15, 20])\n",`
`137`		`- "\n",`
`138`		`- "# Datasets generation parameters\n",`
`139`		`- "n_samples_per_center = 100\n",`
`140`		`- "grid_size = 3\n",`
`141`		`- "scale = 0.1\n",`
`142`		`- "n_clusters = grid_size ** 2\n",`
`143`		`- "\n",`
`144`		`- "\n",`
`145`		`- "def make_data(random_state, n_samples_per_center, grid_size, scale):\n",`
`146`		`- " random_state = check_random_state(random_state)\n",`
`147`		`- " centers = np.array([[i, j]\n",`
`148`		`- " for i in range(grid_size)\n",`
`149`		`- " for j in range(grid_size)])\n",`
`150`		`- " n_clusters_true, n_features = centers.shape\n",`
`151`		`- "\n",`
`152`		`- " noise = random_state.normal(\n",`
`153`		`- " scale=scale, size=(n_samples_per_center, centers.shape[1]))\n",`
`154`		`- "\n",`
`155`		`- " X = np.concatenate([c + noise for c in centers])\n",`
`156`		`- " y = np.concatenate([[i] * n_samples_per_center\n",`
`157`		`- " for i in range(n_clusters_true)])\n",`
`158`		`- " return shuffle(X, y, random_state=random_state)\n",`
`159`		`- "\n",`
`160`		`- "# Part 1: Quantitative evaluation of various init methods\n",`
`161`		`- "\n",`
`162`		`- "plt.figure()\n",`
`163`		`- "plots = []\n",`
`164`		`- "legends = []\n",`
`165`		`- "\n",`
`166`		`- "cases = [\n",`
`167`		`- " (KMeans, 'k-means++', {}),\n",`
`168`		`- " (KMeans, 'random', {}),\n",`
`169`		`- " (MiniBatchKMeans, 'k-means++', {'max_no_improvement': 3}),\n",`
`170`		`- " (MiniBatchKMeans, 'random', {'max_no_improvement': 3, 'init_size': 500}),\n",`
`171`		`- "]\n",`
`172`		`- "\n",`
`173`		`- "for factory, init, params in cases:\n",`
`174`		`- " print(\"Evaluation of %s with %s init\" % (factory.__name__, init))\n",`
`175`		`- " inertia = np.empty((len(n_init_range), n_runs))\n",`
`176`		`- "\n",`
`177`		`- " for run_id in range(n_runs):\n",`
`178`		`- " X, y = make_data(run_id, n_samples_per_center, grid_size, scale)\n",`
`179`		`- " for i, n_init in enumerate(n_init_range):\n",`
`180`		`- " km = factory(n_clusters=n_clusters, init=init, random_state=run_id,\n",`
`181`		`- " n_init=n_init, **params).fit(X)\n",`
`182`		`- " inertia[i, run_id] = km.inertia_\n",`
`183`		`- " p = plt.errorbar(n_init_range, inertia.mean(axis=1), inertia.std(axis=1))\n",`
`184`		`- " plots.append(p[0])\n",`
`185`		`- " legends.append(\"%s with %s init\" % (factory.__name__, init))\n",`
`186`		`- "\n",`
`187`		`- "plt.xlabel('n_init')\n",`
`188`		`- "plt.ylabel('inertia')\n",`
`189`		`- "plt.legend(plots, legends)\n",`
`190`		`- "plt.title(\"Mean inertia for various k-means init across %d runs\" % n_runs)\n",`
`191`		`- "\n",`
`192`		`- "# Part 2: Qualitative visual inspection of the convergence\n",`
`193`		`- "\n",`
`194`		`- "X, y = make_data(random_state, n_samples_per_center, grid_size, scale)\n",`
`195`		`- "km = MiniBatchKMeans(n_clusters=n_clusters, init='random', n_init=1,\n",`
`196`		`- " random_state=random_state).fit(X)\n",`
`197`		`- "\n",`
`198`		`- "plt.figure()\n",`
`199`		`- "for k in range(n_clusters):\n",`
`200`		`- " my_members = km.labels_ == k\n",`
`201`		`- " color = cm.nipy_spectral(float(k) / n_clusters, 1)\n",`
`202`		`- " plt.plot(X[my_members, 0], X[my_members, 1], 'o', marker='.', c=color)\n",`
`203`		`- " cluster_center = km.cluster_centers_[k]\n",`
`204`		`- " plt.plot(cluster_center[0], cluster_center[1], 'o',\n",`
`205`		`- " markerfacecolor=color, markeredgecolor='k', markersize=6)\n",`
`206`		`- " plt.title(\"Example cluster allocation with a single random init\\n\"\n",`
`207`		`- " \"with MiniBatchKMeans\")\n",`
`208`		`- "\n",`
`209`		`- "plt.show()"`
	`86`	`+ "# License: BSD 3 clause\n"`
`210`	`87`	`]`
`211`	`88`	`},`
`212`	`89`	`{`
`213`	`90`	`"cell_type": "code",`
`214`	`91`	`"execution_count": null,`
`215`	`92`	`"metadata": {},`
`216`	`93`	`"outputs": [],`
`217`		`- "source": [`
`218`		`- ""`
`219`		`- ]`
	`94`	`+ "source": []`
`220`	`95`	`}`
`221`	`96`	`],`
`222`	`97`	`"metadata": {`
`@@ -228,16 +103,16 @@`
`228`	`103`	`"language_info": {`
`229`	`104`	`"codemirror_mode": {`
`230`	`105`	`"name": "ipython",`
`231`		`- "version": 3.0`
	`106`	`+ "version": 3`
`232`	`107`	`},`
`233`	`108`	`"file_extension": ".py",`
`234`	`109`	`"mimetype": "text/x-python",`
`235`	`110`	`"name": "python",`
`236`	`111`	`"nbconvert_exporter": "python",`
`237`	`112`	`"pygments_lexer": "ipython3",`
`238`		`- "version": "3.6.6"`
	`113`	`+ "version": "3.8.8"`
`239`	`114`	`}`
`240`	`115`	`},`
`241`	`116`	`"nbformat": 4,`
`242`		`- "nbformat_minor": 0`
	`117`	`+ "nbformat_minor": 1`
`243`	`118`	`}`

`‎03_data_science/03_scikit-learn/kmeans/plot_mini_batch_kmeans.ipynb‎`

Lines changed: 14 additions & 18 deletions

Original file line number	Diff line number	Diff line change
`@@ -3,9 +3,7 @@`
`3`	`3`	`{`
`4`	`4`	`"cell_type": "code",`
`5`	`5`	`"execution_count": 1,`
`6`		`- "metadata": {`
`7`		`- "collapsed": false`
`8`		`- },`
	`6`	`+ "metadata": {},`
`9`	`7`	`"outputs": [],`
`10`	`8`	`"source": [`
`11`	`9`	`"%matplotlib inline"`
`@@ -33,24 +31,24 @@`
`33`	`31`	`{`
`34`	`32`	`"cell_type": "markdown",`
`35`	`33`	`"metadata": {},`
`36`		`- "source": [`
`37`		`- "比较K-Means和MiniBatchKMeans算法\n",`
`38`		`- "\n",`
`39`		`- "结论:初始化一致的情况下,差别很小。"`
`40`		`- ]`
	`34`	`+ "source": []`
`41`	`35`	`},`
`42`	`36`	`{`
`43`	`37`	`"cell_type": "code",`
`44`	`38`	`"execution_count": 7,`
`45`		`- "metadata": {`
`46`		`- "collapsed": false`
`47`		`- },`
	`39`	`+ "metadata": {},`
`48`	`40`	`"outputs": [`
`49`	`41`	`{`
`50`	`42`	`"name": "stdout",`
`51`	`43`	`"output_type": "stream",`
`52`	`44`	`"text": [`
`53`		`- "Automatically created module for IPython interactive environment\n0\n[False True False ... True True False] [1 0 1 ... 0 0 2] 0\n1\n[ True False True ... False False False] [1 0 1 ... 0 0 2] 1\n2\n[False False False ... False False True] [1 0 1 ... 0 0 2] 2\n"`
	`45`	`+ "Automatically created module for IPython interactive environment\n",`
	`46`	`+ "0\n",`
	`47`	`+ "[False True False ... True True False] [1 0 1 ... 0 0 2] 0\n",`
	`48`	`+ "1\n",`
	`49`	`+ "[ True False True ... False False False] [1 0 1 ... 0 0 2] 1\n",`
	`50`	`+ "2\n",`
	`51`	`+ "[False False False ... False False True] [1 0 1 ... 0 0 2] 2\n"`
`54`	`52`	`]`
`55`	`53`	`},`
`56`	`54`	`{`
`@@ -175,9 +173,7 @@`
`175`	`173`	`"execution_count": null,`
`176`	`174`	`"metadata": {},`
`177`	`175`	`"outputs": [],`
`178`		`- "source": [`
`179`		`- ""`
`180`		`- ]`
	`176`	`+ "source": []`
`181`	`177`	`}`
`182`	`178`	`],`
`183`	`179`	`"metadata": {`
`@@ -189,16 +185,16 @@`
`189`	`185`	`"language_info": {`
`190`	`186`	`"codemirror_mode": {`
`191`	`187`	`"name": "ipython",`
`192`		`- "version": 3.0`
	`188`	`+ "version": 3`
`193`	`189`	`},`
`194`	`190`	`"file_extension": ".py",`
`195`	`191`	`"mimetype": "text/x-python",`
`196`	`192`	`"name": "python",`
`197`	`193`	`"nbconvert_exporter": "python",`
`198`	`194`	`"pygments_lexer": "ipython3",`
`199`		`- "version": "3.6.6"`
	`195`	`+ "version": "3.8.8"`
`200`	`196`	`}`
`201`	`197`	`},`
`202`	`198`	`"nbformat": 4,`
`203`		`- "nbformat_minor": 0`
	`199`	`+ "nbformat_minor": 1`
`204`	`200`	`}`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 4026f9e

File tree

14 files changed

14 files changed

`‎03_data_science/03_scikit-learn/kmeans/plot_cluster_iris.ipynb‎`

`‎03_data_science/03_scikit-learn/kmeans/plot_color_quantization.ipynb‎`

`‎03_data_science/03_scikit-learn/kmeans/plot_kmeans_stability_low_dim_dense.ipynb‎`

`‎03_data_science/03_scikit-learn/kmeans/plot_mini_batch_kmeans.ipynb‎`

0 commit comments