|
6 | 6 | "name": "09 Hierarchical Clustering.ipynb",
|
7 | 7 | "provenance": [],
|
8 | 8 | "collapsed_sections": [],
|
9 | | - "authorship_tag": "ABX9TyPnLpjY7aB4VdaauAzHSYHL", |
| 9 | + "authorship_tag": "ABX9TyOcNJnzlGscARY2g2fgeEkQ", |
10 | 10 | "include_colab_link": true
|
11 | 11 | },
|
12 | 12 | "kernelspec": {
|
|
36 | 36 | "# Hierarchical Clustering \n",
|
37 | 37 | "\n",
|
38 | 38 | "# Agglomerative Clustering which is a method of clustering \n",
|
39 | | - "# which builds a hierarchy of clusters by merging together small clusters" |
| 39 | + "# which builds a hierarchy of clusters by merging together small clusters\n", |
| 40 | + "\n", |
| 41 | + "# Silhouette Score\n", |
| 42 | + "# Clusters are well apart from each other as the silhouette score is closer to 1\n", |
| 43 | + "# Silhouette Coefficient score is a metric used to calculate the goodness of a clustering technique \n", |
| 44 | + "# Its value ranges from -1 to 1.\n", |
| 45 | + "# 1: Means clusters are well apart from each other and clearly distinguished.\n", |
| 46 | + "# 0: Means clusters are indifferent, or we can say that the distance between clusters is not significant.\n", |
| 47 | + "# -1: Means clusters are assigned in the wrong way." |
40 | 48 | ],
|
41 | 49 | "execution_count": 1,
|
42 | 50 | "outputs": []
|
|
56 | 64 | "from sklearn.cluster import AgglomerativeClustering\n",
|
57 | 65 | "from sklearn.datasets import load_iris\n",
|
58 | 66 | "import numpy as np\n",
|
59 | | - "import pandas as pd" |
| 67 | + "import pandas as pd\n", |
| 68 | + "from sklearn.metrics import silhouette_score" |
60 | 69 | ],
|
61 | 70 | "execution_count": 2,
|
62 | 71 | "outputs": []
|
|
70 | 79 | "base_uri": "https://localhost:8080/",
|
71 | 80 | "height": 204
|
72 | 81 | },
|
73 | | - "outputId": "b160b91f-2581-4d91-d255-86e0c327d994" |
| 82 | + "outputId": "0e1ad085-0af3-4ea2-b6c9-11c4a8bafb6c" |
74 | 83 | },
|
75 | 84 | "source": [
|
76 | 85 | "# Loading the Dataset\n",
|
|
184 | 193 | "base_uri": "https://localhost:8080/",
|
185 | 194 | "height": 107
|
186 | 195 | },
|
187 | | - "outputId": "edba7d29-6f17-45ef-8198-686fb3cf55a9" |
| 196 | + "outputId": "cc3304da-9a95-4ff0-9361-29866fb1f068" |
188 | 197 | },
|
189 | 198 | "source": [
|
190 | 199 | "# Preparing Data\n",
|
|
221 | 230 | "base_uri": "https://localhost:8080/",
|
222 | 231 | "height": 125
|
223 | 232 | },
|
224 | | - "outputId": "dbd5dec4-fecf-4888-9854-ceb6d4102fc1" |
| 233 | + "outputId": "7d690fac-4ee5-4b4b-d187-18c53fc805b8" |
225 | 234 | },
|
226 | 235 | "source": [
|
227 | 236 | "# Now we will separate the target variable from the original dataset \n",
|
|
270 | 279 | "base_uri": "https://localhost:8080/",
|
271 | 280 | "height": 395
|
272 | 281 | },
|
273 | | - "outputId": "face4433-b7bb-438f-c25e-3b76bfdf7e38" |
| 282 | + "outputId": "6d6a1938-d925-4fc0-a686-0a8d1a3a8720" |
274 | 283 | },
|
275 | 284 | "source": [
|
276 | 285 | "# Filtering Setosa\n",
|
|
324 | 333 | "base_uri": "https://localhost:8080/",
|
325 | 334 | "height": 485
|
326 | 335 | },
|
327 | | - "outputId": "57852746-b1cf-44af-fdc7-3e4a9719fc93" |
| 336 | + "outputId": "083498f6-b5bc-4968-8cbb-a7c20cbeab12" |
328 | 337 | },
|
329 | 338 | "source": [
|
330 | 339 | "# Filtering Setosa for 2D Plot \n",
|
|
397 | 406 | "base_uri": "https://localhost:8080/",
|
398 | 407 | "height": 395
|
399 | 408 | },
|
400 | | - "outputId": "075f81a4-595c-4c0c-e8b7-31721acb2488" |
| 409 | + "outputId": "e58b70b5-3623-44dd-fe31-417d81bb5275" |
401 | 410 | },
|
402 | 411 | "source": [
|
403 | 412 | "# Filtering Versicolour\n",
|
|
451 | 460 | "base_uri": "https://localhost:8080/",
|
452 | 461 | "height": 485
|
453 | 462 | },
|
454 | | - "outputId": "8d22f3ae-c703-4303-cc75-c55cf5b264a6" |
| 463 | + "outputId": "483a8bf5-ae7f-4951-caa2-063cbbbc9cee" |
455 | 464 | },
|
456 | 465 | "source": [
|
457 | 466 | "# Filtering Versicolour for 2D Plot \n",
|
|
523 | 532 | "base_uri": "https://localhost:8080/",
|
524 | 533 | "height": 395
|
525 | 534 | },
|
526 | | - "outputId": "cfed997b-a096-4ebb-c2b3-05edfafa362c" |
| 535 | + "outputId": "79cbabbb-0739-45b8-fd36-9a2d71bda44b" |
527 | 536 | },
|
528 | 537 | "source": [
|
529 | 538 | "# Filtering Virginica\n",
|
|
577 | 586 | "base_uri": "https://localhost:8080/",
|
578 | 587 | "height": 485
|
579 | 588 | },
|
580 | | - "outputId": "ad81832b-49ce-4768-b791-71f0f34ae15f" |
| 589 | + "outputId": "66e8d27b-86aa-4c81-9a11-27e28117b09d" |
581 | 590 | },
|
582 | 591 | "source": [
|
583 | 592 | "# Filtering Virginica for 2D Plot\n",
|
|
648 | 657 | "base_uri": "https://localhost:8080/",
|
649 | 658 | "height": 286
|
650 | 659 | },
|
651 | | - "outputId": "c1f60208-731a-425e-9286-d06b52ba9777" |
| 660 | + "outputId": "d2bd50fd-12dd-48b8-aacd-c51aba490576" |
652 | 661 | },
|
653 | 662 | "source": [
|
654 | 663 | "# Visualise Classes all at once\n",
|
|
675 | 684 | "output_type": "execute_result",
|
676 | 685 | "data": {
|
677 | 686 | "text/plain": [
|
678 | | - "<matplotlib.legend.Legend at 0x7f27d345e470>" |
| 687 | + "<matplotlib.legend.Legend at 0x7ffa7d6145f8>" |
679 | 688 | ]
|
680 | 689 | },
|
681 | 690 | "metadata": {
|
|
707 | 716 | "base_uri": "https://localhost:8080/",
|
708 | 717 | "height": 204
|
709 | 718 | },
|
710 | | - "outputId": "be9c9cac-ac1f-4696-89d0-eed6c4c2b764" |
| 719 | + "outputId": "d2a9b743-a4df-4e58-9299-6ffb7acfab9e" |
711 | 720 | },
|
712 | 721 | "source": [
|
713 | 722 | "# Plotting of Dendrogram\n",
|
|
817 | 826 | "base_uri": "https://localhost:8080/",
|
818 | 827 | "height": 395
|
819 | 828 | },
|
820 | | - "outputId": "db224ba6-bde1-4c05-ea6b-b6f32c907a80" |
| 829 | + "outputId": "06710af3-fec8-4c5c-9c12-80a2c04c4a99" |
821 | 830 | },
|
822 | 831 | "source": [
|
823 | 832 | "# We finally plot a Dendrogram \n",
|
|
869 | 878 | "base_uri": "https://localhost:8080/",
|
870 | 879 | "height": 392
|
871 | 880 | },
|
872 | | - "outputId": "63f4452f-5967-453c-9fad-432b744367a1" |
| 881 | + "outputId": "bfc6af2f-4431-43b1-dfb7-ca24c0715e2b" |
873 | 882 | },
|
874 | 883 | "source": [
|
875 | 884 | "# Single Linkage - Nearest Point\n",
|
|
919 | 928 | "base_uri": "https://localhost:8080/",
|
920 | 929 | "height": 395
|
921 | 930 | },
|
922 | | - "outputId": "de9efac7-202b-4b5c-8f36-55b476c8b14c" |
| 931 | + "outputId": "8445cacc-3dc7-43ec-9f59-7d7b5960bc80" |
923 | 932 | },
|
924 | 933 | "source": [
|
925 | 934 | "# Complete Linkage - Farthest Point \n",
|
|
969 | 978 | "base_uri": "https://localhost:8080/",
|
970 | 979 | "height": 392
|
971 | 980 | },
|
972 | | - "outputId": "7e098ef1-7d30-4d24-f784-126bf6f3f82e" |
| 981 | + "outputId": "567b5a53-d4d9-43a6-b364-7c3c0331d5f2" |
973 | 982 | },
|
974 | 983 | "source": [
|
975 | 984 | "# Average Linkage - Average Distance between all points\n",
|
|
1019 | 1028 | "base_uri": "https://localhost:8080/",
|
1020 | 1029 | "height": 71
|
1021 | 1030 | },
|
1022 | | - "outputId": "6b697d9a-f7ba-43e7-b25f-31144f3be908" |
| 1031 | + "outputId": "4440bfae-dd1d-435e-d414-f7a8952b7c82" |
1023 | 1032 | },
|
1024 | 1033 | "source": [
|
1025 | 1034 | "# Building an Agglomerative Clustering Model\n",
|
|
1057 | 1066 | "base_uri": "https://localhost:8080/",
|
1058 | 1067 | "height": 71
|
1059 | 1068 | },
|
1060 | | - "outputId": "a2336119-c81d-41c2-ed90-0728b8e97d2f" |
| 1069 | + "outputId": "d3c38bfd-fc5d-4c4f-83a8-855e3daf9d9f" |
1061 | 1070 | },
|
1062 | 1071 | "source": [
|
1063 | 1072 | "# Fitting Model\n",
|
|
1095 | 1104 | "base_uri": "https://localhost:8080/",
|
1096 | 1105 | "height": 143
|
1097 | 1106 | },
|
1098 | | - "outputId": "12a9c91b-5ecd-4d1c-bddd-710622ede31e" |
| 1107 | + "outputId": "425d233f-d1c6-4c05-9e5d-f2bf90e5b941" |
1099 | 1108 | },
|
1100 | 1109 | "source": [
|
1101 | 1110 | "# Predicting Output Class\n",
|
|
1131 | 1140 | "colab_type": "code",
|
1132 | 1141 | "colab": {
|
1133 | 1142 | "base_uri": "https://localhost:8080/",
|
1134 | | - "height": 394 |
| 1143 | + "height": 430 |
1135 | 1144 | },
|
1136 | | - "outputId": "3b95e094-51d6-455f-ba63-1ea4d194f9fc" |
| 1145 | + "outputId": "c142809b-4d35-4c4d-aa6c-8575e940389f" |
1137 | 1146 | },
|
1138 | 1147 | "source": [
|
1139 | 1148 | "# Visualizing Output\n",
|
|
1144 | 1153 | "unique, counts = np.unique(pred1, return_counts=True)\n",
|
1145 | 1154 | "print('Hierarchical Clustering Output Cluster')\n",
|
1146 | 1155 | "print(dict(zip(unique, counts)))\n",
|
| 1156 | + "# Silhouette Score\n", |
| 1157 | + "print('Silhouette Score for 3 Clusters')\n", |
| 1158 | + "print(silhouette_score(iris_X,pred1))\n", |
1147 | 1159 | "print('\\n')\n",
|
1148 | 1160 | "\n",
|
1149 | 1161 | "# In the above output we got value labels: ‘0’, ‘1’ and ‘2’\n",
|
|
1176 | 1188 | "{0: 50, 1: 50, 2: 50}\n",
|
1177 | 1189 | "Hierarchical Clustering Output Cluster\n",
|
1178 | 1190 | "{0: 64, 1: 50, 2: 36}\n",
|
| 1191 | + "Silhouette Score for 3 Clusters\n", |
| 1192 | + "0.5543236611296415\n", |
1179 | 1193 | "\n",
|
1180 | 1194 | "\n"
|
1181 | 1195 | ],
|
|
1185 | 1199 | "output_type": "execute_result",
|
1186 | 1200 | "data": {
|
1187 | 1201 | "text/plain": [
|
1188 | | - "<matplotlib.legend.Legend at 0x7f27cffcdb70>" |
| 1202 | + "<matplotlib.legend.Legend at 0x7ffa7a180d30>" |
1189 | 1203 | ]
|
1190 | 1204 | },
|
1191 | 1205 | "metadata": {
|
|
0 commit comments