1
+ #include < iostream>
2
+ #include < vector>
3
+ #include < cmath>
4
+ #include < fstream>
5
+ #include < sstream>
6
+ #include < algorithm>
7
+
8
+ using namespace std ;
9
+
10
+ class Point {
11
+
12
+ private:
13
+ int pointId, clusterId;
14
+ int dimensions;
15
+ vector<double > values;
16
+
17
+ public:
18
+ Point (int id, string line){
19
+ dimensions = 0 ;
20
+ pointId = id;
21
+ stringstream is (line);
22
+ double val;
23
+ while (is >> val){
24
+ values.push_back (val);
25
+ dimensions++;
26
+ }
27
+ clusterId = 0 ; // Initially not assigned to any cluster
28
+ }
29
+
30
+ int getDimensions (){
31
+ return dimensions;
32
+ }
33
+
34
+ int getCluster (){
35
+ return clusterId;
36
+ }
37
+
38
+ int getID (){
39
+ return pointId;
40
+ }
41
+
42
+ void setCluster (int val){
43
+ clusterId = val;
44
+ }
45
+
46
+ double getVal (int pos){
47
+ return values[pos];
48
+ }
49
+ };
50
+
51
+ class Cluster {
52
+
53
+ private:
54
+ int clusterId;
55
+ vector<double > centroid;
56
+ vector<Point> points;
57
+
58
+ public:
59
+ Cluster (int clusterId, Point centroid){
60
+ this ->clusterId = clusterId;
61
+ for (int i=0 ; i<centroid.getDimensions (); i++){
62
+ this ->centroid .push_back (centroid.getVal (i));
63
+ }
64
+ this ->addPoint (centroid);
65
+ }
66
+
67
+ void addPoint (Point p){
68
+ p.setCluster (this ->clusterId );
69
+ points.push_back (p);
70
+ }
71
+
72
+ bool removePoint (int pointId){
73
+ int size = points.size ();
74
+
75
+ for (int i = 0 ; i < size; i++)
76
+ {
77
+ if (points[i].getID () == pointId)
78
+ {
79
+ points.erase (points.begin () + i);
80
+ return true ;
81
+ }
82
+ }
83
+ return false ;
84
+ }
85
+
86
+ int getId (){
87
+ return clusterId;
88
+ }
89
+
90
+ Point getPoint (int pos){
91
+ return points[pos];
92
+ }
93
+
94
+ int getSize (){
95
+ return points.size ();
96
+ }
97
+
98
+ double getCentroidByPos (int pos) {
99
+ return centroid[pos];
100
+ }
101
+
102
+ void setCentroidByPos (int pos, double val){
103
+ this ->centroid [pos] = val;
104
+ }
105
+ };
106
+
107
+ class KMeans {
108
+ private:
109
+ int K, iters, dimensions, total_points;
110
+ vector<Cluster> clusters;
111
+
112
+ int getNearestClusterId (Point point){
113
+ double sum = 0.0 , min_dist;
114
+ int NearestClusterId;
115
+
116
+ for (int i = 0 ; i < dimensions; i++)
117
+ {
118
+ sum += pow (clusters[0 ].getCentroidByPos (i) - point.getVal (i), 2.0 );
119
+ }
120
+
121
+ min_dist = sqrt (sum);
122
+ NearestClusterId = clusters[0 ].getId ();
123
+
124
+ for (int i = 1 ; i < K; i++)
125
+ {
126
+ double dist;
127
+ sum = 0.0 ;
128
+
129
+ for (int j = 0 ; j < dimensions; j++)
130
+ {
131
+ sum += pow (clusters[i].getCentroidByPos (j) - point.getVal (j), 2.0 );
132
+ }
133
+
134
+ dist = sqrt (sum);
135
+
136
+ if (dist < min_dist)
137
+ {
138
+ min_dist = dist;
139
+ NearestClusterId = clusters[i].getId ();
140
+ }
141
+ }
142
+
143
+ return NearestClusterId;
144
+ }
145
+
146
+ public:
147
+ KMeans (int K, int iterations){
148
+ this ->K = K;
149
+ this ->iters = iterations;
150
+ }
151
+
152
+ void run (vector<Point>& all_points){
153
+
154
+ total_points = all_points.size ();
155
+ dimensions = all_points[0 ].getDimensions ();
156
+
157
+
158
+ // Initializing Clusters
159
+ vector<int > used_pointIds;
160
+
161
+ for (int i=1 ; i<=K; i++)
162
+ {
163
+ while (true )
164
+ {
165
+ int index = rand () % total_points;
166
+
167
+ if (find (used_pointIds.begin (), used_pointIds.end (), index) == used_pointIds.end ())
168
+ {
169
+ used_pointIds.push_back (index);
170
+ all_points[index].setCluster (i);
171
+ Cluster cluster (i, all_points[index]);
172
+ clusters.push_back (cluster);
173
+ break ;
174
+ }
175
+ }
176
+ }
177
+ cout<<" Clusters initialized = " <<clusters.size ()<<endl<<endl;
178
+
179
+
180
+ cout<<" Running K-Means Clustering.." <<endl;
181
+
182
+ int iter = 1 ;
183
+ while (true )
184
+ {
185
+ cout<<" Iter - " <<iter<<" /" <<iters<<endl;
186
+ bool done = true ;
187
+
188
+ // Add all points to their nearest cluster
189
+ for (int i = 0 ; i < total_points; i++)
190
+ {
191
+ int currentClusterId = all_points[i].getCluster ();
192
+ int nearestClusterId = getNearestClusterId (all_points[i]);
193
+
194
+ if (currentClusterId != nearestClusterId)
195
+ {
196
+ if (currentClusterId != 0 ){
197
+ for (int j=0 ; j<K; j++){
198
+ if (clusters[j].getId () == currentClusterId){
199
+ clusters[j].removePoint (all_points[i].getID ());
200
+ }
201
+ }
202
+ }
203
+
204
+ for (int j=0 ; j<K; j++){
205
+ if (clusters[j].getId () == nearestClusterId){
206
+ clusters[j].addPoint (all_points[i]);
207
+ }
208
+ }
209
+ all_points[i].setCluster (nearestClusterId);
210
+ done = false ;
211
+ }
212
+ }
213
+
214
+ // Recalculating the center of each cluster
215
+ for (int i = 0 ; i < K; i++)
216
+ {
217
+ int ClusterSize = clusters[i].getSize ();
218
+
219
+ for (int j = 0 ; j < dimensions; j++)
220
+ {
221
+ double sum = 0.0 ;
222
+ if (ClusterSize > 0 )
223
+ {
224
+ for (int p = 0 ; p < ClusterSize; p++)
225
+ sum += clusters[i].getPoint (p).getVal (j);
226
+ clusters[i].setCentroidByPos (j, sum / ClusterSize);
227
+ }
228
+ }
229
+ }
230
+
231
+ if (done || iter >= iters)
232
+ {
233
+ cout << " Clustering completed in iteration : " <<iter<<endl<<endl;
234
+ break ;
235
+ }
236
+ iter++;
237
+ }
238
+
239
+
240
+ // Print pointIds in each cluster
241
+ for (int i=0 ; i<K; i++){
242
+ cout<<" Points in cluster " <<clusters[i].getId ()<<" : " ;
243
+ for (int j=0 ; j<clusters[i].getSize (); j++){
244
+ cout<<clusters[i].getPoint (j).getID ()<<" " ;
245
+ }
246
+ cout<<endl<<endl;
247
+ }
248
+ cout<<" ========================" <<endl<<endl;
249
+
250
+ // Write cluster centers to file
251
+ ofstream outfile;
252
+ outfile.open (" clusters.txt" );
253
+ if (outfile.is_open ()){
254
+ for (int i=0 ; i<K; i++){
255
+ cout<<" Cluster " <<clusters[i].getId ()<<" centroid : " ;
256
+ for (int j=0 ; j<dimensions; j++){
257
+ cout<<clusters[i].getCentroidByPos (j)<<" " ; // Output to console
258
+ outfile<<clusters[i].getCentroidByPos (j)<<" " ; // Output to file
259
+ }
260
+ cout<<endl;
261
+ outfile<<endl;
262
+ }
263
+ outfile.close ();
264
+ }
265
+ else {
266
+ cout<<" Error: Unable to write to clusters.txt" ;
267
+ }
268
+
269
+ }
270
+ };
271
+
272
+ int main (int argc, char **argv){
273
+
274
+ // Need 2 arguments (except filename) to run, else exit
275
+ if (argc != 3 ){
276
+ cout<<" Error: command-line argument count mismatch." ;
277
+ return 1 ;
278
+ }
279
+
280
+ // Fetching number of clusters
281
+ int K = atoi (argv[2 ]);
282
+
283
+ // Open file for fetching points
284
+ string filename = argv[1 ];
285
+ ifstream infile (filename.c_str ());
286
+
287
+ if (!infile.is_open ()){
288
+ cout<<" Error: Failed to open file." <<endl;
289
+ return 1 ;
290
+ }
291
+
292
+ // Fetching points from file
293
+ int pointId = 1 ;
294
+ vector<Point> all_points;
295
+ string line;
296
+
297
+ while (getline (infile, line)){
298
+ Point point (pointId, line);
299
+ all_points.push_back (point);
300
+ pointId++;
301
+ }
302
+ infile.close ();
303
+ cout<<" \n Data fetched successfully!" <<endl<<endl;
304
+
305
+ // Return if number of clusters > number of points
306
+ if (all_points.size () < K){
307
+ cout<<" Error: Number of clusters greater than number of points." <<endl;
308
+ return 1 ;
309
+ }
310
+
311
+ // Running K-Means Clustering
312
+ int iters = 100 ;
313
+
314
+ KMeans kmeans (K, iters);
315
+ kmeans.run (all_points);
316
+
317
+ return 0 ;
318
+ }
0 commit comments