Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 5fc59f3

Browse files
Initial commit
0 parents commit 5fc59f3

File tree

8 files changed

+351
-0
lines changed

8 files changed

+351
-0
lines changed

‎README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# K-Means clustering in C++
2+
3+
#### This is a C++ implementation of the simple K-Means clustering algorithm.
4+
5+
K-means clustering is a type of unsupervised learning, which is used when you have unlabeled data (i.e., data without defined categories or groups). The goal of this algorithm is to find groups in the data, with the number of groups represented by the variable K. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity.
6+
7+
## Instructions:
8+
The input supports any number of points and any number of dimensions. Make the "input.txt" file accordingly.
9+
10+
* Download the binary file "kmeans" from the repository.
11+
* Make a file "input.txt" with all the point coordinates. The format should be as shown below (the example has 2-dimensional coordinates):
12+
13+
![Input File Syntax](image/input.png)
14+
* Run the kmeans binary with the input file name and number of clusters as command line arguments, as shown below:
15+
16+
![Bash Output](image/cmd.png)
17+
* The output will be the center point of each cluster, saved in "clusters.txt" file as shown below:
18+
19+
![Clusters File Syntax](image/clusters.png)
20+
21+
-------------------------------

‎clusters.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
0.0333333 0.166667
2+
8 9.25

‎image/clusters.png

5.17 KB
Loading[フレーム]

‎image/cmd.png

19 KB
Loading[フレーム]

‎image/input.png

7.98 KB
Loading[フレーム]

‎input.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
9 9
2+
1 1
3+
-1 -1
4+
3 3
5+
10 10
6+
-2 -2
7+
7 8
8+
0.2 0
9+
-1 0
10+
6 10

‎kmeans

261 KB
Binary file not shown.

‎kmeans.cpp

Lines changed: 318 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,318 @@
1+
#include <iostream>
2+
#include <vector>
3+
#include <cmath>
4+
#include <fstream>
5+
#include <sstream>
6+
#include <algorithm>
7+
8+
using namespace std;
9+
10+
class Point{
11+
12+
private:
13+
int pointId, clusterId;
14+
int dimensions;
15+
vector<double> values;
16+
17+
public:
18+
Point(int id, string line){
19+
dimensions = 0;
20+
pointId = id;
21+
stringstream is(line);
22+
double val;
23+
while(is >> val){
24+
values.push_back(val);
25+
dimensions++;
26+
}
27+
clusterId = 0; //Initially not assigned to any cluster
28+
}
29+
30+
int getDimensions(){
31+
return dimensions;
32+
}
33+
34+
int getCluster(){
35+
return clusterId;
36+
}
37+
38+
int getID(){
39+
return pointId;
40+
}
41+
42+
void setCluster(int val){
43+
clusterId = val;
44+
}
45+
46+
double getVal(int pos){
47+
return values[pos];
48+
}
49+
};
50+
51+
class Cluster{
52+
53+
private:
54+
int clusterId;
55+
vector<double> centroid;
56+
vector<Point> points;
57+
58+
public:
59+
Cluster(int clusterId, Point centroid){
60+
this->clusterId = clusterId;
61+
for(int i=0; i<centroid.getDimensions(); i++){
62+
this->centroid.push_back(centroid.getVal(i));
63+
}
64+
this->addPoint(centroid);
65+
}
66+
67+
void addPoint(Point p){
68+
p.setCluster(this->clusterId);
69+
points.push_back(p);
70+
}
71+
72+
bool removePoint(int pointId){
73+
int size = points.size();
74+
75+
for(int i = 0; i < size; i++)
76+
{
77+
if(points[i].getID() == pointId)
78+
{
79+
points.erase(points.begin() + i);
80+
return true;
81+
}
82+
}
83+
return false;
84+
}
85+
86+
int getId(){
87+
return clusterId;
88+
}
89+
90+
Point getPoint(int pos){
91+
return points[pos];
92+
}
93+
94+
int getSize(){
95+
return points.size();
96+
}
97+
98+
double getCentroidByPos(int pos) {
99+
return centroid[pos];
100+
}
101+
102+
void setCentroidByPos(int pos, double val){
103+
this->centroid[pos] = val;
104+
}
105+
};
106+
107+
class KMeans{
108+
private:
109+
int K, iters, dimensions, total_points;
110+
vector<Cluster> clusters;
111+
112+
int getNearestClusterId(Point point){
113+
double sum = 0.0, min_dist;
114+
int NearestClusterId;
115+
116+
for(int i = 0; i < dimensions; i++)
117+
{
118+
sum += pow(clusters[0].getCentroidByPos(i) - point.getVal(i), 2.0);
119+
}
120+
121+
min_dist = sqrt(sum);
122+
NearestClusterId = clusters[0].getId();
123+
124+
for(int i = 1; i < K; i++)
125+
{
126+
double dist;
127+
sum = 0.0;
128+
129+
for(int j = 0; j < dimensions; j++)
130+
{
131+
sum += pow(clusters[i].getCentroidByPos(j) - point.getVal(j), 2.0);
132+
}
133+
134+
dist = sqrt(sum);
135+
136+
if(dist < min_dist)
137+
{
138+
min_dist = dist;
139+
NearestClusterId = clusters[i].getId();
140+
}
141+
}
142+
143+
return NearestClusterId;
144+
}
145+
146+
public:
147+
KMeans(int K, int iterations){
148+
this->K = K;
149+
this->iters = iterations;
150+
}
151+
152+
void run(vector<Point>& all_points){
153+
154+
total_points = all_points.size();
155+
dimensions = all_points[0].getDimensions();
156+
157+
158+
//Initializing Clusters
159+
vector<int> used_pointIds;
160+
161+
for(int i=1; i<=K; i++)
162+
{
163+
while(true)
164+
{
165+
int index = rand() % total_points;
166+
167+
if(find(used_pointIds.begin(), used_pointIds.end(), index) == used_pointIds.end())
168+
{
169+
used_pointIds.push_back(index);
170+
all_points[index].setCluster(i);
171+
Cluster cluster(i, all_points[index]);
172+
clusters.push_back(cluster);
173+
break;
174+
}
175+
}
176+
}
177+
cout<<"Clusters initialized = "<<clusters.size()<<endl<<endl;
178+
179+
180+
cout<<"Running K-Means Clustering.."<<endl;
181+
182+
int iter = 1;
183+
while(true)
184+
{
185+
cout<<"Iter - "<<iter<<"/"<<iters<<endl;
186+
bool done = true;
187+
188+
// Add all points to their nearest cluster
189+
for(int i = 0; i < total_points; i++)
190+
{
191+
int currentClusterId = all_points[i].getCluster();
192+
int nearestClusterId = getNearestClusterId(all_points[i]);
193+
194+
if(currentClusterId != nearestClusterId)
195+
{
196+
if(currentClusterId != 0){
197+
for(int j=0; j<K; j++){
198+
if(clusters[j].getId() == currentClusterId){
199+
clusters[j].removePoint(all_points[i].getID());
200+
}
201+
}
202+
}
203+
204+
for(int j=0; j<K; j++){
205+
if(clusters[j].getId() == nearestClusterId){
206+
clusters[j].addPoint(all_points[i]);
207+
}
208+
}
209+
all_points[i].setCluster(nearestClusterId);
210+
done = false;
211+
}
212+
}
213+
214+
// Recalculating the center of each cluster
215+
for(int i = 0; i < K; i++)
216+
{
217+
int ClusterSize = clusters[i].getSize();
218+
219+
for(int j = 0; j < dimensions; j++)
220+
{
221+
double sum = 0.0;
222+
if(ClusterSize > 0)
223+
{
224+
for(int p = 0; p < ClusterSize; p++)
225+
sum += clusters[i].getPoint(p).getVal(j);
226+
clusters[i].setCentroidByPos(j, sum / ClusterSize);
227+
}
228+
}
229+
}
230+
231+
if(done || iter >= iters)
232+
{
233+
cout << "Clustering completed in iteration : " <<iter<<endl<<endl;
234+
break;
235+
}
236+
iter++;
237+
}
238+
239+
240+
//Print pointIds in each cluster
241+
for(int i=0; i<K; i++){
242+
cout<<"Points in cluster "<<clusters[i].getId()<<" : ";
243+
for(int j=0; j<clusters[i].getSize(); j++){
244+
cout<<clusters[i].getPoint(j).getID()<<" ";
245+
}
246+
cout<<endl<<endl;
247+
}
248+
cout<<"========================"<<endl<<endl;
249+
250+
//Write cluster centers to file
251+
ofstream outfile;
252+
outfile.open("clusters.txt");
253+
if(outfile.is_open()){
254+
for(int i=0; i<K; i++){
255+
cout<<"Cluster "<<clusters[i].getId()<<" centroid : ";
256+
for(int j=0; j<dimensions; j++){
257+
cout<<clusters[i].getCentroidByPos(j)<<" "; //Output to console
258+
outfile<<clusters[i].getCentroidByPos(j)<<" "; //Output to file
259+
}
260+
cout<<endl;
261+
outfile<<endl;
262+
}
263+
outfile.close();
264+
}
265+
else{
266+
cout<<"Error: Unable to write to clusters.txt";
267+
}
268+
269+
}
270+
};
271+
272+
int main(int argc, char **argv){
273+
274+
//Need 2 arguments (except filename) to run, else exit
275+
if(argc != 3){
276+
cout<<"Error: command-line argument count mismatch.";
277+
return 1;
278+
}
279+
280+
//Fetching number of clusters
281+
int K = atoi(argv[2]);
282+
283+
//Open file for fetching points
284+
string filename = argv[1];
285+
ifstream infile(filename.c_str());
286+
287+
if(!infile.is_open()){
288+
cout<<"Error: Failed to open file."<<endl;
289+
return 1;
290+
}
291+
292+
//Fetching points from file
293+
int pointId = 1;
294+
vector<Point> all_points;
295+
string line;
296+
297+
while(getline(infile, line)){
298+
Point point(pointId, line);
299+
all_points.push_back(point);
300+
pointId++;
301+
}
302+
infile.close();
303+
cout<<"\nData fetched successfully!"<<endl<<endl;
304+
305+
//Return if number of clusters > number of points
306+
if(all_points.size() < K){
307+
cout<<"Error: Number of clusters greater than number of points."<<endl;
308+
return 1;
309+
}
310+
311+
//Running K-Means Clustering
312+
int iters = 100;
313+
314+
KMeans kmeans(K, iters);
315+
kmeans.run(all_points);
316+
317+
return 0;
318+
}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /