Algorithms_in_C  1.0.0
Set of algorithms implemented in C.
K-Means Clustering Algorithm
Collaboration diagram for K-Means Clustering Algorithm:

Data Structures

struct  observation
 
struct  cluster
 

Typedefs

typedef struct observation observation
 
typedef struct cluster cluster
 

Functions

int calculateNearst (observation *o, cluster clusters[], int k)
 
void calculateCentroid (observation observations[], size_t size, cluster *centroid)
 
clusterkMeans (observation observations[], size_t size, int k)
 

Detailed Description

Function Documentation

◆ calculateCentroid()

void calculateCentroid ( observation  observations[],
size_t  size,
cluster centroid 
)

Calculate centoid and assign it to the cluster variable

Parameters
observationsan array of observations whose centroid is calculated
sizesize of the observations array
centroida reference to cluster object to store information of centroid
99 {
100  size_t i = 0;
101  centroid->x = 0;
102  centroid->y = 0;
103  centroid->count = size;
104  for (; i < size; i++)
105  {
106  centroid->x += observations[i].x;
107  centroid->y += observations[i].y;
108  observations[i].group = 0;
109  }
110  centroid->x /= centroid->count;
111  centroid->y /= centroid->count;
112 }
double y
ordinate of centroid of this cluster
Definition: k_means_clustering.c:55
double x
abscissa centroid of this cluster
Definition: k_means_clustering.c:54
size_t count
count of observations present in this cluster
Definition: k_means_clustering.c:56
double x
abscissa of 2D data point
Definition: k_means_clustering.c:40
int group
the group no in which this observation would go
Definition: k_means_clustering.c:42
double y
ordinate of 2D data point
Definition: k_means_clustering.c:41

◆ calculateNearst()

int calculateNearst ( observation o,
cluster  clusters[],
int  k 
)

Returns the index of centroid nearest to given observation

Parameters
oobservation
clustersarray of cluster having centroids coordinates
ksize of clusters array
Returns
the index of nearest centroid for given observation
70 {
71  double minD = DBL_MAX;
72  double dist = 0;
73  int index = -1;
74  int i = 0;
75  for (; i < k; i++)
76  {
77  /* Calculate Squared Distance*/
78  dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) +
79  (clusters[i].y - o->y) * (clusters[i].y - o->y);
80  if (dist < minD)
81  {
82  minD = dist;
83  index = i;
84  }
85  }
86  return index;
87 }

◆ kMeans()

cluster* kMeans ( observation  observations[],
size_t  size,
int  k 
)

–K Means Algorithm–

  1. Assign each observation to one of k groups creating a random initial clustering
  2. Find the centroid of observations for each cluster to form new centroids
  3. Find the centroid which is nearest for each observation among the calculated centroids
  4. Assign the observation to its nearest centroid to create a new clustering.
  5. Repeat step 2,3,4 until there is no change the current clustering and is same as last clustering.
Parameters
observationsan array of observations to cluster
sizesize of observations array
kno of clusters to be made
Returns
pointer to cluster object
135 {
136  cluster* clusters = NULL;
137  if (k <= 1)
138  {
139  /*
140  If we have to cluster them only in one group
141  then calculate centroid of observations and
142  that will be a ingle cluster
143  */
144  clusters = (cluster*)malloc(sizeof(cluster));
145  memset(clusters, 0, sizeof(cluster));
146  calculateCentroid(observations, size, clusters);
147  }
148  else if (k < size)
149  {
150  clusters = malloc(sizeof(cluster) * k);
151  memset(clusters, 0, k * sizeof(cluster));
152  /* STEP 1 */
153  for (size_t j = 0; j < size; j++)
154  {
155  observations[j].group = rand() % k;
156  }
157  size_t changed = 0;
158  size_t minAcceptedError =
159  size /
160  10000; // Do until 99.99 percent points are in correct cluster
161  int t = 0;
162  do
163  {
164  /* Initialize clusters */
165  for (int i = 0; i < k; i++)
166  {
167  clusters[i].x = 0;
168  clusters[i].y = 0;
169  clusters[i].count = 0;
170  }
171  /* STEP 2*/
172  for (size_t j = 0; j < size; j++)
173  {
174  t = observations[j].group;
175  clusters[t].x += observations[j].x;
176  clusters[t].y += observations[j].y;
177  clusters[t].count++;
178  }
179  for (int i = 0; i < k; i++)
180  {
181  clusters[i].x /= clusters[i].count;
182  clusters[i].y /= clusters[i].count;
183  }
184  /* STEP 3 and 4 */
185  changed = 0; // this variable stores change in clustering
186  for (size_t j = 0; j < size; j++)
187  {
188  t = calculateNearst(observations + j, clusters, k);
189  if (t != observations[j].group)
190  {
191  changed++;
192  observations[j].group = t;
193  }
194  }
195  } while (changed > minAcceptedError); // Keep on grouping until we have
196  // got almost best clustering
197  }
198  else
199  {
200  /* If no of clusters is more than observations
201  each observation can be its own cluster
202  */
203  clusters = (cluster*)malloc(sizeof(cluster) * k);
204  memset(clusters, 0, k * sizeof(cluster));
205  for (int j = 0; j < size; j++)
206  {
207  clusters[j].x = observations[j].x;
208  clusters[j].y = observations[j].y;
209  clusters[j].count = 1;
210  observations[j].group = j;
211  }
212  }
213  return clusters;
214 }
int calculateNearst(observation *o, cluster clusters[], int k)
Definition: k_means_clustering.c:69
void calculateCentroid(observation observations[], size_t size, cluster *centroid)
Definition: k_means_clustering.c:97
#define malloc(bytes)
This macro replace the standard malloc function with malloc_dbg.
Definition: malloc_dbg.h:18
Definition: k_means_clustering.c:53
Here is the call graph for this function: