From 0b426c012415965d25359c3a061d2db0cdf50659 Mon Sep 17 00:00:00 2001
From: Krishna Vedala <7001608+kvedala@users.noreply.github.com>
Date: Sat, 8 Aug 2020 12:27:46 -0400
Subject: [PATCH] [doc fix] fix documentations in k_means (#592)
* fix documentations
* clang-tidy fixes for 814f9077b7a254e14d447871efb26f0fdb0a7671
Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
---
machine_learning/k_means_clustering.c | 451 +++++++++++++++-----------
1 file changed, 253 insertions(+), 198 deletions(-)
diff --git a/machine_learning/k_means_clustering.c b/machine_learning/k_means_clustering.c
index 11b85505..d762a7b4 100644
--- a/machine_learning/k_means_clustering.c
+++ b/machine_learning/k_means_clustering.c
@@ -35,10 +35,11 @@
* the name observation is used to denote
* a random point in plane
*/
-typedef struct observation {
- double x; /**< abscissa of 2D data point */
- double y; /**< ordinate of 2D data point */
- int group; /**< the group no in which this observation would go */
+typedef struct observation
+{
+ double x; /**< abscissa of 2D data point */
+ double y; /**< ordinate of 2D data point */
+ int group; /**< the group no in which this observation would go */
} observation;
/*! @struct cluster
@@ -48,13 +49,14 @@ typedef struct observation {
* stores the count of observations
* belonging to this cluster
*/
-typedef struct cluster {
- double x; /**< abscissa centroid of this cluster */
- double y; /**< ordinate of centroid of this cluster */
- size_t count; /**< count of observations present in this cluster */
+typedef struct cluster
+{
+ double x; /**< abscissa centroid of this cluster */
+ double y; /**< ordinate of centroid of this cluster */
+ size_t count; /**< count of observations present in this cluster */
} cluster;
-/*! @fn calculateNearest
+/*!
* Returns the index of centroid nearest to
* given observation
*
@@ -64,24 +66,27 @@ typedef struct cluster {
*
* @returns the index of nearest centroid for given observation
*/
-int calculateNearst(observation* o, cluster clusters[], int k) {
- double minD = DBL_MAX;
- double dist = 0;
- int index = -1;
- int i = 0;
- for (; i < k; i++) {
- /* Calculate Squared Distance*/
- dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) +
- (clusters[i].y - o->y) * (clusters[i].y - o->y);
- if (dist < minD) {
- minD = dist;
- index = i;
+int calculateNearst(observation* o, cluster clusters[], int k)
+{
+ double minD = DBL_MAX;
+ double dist = 0;
+ int index = -1;
+ int i = 0;
+ for (; i < k; i++)
+ {
+ /* Calculate Squared Distance*/
+ dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) +
+ (clusters[i].y - o->y) * (clusters[i].y - o->y);
+ if (dist < minD)
+ {
+ minD = dist;
+ index = i;
+ }
}
- }
- return index;
+ return index;
}
-/*! @fn calculateCentroid
+/*!
* Calculate centoid and assign it to the cluster variable
*
* @param observations an array of observations whose centroid is calculated
@@ -90,22 +95,24 @@ int calculateNearst(observation* o, cluster clusters[], int k) {
* centroid
*/
void calculateCentroid(observation observations[], size_t size,
- cluster* centroid) {
- size_t i = 0;
- centroid->x = 0;
- centroid->y = 0;
- centroid->count = size;
- for (; i < size; i++) {
- centroid->x += observations[i].x;
- centroid->y += observations[i].y;
- observations[i].group = 0;
- }
- centroid->x /= centroid->count;
- centroid->y /= centroid->count;
+ cluster* centroid)
+{
+ size_t i = 0;
+ centroid->x = 0;
+ centroid->y = 0;
+ centroid->count = size;
+ for (; i < size; i++)
+ {
+ centroid->x += observations[i].x;
+ centroid->y += observations[i].y;
+ observations[i].group = 0;
+ }
+ centroid->x /= centroid->count;
+ centroid->y /= centroid->count;
}
-/*! @fn kMeans
- * --K Means Algorithm--
+/*!
+ * --K Means Algorithm--
* 1. Assign each observation to one of k groups
* creating a random initial clustering
* 2. Find the centroid of observations for each
@@ -117,77 +124,93 @@ void calculateCentroid(observation observations[], size_t size,
* 5. Repeat step 2,3,4 until there is no change
* the current clustering and is same as last
* clustering.
+ *
* @param observations an array of observations to cluster
* @param size size of observations array
* @param k no of clusters to be made
*
* @returns pointer to cluster object
*/
-cluster* kMeans(observation observations[], size_t size, int k) {
- cluster* clusters = NULL;
- if (k <= 1) {
- /*
- If we have to cluster them only in one group
- then calculate centroid of observations and
- that will be a ingle cluster
- */
- clusters = (cluster*)malloc(sizeof(cluster));
- memset(clusters, 0, sizeof(cluster));
- calculateCentroid(observations, size, clusters);
- } else if (k < size) {
- clusters = malloc(sizeof(cluster) * k);
- memset(clusters, 0, k * sizeof(cluster));
- /* STEP 1 */
- for (size_t j = 0; j < size; j++) {
- observations[j].group = rand() % k;
+cluster* kMeans(observation observations[], size_t size, int k)
+{
+ cluster* clusters = NULL;
+ if (k <= 1)
+ {
+ /*
+ If we have to cluster them only in one group
+ then calculate centroid of observations and
+ that will be a ingle cluster
+ */
+ clusters = (cluster*)malloc(sizeof(cluster));
+ memset(clusters, 0, sizeof(cluster));
+ calculateCentroid(observations, size, clusters);
}
- size_t changed = 0;
- size_t minAcceptedError =
- size / 10000; // Do until 99.99 percent points are in correct cluster
- int t = 0;
- do {
- /* Initialize clusters */
- for (int i = 0; i < k; i++) {
- clusters[i].x = 0;
- clusters[i].y = 0;
- clusters[i].count = 0;
- }
- /* STEP 2*/
- for (size_t j = 0; j < size; j++) {
- t = observations[j].group;
- clusters[t].x += observations[j].x;
- clusters[t].y += observations[j].y;
- clusters[t].count++;
- }
- for (int i = 0; i < k; i++) {
- clusters[i].x /= clusters[i].count;
- clusters[i].y /= clusters[i].count;
- }
- /* STEP 3 and 4 */
- changed = 0; // this variable stores change in clustering
- for (size_t j = 0; j < size; j++) {
- t = calculateNearst(observations + j, clusters, k);
- if (t != observations[j].group) {
- changed++;
- observations[j].group = t;
+ else if (k < size)
+ {
+ clusters = malloc(sizeof(cluster) * k);
+ memset(clusters, 0, k * sizeof(cluster));
+ /* STEP 1 */
+ for (size_t j = 0; j < size; j++)
+ {
+ observations[j].group = rand() % k;
}
- }
- } while (changed > minAcceptedError); // Keep on grouping until we have
- // got almost best clustering
- } else {
- /* If no of clusters is more than observations
- each observation can be its own cluster
- */
- clusters = (cluster*)malloc(sizeof(cluster) * k);
- memset(clusters, 0, k * sizeof(cluster));
- for (int j = 0; j < size; j++) {
- clusters[j].x = observations[j].x;
- clusters[j].y = observations[j].y;
- clusters[j].count = 1;
- observations[j].group = j;
+ size_t changed = 0;
+ size_t minAcceptedError =
+ size /
+ 10000; // Do until 99.99 percent points are in correct cluster
+ int t = 0;
+ do
+ {
+ /* Initialize clusters */
+ for (int i = 0; i < k; i++)
+ {
+ clusters[i].x = 0;
+ clusters[i].y = 0;
+ clusters[i].count = 0;
+ }
+ /* STEP 2*/
+ for (size_t j = 0; j < size; j++)
+ {
+ t = observations[j].group;
+ clusters[t].x += observations[j].x;
+ clusters[t].y += observations[j].y;
+ clusters[t].count++;
+ }
+ for (int i = 0; i < k; i++)
+ {
+ clusters[i].x /= clusters[i].count;
+ clusters[i].y /= clusters[i].count;
+ }
+ /* STEP 3 and 4 */
+ changed = 0; // this variable stores change in clustering
+ for (size_t j = 0; j < size; j++)
+ {
+ t = calculateNearst(observations + j, clusters, k);
+ if (t != observations[j].group)
+ {
+ changed++;
+ observations[j].group = t;
+ }
+ }
+ } while (changed > minAcceptedError); // Keep on grouping until we have
+ // got almost best clustering
}
- }
- return clusters;
+ else
+ {
+ /* If no of clusters is more than observations
+ each observation can be its own cluster
+ */
+ clusters = (cluster*)malloc(sizeof(cluster) * k);
+ memset(clusters, 0, k * sizeof(cluster));
+ for (int j = 0; j < size; j++)
+ {
+ clusters[j].x = observations[j].x;
+ clusters[j].y = observations[j].y;
+ clusters[j].count = 1;
+ observations[j].group = j;
+ }
+ }
+ return clusters;
}
/**
@@ -195,73 +218,96 @@ cluster* kMeans(observation observations[], size_t size, int k) {
* @}
*/
-/*! @fn printEPS
+/*!
* A function to print observations and clusters
* The code is taken from
- * @link http://rosettacode.org/wiki/K-means%2B%2B_clustering
- * its C implementation
+ * http://rosettacode.org/wiki/K-means%2B%2B_clustering.
* Even the K Means code is also inspired from it
*
- * Note: To print in a file use pipeline operator ( ./a.out > image.eps )
+ * @note To print in a file use pipeline operator
+ * ```sh
+ * ./k_means_clustering > image.eps
+ * ```
*
* @param observations observations array
* @param len size of observation array
* @param cent clusters centroid's array
* @param k size of cent array
*/
-void printEPS(observation pts[], size_t len, cluster cent[], int k) {
- int W = 400, H = 400;
- double min_x = DBL_MAX, max_x = DBL_MIN, min_y = DBL_MAX, max_y = DBL_MIN;
- double scale = 0, cx = 0, cy = 0;
- double* colors = (double*)malloc(sizeof(double) * (k * 3));
- int i;
- size_t j;
- double kd = k * 1.0;
- for (i = 0; i < k; i++) {
- *(colors + 3 * i) = (3 * (i + 1) % k) / kd;
- *(colors + 3 * i + 1) = (7 * i % k) / kd;
- *(colors + 3 * i + 2) = (9 * i % k) / kd;
- }
-
- for (j = 0; j < len; j++) {
- if (max_x < pts[j].x) max_x = pts[j].x;
- if (min_x > pts[j].x) min_x = pts[j].x;
- if (max_y < pts[j].y) max_y = pts[j].y;
- if (min_y > pts[j].y) min_y = pts[j].y;
- }
- scale = W / (max_x - min_x);
- if (scale > (H / (max_y - min_y))) {
- scale = H / (max_y - min_y);
- };
- cx = (max_x + min_x) / 2;
- cy = (max_y + min_y) / 2;
-
- printf("%%!PS-Adobe-3.0 EPSF-3.0\n%%%%BoundingBox: -5 -5 %d %d\n", W + 10,
- H + 10);
- printf(
- "/l {rlineto} def /m {rmoveto} def\n"
- "/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def\n"
- "/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath "
- " gsave 1 setgray fill grestore gsave 3 setlinewidth"
- " 1 setgray stroke grestore 0 setgray stroke }def\n");
- for (int i = 0; i < k; i++) {
- printf("%g %g %g setrgbcolor\n", *(colors + 3 * i), *(colors + 3 * i + 1),
- *(colors + 3 * i + 2));
- for (j = 0; j < len; j++) {
- if (pts[j].group != i) continue;
- printf("%.3f %.3f c\n", (pts[j].x - cx) * scale + W / 2,
- (pts[j].y - cy) * scale + H / 2);
+void printEPS(observation pts[], size_t len, cluster cent[], int k)
+{
+ int W = 400, H = 400;
+ double min_x = DBL_MAX, max_x = DBL_MIN, min_y = DBL_MAX, max_y = DBL_MIN;
+ double scale = 0, cx = 0, cy = 0;
+ double* colors = (double*)malloc(sizeof(double) * (k * 3));
+ int i;
+ size_t j;
+ double kd = k * 1.0;
+ for (i = 0; i < k; i++)
+ {
+ *(colors + 3 * i) = (3 * (i + 1) % k) / kd;
+ *(colors + 3 * i + 1) = (7 * i % k) / kd;
+ *(colors + 3 * i + 2) = (9 * i % k) / kd;
}
- printf("\n0 setgray %g %g s\n", (cent[i].x - cx) * scale + W / 2,
- (cent[i].y - cy) * scale + H / 2);
- }
- printf("\n%%%%EOF");
- // free accquired memory
- free(colors);
+ for (j = 0; j < len; j++)
+ {
+ if (max_x < pts[j].x)
+ {
+ max_x = pts[j].x;
+ }
+ if (min_x > pts[j].x)
+ {
+ min_x = pts[j].x;
+ }
+ if (max_y < pts[j].y)
+ {
+ max_y = pts[j].y;
+ }
+ if (min_y > pts[j].y)
+ {
+ min_y = pts[j].y;
+ }
+ }
+ scale = W / (max_x - min_x);
+ if (scale > (H / (max_y - min_y)))
+ {
+ scale = H / (max_y - min_y);
+ };
+ cx = (max_x + min_x) / 2;
+ cy = (max_y + min_y) / 2;
+
+ printf("%%!PS-Adobe-3.0 EPSF-3.0\n%%%%BoundingBox: -5 -5 %d %d\n", W + 10,
+ H + 10);
+ printf(
+ "/l {rlineto} def /m {rmoveto} def\n"
+ "/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def\n"
+ "/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath "
+ " gsave 1 setgray fill grestore gsave 3 setlinewidth"
+ " 1 setgray stroke grestore 0 setgray stroke }def\n");
+ for (int i = 0; i < k; i++)
+ {
+ printf("%g %g %g setrgbcolor\n", *(colors + 3 * i),
+ *(colors + 3 * i + 1), *(colors + 3 * i + 2));
+ for (j = 0; j < len; j++)
+ {
+ if (pts[j].group != i)
+ {
+ continue;
+ }
+ printf("%.3f %.3f c\n", (pts[j].x - cx) * scale + W / 2,
+ (pts[j].y - cy) * scale + H / 2);
+ }
+ printf("\n0 setgray %g %g s\n", (cent[i].x - cx) * scale + W / 2,
+ (cent[i].y - cy) * scale + H / 2);
+ }
+ printf("\n%%%%EOF");
+
+ // free accquired memory
+ free(colors);
}
-/*! @fn test
+/*!
* A function to test the kMeans function
* Generates 100000 points in a circle of
* radius 20.0 with center at (0,0)
@@ -270,29 +316,33 @@ void printEPS(observation pts[], size_t len, cluster cent[], int k) {
*
+ * @returns None
*/
-static void test() {
- size_t size = 100000L;
- observation* observations = (observation*)malloc(sizeof(observation) * size);
- double maxRadius = 20.00;
- double radius = 0;
- double ang = 0;
- size_t i = 0;
- for (; i < size; i++) {
- radius = maxRadius * ((double)rand() / RAND_MAX);
- ang = 2 * M_PI * ((double)rand() / RAND_MAX);
- observations[i].x = radius * cos(ang);
- observations[i].y = radius * sin(ang);
- }
- int k = 5; // No of clusters
- cluster* clusters = kMeans(observations, size, k);
- printEPS(observations, size, clusters, k);
- // Free the accquired memory
- free(observations);
- free(clusters);
+static void test()
+{
+ size_t size = 100000L;
+ observation* observations =
+ (observation*)malloc(sizeof(observation) * size);
+ double maxRadius = 20.00;
+ double radius = 0;
+ double ang = 0;
+ size_t i = 0;
+ for (; i < size; i++)
+ {
+ radius = maxRadius * ((double)rand() / RAND_MAX);
+ ang = 2 * M_PI * ((double)rand() / RAND_MAX);
+ observations[i].x = radius * cos(ang);
+ observations[i].y = radius * sin(ang);
+ }
+ int k = 5; // No of clusters
+ cluster* clusters = kMeans(observations, size, k);
+ printEPS(observations, size, clusters, k);
+ // Free the accquired memory
+ free(observations);
+ free(clusters);
}
-/*! @fn test2
+/*!
* A function to test the kMeans function
* Generates 1000000 points in a circle of
* radius 20.0 with center at (0,0)
@@ -301,35 +351,40 @@ static void test() {
*
+ * @returns None
*/
-void test2() {
- size_t size = 1000000L;
- observation* observations = (observation*)malloc(sizeof(observation) * size);
- double maxRadius = 20.00;
- double radius = 0;
- double ang = 0;
- size_t i = 0;
- for (; i < size; i++) {
- radius = maxRadius * ((double)rand() / RAND_MAX);
- ang = 2 * M_PI * ((double)rand() / RAND_MAX);
- observations[i].x = radius * cos(ang);
- observations[i].y = radius * sin(ang);
- }
- int k = 11; // No of clusters
- cluster* clusters = kMeans(observations, size, k);
- printEPS(observations, size, clusters, k);
- // Free the accquired memory
- free(observations);
- free(clusters);
+void test2()
+{
+ size_t size = 1000000L;
+ observation* observations =
+ (observation*)malloc(sizeof(observation) * size);
+ double maxRadius = 20.00;
+ double radius = 0;
+ double ang = 0;
+ size_t i = 0;
+ for (; i < size; i++)
+ {
+ radius = maxRadius * ((double)rand() / RAND_MAX);
+ ang = 2 * M_PI * ((double)rand() / RAND_MAX);
+ observations[i].x = radius * cos(ang);
+ observations[i].y = radius * sin(ang);
+ }
+ int k = 11; // No of clusters
+ cluster* clusters = kMeans(observations, size, k);
+ printEPS(observations, size, clusters, k);
+ // Free the accquired memory
+ free(observations);
+ free(clusters);
}
-/*! @fn main
+/*!
* This function calls the test
* function
*/
-int main() {
- srand(time(NULL));
- test();
- /* test2(); */
- return 0;
+int main()
+{
+ srand(time(NULL));
+ test();
+ /* test2(); */
+ return 0;
}