/** * @file k_means_clustering.cpp * @brief K Means Clustering Algorithm implemented * @details * This file has K Means algorithm implemmented * It prints test output in eps format * * Note: * Though the code for clustering works for all the * 2D data points and can be extended for any size vector * by making the required changes, but note that * the output method i.e. printEPS is only good for * polar data points i.e. in a circle and both test * use the same. * @author [Lakhan Nad](https://github.com/Lakhan-Nad) */ #define _USE_MATH_DEFINES /* required for MS Visual C */ #include /* DBL_MAX, DBL_MIN */ #include /* PI, sin, cos */ #include /* printf */ #include /* rand */ #include /* memset */ #include /* time */ /*! * @addtogroup machine_learning Machine Learning Algorithms * @{ * @addtogroup k_means K-Means Clustering Algorithm * @{ */ /*! @struct observation * a class to store points in 2d plane * the name observation is used to denote * a random point in plane */ typedef struct observation { double x; /**< abscissa of 2D data point */ double y; /**< ordinate of 2D data point */ int group; /**< the group no in which this observation would go */ } observation; /*! @struct cluster * this class stores the coordinates * of centroid of all the points * in that cluster it also * stores the count of observations * belonging to this cluster */ typedef struct cluster { double x; /**< abscissa centroid of this cluster */ double y; /**< ordinate of centroid of this cluster */ size_t count; /**< count of observations present in this cluster */ } cluster; /*! @fn calculateNearest * Returns the index of centroid nearest to * given observation * * @param o observation * @param clusters array of cluster having centroids coordinates * @param k size of clusters array * * @returns the index of nearest centroid for given observation */ int calculateNearst(observation* o, cluster clusters[], int k) { double minD = DBL_MAX; double dist = 0; int index = -1; int i = 0; for (; i < k; i++) { /* Calculate Squared Distance*/ dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) + (clusters[i].y - o->y) * (clusters[i].y - o->y); if (dist < minD) { minD = dist; index = i; } } return index; } /*! @fn calculateCentroid * Calculate centoid and assign it to the cluster variable * * @param observations an array of observations whose centroid is calculated * @param size size of the observations array * @param centroid a reference to cluster object to store information of * centroid */ void calculateCentroid(observation observations[], size_t size, cluster* centroid) { size_t i = 0; centroid->x = 0; centroid->y = 0; centroid->count = size; for (; i < size; i++) { centroid->x += observations[i].x; centroid->y += observations[i].y; observations[i].group = 0; } centroid->x /= centroid->count; centroid->y /= centroid->count; } /*! @fn kMeans * --K Means Algorithm-- * 1. Assign each observation to one of k groups * creating a random initial clustering * 2. Find the centroid of observations for each * cluster to form new centroids * 3. Find the centroid which is nearest for each * observation among the calculated centroids * 4. Assign the observation to its nearest centroid * to create a new clustering. * 5. Repeat step 2,3,4 until there is no change * the current clustering and is same as last * clustering. * @param observations an array of observations to cluster * @param size size of observations array * @param k no of clusters to be made * * @returns pointer to cluster object */ cluster* kMeans(observation observations[], size_t size, int k) { cluster* clusters = NULL; if (k <= 1) { /* If we have to cluster them only in one group then calculate centroid of observations and that will be a ingle cluster */ clusters = (cluster*)malloc(sizeof(cluster)); memset(clusters, 0, sizeof(cluster)); calculateCentroid(observations, size, clusters); } else if (k < size) { clusters = malloc(sizeof(cluster) * k); memset(clusters, 0, k * sizeof(cluster)); /* STEP 1 */ for (size_t j = 0; j < size; j++) { observations[j].group = rand() % k; } size_t changed = 0; size_t minAcceptedError = size / 10000; // Do until 99.99 percent points are in correct cluster int t = 0; do { /* Initialize clusters */ for (int i = 0; i < k; i++) { clusters[i].x = 0; clusters[i].y = 0; clusters[i].count = 0; } /* STEP 2*/ for (size_t j = 0; j < size; j++) { t = observations[j].group; clusters[t].x += observations[j].x; clusters[t].y += observations[j].y; clusters[t].count++; } for (int i = 0; i < k; i++) { clusters[i].x /= clusters[i].count; clusters[i].y /= clusters[i].count; } /* STEP 3 and 4 */ changed = 0; // this variable stores change in clustering for (size_t j = 0; j < size; j++) { t = calculateNearst(observations + j, clusters, k); if (t != observations[j].group) { changed++; observations[j].group = t; } } } while (changed > minAcceptedError); // Keep on grouping until we have // got almost best clustering } else { /* If no of clusters is more than observations each observation can be its own cluster */ clusters = (cluster*)malloc(sizeof(cluster) * k); memset(clusters, 0, k * sizeof(cluster)); for (int j = 0; j < size; j++) { clusters[j].x = observations[j].x; clusters[j].y = observations[j].y; clusters[j].count = 1; observations[j].group = j; } } return clusters; } /** * @} * @} */ /*! @fn printEPS * A function to print observations and clusters * The code is taken from * @link http://rosettacode.org/wiki/K-means%2B%2B_clustering * its C implementation * Even the K Means code is also inspired from it * * Note: To print in a file use pipeline operator ( ./a.out > image.eps ) * * @param observations observations array * @param len size of observation array * @param cent clusters centroid's array * @param k size of cent array */ void printEPS(observation pts[], size_t len, cluster cent[], int k) { int W = 400, H = 400; double min_x = DBL_MAX, max_x = DBL_MIN, min_y = DBL_MAX, max_y = DBL_MIN; double scale = 0, cx = 0, cy = 0; double* colors = (double*)malloc(sizeof(double) * (k * 3)); int i; size_t j; double kd = k * 1.0; for (i = 0; i < k; i++) { *(colors + 3 * i) = (3 * (i + 1) % k) / kd; *(colors + 3 * i + 1) = (7 * i % k) / kd; *(colors + 3 * i + 2) = (9 * i % k) / kd; } for (j = 0; j < len; j++) { if (max_x < pts[j].x) max_x = pts[j].x; if (min_x > pts[j].x) min_x = pts[j].x; if (max_y < pts[j].y) max_y = pts[j].y; if (min_y > pts[j].y) min_y = pts[j].y; } scale = W / (max_x - min_x); if (scale > (H / (max_y - min_y))) { scale = H / (max_y - min_y); }; cx = (max_x + min_x) / 2; cy = (max_y + min_y) / 2; printf("%%!PS-Adobe-3.0 EPSF-3.0\n%%%%BoundingBox: -5 -5 %d %d\n", W + 10, H + 10); printf( "/l {rlineto} def /m {rmoveto} def\n" "/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def\n" "/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath " " gsave 1 setgray fill grestore gsave 3 setlinewidth" " 1 setgray stroke grestore 0 setgray stroke }def\n"); for (int i = 0; i < k; i++) { printf("%g %g %g setrgbcolor\n", *(colors + 3 * i), *(colors + 3 * i + 1), *(colors + 3 * i + 2)); for (j = 0; j < len; j++) { if (pts[j].group != i) continue; printf("%.3f %.3f c\n", (pts[j].x - cx) * scale + W / 2, (pts[j].y - cy) * scale + H / 2); } printf("\n0 setgray %g %g s\n", (cent[i].x - cx) * scale + W / 2, (cent[i].y - cy) * scale + H / 2); } printf("\n%%%%EOF"); // free accquired memory free(colors); } /*! @fn test * A function to test the kMeans function * Generates 100000 points in a circle of * radius 20.0 with center at (0,0) * and cluster them into 5 clusters * * Output for 100000 points divided in 5 clusters */ static void test() { size_t size = 100000L; observation* observations = (observation*)malloc(sizeof(observation) * size); double maxRadius = 20.00; double radius = 0; double ang = 0; size_t i = 0; for (; i < size; i++) { radius = maxRadius * ((double)rand() / RAND_MAX); ang = 2 * M_PI * ((double)rand() / RAND_MAX); observations[i].x = radius * cos(ang); observations[i].y = radius * sin(ang); } int k = 5; // No of clusters cluster* clusters = kMeans(observations, size, k); printEPS(observations, size, clusters, k); // Free the accquired memory free(observations); free(clusters); } /*! @fn test2 * A function to test the kMeans function * Generates 1000000 points in a circle of * radius 20.0 with center at (0,0) * and cluster them into 11 clusters * * Output for 1000000 points divided in 11 clusters */ void test2() { size_t size = 1000000L; observation* observations = (observation*)malloc(sizeof(observation) * size); double maxRadius = 20.00; double radius = 0; double ang = 0; size_t i = 0; for (; i < size; i++) { radius = maxRadius * ((double)rand() / RAND_MAX); ang = 2 * M_PI * ((double)rand() / RAND_MAX); observations[i].x = radius * cos(ang); observations[i].y = radius * sin(ang); } int k = 11; // No of clusters cluster* clusters = kMeans(observations, size, k); printEPS(observations, size, clusters, k); // Free the accquired memory free(observations); free(clusters); } /*! @fn main * This function calls the test * function */ int main() { srand(time(NULL)); test(); /* test2(); */ return 0; }