[doc fix] fix documentations in k_means (#592)

* fix documentations

* clang-tidy fixes for 814f9077b7

Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
This commit is contained in:
Krishna Vedala 2020-08-08 12:27:46 -04:00 committed by GitHub
parent 05d9af45f3
commit 0b426c0124
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -35,7 +35,8 @@
* the name observation is used to denote * the name observation is used to denote
* a random point in plane * a random point in plane
*/ */
typedef struct observation { typedef struct observation
{
double x; /**< abscissa of 2D data point */ double x; /**< abscissa of 2D data point */
double y; /**< ordinate of 2D data point */ double y; /**< ordinate of 2D data point */
int group; /**< the group no in which this observation would go */ int group; /**< the group no in which this observation would go */
@ -48,13 +49,14 @@ typedef struct observation {
* stores the count of observations * stores the count of observations
* belonging to this cluster * belonging to this cluster
*/ */
typedef struct cluster { typedef struct cluster
{
double x; /**< abscissa centroid of this cluster */ double x; /**< abscissa centroid of this cluster */
double y; /**< ordinate of centroid of this cluster */ double y; /**< ordinate of centroid of this cluster */
size_t count; /**< count of observations present in this cluster */ size_t count; /**< count of observations present in this cluster */
} cluster; } cluster;
/*! @fn calculateNearest /*!
* Returns the index of centroid nearest to * Returns the index of centroid nearest to
* given observation * given observation
* *
@ -64,16 +66,19 @@ typedef struct cluster {
* *
* @returns the index of nearest centroid for given observation * @returns the index of nearest centroid for given observation
*/ */
int calculateNearst(observation* o, cluster clusters[], int k) { int calculateNearst(observation* o, cluster clusters[], int k)
{
double minD = DBL_MAX; double minD = DBL_MAX;
double dist = 0; double dist = 0;
int index = -1; int index = -1;
int i = 0; int i = 0;
for (; i < k; i++) { for (; i < k; i++)
{
/* Calculate Squared Distance*/ /* Calculate Squared Distance*/
dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) + dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) +
(clusters[i].y - o->y) * (clusters[i].y - o->y); (clusters[i].y - o->y) * (clusters[i].y - o->y);
if (dist < minD) { if (dist < minD)
{
minD = dist; minD = dist;
index = i; index = i;
} }
@ -81,7 +86,7 @@ int calculateNearst(observation* o, cluster clusters[], int k) {
return index; return index;
} }
/*! @fn calculateCentroid /*!
* Calculate centoid and assign it to the cluster variable * Calculate centoid and assign it to the cluster variable
* *
* @param observations an array of observations whose centroid is calculated * @param observations an array of observations whose centroid is calculated
@ -90,12 +95,14 @@ int calculateNearst(observation* o, cluster clusters[], int k) {
* centroid * centroid
*/ */
void calculateCentroid(observation observations[], size_t size, void calculateCentroid(observation observations[], size_t size,
cluster* centroid) { cluster* centroid)
{
size_t i = 0; size_t i = 0;
centroid->x = 0; centroid->x = 0;
centroid->y = 0; centroid->y = 0;
centroid->count = size; centroid->count = size;
for (; i < size; i++) { for (; i < size; i++)
{
centroid->x += observations[i].x; centroid->x += observations[i].x;
centroid->y += observations[i].y; centroid->y += observations[i].y;
observations[i].group = 0; observations[i].group = 0;
@ -104,7 +111,7 @@ void calculateCentroid(observation observations[], size_t size,
centroid->y /= centroid->count; centroid->y /= centroid->count;
} }
/*! @fn kMeans /*!
* --K Means Algorithm-- * --K Means Algorithm--
* 1. Assign each observation to one of k groups * 1. Assign each observation to one of k groups
* creating a random initial clustering * creating a random initial clustering
@ -117,15 +124,18 @@ void calculateCentroid(observation observations[], size_t size,
* 5. Repeat step 2,3,4 until there is no change * 5. Repeat step 2,3,4 until there is no change
* the current clustering and is same as last * the current clustering and is same as last
* clustering. * clustering.
*
* @param observations an array of observations to cluster * @param observations an array of observations to cluster
* @param size size of observations array * @param size size of observations array
* @param k no of clusters to be made * @param k no of clusters to be made
* *
* @returns pointer to cluster object * @returns pointer to cluster object
*/ */
cluster* kMeans(observation observations[], size_t size, int k) { cluster* kMeans(observation observations[], size_t size, int k)
{
cluster* clusters = NULL; cluster* clusters = NULL;
if (k <= 1) { if (k <= 1)
{
/* /*
If we have to cluster them only in one group If we have to cluster them only in one group
then calculate centroid of observations and then calculate centroid of observations and
@ -134,53 +144,66 @@ cluster* kMeans(observation observations[], size_t size, int k) {
clusters = (cluster*)malloc(sizeof(cluster)); clusters = (cluster*)malloc(sizeof(cluster));
memset(clusters, 0, sizeof(cluster)); memset(clusters, 0, sizeof(cluster));
calculateCentroid(observations, size, clusters); calculateCentroid(observations, size, clusters);
} else if (k < size) { }
else if (k < size)
{
clusters = malloc(sizeof(cluster) * k); clusters = malloc(sizeof(cluster) * k);
memset(clusters, 0, k * sizeof(cluster)); memset(clusters, 0, k * sizeof(cluster));
/* STEP 1 */ /* STEP 1 */
for (size_t j = 0; j < size; j++) { for (size_t j = 0; j < size; j++)
{
observations[j].group = rand() % k; observations[j].group = rand() % k;
} }
size_t changed = 0; size_t changed = 0;
size_t minAcceptedError = size_t minAcceptedError =
size / 10000; // Do until 99.99 percent points are in correct cluster size /
10000; // Do until 99.99 percent points are in correct cluster
int t = 0; int t = 0;
do { do
{
/* Initialize clusters */ /* Initialize clusters */
for (int i = 0; i < k; i++) { for (int i = 0; i < k; i++)
{
clusters[i].x = 0; clusters[i].x = 0;
clusters[i].y = 0; clusters[i].y = 0;
clusters[i].count = 0; clusters[i].count = 0;
} }
/* STEP 2*/ /* STEP 2*/
for (size_t j = 0; j < size; j++) { for (size_t j = 0; j < size; j++)
{
t = observations[j].group; t = observations[j].group;
clusters[t].x += observations[j].x; clusters[t].x += observations[j].x;
clusters[t].y += observations[j].y; clusters[t].y += observations[j].y;
clusters[t].count++; clusters[t].count++;
} }
for (int i = 0; i < k; i++) { for (int i = 0; i < k; i++)
{
clusters[i].x /= clusters[i].count; clusters[i].x /= clusters[i].count;
clusters[i].y /= clusters[i].count; clusters[i].y /= clusters[i].count;
} }
/* STEP 3 and 4 */ /* STEP 3 and 4 */
changed = 0; // this variable stores change in clustering changed = 0; // this variable stores change in clustering
for (size_t j = 0; j < size; j++) { for (size_t j = 0; j < size; j++)
{
t = calculateNearst(observations + j, clusters, k); t = calculateNearst(observations + j, clusters, k);
if (t != observations[j].group) { if (t != observations[j].group)
{
changed++; changed++;
observations[j].group = t; observations[j].group = t;
} }
} }
} while (changed > minAcceptedError); // Keep on grouping until we have } while (changed > minAcceptedError); // Keep on grouping until we have
// got almost best clustering // got almost best clustering
} else { }
else
{
/* If no of clusters is more than observations /* If no of clusters is more than observations
each observation can be its own cluster each observation can be its own cluster
*/ */
clusters = (cluster*)malloc(sizeof(cluster) * k); clusters = (cluster*)malloc(sizeof(cluster) * k);
memset(clusters, 0, k * sizeof(cluster)); memset(clusters, 0, k * sizeof(cluster));
for (int j = 0; j < size; j++) { for (int j = 0; j < size; j++)
{
clusters[j].x = observations[j].x; clusters[j].x = observations[j].x;
clusters[j].y = observations[j].y; clusters[j].y = observations[j].y;
clusters[j].count = 1; clusters[j].count = 1;
@ -195,21 +218,24 @@ cluster* kMeans(observation observations[], size_t size, int k) {
* @} * @}
*/ */
/*! @fn printEPS /*!
* A function to print observations and clusters * A function to print observations and clusters
* The code is taken from * The code is taken from
* @link http://rosettacode.org/wiki/K-means%2B%2B_clustering * http://rosettacode.org/wiki/K-means%2B%2B_clustering.
* its C implementation
* Even the K Means code is also inspired from it * Even the K Means code is also inspired from it
* *
* Note: To print in a file use pipeline operator ( ./a.out > image.eps ) * @note To print in a file use pipeline operator
* ```sh
* ./k_means_clustering > image.eps
* ```
* *
* @param observations observations array * @param observations observations array
* @param len size of observation array * @param len size of observation array
* @param cent clusters centroid's array * @param cent clusters centroid's array
* @param k size of cent array * @param k size of cent array
*/ */
void printEPS(observation pts[], size_t len, cluster cent[], int k) { void printEPS(observation pts[], size_t len, cluster cent[], int k)
{
int W = 400, H = 400; int W = 400, H = 400;
double min_x = DBL_MAX, max_x = DBL_MIN, min_y = DBL_MAX, max_y = DBL_MIN; double min_x = DBL_MAX, max_x = DBL_MIN, min_y = DBL_MAX, max_y = DBL_MIN;
double scale = 0, cx = 0, cy = 0; double scale = 0, cx = 0, cy = 0;
@ -217,20 +243,35 @@ void printEPS(observation pts[], size_t len, cluster cent[], int k) {
int i; int i;
size_t j; size_t j;
double kd = k * 1.0; double kd = k * 1.0;
for (i = 0; i < k; i++) { for (i = 0; i < k; i++)
{
*(colors + 3 * i) = (3 * (i + 1) % k) / kd; *(colors + 3 * i) = (3 * (i + 1) % k) / kd;
*(colors + 3 * i + 1) = (7 * i % k) / kd; *(colors + 3 * i + 1) = (7 * i % k) / kd;
*(colors + 3 * i + 2) = (9 * i % k) / kd; *(colors + 3 * i + 2) = (9 * i % k) / kd;
} }
for (j = 0; j < len; j++) { for (j = 0; j < len; j++)
if (max_x < pts[j].x) max_x = pts[j].x; {
if (min_x > pts[j].x) min_x = pts[j].x; if (max_x < pts[j].x)
if (max_y < pts[j].y) max_y = pts[j].y; {
if (min_y > pts[j].y) min_y = pts[j].y; max_x = pts[j].x;
}
if (min_x > pts[j].x)
{
min_x = pts[j].x;
}
if (max_y < pts[j].y)
{
max_y = pts[j].y;
}
if (min_y > pts[j].y)
{
min_y = pts[j].y;
}
} }
scale = W / (max_x - min_x); scale = W / (max_x - min_x);
if (scale > (H / (max_y - min_y))) { if (scale > (H / (max_y - min_y)))
{
scale = H / (max_y - min_y); scale = H / (max_y - min_y);
}; };
cx = (max_x + min_x) / 2; cx = (max_x + min_x) / 2;
@ -244,11 +285,16 @@ void printEPS(observation pts[], size_t len, cluster cent[], int k) {
"/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath " "/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath "
" gsave 1 setgray fill grestore gsave 3 setlinewidth" " gsave 1 setgray fill grestore gsave 3 setlinewidth"
" 1 setgray stroke grestore 0 setgray stroke }def\n"); " 1 setgray stroke grestore 0 setgray stroke }def\n");
for (int i = 0; i < k; i++) { for (int i = 0; i < k; i++)
printf("%g %g %g setrgbcolor\n", *(colors + 3 * i), *(colors + 3 * i + 1), {
*(colors + 3 * i + 2)); printf("%g %g %g setrgbcolor\n", *(colors + 3 * i),
for (j = 0; j < len; j++) { *(colors + 3 * i + 1), *(colors + 3 * i + 2));
if (pts[j].group != i) continue; for (j = 0; j < len; j++)
{
if (pts[j].group != i)
{
continue;
}
printf("%.3f %.3f c\n", (pts[j].x - cx) * scale + W / 2, printf("%.3f %.3f c\n", (pts[j].x - cx) * scale + W / 2,
(pts[j].y - cy) * scale + H / 2); (pts[j].y - cy) * scale + H / 2);
} }
@ -261,7 +307,7 @@ void printEPS(observation pts[], size_t len, cluster cent[], int k) {
free(colors); free(colors);
} }
/*! @fn test /*!
* A function to test the kMeans function * A function to test the kMeans function
* Generates 100000 points in a circle of * Generates 100000 points in a circle of
* radius 20.0 with center at (0,0) * radius 20.0 with center at (0,0)
@ -270,15 +316,19 @@ void printEPS(observation pts[], size_t len, cluster cent[], int k) {
* <img alt="Output for 100000 points divided in 5 clusters" src= * <img alt="Output for 100000 points divided in 5 clusters" src=
* "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest1.png" * "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest1.png"
* width="400px" heiggt="400px"> * width="400px" heiggt="400px">
* @returns None
*/ */
static void test() { static void test()
{
size_t size = 100000L; size_t size = 100000L;
observation* observations = (observation*)malloc(sizeof(observation) * size); observation* observations =
(observation*)malloc(sizeof(observation) * size);
double maxRadius = 20.00; double maxRadius = 20.00;
double radius = 0; double radius = 0;
double ang = 0; double ang = 0;
size_t i = 0; size_t i = 0;
for (; i < size; i++) { for (; i < size; i++)
{
radius = maxRadius * ((double)rand() / RAND_MAX); radius = maxRadius * ((double)rand() / RAND_MAX);
ang = 2 * M_PI * ((double)rand() / RAND_MAX); ang = 2 * M_PI * ((double)rand() / RAND_MAX);
observations[i].x = radius * cos(ang); observations[i].x = radius * cos(ang);
@ -292,7 +342,7 @@ static void test() {
free(clusters); free(clusters);
} }
/*! @fn test2 /*!
* A function to test the kMeans function * A function to test the kMeans function
* Generates 1000000 points in a circle of * Generates 1000000 points in a circle of
* radius 20.0 with center at (0,0) * radius 20.0 with center at (0,0)
@ -301,15 +351,19 @@ static void test() {
* <img alt="Output for 1000000 points divided in 11 clusters" src= * <img alt="Output for 1000000 points divided in 11 clusters" src=
* "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest2.png" * "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest2.png"
* width="400px" heiggt="400px"> * width="400px" heiggt="400px">
* @returns None
*/ */
void test2() { void test2()
{
size_t size = 1000000L; size_t size = 1000000L;
observation* observations = (observation*)malloc(sizeof(observation) * size); observation* observations =
(observation*)malloc(sizeof(observation) * size);
double maxRadius = 20.00; double maxRadius = 20.00;
double radius = 0; double radius = 0;
double ang = 0; double ang = 0;
size_t i = 0; size_t i = 0;
for (; i < size; i++) { for (; i < size; i++)
{
radius = maxRadius * ((double)rand() / RAND_MAX); radius = maxRadius * ((double)rand() / RAND_MAX);
ang = 2 * M_PI * ((double)rand() / RAND_MAX); ang = 2 * M_PI * ((double)rand() / RAND_MAX);
observations[i].x = radius * cos(ang); observations[i].x = radius * cos(ang);
@ -323,11 +377,12 @@ void test2() {
free(clusters); free(clusters);
} }
/*! @fn main /*!
* This function calls the test * This function calls the test
* function * function
*/ */
int main() { int main()
{
srand(time(NULL)); srand(time(NULL));
test(); test();
/* test2(); */ /* test2(); */