mirror of
https://github.moeyy.xyz/https://github.com/TheAlgorithms/C.git
synced 2023-10-11 15:56:24 +08:00
[doc fix] fix documentations in k_means (#592)
* fix documentations
* clang-tidy fixes for 814f9077b7
Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
This commit is contained in:
parent
05d9af45f3
commit
0b426c0124
@ -35,7 +35,8 @@
|
|||||||
* the name observation is used to denote
|
* the name observation is used to denote
|
||||||
* a random point in plane
|
* a random point in plane
|
||||||
*/
|
*/
|
||||||
typedef struct observation {
|
typedef struct observation
|
||||||
|
{
|
||||||
double x; /**< abscissa of 2D data point */
|
double x; /**< abscissa of 2D data point */
|
||||||
double y; /**< ordinate of 2D data point */
|
double y; /**< ordinate of 2D data point */
|
||||||
int group; /**< the group no in which this observation would go */
|
int group; /**< the group no in which this observation would go */
|
||||||
@ -48,13 +49,14 @@ typedef struct observation {
|
|||||||
* stores the count of observations
|
* stores the count of observations
|
||||||
* belonging to this cluster
|
* belonging to this cluster
|
||||||
*/
|
*/
|
||||||
typedef struct cluster {
|
typedef struct cluster
|
||||||
|
{
|
||||||
double x; /**< abscissa centroid of this cluster */
|
double x; /**< abscissa centroid of this cluster */
|
||||||
double y; /**< ordinate of centroid of this cluster */
|
double y; /**< ordinate of centroid of this cluster */
|
||||||
size_t count; /**< count of observations present in this cluster */
|
size_t count; /**< count of observations present in this cluster */
|
||||||
} cluster;
|
} cluster;
|
||||||
|
|
||||||
/*! @fn calculateNearest
|
/*!
|
||||||
* Returns the index of centroid nearest to
|
* Returns the index of centroid nearest to
|
||||||
* given observation
|
* given observation
|
||||||
*
|
*
|
||||||
@ -64,16 +66,19 @@ typedef struct cluster {
|
|||||||
*
|
*
|
||||||
* @returns the index of nearest centroid for given observation
|
* @returns the index of nearest centroid for given observation
|
||||||
*/
|
*/
|
||||||
int calculateNearst(observation* o, cluster clusters[], int k) {
|
int calculateNearst(observation* o, cluster clusters[], int k)
|
||||||
|
{
|
||||||
double minD = DBL_MAX;
|
double minD = DBL_MAX;
|
||||||
double dist = 0;
|
double dist = 0;
|
||||||
int index = -1;
|
int index = -1;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (; i < k; i++) {
|
for (; i < k; i++)
|
||||||
|
{
|
||||||
/* Calculate Squared Distance*/
|
/* Calculate Squared Distance*/
|
||||||
dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) +
|
dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) +
|
||||||
(clusters[i].y - o->y) * (clusters[i].y - o->y);
|
(clusters[i].y - o->y) * (clusters[i].y - o->y);
|
||||||
if (dist < minD) {
|
if (dist < minD)
|
||||||
|
{
|
||||||
minD = dist;
|
minD = dist;
|
||||||
index = i;
|
index = i;
|
||||||
}
|
}
|
||||||
@ -81,7 +86,7 @@ int calculateNearst(observation* o, cluster clusters[], int k) {
|
|||||||
return index;
|
return index;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! @fn calculateCentroid
|
/*!
|
||||||
* Calculate centoid and assign it to the cluster variable
|
* Calculate centoid and assign it to the cluster variable
|
||||||
*
|
*
|
||||||
* @param observations an array of observations whose centroid is calculated
|
* @param observations an array of observations whose centroid is calculated
|
||||||
@ -90,12 +95,14 @@ int calculateNearst(observation* o, cluster clusters[], int k) {
|
|||||||
* centroid
|
* centroid
|
||||||
*/
|
*/
|
||||||
void calculateCentroid(observation observations[], size_t size,
|
void calculateCentroid(observation observations[], size_t size,
|
||||||
cluster* centroid) {
|
cluster* centroid)
|
||||||
|
{
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
centroid->x = 0;
|
centroid->x = 0;
|
||||||
centroid->y = 0;
|
centroid->y = 0;
|
||||||
centroid->count = size;
|
centroid->count = size;
|
||||||
for (; i < size; i++) {
|
for (; i < size; i++)
|
||||||
|
{
|
||||||
centroid->x += observations[i].x;
|
centroid->x += observations[i].x;
|
||||||
centroid->y += observations[i].y;
|
centroid->y += observations[i].y;
|
||||||
observations[i].group = 0;
|
observations[i].group = 0;
|
||||||
@ -104,7 +111,7 @@ void calculateCentroid(observation observations[], size_t size,
|
|||||||
centroid->y /= centroid->count;
|
centroid->y /= centroid->count;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! @fn kMeans
|
/*!
|
||||||
* --K Means Algorithm--
|
* --K Means Algorithm--
|
||||||
* 1. Assign each observation to one of k groups
|
* 1. Assign each observation to one of k groups
|
||||||
* creating a random initial clustering
|
* creating a random initial clustering
|
||||||
@ -117,15 +124,18 @@ void calculateCentroid(observation observations[], size_t size,
|
|||||||
* 5. Repeat step 2,3,4 until there is no change
|
* 5. Repeat step 2,3,4 until there is no change
|
||||||
* the current clustering and is same as last
|
* the current clustering and is same as last
|
||||||
* clustering.
|
* clustering.
|
||||||
|
*
|
||||||
* @param observations an array of observations to cluster
|
* @param observations an array of observations to cluster
|
||||||
* @param size size of observations array
|
* @param size size of observations array
|
||||||
* @param k no of clusters to be made
|
* @param k no of clusters to be made
|
||||||
*
|
*
|
||||||
* @returns pointer to cluster object
|
* @returns pointer to cluster object
|
||||||
*/
|
*/
|
||||||
cluster* kMeans(observation observations[], size_t size, int k) {
|
cluster* kMeans(observation observations[], size_t size, int k)
|
||||||
|
{
|
||||||
cluster* clusters = NULL;
|
cluster* clusters = NULL;
|
||||||
if (k <= 1) {
|
if (k <= 1)
|
||||||
|
{
|
||||||
/*
|
/*
|
||||||
If we have to cluster them only in one group
|
If we have to cluster them only in one group
|
||||||
then calculate centroid of observations and
|
then calculate centroid of observations and
|
||||||
@ -134,53 +144,66 @@ cluster* kMeans(observation observations[], size_t size, int k) {
|
|||||||
clusters = (cluster*)malloc(sizeof(cluster));
|
clusters = (cluster*)malloc(sizeof(cluster));
|
||||||
memset(clusters, 0, sizeof(cluster));
|
memset(clusters, 0, sizeof(cluster));
|
||||||
calculateCentroid(observations, size, clusters);
|
calculateCentroid(observations, size, clusters);
|
||||||
} else if (k < size) {
|
}
|
||||||
|
else if (k < size)
|
||||||
|
{
|
||||||
clusters = malloc(sizeof(cluster) * k);
|
clusters = malloc(sizeof(cluster) * k);
|
||||||
memset(clusters, 0, k * sizeof(cluster));
|
memset(clusters, 0, k * sizeof(cluster));
|
||||||
/* STEP 1 */
|
/* STEP 1 */
|
||||||
for (size_t j = 0; j < size; j++) {
|
for (size_t j = 0; j < size; j++)
|
||||||
|
{
|
||||||
observations[j].group = rand() % k;
|
observations[j].group = rand() % k;
|
||||||
}
|
}
|
||||||
size_t changed = 0;
|
size_t changed = 0;
|
||||||
size_t minAcceptedError =
|
size_t minAcceptedError =
|
||||||
size / 10000; // Do until 99.99 percent points are in correct cluster
|
size /
|
||||||
|
10000; // Do until 99.99 percent points are in correct cluster
|
||||||
int t = 0;
|
int t = 0;
|
||||||
do {
|
do
|
||||||
|
{
|
||||||
/* Initialize clusters */
|
/* Initialize clusters */
|
||||||
for (int i = 0; i < k; i++) {
|
for (int i = 0; i < k; i++)
|
||||||
|
{
|
||||||
clusters[i].x = 0;
|
clusters[i].x = 0;
|
||||||
clusters[i].y = 0;
|
clusters[i].y = 0;
|
||||||
clusters[i].count = 0;
|
clusters[i].count = 0;
|
||||||
}
|
}
|
||||||
/* STEP 2*/
|
/* STEP 2*/
|
||||||
for (size_t j = 0; j < size; j++) {
|
for (size_t j = 0; j < size; j++)
|
||||||
|
{
|
||||||
t = observations[j].group;
|
t = observations[j].group;
|
||||||
clusters[t].x += observations[j].x;
|
clusters[t].x += observations[j].x;
|
||||||
clusters[t].y += observations[j].y;
|
clusters[t].y += observations[j].y;
|
||||||
clusters[t].count++;
|
clusters[t].count++;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < k; i++) {
|
for (int i = 0; i < k; i++)
|
||||||
|
{
|
||||||
clusters[i].x /= clusters[i].count;
|
clusters[i].x /= clusters[i].count;
|
||||||
clusters[i].y /= clusters[i].count;
|
clusters[i].y /= clusters[i].count;
|
||||||
}
|
}
|
||||||
/* STEP 3 and 4 */
|
/* STEP 3 and 4 */
|
||||||
changed = 0; // this variable stores change in clustering
|
changed = 0; // this variable stores change in clustering
|
||||||
for (size_t j = 0; j < size; j++) {
|
for (size_t j = 0; j < size; j++)
|
||||||
|
{
|
||||||
t = calculateNearst(observations + j, clusters, k);
|
t = calculateNearst(observations + j, clusters, k);
|
||||||
if (t != observations[j].group) {
|
if (t != observations[j].group)
|
||||||
|
{
|
||||||
changed++;
|
changed++;
|
||||||
observations[j].group = t;
|
observations[j].group = t;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (changed > minAcceptedError); // Keep on grouping until we have
|
} while (changed > minAcceptedError); // Keep on grouping until we have
|
||||||
// got almost best clustering
|
// got almost best clustering
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
/* If no of clusters is more than observations
|
/* If no of clusters is more than observations
|
||||||
each observation can be its own cluster
|
each observation can be its own cluster
|
||||||
*/
|
*/
|
||||||
clusters = (cluster*)malloc(sizeof(cluster) * k);
|
clusters = (cluster*)malloc(sizeof(cluster) * k);
|
||||||
memset(clusters, 0, k * sizeof(cluster));
|
memset(clusters, 0, k * sizeof(cluster));
|
||||||
for (int j = 0; j < size; j++) {
|
for (int j = 0; j < size; j++)
|
||||||
|
{
|
||||||
clusters[j].x = observations[j].x;
|
clusters[j].x = observations[j].x;
|
||||||
clusters[j].y = observations[j].y;
|
clusters[j].y = observations[j].y;
|
||||||
clusters[j].count = 1;
|
clusters[j].count = 1;
|
||||||
@ -195,21 +218,24 @@ cluster* kMeans(observation observations[], size_t size, int k) {
|
|||||||
* @}
|
* @}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*! @fn printEPS
|
/*!
|
||||||
* A function to print observations and clusters
|
* A function to print observations and clusters
|
||||||
* The code is taken from
|
* The code is taken from
|
||||||
* @link http://rosettacode.org/wiki/K-means%2B%2B_clustering
|
* http://rosettacode.org/wiki/K-means%2B%2B_clustering.
|
||||||
* its C implementation
|
|
||||||
* Even the K Means code is also inspired from it
|
* Even the K Means code is also inspired from it
|
||||||
*
|
*
|
||||||
* Note: To print in a file use pipeline operator ( ./a.out > image.eps )
|
* @note To print in a file use pipeline operator
|
||||||
|
* ```sh
|
||||||
|
* ./k_means_clustering > image.eps
|
||||||
|
* ```
|
||||||
*
|
*
|
||||||
* @param observations observations array
|
* @param observations observations array
|
||||||
* @param len size of observation array
|
* @param len size of observation array
|
||||||
* @param cent clusters centroid's array
|
* @param cent clusters centroid's array
|
||||||
* @param k size of cent array
|
* @param k size of cent array
|
||||||
*/
|
*/
|
||||||
void printEPS(observation pts[], size_t len, cluster cent[], int k) {
|
void printEPS(observation pts[], size_t len, cluster cent[], int k)
|
||||||
|
{
|
||||||
int W = 400, H = 400;
|
int W = 400, H = 400;
|
||||||
double min_x = DBL_MAX, max_x = DBL_MIN, min_y = DBL_MAX, max_y = DBL_MIN;
|
double min_x = DBL_MAX, max_x = DBL_MIN, min_y = DBL_MAX, max_y = DBL_MIN;
|
||||||
double scale = 0, cx = 0, cy = 0;
|
double scale = 0, cx = 0, cy = 0;
|
||||||
@ -217,20 +243,35 @@ void printEPS(observation pts[], size_t len, cluster cent[], int k) {
|
|||||||
int i;
|
int i;
|
||||||
size_t j;
|
size_t j;
|
||||||
double kd = k * 1.0;
|
double kd = k * 1.0;
|
||||||
for (i = 0; i < k; i++) {
|
for (i = 0; i < k; i++)
|
||||||
|
{
|
||||||
*(colors + 3 * i) = (3 * (i + 1) % k) / kd;
|
*(colors + 3 * i) = (3 * (i + 1) % k) / kd;
|
||||||
*(colors + 3 * i + 1) = (7 * i % k) / kd;
|
*(colors + 3 * i + 1) = (7 * i % k) / kd;
|
||||||
*(colors + 3 * i + 2) = (9 * i % k) / kd;
|
*(colors + 3 * i + 2) = (9 * i % k) / kd;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (j = 0; j < len; j++) {
|
for (j = 0; j < len; j++)
|
||||||
if (max_x < pts[j].x) max_x = pts[j].x;
|
{
|
||||||
if (min_x > pts[j].x) min_x = pts[j].x;
|
if (max_x < pts[j].x)
|
||||||
if (max_y < pts[j].y) max_y = pts[j].y;
|
{
|
||||||
if (min_y > pts[j].y) min_y = pts[j].y;
|
max_x = pts[j].x;
|
||||||
|
}
|
||||||
|
if (min_x > pts[j].x)
|
||||||
|
{
|
||||||
|
min_x = pts[j].x;
|
||||||
|
}
|
||||||
|
if (max_y < pts[j].y)
|
||||||
|
{
|
||||||
|
max_y = pts[j].y;
|
||||||
|
}
|
||||||
|
if (min_y > pts[j].y)
|
||||||
|
{
|
||||||
|
min_y = pts[j].y;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
scale = W / (max_x - min_x);
|
scale = W / (max_x - min_x);
|
||||||
if (scale > (H / (max_y - min_y))) {
|
if (scale > (H / (max_y - min_y)))
|
||||||
|
{
|
||||||
scale = H / (max_y - min_y);
|
scale = H / (max_y - min_y);
|
||||||
};
|
};
|
||||||
cx = (max_x + min_x) / 2;
|
cx = (max_x + min_x) / 2;
|
||||||
@ -244,11 +285,16 @@ void printEPS(observation pts[], size_t len, cluster cent[], int k) {
|
|||||||
"/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath "
|
"/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath "
|
||||||
" gsave 1 setgray fill grestore gsave 3 setlinewidth"
|
" gsave 1 setgray fill grestore gsave 3 setlinewidth"
|
||||||
" 1 setgray stroke grestore 0 setgray stroke }def\n");
|
" 1 setgray stroke grestore 0 setgray stroke }def\n");
|
||||||
for (int i = 0; i < k; i++) {
|
for (int i = 0; i < k; i++)
|
||||||
printf("%g %g %g setrgbcolor\n", *(colors + 3 * i), *(colors + 3 * i + 1),
|
{
|
||||||
*(colors + 3 * i + 2));
|
printf("%g %g %g setrgbcolor\n", *(colors + 3 * i),
|
||||||
for (j = 0; j < len; j++) {
|
*(colors + 3 * i + 1), *(colors + 3 * i + 2));
|
||||||
if (pts[j].group != i) continue;
|
for (j = 0; j < len; j++)
|
||||||
|
{
|
||||||
|
if (pts[j].group != i)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
printf("%.3f %.3f c\n", (pts[j].x - cx) * scale + W / 2,
|
printf("%.3f %.3f c\n", (pts[j].x - cx) * scale + W / 2,
|
||||||
(pts[j].y - cy) * scale + H / 2);
|
(pts[j].y - cy) * scale + H / 2);
|
||||||
}
|
}
|
||||||
@ -261,7 +307,7 @@ void printEPS(observation pts[], size_t len, cluster cent[], int k) {
|
|||||||
free(colors);
|
free(colors);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! @fn test
|
/*!
|
||||||
* A function to test the kMeans function
|
* A function to test the kMeans function
|
||||||
* Generates 100000 points in a circle of
|
* Generates 100000 points in a circle of
|
||||||
* radius 20.0 with center at (0,0)
|
* radius 20.0 with center at (0,0)
|
||||||
@ -270,15 +316,19 @@ void printEPS(observation pts[], size_t len, cluster cent[], int k) {
|
|||||||
* <img alt="Output for 100000 points divided in 5 clusters" src=
|
* <img alt="Output for 100000 points divided in 5 clusters" src=
|
||||||
* "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest1.png"
|
* "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest1.png"
|
||||||
* width="400px" heiggt="400px">
|
* width="400px" heiggt="400px">
|
||||||
|
* @returns None
|
||||||
*/
|
*/
|
||||||
static void test() {
|
static void test()
|
||||||
|
{
|
||||||
size_t size = 100000L;
|
size_t size = 100000L;
|
||||||
observation* observations = (observation*)malloc(sizeof(observation) * size);
|
observation* observations =
|
||||||
|
(observation*)malloc(sizeof(observation) * size);
|
||||||
double maxRadius = 20.00;
|
double maxRadius = 20.00;
|
||||||
double radius = 0;
|
double radius = 0;
|
||||||
double ang = 0;
|
double ang = 0;
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
for (; i < size; i++) {
|
for (; i < size; i++)
|
||||||
|
{
|
||||||
radius = maxRadius * ((double)rand() / RAND_MAX);
|
radius = maxRadius * ((double)rand() / RAND_MAX);
|
||||||
ang = 2 * M_PI * ((double)rand() / RAND_MAX);
|
ang = 2 * M_PI * ((double)rand() / RAND_MAX);
|
||||||
observations[i].x = radius * cos(ang);
|
observations[i].x = radius * cos(ang);
|
||||||
@ -292,7 +342,7 @@ static void test() {
|
|||||||
free(clusters);
|
free(clusters);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! @fn test2
|
/*!
|
||||||
* A function to test the kMeans function
|
* A function to test the kMeans function
|
||||||
* Generates 1000000 points in a circle of
|
* Generates 1000000 points in a circle of
|
||||||
* radius 20.0 with center at (0,0)
|
* radius 20.0 with center at (0,0)
|
||||||
@ -301,15 +351,19 @@ static void test() {
|
|||||||
* <img alt="Output for 1000000 points divided in 11 clusters" src=
|
* <img alt="Output for 1000000 points divided in 11 clusters" src=
|
||||||
* "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest2.png"
|
* "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest2.png"
|
||||||
* width="400px" heiggt="400px">
|
* width="400px" heiggt="400px">
|
||||||
|
* @returns None
|
||||||
*/
|
*/
|
||||||
void test2() {
|
void test2()
|
||||||
|
{
|
||||||
size_t size = 1000000L;
|
size_t size = 1000000L;
|
||||||
observation* observations = (observation*)malloc(sizeof(observation) * size);
|
observation* observations =
|
||||||
|
(observation*)malloc(sizeof(observation) * size);
|
||||||
double maxRadius = 20.00;
|
double maxRadius = 20.00;
|
||||||
double radius = 0;
|
double radius = 0;
|
||||||
double ang = 0;
|
double ang = 0;
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
for (; i < size; i++) {
|
for (; i < size; i++)
|
||||||
|
{
|
||||||
radius = maxRadius * ((double)rand() / RAND_MAX);
|
radius = maxRadius * ((double)rand() / RAND_MAX);
|
||||||
ang = 2 * M_PI * ((double)rand() / RAND_MAX);
|
ang = 2 * M_PI * ((double)rand() / RAND_MAX);
|
||||||
observations[i].x = radius * cos(ang);
|
observations[i].x = radius * cos(ang);
|
||||||
@ -323,11 +377,12 @@ void test2() {
|
|||||||
free(clusters);
|
free(clusters);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! @fn main
|
/*!
|
||||||
* This function calls the test
|
* This function calls the test
|
||||||
* function
|
* function
|
||||||
*/
|
*/
|
||||||
int main() {
|
int main()
|
||||||
|
{
|
||||||
srand(time(NULL));
|
srand(time(NULL));
|
||||||
test();
|
test();
|
||||||
/* test2(); */
|
/* test2(); */
|
||||||
|
Loading…
Reference in New Issue
Block a user