mirror of
https://hub.njuu.cf/TheAlgorithms/Python.git
synced 2023-10-11 13:06:12 +08:00
0d01a4a0aa
The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google.
142 lines
5.8 KiB
Python
142 lines
5.8 KiB
Python
import tensorflow as tf
|
|
from random import choice, shuffle
|
|
from numpy import array
|
|
|
|
|
|
def TFKMeansCluster(vectors, noofclusters):
|
|
"""
|
|
K-Means Clustering using TensorFlow.
|
|
'vectors' should be a n*k 2-D NumPy array, where n is the number
|
|
of vectors of dimensionality k.
|
|
'noofclusters' should be an integer.
|
|
"""
|
|
|
|
noofclusters = int(noofclusters)
|
|
assert noofclusters < len(vectors)
|
|
|
|
#Find out the dimensionality
|
|
dim = len(vectors[0])
|
|
|
|
#Will help select random centroids from among the available vectors
|
|
vector_indices = list(range(len(vectors)))
|
|
shuffle(vector_indices)
|
|
|
|
#GRAPH OF COMPUTATION
|
|
#We initialize a new graph and set it as the default during each run
|
|
#of this algorithm. This ensures that as this function is called
|
|
#multiple times, the default graph doesn't keep getting crowded with
|
|
#unused ops and Variables from previous function calls.
|
|
|
|
graph = tf.Graph()
|
|
|
|
with graph.as_default():
|
|
|
|
#SESSION OF COMPUTATION
|
|
|
|
sess = tf.Session()
|
|
|
|
##CONSTRUCTING THE ELEMENTS OF COMPUTATION
|
|
|
|
##First lets ensure we have a Variable vector for each centroid,
|
|
##initialized to one of the vectors from the available data points
|
|
centroids = [tf.Variable((vectors[vector_indices[i]]))
|
|
for i in range(noofclusters)]
|
|
##These nodes will assign the centroid Variables the appropriate
|
|
##values
|
|
centroid_value = tf.placeholder("float64", [dim])
|
|
cent_assigns = []
|
|
for centroid in centroids:
|
|
cent_assigns.append(tf.assign(centroid, centroid_value))
|
|
|
|
##Variables for cluster assignments of individual vectors(initialized
|
|
##to 0 at first)
|
|
assignments = [tf.Variable(0) for i in range(len(vectors))]
|
|
##These nodes will assign an assignment Variable the appropriate
|
|
##value
|
|
assignment_value = tf.placeholder("int32")
|
|
cluster_assigns = []
|
|
for assignment in assignments:
|
|
cluster_assigns.append(tf.assign(assignment,
|
|
assignment_value))
|
|
|
|
##Now lets construct the node that will compute the mean
|
|
#The placeholder for the input
|
|
mean_input = tf.placeholder("float", [None, dim])
|
|
#The Node/op takes the input and computes a mean along the 0th
|
|
#dimension, i.e. the list of input vectors
|
|
mean_op = tf.reduce_mean(mean_input, 0)
|
|
|
|
##Node for computing Euclidean distances
|
|
#Placeholders for input
|
|
v1 = tf.placeholder("float", [dim])
|
|
v2 = tf.placeholder("float", [dim])
|
|
euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(
|
|
v1, v2), 2)))
|
|
|
|
##This node will figure out which cluster to assign a vector to,
|
|
##based on Euclidean distances of the vector from the centroids.
|
|
#Placeholder for input
|
|
centroid_distances = tf.placeholder("float", [noofclusters])
|
|
cluster_assignment = tf.argmin(centroid_distances, 0)
|
|
|
|
##INITIALIZING STATE VARIABLES
|
|
|
|
##This will help initialization of all Variables defined with respect
|
|
##to the graph. The Variable-initializer should be defined after
|
|
##all the Variables have been constructed, so that each of them
|
|
##will be included in the initialization.
|
|
init_op = tf.initialize_all_variables()
|
|
|
|
#Initialize all variables
|
|
sess.run(init_op)
|
|
|
|
##CLUSTERING ITERATIONS
|
|
|
|
#Now perform the Expectation-Maximization steps of K-Means clustering
|
|
#iterations. To keep things simple, we will only do a set number of
|
|
#iterations, instead of using a Stopping Criterion.
|
|
noofiterations = 100
|
|
for iteration_n in range(noofiterations):
|
|
|
|
##EXPECTATION STEP
|
|
##Based on the centroid locations till last iteration, compute
|
|
##the _expected_ centroid assignments.
|
|
#Iterate over each vector
|
|
for vector_n in range(len(vectors)):
|
|
vect = vectors[vector_n]
|
|
#Compute Euclidean distance between this vector and each
|
|
#centroid. Remember that this list cannot be named
|
|
#'centroid_distances', since that is the input to the
|
|
#cluster assignment node.
|
|
distances = [sess.run(euclid_dist, feed_dict={
|
|
v1: vect, v2: sess.run(centroid)})
|
|
for centroid in centroids]
|
|
#Now use the cluster assignment node, with the distances
|
|
#as the input
|
|
assignment = sess.run(cluster_assignment, feed_dict = {
|
|
centroid_distances: distances})
|
|
#Now assign the value to the appropriate state variable
|
|
sess.run(cluster_assigns[vector_n], feed_dict={
|
|
assignment_value: assignment})
|
|
|
|
##MAXIMIZATION STEP
|
|
#Based on the expected state computed from the Expectation Step,
|
|
#compute the locations of the centroids so as to maximize the
|
|
#overall objective of minimizing within-cluster Sum-of-Squares
|
|
for cluster_n in range(noofclusters):
|
|
#Collect all the vectors assigned to this cluster
|
|
assigned_vects = [vectors[i] for i in range(len(vectors))
|
|
if sess.run(assignments[i]) == cluster_n]
|
|
#Compute new centroid location
|
|
new_location = sess.run(mean_op, feed_dict={
|
|
mean_input: array(assigned_vects)})
|
|
#Assign value to appropriate variable
|
|
sess.run(cent_assigns[cluster_n], feed_dict={
|
|
centroid_value: new_location})
|
|
|
|
#Return centroids and assignments
|
|
centroids = sess.run(centroids)
|
|
assignments = sess.run(assignments)
|
|
return centroids, assignments
|
|
|