2018-10-18 05:28:57 +08:00
|
|
|
from random import shuffle
|
2023-04-02 01:43:11 +08:00
|
|
|
|
|
|
|
import tensorflow as tf
|
2017-07-30 03:12:32 +08:00
|
|
|
from numpy import array
|
|
|
|
|
|
|
|
|
2023-04-02 01:43:11 +08:00
|
|
|
def tf_k_means_cluster(vectors, noofclusters):
|
2017-07-30 03:12:32 +08:00
|
|
|
"""
|
|
|
|
K-Means Clustering using TensorFlow.
|
|
|
|
'vectors' should be a n*k 2-D NumPy array, where n is the number
|
|
|
|
of vectors of dimensionality k.
|
|
|
|
'noofclusters' should be an integer.
|
|
|
|
"""
|
|
|
|
|
|
|
|
noofclusters = int(noofclusters)
|
|
|
|
assert noofclusters < len(vectors)
|
|
|
|
|
2019-10-05 13:14:13 +08:00
|
|
|
# Find out the dimensionality
|
2017-07-30 03:12:32 +08:00
|
|
|
dim = len(vectors[0])
|
|
|
|
|
2019-10-05 13:14:13 +08:00
|
|
|
# Will help select random centroids from among the available vectors
|
2017-07-30 03:12:32 +08:00
|
|
|
vector_indices = list(range(len(vectors)))
|
|
|
|
shuffle(vector_indices)
|
|
|
|
|
2019-10-05 13:14:13 +08:00
|
|
|
# GRAPH OF COMPUTATION
|
|
|
|
# We initialize a new graph and set it as the default during each run
|
|
|
|
# of this algorithm. This ensures that as this function is called
|
|
|
|
# multiple times, the default graph doesn't keep getting crowded with
|
|
|
|
# unused ops and Variables from previous function calls.
|
2017-07-30 03:12:32 +08:00
|
|
|
|
|
|
|
graph = tf.Graph()
|
|
|
|
|
|
|
|
with graph.as_default():
|
2019-10-05 13:14:13 +08:00
|
|
|
# SESSION OF COMPUTATION
|
2017-07-30 03:12:32 +08:00
|
|
|
|
|
|
|
sess = tf.Session()
|
|
|
|
|
|
|
|
##CONSTRUCTING THE ELEMENTS OF COMPUTATION
|
|
|
|
|
|
|
|
##First lets ensure we have a Variable vector for each centroid,
|
|
|
|
##initialized to one of the vectors from the available data points
|
2019-10-05 13:14:13 +08:00
|
|
|
centroids = [
|
2020-01-03 22:25:36 +08:00
|
|
|
tf.Variable(vectors[vector_indices[i]]) for i in range(noofclusters)
|
2019-10-05 13:14:13 +08:00
|
|
|
]
|
2017-07-30 03:12:32 +08:00
|
|
|
##These nodes will assign the centroid Variables the appropriate
|
|
|
|
##values
|
|
|
|
centroid_value = tf.placeholder("float64", [dim])
|
|
|
|
cent_assigns = []
|
|
|
|
for centroid in centroids:
|
|
|
|
cent_assigns.append(tf.assign(centroid, centroid_value))
|
|
|
|
|
|
|
|
##Variables for cluster assignments of individual vectors(initialized
|
|
|
|
##to 0 at first)
|
|
|
|
assignments = [tf.Variable(0) for i in range(len(vectors))]
|
|
|
|
##These nodes will assign an assignment Variable the appropriate
|
|
|
|
##value
|
|
|
|
assignment_value = tf.placeholder("int32")
|
|
|
|
cluster_assigns = []
|
|
|
|
for assignment in assignments:
|
2019-10-05 13:14:13 +08:00
|
|
|
cluster_assigns.append(tf.assign(assignment, assignment_value))
|
2017-07-30 03:12:32 +08:00
|
|
|
|
|
|
|
##Now lets construct the node that will compute the mean
|
2019-10-05 13:14:13 +08:00
|
|
|
# The placeholder for the input
|
2017-07-30 03:12:32 +08:00
|
|
|
mean_input = tf.placeholder("float", [None, dim])
|
2019-10-05 13:14:13 +08:00
|
|
|
# The Node/op takes the input and computes a mean along the 0th
|
|
|
|
# dimension, i.e. the list of input vectors
|
2017-07-30 03:12:32 +08:00
|
|
|
mean_op = tf.reduce_mean(mean_input, 0)
|
|
|
|
|
|
|
|
##Node for computing Euclidean distances
|
2019-10-05 13:14:13 +08:00
|
|
|
# Placeholders for input
|
2017-07-30 03:12:32 +08:00
|
|
|
v1 = tf.placeholder("float", [dim])
|
|
|
|
v2 = tf.placeholder("float", [dim])
|
2019-10-05 13:14:13 +08:00
|
|
|
euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(v1, v2), 2)))
|
2017-07-30 03:12:32 +08:00
|
|
|
|
|
|
|
##This node will figure out which cluster to assign a vector to,
|
|
|
|
##based on Euclidean distances of the vector from the centroids.
|
2019-10-05 13:14:13 +08:00
|
|
|
# Placeholder for input
|
2017-07-30 03:12:32 +08:00
|
|
|
centroid_distances = tf.placeholder("float", [noofclusters])
|
|
|
|
cluster_assignment = tf.argmin(centroid_distances, 0)
|
|
|
|
|
|
|
|
##INITIALIZING STATE VARIABLES
|
|
|
|
|
|
|
|
##This will help initialization of all Variables defined with respect
|
|
|
|
##to the graph. The Variable-initializer should be defined after
|
|
|
|
##all the Variables have been constructed, so that each of them
|
|
|
|
##will be included in the initialization.
|
|
|
|
init_op = tf.initialize_all_variables()
|
|
|
|
|
2019-10-05 13:14:13 +08:00
|
|
|
# Initialize all variables
|
2017-07-30 03:12:32 +08:00
|
|
|
sess.run(init_op)
|
|
|
|
|
|
|
|
##CLUSTERING ITERATIONS
|
|
|
|
|
2019-10-05 13:14:13 +08:00
|
|
|
# Now perform the Expectation-Maximization steps of K-Means clustering
|
|
|
|
# iterations. To keep things simple, we will only do a set number of
|
|
|
|
# iterations, instead of using a Stopping Criterion.
|
2017-07-30 03:12:32 +08:00
|
|
|
noofiterations = 100
|
2023-04-02 01:43:11 +08:00
|
|
|
for _ in range(noofiterations):
|
2017-07-30 03:12:32 +08:00
|
|
|
##EXPECTATION STEP
|
|
|
|
##Based on the centroid locations till last iteration, compute
|
|
|
|
##the _expected_ centroid assignments.
|
2019-10-05 13:14:13 +08:00
|
|
|
# Iterate over each vector
|
2017-07-30 03:12:32 +08:00
|
|
|
for vector_n in range(len(vectors)):
|
|
|
|
vect = vectors[vector_n]
|
2019-10-05 13:14:13 +08:00
|
|
|
# Compute Euclidean distance between this vector and each
|
|
|
|
# centroid. Remember that this list cannot be named
|
2017-07-30 03:12:32 +08:00
|
|
|
#'centroid_distances', since that is the input to the
|
2019-10-05 13:14:13 +08:00
|
|
|
# cluster assignment node.
|
|
|
|
distances = [
|
|
|
|
sess.run(euclid_dist, feed_dict={v1: vect, v2: sess.run(centroid)})
|
|
|
|
for centroid in centroids
|
|
|
|
]
|
|
|
|
# Now use the cluster assignment node, with the distances
|
|
|
|
# as the input
|
|
|
|
assignment = sess.run(
|
|
|
|
cluster_assignment, feed_dict={centroid_distances: distances}
|
|
|
|
)
|
|
|
|
# Now assign the value to the appropriate state variable
|
|
|
|
sess.run(
|
|
|
|
cluster_assigns[vector_n], feed_dict={assignment_value: assignment}
|
|
|
|
)
|
2017-07-30 03:12:32 +08:00
|
|
|
|
|
|
|
##MAXIMIZATION STEP
|
2019-10-05 13:14:13 +08:00
|
|
|
# Based on the expected state computed from the Expectation Step,
|
|
|
|
# compute the locations of the centroids so as to maximize the
|
|
|
|
# overall objective of minimizing within-cluster Sum-of-Squares
|
2017-07-30 03:12:32 +08:00
|
|
|
for cluster_n in range(noofclusters):
|
2019-10-05 13:14:13 +08:00
|
|
|
# Collect all the vectors assigned to this cluster
|
|
|
|
assigned_vects = [
|
|
|
|
vectors[i]
|
|
|
|
for i in range(len(vectors))
|
|
|
|
if sess.run(assignments[i]) == cluster_n
|
|
|
|
]
|
|
|
|
# Compute new centroid location
|
|
|
|
new_location = sess.run(
|
|
|
|
mean_op, feed_dict={mean_input: array(assigned_vects)}
|
|
|
|
)
|
|
|
|
# Assign value to appropriate variable
|
|
|
|
sess.run(
|
|
|
|
cent_assigns[cluster_n], feed_dict={centroid_value: new_location}
|
|
|
|
)
|
|
|
|
|
|
|
|
# Return centroids and assignments
|
2017-07-30 03:12:32 +08:00
|
|
|
centroids = sess.run(centroids)
|
|
|
|
assignments = sess.run(assignments)
|
|
|
|
return centroids, assignments
|