[PYTHON/TENSORFLOW] K-평균 군집화(K-Means Clustering) 알고리즘 사용하기

■ K-평균 군집화(K-Means Clustering) 알고리즘을 사용하는 방법을 보여준다.

▶ kmean.py


import matplotlib.pyplot as pp
import numpy as np
import pandas as pd
import tensorflow as tf

def DisplayPartition(xValueList, yValueList, assignmentValueNDArray):
    labelList = []
    colorList = ["red", "blue", "green", "yellow"]
    for i in range(len(assignmentValueNDArray)):
        labelList.append(colorList[(assignmentValueNDArray[i])])
    dataFrame = pd.DataFrame(dict(x = xValueList, y = yValueList, color = labelList))
    _, axexSubPlot = pp.subplots()
    axexSubPlot.scatter(dataFrame["x"], dataFrame["y"], c = dataFrame["color"])
    pp.show()

vectorCount           = 2000
clusterCount          = 4
sampleCountPerCluster = 500
stepCount             = 1000
xValueList            = []
yValueList            = []
vectorList            = []

# 랜덤 데이터를 생성한다.
for i in range(vectorCount):
    if np.random.random() > 0.5:
        xValueList.append(np.random.normal(0.4, 0.7))
        yValueList.append(np.random.normal(0.2, 0.8))
    else:
        xValueList.append(np.random.normal(0.6, 0.4))
        yValueList.append(np.random.normal(0.8, 0.5))

vectorList = list(zip(xValueList, yValueList))

vectorTensor = tf.constant(vectorList)

vectorListCount = tf.shape(vectorList)[0]

randomIndexTensor = tf.random_shuffle(tf.range(0, vectorListCount))

begin = [0,]

size    = [clusterCount,]
size[0] = clusterCount

centroidIndexTensor     = tf.slice(randomIndexTensor, begin, size)
centroidVariable        = tf.Variable(tf.gather(vectorList, centroidIndexTensor))
expandedVectorTensor    = tf.expand_dims(vectorTensor, 0)
expandedCentroidVector  = tf.expand_dims(centroidVariable, 1)
subtractedVectorTensor  = tf.subtract(expandedVectorTensor, expandedCentroidVector)
euclideanDistanceTensor = tf.reduce_sum(tf.square(subtractedVectorTensor), 2)
assignmentTensor        = tf.to_int32(tf.argmin(euclideanDistanceTensor, 0))

partitionList = tf.dynamic_partition(vectorTensor, assignmentTensor, clusterCount)

for partition in partitionList:
    updatedCentroidTensor = tf.concat(tf.expand_dims(tf.reduce_mean(partition, 0), 0), 0)

initializeOperation = tf.global_variables_initializer()

sess = tf.Session()

sess.run(initializeOperation)

for step in range(stepCount):
    _, centroidValueNDArray, assignmentValueNDArray = sess.run([updatedCentroidTensor, centroidVariable, assignmentTensor])

DisplayPartition(xValueList, yValueList, assignmentValueNDArray)

pp.plot(xValueList,yValueList, "o", label = "Input Data")
pp.legend()
pp.show()

import matplotlib.pyplot as pp

import numpy as np

import pandas as pd

import tensorflow as tf

def DisplayPartition(xValueList, yValueList, assignmentValueNDArray):

labelList = []

colorList = ["red", "blue", "green", "yellow"]

for i in range(len(assignmentValueNDArray)):

labelList.append(colorList[(assignmentValueNDArray[i])])

dataFrame = pd.DataFrame(dict(x = xValueList, y = yValueList, color = labelList))

_, axexSubPlot = pp.subplots()

axexSubPlot.scatter(dataFrame["x"], dataFrame["y"], c = dataFrame["color"])

pp.show()

vectorCount = 2000

clusterCount = 4

sampleCountPerCluster = 500

stepCount = 1000

xValueList = []

yValueList = []

vectorList = []

# 랜덤 데이터를 생성한다.

for i in range(vectorCount):

if np.random.random() > 0.5:

xValueList.append(np.random.normal(0.4, 0.7))

yValueList.append(np.random.normal(0.2, 0.8))

else:

xValueList.append(np.random.normal(0.6, 0.4))

yValueList.append(np.random.normal(0.8, 0.5))

vectorList = list(zip(xValueList, yValueList))

vectorTensor = tf.constant(vectorList)

vectorListCount = tf.shape(vectorList)[0]

randomIndexTensor = tf.random_shuffle(tf.range(0, vectorListCount))

begin = [0,]

size = [clusterCount,]

size[0] = clusterCount

centroidIndexTensor = tf.slice(randomIndexTensor, begin, size)

centroidVariable = tf.Variable(tf.gather(vectorList, centroidIndexTensor))

expandedVectorTensor = tf.expand_dims(vectorTensor, 0)

expandedCentroidVector = tf.expand_dims(centroidVariable, 1)

subtractedVectorTensor = tf.subtract(expandedVectorTensor, expandedCentroidVector)

euclideanDistanceTensor = tf.reduce_sum(tf.square(subtractedVectorTensor), 2)

assignmentTensor = tf.to_int32(tf.argmin(euclideanDistanceTensor, 0))

partitionList = tf.dynamic_partition(vectorTensor, assignmentTensor, clusterCount)

for partition in partitionList:

updatedCentroidTensor = tf.concat(tf.expand_dims(tf.reduce_mean(partition, 0), 0), 0)

initializeOperation = tf.global_variables_initializer()

sess = tf.Session()

sess.run(initializeOperation)

for step in range(stepCount):

_, centroidValueNDArray, assignmentValueNDArray = sess.run([updatedCentroidTensor, centroidVariable, assignmentTensor])

DisplayPartition(xValueList, yValueList, assignmentValueNDArray)

pp.plot(xValueList,yValueList, "o", label = "Input Data")

pp.legend()

pp.show()