K-means algorithm 구현하기¶

by JunPyo Park

sklearn package 에 내장되어 있는 국민 데이터셋인 Iris 데이터셋을 사용하여 k-means 알고리즘을 구현해 보았습니다.
k=3 인 경우에 해당하는 코드를 먼저 작성하였고 그 다음 임의의 k 값에 대해 작동하도록 함수를 만들어 보았습니다.
함수 내부에서 Jagota index를 계산하여 k에 따른 Jagota index의 변화를 마지막에 출력하였습니다.

Jagota Index?¶

import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np
import pandas as pd
from sklearn import datasets
from jupyterthemes import jtplot
import matplotlib.pyplot as plt
jtplot.style('grade3')

Load Data¶

iris = datasets.load_iris() # load iris dataset

X = iris.data[:, :2] # select two features
X.shape

(150, 2)

Data Visualization¶

trace = go.Scatter(
    x = X[:,0],
    y = X[:,1],
    name = 'Data_Point',
    mode = 'markers'
)

data = [trace]
fig = dict(data=data)
py.iplot(fig, filename='basic-scatter')

Choose Initial Random Points (For the Case k=3)¶

# The number of clusters and data
k = 3
m = X.shape[0]

# ramdomly initialize mean points
mu = X[np.random.randint(0,m,k),:]
pre_mu = mu.copy()

print("Initial 3 Random Point")
print(mu)

Initial 3 Random Point
[[ 5.9  3. ]
 [ 5.4  3.9]
 [ 5.7  3. ]]

trace1 = go.Scatter(
    x = mu[:,0],
    y = mu[:,1],
    name = 'Initial_Random_Point',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'rgba(255, 182, 193, .9)',
        line = dict(
            width = 2,
        )
    )
)

data = [trace, trace1]

fig = dict(data=data)
py.iplot(fig, filename='add_initial_point')

Running K-means Algorithm¶

y = np.empty([m,1])
tolerance = 1e-15 # Setting the tolerance(stopping criteria), I arbitrarily put this value
iter_num = 0

while True:
    iter_num += 1
    for i in range(m):
        d0 = np.linalg.norm(X[i,:] - mu[0,:],2) # use 2-norm(Euclidean Distance) as a distance measure
        d1 = np.linalg.norm(X[i,:] - mu[1,:],2)
        d2 = np.linalg.norm(X[i,:] - mu[2,:],2)

        y[i] = np.argmin([d0, d1, d2]) # Choose the closest cluster
    
    err = 0
    for i in range(k): # Calculate the error
        mu[i,:] = np.mean(X[np.where(y == i)[0]], axis=0)
        err += np.linalg.norm(pre_mu[i,:] - mu[i,:],2)
    
    pre_mu = mu.copy()
    
    if err < tolerance:
        print("Iteration:", iter_num) # if stopping criteria is satisfied, break the loop and print iteration number
        break
        
    if iter_num > 9999:
        break

Iteration: 9

color_list = ['rgba(0, 230, 138, .9)','rgba(255, 102, 102, .9)', 'rgba(51, 133, 255, .9)' ]

data = []
for i in np.arange(3):
    trace = go.Scatter(
        x = X[np.where(y==i)[0]][:,0],
        y = X[np.where(y==i)[0]][:,1],
        name = 'Cluster_' + str(i+1),
        mode = 'markers',
        marker = dict(
            size = 10,
            color = color_list[i],
        )
    )
    data.append(trace)

fig = dict(data=data)
py.iplot(fig, filename='clustering_result')

Making the function for arbitrary k¶

def k_means(X,k):
    m = X.shape[0]

    # ramdomly initialize mean points
    mu = X[np.random.randint(0,m,k),:]
    pre_mu = mu.copy()
    y = np.empty([m,1])
    tolerance = 1e-15 # Setting the tolerance(stopping criteria), I arbitrarily put this value
    iter_num = 0

    while True:
        iter_num += 1
        for i in range(m):
            distance = []
            for j in range(k):
                distance.append(np.linalg.norm(X[i,:] - mu[j,:],2)) # use 2-norm(Euclidean Distance) as a distance measure
            y[i] = np.argmin(distance) # Choose the closest cluster

        err = 0
        for i in range(k): # Calculate the error
            mu[i,:] = np.mean(X[np.where(y == i)[0]], axis=0)
            err += np.linalg.norm(pre_mu[i,:] - mu[i,:],2)

        pre_mu = mu.copy()

        if err < tolerance:
            print("Iteration:", iter_num) # if stopping criteria is satisfied, break the loop and print iteration number
            data = []
            for i in np.arange(k):
                trace = go.Scatter(
                    x = X[np.where(y==i)[0]][:,0],
                    y = X[np.where(y==i)[0]][:,1],
                    name = 'Cluster_' + str(i+1),
                    mode = 'markers',
                    marker = dict(
                        size = 10,
                    )
                )
                data.append(trace)
            
            jagota_measure=0
            y = pd.Series(y[:,0])
            for i in range(m): # calculating Jagota Measure(Q measure)
                num = len(y.where(y==y[i]).dropna())
                jagota_measure += np.linalg.norm(X[i,:] - mu[int(y[i]),:],2) / num
            
            fig = dict(data=data)
            return(fig,jagota_measure)
            
            break

        if iter_num > 9999:
            break

Result for k=3¶

py.iplot(k_means(X,3)[0])

Iteration: 8

Jagota Measure for k=3

k_means(X,3)[1]

Iteration: 10

1.2582208225490474

Result for k=5¶

py.iplot(k_means(X,5))

Iteration: 7

Jagota Measure for k=5

k_means(X,5)[1]

Iteration: 7

1.7060376729133013

Plot Q measure for different k¶

x = np.arange(6)+2
y = []
for k in x:
    y.append(k_means(X,k)[1])

Iteration: 8
Iteration: 9
Iteration: 7
Iteration: 12
Iteration: 8
Iteration: 8

plt.figure(figsize=(14,7))
plt.title('Q measure for k-means clustering')
plt.xlabel('k')
plt.ylabel('Jagota Measure(=Q)')
plt.plot(x,y)
plt.show()