Shlukování

In [7]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
import math;

X = np.array([[1, 2],
              [1.5, 1.8],
              [5, 8 ],
              [8, 8],
              [9,9],
              [1, 0.6],
              [9,11]])

plt.scatter(X[:,0], X[:,1], s=150)
plt.show()

K-means

In [8]:
class K_Means:
    def __init__(self, k=2, tol=0.001, max_iter=300):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter

    def fit(self,data):

        self.centroids = {}

        for i in range(self.k):
            self.centroids[i] = data[i]

        for i in range(self.max_iter):
            self.classifications = {}

            for i in range(self.k):
                self.classifications[i] = []

            for featureset in data:
                distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classifications[classification].append(featureset)

            for classification in self.classifications:
                self.centroids[classification] = np.average(self.classifications[classification],axis=0)
In [9]:
clust = K_Means()
clust.fit(X);
print(clust.classifications[0])
print(clust.classifications[1])
[array([ 1.,  2.]), array([ 1.5,  1.8]), array([ 1. ,  0.6])]
[array([ 5.,  8.]), array([ 8.,  8.]), array([ 9.,  9.]), array([  9.,  11.])]
In [10]:
plt.scatter([x[0] for x in clust.classifications[0]],[x[1] for x in clust.classifications[0]], s=150)
plt.scatter([x[0] for x in clust.classifications[1]],[x[1] for x in clust.classifications[1]], s=150, marker='o')
plt.show()
In [11]:
X = np.array([[1, 2],
              [1.5, 1.8],
              [5, 8 ],
              [8, 8],
              [9, 9],
              [1, 0.6],
              [9,111]])

plt.scatter(X[:,0], X[:,1], s=150)
plt.show()
In [12]:
clust = K_Means()
clust.fit(X);
plt.scatter([x[0] for x in clust.classifications[0]],[x[1] for x in clust.classifications[0]], s=150)
plt.scatter([x[0] for x in clust.classifications[1]],[x[1] for x in clust.classifications[1]], s=150, marker='o')
plt.show()

K-medoids

In [13]:
class K_Medoids:
    def __init__(self, k=2, tol=0.001, max_iter=300):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter

    def fit(self,data):

        self.medoids = {}

        for i in range(self.k):
            self.medoids[i] = data[i]

        for i in range(self.max_iter):
            self.classifications = {}

            for i in range(self.k):
                self.classifications[i] = []

            for featureset in data:
                distances = [np.linalg.norm(featureset-self.medoids[medoid]) for medoid in self.medoids]
                classification = distances.index(min(distances))
                self.classifications[classification].append(featureset)

            for classification in self.classifications:
                                
                class_medoid = self.classifications[classification][0]
                class_dist   = sum([np.linalg.norm(class_medoid-item) for item in self.classifications[classification]])
                
                for item1 in self.classifications[classification][1:]:
                    new_dist = sum([np.linalg.norm(item1-item) for item in self.classifications[classification]])
                    if new_dist < class_dist:
                        class_medoid = item1;
                        
                self.medoids[classification] = class_medoid;
In [14]:
clust = K_Medoids()
clust.fit(X);
print(clust.classifications[0])
print(clust.classifications[1])
[array([ 1.,  2.]), array([ 1.5,  1.8]), array([ 1. ,  0.6])]
[array([ 5.,  8.]), array([ 8.,  8.]), array([ 9.,  9.]), array([   9.,  111.])]
In [15]:
plt.scatter([x[0] for x in clust.classifications[0]],[x[1] for x in clust.classifications[0]], s=150)
plt.scatter([x[0] for x in clust.classifications[1]],[x[1] for x in clust.classifications[1]], s=150, marker='o')
plt.show()

Fuzzy c-means

In [16]:
class fuuzy_c_Means:
    def __init__(self, k=2, tol=0.001, max_iter=100):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter
        self.m = 2

    def deg(self,dists,index):        
        suma = 0
        for i in range(self.k):
            suma += (dists[index]/dists[i])**(2/(self.m -1))
        return 1/suma
    
    def wavg(self,data,index):
        
        total = 0
        suma  = 0
        
        for item,featureset in zip(self.classifications,data):
            total += item[index]**self.m
            suma  += featureset * item[index]**self.m
          
        
        return suma/total;
    

    def fit(self,data):
        
        self.centroids = {}

        self.classifications = [[] for item in data]

        for i in range(self.k):
            self.centroids[i] = data[4*i]+1

        for i in range(self.max_iter):

         
            for j,featureset in enumerate(data):
                dists = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in self.centroids]
                                
                degrees  = [self.deg(dists,i) for i in range(self.k)]
                self.classifications[j] = degrees


            for kk in range(self.k):
                self.centroids[kk] = self.wavg(data,kk)
In [17]:
clust = fuuzy_c_Means()
clust.fit(X);
print(clust.classifications)
print(clust.centroids)
[[0.99841696054780082, 0.0015830394521991712], [0.99857127547585633, 0.0014287245241435991], [0.99904175955394481, 0.00095824044605512357], [0.99777006817421243, 0.0022299318257876441], [0.99622495273040401, 0.0037750472695960015], [0.99763750800881856, 0.0023624919911815717], [8.9294434303703944e-10, 0.99999999910705561]]
{0: array([ 4.24657498,  4.89793723]), 1: array([   8.99991135,  110.9968275 ])}
In [32]:
my_data = np.genfromtxt('s1.txt', delimiter=',')[:1500,:]
In [33]:
my_data
Out[33]:
array([[ 664159.,  550946.],
       [ 665845.,  557965.],
       [ 597173.,  575538.],
       ..., 
       [ 846720.,  162060.],
       [ 871544.,  144135.],
       [ 849003.,  115373.]])
In [34]:
np.shape(my_data)
Out[34]:
(1500, 2)
In [35]:
plt.scatter(my_data[:,0], my_data[:,1], s=150)
plt.show()
In [36]:
clust = K_Means()
clust.k = 3
clust.fit(my_data)
In [37]:
plt.scatter([x[0] for x in clust.classifications[0]],[x[1] for x in clust.classifications[0]], s=150)
plt.scatter([x[0] for x in clust.classifications[1]],[x[1] for x in clust.classifications[1]], s=150, marker='o')
plt.scatter([x[0] for x in clust.classifications[2]],[x[1] for x in clust.classifications[2]], s=150, marker='x')
plt.show()
In [39]:
clust.k = 4
clust.fit(my_data)
In [41]:
plt.scatter([x[0] for x in clust.classifications[0]],[x[1] for x in clust.classifications[0]], s=150)
plt.scatter([x[0] for x in clust.classifications[1]],[x[1] for x in clust.classifications[1]], s=150, marker='o')
plt.scatter([x[0] for x in clust.classifications[2]],[x[1] for x in clust.classifications[2]], s=150, marker='x')
plt.scatter([x[0] for x in clust.classifications[3]],[x[1] for x in clust.classifications[3]], s=150, marker='s')
plt.show()