k-means算法思想较简单,说的通俗易懂点就是物以类聚,花了一点时间在python中实现k-means算法,k-means算法有本身的缺点,比如说k初始位置的选择,针对这个有不少人提出k-means++算法进行改进;另外一种是要对k大小的选择也没有很完善的理论,针对这个比较经典的理论是轮廓系数,二分聚类的算法确定k的大小,在最后还写了二分聚类算法的实现,代码主要参考机器学习实战那本书:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
#encoding:utf-8
'''''
Created on 2015年9月21日
@author: ZHOUMEIXU204
'''
path = u "D:\\Users\\zhoumeixu204\\Desktop\\python语言机器学习\\机器学习实战代码 python\\机器学习实战代码\\machinelearninginaction\\Ch10\\"
import numpy as np
def loadDataSet(fileName): #读取数据
dataMat = []
fr = open (fileName)
for line in fr.readlines():
curLine = line.strip().split( '\t' )
fltLine = map ( float ,curLine)
dataMat.append(fltLine)
return dataMat
def distEclud(vecA,vecB): #计算距离
return np.sqrt(np. sum (np.power(vecA - vecB, 2 )))
def randCent(dataSet,k): #构建镞质心
n = np.shape(dataSet)[ 1 ]
centroids = np.mat(np.zeros((k,n)))
for j in range (n):
minJ = np. min (dataSet[:,j])
rangeJ = float (np. max (dataSet[:,j]) - minJ)
centroids[:,j] = minJ + rangeJ * np.random.rand(k, 1 )
return centroids
dataMat = np.mat(loadDataSet(path + 'testSet.txt' ))
print (dataMat[:, 0 ])
# 所有数都比-inf大
# 所有数都比+inf小
def kMeans(dataSet,k,distMeas = distEclud,createCent = randCent):
m = np.shape(dataSet)[ 0 ]
clusterAssment = np.mat(np.zeros((m, 2 )))
centroids = createCent(dataSet,k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range (m):
minDist = np.inf;minIndex = - 1 #np.inf表示无穷大
for j in range (k):
distJI = distMeas(centroids[j,:],dataSet[i,:])
if distJI
minDist = distJI;minIndex = j
if clusterAssment[i, 0 ]! = minIndex:clusterChanged = True
clusterAssment[i,:] = minIndex,minDist * * 2
print centroids
for cent in range (k):
ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0 ].A = = cent)[ 0 ]] #[0]这里取0是指去除坐标索引值,结果会有两个
#np.nonzero函数,寻找非0元素的下标 nz=np.nonzero([1,2,3,0,0,4,0])结果为0,1,2
centroids[cent,:] = np.mean(ptsInClust,axis = 0 )
return centroids,clusterAssment
myCentroids,clustAssing = kMeans(dataMat, 4 )
print (myCentroids,clustAssing)
#二分均值聚类(bisecting k-means)
def biKmeans(dataSet,k,distMeas = distEclud):
m = np.shape(dataSet)[ 0 ]
clusterAssment = np.mat(np.zeros((m, 2 )))
centroid0 = np.mean(dataSet,axis = 0 ).tolist()[ 0 ]
centList = [centroid0]
for j in range (m):
clusterAssment[j, 1 ] = distMeas(np.mat(centroid0),dataSet[j,:]) * * 2
while ( len (centList)
lowestSSE = np.Inf
for i in range ( len (centList)):
ptsInCurrCluster = dataSet[np.nonzero(clusterAssment[:, 0 ].A = = i)[ 0 ],:]
centroidMat,splitClusAss = kMeans(ptsInCurrCluster, 2 ,distMeas)
sseSplit = np. sum (splitClusAss[:, 1 ])
sseNotSplit = np. sum (clusterAssment[np.nonzero(clusterAssment[:, 0 ].A! = i)[ 0 ], 1 ])
print "sseSplit, and notSplit:" ,sseSplit,sseNotSplit
if (sseSplit + sseNotSplit)
bestCenToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClusAss.copy()
lowestSSE = sseSplit + sseNotSplit
bestClustAss[np.nonzero(bestClustAss[:, 0 ].A = = 1 )[ 0 ], 0 ] = len (centList)
bestClustAss[np.nonzero(bestClustAss[:, 0 ].A = = 0 )[ 0 ], 0 ] = bestCenToSplit
print "the bestCentToSplit is:" ,bestCenToSplit
print 'the len of bestClustAss is:' , len (bestClustAss)
centList[bestCenToSplit] = bestNewCents[ 0 ,:]
centList.append(bestNewCents[ 1 ,:])
clusterAssment[np.nonzero(clusterAssment[:, 0 ].A = = bestCenToSplit)[ 0 ],:] = bestClustAss
return centList,clusterAssment
print (u "二分聚类分析结果开始" )
dataMat3 = np.mat(loadDataSet(path + 'testSet2.txt' ))
centList,myNewAssments = biKmeans(dataMat3, 3 )
print (centList)
|
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:http://blog.csdn.net/luoyexuge/article/details/49105177