机器学习实战——k—均值聚类算法

时间:2022-10-24 19:37:48
from numpy import *
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

 
#读数据,list类型
def loadDataSet(filename):
dataMat=[]
fr=open(filename)
for line in fr.readlines():
curLine=line.strip().split('\t')
fltLine=list(map(float,curLine))
dataMat.append(fltLine)
return dataMat #list类型,必须转换为mat

#计算距离
def disEclud(vecA,vecB):
return sqrt(sum(power(vecA-vecB,2)))

#随机产生质心
def randCent(dataSet,k):
n=shape(dataSet)[1]
centroids=mat(zeros((k,n)))
for j in range(n):
minJ=min(dataSet[:,j])
rangeJ=float(max(dataSet[:,j])-minJ)
centroids[:,j]=minJ+rangeJ*random.rand(k,1)
return centroids

#普通k均值聚类
def kMeans(dataSet,k,distMeas=disEclud,createCent=randCent):
m=shape(dataSet)[0]
clusterAssment=mat(zeros((m,2))) #[索引,距离]
centroids=createCent(dataSet,k) #质心
clusterChanged=True
while clusterChanged:
clusterChanged=False
for i in range(m):
minDist=inf;minIndex=-1 #inf:正无穷
for j in range(k):
distJI=distMeas(centroids[j,:],dataSet[i,:])
if distJI<minDist:
minDist=distJI;minIndex=j
if clusterAssment[i,0]!=minIndex:clusterChanged=True
clusterAssment[i,:]=minIndex,minDist**2
print(centroids)
for cent in range(k):
ptsInClust=dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]
centroids[cent,:]=mean(ptsInClust,axis=0)
return centroids,clusterAssment

#二分法k均值聚类,可消除局部最优问题
def biKmeans(dataSet,k,distMeas=disEclud):
m=shape(dataSet)[0]
clusterAssment=mat(zeros((m,2)))
centroid0=mean(dataSet,axis=0).tolist()[0]
centList=[centroid0]
for j in range(m):
clusterAssment[j,1]=distMeas(mat(centroid0),dataSet[j,:])**2
while(len(centList)<k):
lowestSSE=inf
for i in range(len(centList)):
ptsInCurrCluster=dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]
centroidMat,splitClustAss=kMeans(ptsInCurrCluster,2,distMeas)
sseSplit=sum(splitClustAss[:,1])
sseNotSplit=sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
print('sseSplit,and notSplit:',sseSplit,sseNotSplit)
if sseNotSplit+sseSplit<lowestSSE:
bestCentToSplit=i
bestNewCents=centroidMat
bestClustAss=splitClustAss.copy()
lowestSSE=sseSplit+sseNotSplit
bestClustAss[nonzero(bestClustAss[:,0].A==1)[0],0]=len(centList)
bestClustAss[nonzero(bestClustAss[:,0].A==0)[0],0]=bestCentToSplit
print('the bestCentToSplit is:',bestCentToSplit)
print('the len of bestClustAss is:',len(bestClustAss))
centList[bestCentToSplit]=bestNewCents[0,:]
centList.append(bestNewCents[1,:])
clusterAssment[nonzero(clusterAssment[:,0].A==bestCentToSplit)[0],:]=bestClustAss
return mat(np.array(centList)),clusterAssment

#优化算法,随机根据数据自动产生k值聚类,无需给定k
def nonk(datSet,k=2): minlen=True x=datSet[:,0].max()-datSet[:,0].min()    y=datSet[:,1].max()-datSet[:,1].min()    distance=(x**2+y**2)/30    while minlen: Disk = [] #cenmat,splitclu=kMeans(datSet,k)        cenmat, splitclu = biKmeans(datSet, k)        for i in range(k): d=splitclu[nonzero(splitclu[:,0].A==i)[0],1].max()            Disk.append(d)        if max(Disk)>distance: k+=1            minlen=True else:minlen=False print('k=%d,dis=%f' %(k,distance))    return cenmat,splitclu,k
#数据可视化,仅针对2维数据:
def pict():
#datMat=mat(loadDataSet('testSet.txt'))
datMat = mat(file2matrix('result.txt'))
datMat=datMat.getA()
#myce, clu = kMeans(datMat, k)
#myce,clu=biKmeans(datMat,k)
myce, clu,k = nonk(datMat,)
print(clu)
myce=myce.getA()
clu=clu.getA()
fig = plt.figure()
ax = fig.add_subplot(111)
for i in list(range(k)):
indx=where(clu[:,0]==i)
ax.scatter(datMat[indx, 0], datMat[indx, 1], 20 * (i+1))
ax.scatter(myce[:, 0], myce[:, 1],marker='+')
plt.show()
return myce,clu

随机产生a个二维数据,进行k均值聚类:
def Mypict(a):
#datMat = mat(loadDataSet('testSet.txt'))
#datMat = datMat.getA()
# myce, clu = kMeans(datMat, k)
# myce,clu=biKmeans(datMat,k)
#a=random.randint(0,199)
datMat=random.rand(a,2)
myce, clu, k = nonk(datMat )
#print(clu)
myce = myce.getA()
clu = clu.getA()
fig = plt.figure()
ax = fig.add_subplot(111)
for i in list(range(k)):
indx = where(clu[:, 0] == i)
ax.scatter(datMat[indx, 0], datMat[indx, 1], 20 * (i + 1))
ax.scatter(myce[:, 0], myce[:, 1], marker='+')
plt.show()
print('k=%d,a=%d' %(k,a))