伪代码:
将所有点看成一个簇 当簇数目小于k时 对于每一个簇 计算总误差 在给定的簇上面进行k-均值聚类(k=2) 计算将该簇一分为二之后的总误差 选择使得误差最小的那个簇进行划分操作
def biKmeans(dataSet,k): m=np.shape(dataSet)[0] clusterAssment=np.mat(np.zeros((m,2))) centroid0=np.mean(dataSet,axis=0).tolist() centList=[centroid0] for j in range(m): clusterAssment[j,1]=distEclud(centroid0, dataSet[j,:])**2 while (len(centList) < k): lowsetSSE=np.inf for i in range(len(centList)): ptsInCurrCluster=dataSet[np.nonzero(clusterAssment[:,0].A==i)[0],:] centroidMat,splitClusterAss=kMeans(ptsInCurrCluster, 2) sseSplit=np.sum(splitClusterAss[:,1]) sseNotSplit=np.sum(clusterAssment[np.nonzero(clusterAssment[:,0].A!=i)[0],1]) if sseSplit+sseNotSplit<lowsetSSE: bestCentToSplit=i bestNewCents=centroidMat bestClustAss=splitClusterAss.copy() lowsetSSE=sseSplit+sseNotSplit bestClustAss[np.nonzero(bestClustAss[:,0].A==1)[0],0]=len(centList) bestClustAss[np.nonzero(bestClustAss[:,0].A==0)[0],0]=bestCentToSplit centList[bestCentToSplit]=bestNewCents[0,:].A[0] centList.append(bestNewCents[1,:].A[0]) clusterAssment[np.nonzero(clusterAssment[:,0].A== bestCentToSplit)[0],:]=bestClustAss centList=[i.tolist() for i in centList] return np.mat(centList),clusterAssment