R语言 模糊c均值(FCM)算法程序(转)

时间:2022-01-24 23:04:02
FCM <- function(x, K, mybeta = 2, nstart = 1, iter_max = 100, eps = 1e-06) {
## FCM

## INPUTS
## x: input matrix n*d, n d-dim samples
## K: number of desired clusters
## Optional :
## mybeta : beta, exponent for u (defaut 2).
## nstart: how many random sets should be chosen(defaut 1)
## iter_max : The maximum number of iterations allowed. (default 100)

##
## OUTPUTS
## u: The fuzzy membership matrix = maxtrix of size n*K;
## g: matrix of size K*d of the centers of the clusters
## J: objective function
## histJ: all the objective function values in the iter process

## modified time: 2015-02-07

FCM_onetime
<- function(x, init_centers, mybeta = 2, iter_max = 100, eps = 1e-06) {
n
= dim(x)[1]
d
= dim(x)[2]
g
= init_centers
K
= dim(g)[1]
histJ
= c()
pasfini
= 1
Jold
= Inf
D
= matrix(0, n, K)
for (j in 1:K) {
D[, j]
= rowSums(sweep(x, 2, g[j, ], "-")^2)
}
iter
= 1
J_old
= Inf
while (pasfini) {
s
= (1/(D + eps))^(1/(mybeta - 1))
u
= s/(s %*% matrix(1, K, K))
t1
= t(u^mybeta) %*% x
t2
= t(u^mybeta) %*% matrix(1, n, d)
V
= t1/t2
g
= V
D
= matrix(0, n, K)
for (j in 1:K) {
D[, j]
= rowSums(sweep(x, 2, g[j, ], "-")^2)
}
J
= sum(u^mybeta * D)
pasfini
= abs(J - Jold) > 0.001 && (iter < iter_max)
Jold
= J
histJ
= c(histJ, J)
iter
= iter + 1
}
cluster_id
= apply(u, 1, which.max)
re
= list(u, J, histJ, g, cluster_id)
names(re)
= c("u", "J", "histJ", "g", "cluster_id")
return(re)
}
x
= as.matrix(x)
seeds
= 1:nrow(x)
id
= sample(seeds, K)
g
= as.matrix(x[id, ])
re_best
= FCM_onetime(x = x, init_centers = g, mybeta = mybeta, iter_max = iter_max, eps = eps)
if (nstart > 1) {
minJ
= 0
i
= 2
while (i <= nstart) {
init_centers_id
= sample(seeds, K)
init_centers
= as.matrix(x[init_centers_id, ])
run
= FCM_onetime(x, init_centers = init_centers, mybeta = mybeta, iter_max = iter_max)
if (run$J <= re_best$J) {
re_best
= run
}
i
= i + 1
}
}
return(re_best)
}

 

# 对于模糊聚类均值的公式及其推到,大致如下:

#主要代码参见下面:(其中使用kmeans作比较。然后通过svm分类测验训练)
#
设置伪随机种子
set.seed(100)

# 生产数据样本
simple.data = function (n=200, nclass=2)
{
require(clusterGeneration)
require(mvtnorm)
# Center of Gaussians
xpos = seq(-nclass*2, nclass*2, length=nclass)
ypos
= runif(nclass, min=-2*nclass, max=2*nclass)

func
= function(i,xpos,ypos,n) {
# Create a random covariance matrix
cov = genPositiveDefMat(2, covMethod="eigen",
rangeVar
=c(1, 10), lambdaLow=1, ratioLambda=10)
# 保存随机数据
data = rmvnorm(n=n, mean=c(xpos[i], ypos[i]), sigma=cov$Sigma)
# 保存每一次的结果
list(means=cbind(xpos[i], ypos[i]), covars=cov$Sigma, data=data,class=rep(i*1.0, n))
}
# do call 合并列表 为 数据框
strL=do.call(rbind,lapply(1:nclass,func,xpos,ypos,n))
data
=list()
data$means
=do.call(rbind,strL[,1])
data$covars
= as.list(strL[,2])
data$data
=do.call(rbind,strL[,3])
data$
class=do.call(c,strL[,4])
# 返回
data
}

# 第一次随机产生u值 nr点个数 k 类别数
random.uij = function(k,nr)
{
#
u = matrix(data=round(runif(k*nr,10,20)),nrow=k,ncol=nr,
dimnames
=list(paste('u',1:k,sep=""),paste('x',1:nr,sep='')))
tempu
= function(x)
{
ret
= round(x/sum(x),4)
# 保证每一列之和为1
ret[1] = 1-sum(ret[-1])
ret
}
apply(u,
2,tempu)
}

# 计算 点矩阵 到 中心的距离
dist_cc_dd = function(cc,dd)
{
# cc 为 中心点 dd 为样本点值
temp = function(cc,dd)
{
# 计算每一个中心点与每一个点的距离
temp1 = function(index)
{
sqrt(sum(index
^2))
}
# 结果向量以列存放,后面的结果需要转置,按行存储
apply(dd-cc,2,temp1)
}
# 将结果转置
t(apply(cc,1,temp,dd))
}

# 模糊均值聚类
fuzzy.cmeans = function(data,u,m=3)
{
# 简单的判断,可以不要
if (is.array(data) || is.matrix(data))
{
data
= as.data.frame(data)
}

# nr = nrow(data)
#
nc = ncol(data)

# while (J > lim && step < steps)
#
{
#
step = step + 1
# uij 的 m 次幂
um = u^m
rowsum
= apply(um,1,sum)
# 求中心点 ci
cc = as.matrix(um/rowsum) %*% as.matrix(data)
# rownames(cc)=paste('c',1:k,sep='')
# colnames(cc)=paste('x',1:nc,sep='')
# 计算 J 值
distance = dist_cc_dd(cc,t(data))
J
= sum(distance^2 * um)
# cc_temp = matrix(rep(cc,each=nr),ncol=2)
# dd_temp = NULL
# lapply(1:k,function(i){dd_temp <<- rbind(dd_temp,data)})
# dist = apply((dd_temp-cc_temp)^2,1,sum)
# um_temp = as.vector(t(um))
# J = um_temp %*% dist


# 计算幂次系数,后面需要使用m != 1
t = -2 / (m - 1)
# 根据公式 计算
tmp = distance^t
colsum
= apply(tmp,2,sum)
mat
= rep(1,nrow(cc)) %*% t(colsum)
# 计算 uij,如此u的每一列之和为0
u = tmp / mat
# }
#
u
# 保存一次迭代的结果值
list(U = u,C = cc,J = J)
}

# 设置初始化参数
n = 100
k
= 4
dat
= simple.data(n,k)
nr
= nrow(dat$data)
m
= 3
limit
= 1e-4
max_itr
=50
# 随机初始化 uij
u = random.uij(k,nr)
results
= list()
data
=dat$data

# 迭代计算 收敛值
for (i in 1 : max_itr)
{
results[[i]]
= fuzzy.cmeans(dat$data,u,m)
if (i != 1 && abs((results[[i]]$J - results[[i-1]]$J)) < limit)
{
break
}
u
= results[[i]]$U
}

# 做散点图
require(ggplot2)
data
=as.data.frame(dat$data,stringsAsFactors=FALSE)
data
=cbind(data,dat$class)
nc
= ncol(data)
colnames(data)
=paste('x',1:nc,sep='')
# par(mar=rep(2,4))
p=ggplot(data,aes(x=x1,y=x2,color=factor(x3)))
p
+geom_point()+xlab('x轴')+ylab('y轴')+ggtitle('scatter points')

# plot(dat$data,col=factor(dat$class))
#
points(results[[i]]$C,pch=19,col=1:uniqe(dat$class))
#
Sys.sleep(1)

# 计算聚类与原始类的误差比率
tclass=apply(results[[i]]$U,2,function(x){which(x==max(x))})
tclass[tclass
==2]=5
tclass[tclass
==3]=6
tclass[tclass
==4]=7
tclass[tclass
==5]=4
tclass[tclass
==6]=2
tclass[tclass
==7]=3

freq
=table(dat$class,tclass)
(sum(freq)
-sum(diag(freq))) / sum(freq)

# 训练 svm model
svm_test = function()
{
library(e1071)
svm.fit
= svm(dat$data,dat$class)
r.fit
= predict(svm.fit, dat$data)
diff.
class = round(as.numeric(r.fit)) - as.numeric(dat$class)
i.misclass
= which(abs(diff.class) > 0)
n.misclass
= length(i.misclass)
f.misclass
= n.misclass/length(dat$class)
}
# 同一数据,使用 kmeans 聚类
kmeans_test = function()
{

k.fit
= kmeans(x=dat$data,4)
tclass
=k.fit$cluster
tclass[tclass
==2]=5
tclass[tclass
==3]=6
tclass[tclass
==4]=7
tclass[tclass
==5]=3
tclass[tclass
==6]=4
tclass[tclass
==7]=2
freq
=table(dat$class,tclass)
(sum(freq)
-sum(diag(freq))) / sum(freq)
}

# kmeans 和 fuzzy c means