利用Python实现高斯混合模型(GMM)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2018/7/19 10:17
# @Author : DaiPuwei
# @FileName:
# @Software: PyCharm
# @E-mail :771830171@
# @Blog :/qq_30091945
import numpy as np
import matplotlib as mpl
import as plt
from import load_iris
from import Normalizer
from import accuracy_score
class GMM:
def __init__(self,Data,K,weights = None,means = None,covars = None):
"""
这是GMM(高斯混合模型)类的构造函数
:param Data: 训练数据
:param K: 高斯分布的个数
:param weigths: 每个高斯分布的初始概率(权重)
:param means: 高斯分布的均值向量
:param covars: 高斯分布的协方差矩阵集合
"""
= Data
= K
if weights is not None:
= weights
else:
= ()
/= () # 归一化
col = ()[1]
if means is not None:
= means
else:
= []
for i in range():
mean = (col)
#mean = mean / (mean) # 归一化
(mean)
if covars is not None:
= covars
else:
= []
for i in range():
cov = (col,col)
#cov = cov / (cov) # 归一化
(cov) # cov是,但是是list
def Gaussian(self,x,mean,cov):
"""
这是自定义的高斯分布概率密度函数
:param x: 输入数据
:param mean: 均值数组
:param cov: 协方差矩阵
:return: x的概率
"""
dim = (cov)[0]
# cov的行列式为零时的措施
covdet = (cov + (dim) * 0.001)
covinv = (cov + (dim) * 0.001)
xdiff = (x - mean).reshape((1,dim))
# 概率密度
prob = 1.0/(((2*,dim)*(covdet),0.5))*\
(-0.5*(covinv).dot())[0][0]
return prob
def GMM_EM(self):
"""
这是利用EM算法进行优化GMM参数的函数
:return: 返回各组数据的属于每个分类的概率
"""
loglikelyhood = 0
oldloglikelyhood = 1
len,dim = ()
# gamma表示第n个样本属于第k个混合高斯的概率
gammas = [() for i in range(len)]
while (loglikelyhood-oldloglikelyhood) > 0.00000001:
oldloglikelyhood = loglikelyhood
# E-step
for n in range(len):
# respons是GMM的EM算法中的权重w,即后验概率
respons = [[k] * ([n], [k], [k])
for k in range()]
respons = (respons)
sum_respons = (respons)
gammas[n] = respons/sum_respons
# M-step
for k in range():
#nk表示N个样本中有多少属于第k个高斯
nk = ([gammas[n][k] for n in range(len)])
# 更新每个高斯分布的概率
[k] = 1.0 * nk / len
# 更新高斯分布的均值
[k] = (1.0/nk) * ([gammas[n][k] * [n] for n in range(len)], axis=0)
xdiffs = - [k]
# 更新高斯分布的协方差矩阵
[k] = (1.0/nk)*([gammas[n][k]*xdiffs[n].reshape((dim,1)).dot(xdiffs[n].reshape((1,dim))) for n in range(len)],axis=0)
loglikelyhood = []
for n in range(len):
tmp = [([k]*([n],[k],[k])) for k in range()]
tmp = ((tmp))
(list(tmp))
loglikelyhood = (loglikelyhood)
for i in range(len):
gammas[i] = gammas[i]/(gammas[i])
= gammas
= [(gammas[i]) for i in range(len)]
def run_main():
"""
这是主函数
"""
# 导入Iris数据集
iris = load_iris()
label = ()
data = ()
print("Iris数据集的标签:\n",label)
# 对数据进行预处理
data = Normalizer().fit_transform(data)
# 解决画图是的中文乱码问题
['-serif'] = [u'simHei']
['axes.unicode_minus'] = False
# 数据可视化
(data[:,0],data[:,1],c = label)
("Iris数据集显示")
()
# GMM模型
K = 3
gmm = GMM(data,K)
gmm.GMM_EM()
y_pre =
print("GMM预测结果:\n",y_pre)
print("GMM正确率为:\n",accuracy_score(label,y_pre))
(data[:, 0], data[:, 1], c=y_pre)
("GMM结果显示")
()
if __name__ == '__main__':
run_main()