基于朴素贝叶斯算法——进行文档分类

时间:2022-02-17 17:28:37

使用贝叶斯进行文档分类

贝叶斯的核心思想:选择具有最高概率的决策
应用贝叶斯准则得到:
p(c1|x,y)=(p(x,y|ci))(p(ci)p(x,y)
如果 p(c1|x,y)>p(c2|x,y) ,那么属于类别 c1
如果 p(c1|x,y)<p(c2|x,y) ,那么属于类别 c2

准备数据:从文本中构建词汇量

def loadDataSet():
'''函数loadDataSet:导入数据集和分类

parameters:无

return:postingList:数据集例子
classVec:对应数据集的分类
'''

postingList=[['my','dog','has','flea','promblems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec=[0,1,0,1,0,1]
return postingList,classVec

def createVocabList(dataSet):
'''函数createVocabList建立一个存在现有单词的总集

parameters:dataSet:之前函数中的数据集

return:vocabList:现有单词的总集
'''

vocabList=set([])
for i in dataSet:
vocabList = vocabList | set(i)
vocabList = list(vocabList)
return vocabList

def setOfWords2Vec(vocabList,inputSet):
'''函数setOfWords2Vec输出文档0-1向量

parameters:vocabList:单词总集
inputSet:某个单词list

return:returnVec:文档list对应的0-1向量
'''

returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print("the word: %s is not in my Vocabulary!"%word)
return returnVec

训练算法:从词汇两计算概率

def trainNB0(trainMatrix,trainCategory):
'''函数trainNB0是朴素贝叶斯分类器训练函数

parameters:trainMatrix:数据集0-1文档矩阵
trainCategory:文档类别标签0-1向量

return:p0Vect,p1Vect:分别是在分类0,1情况下的条件概况list
pAbusive:分类为1的概率
'''

numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = np.sum(trainCategory)/numTrainDocs
p0Num = np.ones(numWords)
p1Num = np.ones(numWords)
#`ones`是为了防止概率值为0的情况
p0Denom = 2
p1Denom = 2
#`=2`是为了防止概率值为0的情况
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += np.sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += np.sum(trainMatrix[i])
p1Vect = np.log(p1Num/p1Denom)
p0Vect = np.log(p0Num/p0Denom)
#`log`防止乘法遇到特别小的数,用log转化为加法
return p0Vect,p1Vect,pAbusive

测试算法:根据现实情况修改分类器

def classifyNB(vec2Classify,p0Vect,p1Vect,pClass1):
'''函数classifyNB是贝叶斯分类函数

parameters:vec2Classify:想测试的词汇list
p0Vect,p1Vect:分别是在分类0,1情况下的条件概况list
pClass1:分类为1的概率

return:最后分类
'''

p1 = np.sum(vec2Classify*p1Vect)+np.log(pClass1)
p0 = np.sum(vec2Classify*p0Vect)+np.log(1-pClass1)
if p1>p0:
return 1
else:
return 0

def testingNB():
'''函数testingNB利用之前的所有函数对新的词汇list进行分类

parameters: 无

return:print输出
'''

import numpy as np
dataSet,classVec = loadDataSet()
myvocabList = createVocabList(dataSet)
trainMat=[]
for postinDoc in dataSet:
trainMat.append(setOfWords2Vec(myvocabList,postinDoc))
p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(classVec))
testEntry = ['love','my','dalmation']
thisDoc = np.array(setOfWords2Vec(myvocabList,testEntry))
print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
testEntry = ['stupid','garbage']
thisDoc = np.array(setOfWords2Vec(myvocabList,testEntry))
print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))

最后的测试

if __name__ == "__main__":
testingNB()

基于朴素贝叶斯算法——进行文档分类

测试算法:使用朴素贝叶斯进行交叉验证

import bayes
import numpy as np
from imp import reload

reload(bayes)

def textParse(bigString):
'''函数textPares对文本字符串切分

parameters:bigString:输入的txt文件

return:被分割的字符串列表(小写,长度大于2)
'''

import re
listOfTokens = re.split(r'\W*',bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
'''函数spamTest使用朴素贝叶斯进行交叉验证

parameters:无

return:错误率
'''

docList=[]
classList=[]
fullText=[]
for i in range(1,26):
wordList=textParse(open("email/spam/%d.txt" % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList=textParse(open("email/ham/%d.txt" % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = bayes.createVocabList(docList)
trainingSet = list(range(50))
testSet=[]
for i in range(10):
randIndex = int(np.random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[]
trainClasses=[]
for docIndex in trainingSet:
trainMat.append(bayes.setOfWords2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = bayes.trainNB0(np.array(trainMat),np.array(trainClasses))
errorCount=0
for docIndex in testSet:
wordVector = bayes.setOfWords2Vec(vocabList,docList[docIndex])
if bayes.classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print("the error rate is:",errorCount/len(testSet))

spamTest()

输出:

基于朴素贝叶斯算法——进行文档分类

以上过程重复多次,比如说10次,然后求平均值,获取平均错误率