python 朴素贝叶斯简单实现

时间:2022-01-13 21:24:51
import reimport maths=['this is yes','that is no']r=[0,1]def tokenize(message):    '''分割单词''' message=message.lower()    all_words = re.findall('[a-z0-9]+', message)    print(all_words)    return set(all_words)word_set = tokenize(' '.join(s))print(word_set)def count_words(training_set):    '''记录单词分别在不同类中出现的次数''' counts={}    for message,is_spam in zip(s,r):        print(message,is_spam)        for word in tokenize(message):            if word not in counts.keys():                counts[word]=[0,0]            counts[word][0 if is_spam else 1]+=1    return countscounts=count_words(s)print(counts)def word_probablities(counts, total_spams, total_non_spams, k=0.5):    '''通过统计各单词在不同类中出现的次数计算出概率''' result=[]    for w,(spam, non_spam) in dict(counts).items():        result.append((w, (spam+k)/(total_spams+2*k), (non_spam+k)/(total_non_spams+2*k) ))    return resultword_prob=word_probablities(counts, 1,1)print(word_prob)def spam_message(word_prob, message):    '''计算新数据概率''' message_words = tokenize(message)    log_prob_spam=0    log_prob_non_spam=0    for word, prob_spam, prob_non_spam in word_prob:        if word in message_words:            # 如果新单词在字典中出现 log_prob_spam+=math.log(prob_spam)            log_prob_non_spam+=math.log(prob_non_spam)        else:            # 如果新单词没有在字典中出现 log_prob_spam+=math.log(1.0-prob_spam)            log_prob_non_spam+=math.log(1.0-prob_non_spam)    prob_is_spam=math.exp(log_prob_spam)    prob_is_non_spam=math.exp(log_prob_non_spam)    return prob_is_spam/(prob_is_spam+prob_is_non_spam)print(spam_message(word_prob, "no is ns"))