import reimport maths=['this is yes','that is no']r=[0,1]def tokenize(message): '''分割单词''' message=message.lower() all_words = re.findall('[a-z0-9]+', message) print(all_words) return set(all_words)word_set = tokenize(' '.join(s))print(word_set)def count_words(training_set): '''记录单词分别在不同类中出现的次数''' counts={} for message,is_spam in zip(s,r): print(message,is_spam) for word in tokenize(message): if word not in counts.keys(): counts[word]=[0,0] counts[word][0 if is_spam else 1]+=1 return countscounts=count_words(s)print(counts)def word_probablities(counts, total_spams, total_non_spams, k=0.5): '''通过统计各单词在不同类中出现的次数计算出概率''' result=[] for w,(spam, non_spam) in dict(counts).items(): result.append((w, (spam+k)/(total_spams+2*k), (non_spam+k)/(total_non_spams+2*k) )) return resultword_prob=word_probablities(counts, 1,1)print(word_prob)def spam_message(word_prob, message): '''计算新数据概率''' message_words = tokenize(message) log_prob_spam=0 log_prob_non_spam=0 for word, prob_spam, prob_non_spam in word_prob: if word in message_words: # 如果新单词在字典中出现 log_prob_spam+=math.log(prob_spam) log_prob_non_spam+=math.log(prob_non_spam) else: # 如果新单词没有在字典中出现 log_prob_spam+=math.log(1.0-prob_spam) log_prob_non_spam+=math.log(1.0-prob_non_spam) prob_is_spam=math.exp(log_prob_spam) prob_is_non_spam=math.exp(log_prob_non_spam) return prob_is_spam/(prob_is_spam+prob_is_non_spam)print(spam_message(word_prob, "no is ns"))