python nltk自然语言视频教程系列24集
https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.137b4d85bDOUz2&id=564944294779
生产Twitter情感分析的模型,并保存数据为pickle,此过程可能要一个小时,所以下次调用数据就很简单了
# -*- coding: utf-8 -*- """ Created on Thu Jan 12 10:44:19 2017 @author: Administrator 用于短评论分析-- Twitter 保存后的"positive.txt","negative.txt"需要转码为utf-8 在线转码网址 http://www.esk365.com/tools/GB2312-UTF8.asp features=5000,准确率百分之60以上 features=10000,准确率百分之 以上 运行时间可能长达一个小时 """ import nltk import random import pickle from nltk.tokenize import word_tokenize short_pos = open("positive.txt","r").read() short_neg = open("negative.txt","r").read() # move this up here documents = [] all_words = [] for r in short_pos.split('\n'): documents.append( (r, "pos") ) for r in short_neg.split('\n'): documents.append( (r, "neg") ) # j is adject, r is adverb, and v is verb #allowed_word_types = ["J","R","V"] 允许形容词类别 allowed_word_types = ["J"] for p in short_pos.split('\n'): documents.append( (p, "pos") ) words = word_tokenize(p) pos = nltk.pos_tag(words) for w in pos: if w[1][0] in allowed_word_types: all_words.append(w[0].lower()) for p in short_neg.split('\n'): documents.append( (p, "neg") ) words = word_tokenize(p) pos = nltk.pos_tag(words) for w in pos: if w[1][0] in allowed_word_types: all_words.append(w[0].lower()) #保存文档 save_documents = open("pickled_algos/documents.pickle","wb") pickle.dump(documents, save_documents) save_documents.close() #保存特征 all_words = nltk.FreqDist(all_words) #最好改成2万以上 word_features = list(all_words.keys())[:5000] save_word_features = open("pickled_algos/word_features5k.pickle","wb") pickle.dump(word_features, save_word_features) save_word_features.close() def find_features(document): words = word_tokenize(document) features = {} for w in word_features: features[w] = (w in words) return features featuresets = [(find_features(rev), category) for (rev, category) in documents] random.shuffle(featuresets) print(len(featuresets)) testing_set = featuresets[10000:] training_set = featuresets[:10000] classifier = nltk.NaiveBayesClassifier.train(training_set) print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) classifier.show_most_informative_features(15) #保存分类器 save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb") pickle.dump(classifier, save_classifier) save_classifier.close()
sentiment_mod.py
# -*- coding: utf-8 -*- """ Created on Thu Jan 12 16:47:51 2017 @author: Administrator """ #File: sentiment_mod.py import nltk import random import pickle from nltk.tokenize import word_tokenize documents_f = open("pickled_algos/documents.pickle", "rb") documents = pickle.load(documents_f) documents_f.close() word_features5k_f = open("pickled_algos/word_features5k.pickle", "rb") word_features = pickle.load(word_features5k_f) word_features5k_f.close() def find_features(document): words = word_tokenize(document) features = {} for w in word_features: features[w] = (w in words) return features featuresets_f = open("pickled_algos/featuresets.pickle", "rb") featuresets = pickle.load(featuresets_f) featuresets_f.close() random.shuffle(featuresets) print(len(featuresets)) testing_set = featuresets[10000:] training_set = featuresets[:10000] open_file = open("pickled_algos/originalnaivebayes5k.pickle", "rb") classifier = pickle.load(open_file) open_file.close() def sentiment(text): feats = find_features(text) return classifier.classify(feats)
测试
# -*- coding: utf-8 -*- """ Created on Thu Jan 12 16:50:12 2017 @author: Administrator """ import sentiment_mod as s print(s.sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!")) print(s.sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))