#coding=utf-8 from __future__ import division from nltk.book import * # 打印文本名称 print text1 # 查找文本中的词monstrous print text2.concordance("monstrous") # 查找与词monstrous有相似上下文的其他词 print text2.similar("monstrous") # 查找共用两个以上词汇的上下文 print text2.common_contexts(["monstrous", "very"]) # 显示某些词在文本中的分布图 text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"]) # 获取文本中所有标识符(单词标点)数量 print len(text3) # 获取文本中项目类型(单词标点去重后)列表及数量 print sorted(set(text3)) print len(set(text3)) # 获取文本中每个词的平均使用次数 print len(text3)/len(set(text3)) # 获取某单词在文本中出现的次数并写为函数 print text3.count("smote") def lexical_diversity(text): return len(text)/len(set(text)) # 获取某单词在文本中占据的百分比并写为函数 print 100 * text4.count("a")/len(text4) def percentage(count, total): return 100 * count/total # 将文本当作词列表并进行操作 sent1 = ["Call", "me", "Zty", "."] sent2 = ["Hello NLP!"] print len(sent1) print lexical_diversity(sent1) print sent1 + sent2 sent1.append("some") print sent1 print text4[173] print text4.index("awaken") print text5[16715:16735] sent = ["1","2","3","4","5","6","7","8","9","10"] print sent[0], sent[9] print sent[5:8], sent[:3] print text2[141525:] sent[0] = "first" sent[9] = "last" sent[1:9] = ["second", "third"] print sent my_sent = ["bold", "Sir", "Robin"] print sorted(my_sent) name = "Monty" print name[0] print name[:4] print name*2 print name+"!" print " ".join(["Monty", "Python"]) print "Monty Python".split() # 利用FreqDist寻找文本中最常见的50个词 fdist1 = FreqDist(text1) print fdist1 print len(text1) print len(set(text1)) vocab1 = fdist1.keys() print vocab1[:50] print fdist1["whale"] # 显示累积频率图 fdist1.plot(50, cumulative=True) # 输出低频词 print fdist1.hapaxes() # 获取长度大于15个字符的词 V = set(text1) long_words = [w for w in V if len(w)>15] print sorted(long_words) # 获取长度超过7个字符并且出现次数超过7次的词 fdist5 = FreqDist(text5) print sorted([w for w in set(text5) if len(w)>7 and fdist5[w]>7]) # 获取文本中双连词 print text4.collocations() # 获取文本词长分布 ls = [len(w) for w in text1] fdist = FreqDist(ls) print fdist.keys() print fdist.items() print fdist.max() print fdist[3] print fdist.freq(3)