#-*- coding: UTF-8 -*- fo = open("D:/PycharmProjects/speaking.txt","r") news=fo.read() fo.close() print(news) # 将分隔符替换为空格 sep = """,."? ! : ; '""" for c in sep: news = news.replace(c, " ") # 将所有大写转换为小写 # 生成单词列表 wordList = news.lower().split() print(wordList) #生成词频统计 wordDict = {} wordSet = set(wordList) for w in wordSet: wordDict[w] = wordList.count(w) for w in wordList: wordDict[w] = wordDict.get(w,0)+1 #排除语法型词汇,代词、冠词、连词 str1=['a','an','more','for','is','of','to','from','or','that','if','the','were','in','s','not','can','get','could','might','up','and','this','t'] for i in str1: del wordDict[i] # 排序 dictList = list(wordDict.items()) dictList.sort(key= lambda x:x[1],reverse=True) #输出词频最大TOP20 for i in range(20): print(wordList[i])
综合练习
词频统计预处理
下载一首英文的歌词或文章
将所有,.?!’:等分隔符全部替换为空格
将所有大写转换为小写
生成单词列表
生成词频统计
排序
排除语法型词汇,代词、冠词、连词
输出词频最大TOP20
将分析对象存为utf-8编码的文件,通过文件读取的方式获得词频分析内容。