Python 统计文本中单词的个数

1.读文件，通过正则匹配

 def statisticWord():

     line_number = 0

     words_dict = {}

     with open (r'D:\test\test.txt',encoding='utf-8') as a_file:

         for line in a_file:

             words = re.findall(r'&#\d+;|&amp;#\d+;|&\w+;',line)

             for word in words:

                 words_dict[word] = words_dict.get(word,0) + 1 #get the value of word, default is 0

         sort_words_dict = OrderedDict(sorted(words_dict.items(),key = lambda x : x[1], reverse = True))

 #        sort_words_dict = sorted(words_dict, key = operator.itemgetter(1))

         with open(r'D:\test\output.txt',encoding = 'utf-8', mode='w') as b_file:

             for k,v in sort_words_dict.items():

                 b_file.write("%-15s:%15s" % (k,v))

                 b_file.write('\n')

2. 通过命令行参数

def statisticWord2():

    if len(sys.argv) == 1 or sys.argv[1] in {"-h", "--help"}:

        print("usage: filename_1 filename_2 ... filename_n")

        sys.exit()

    else:

        words = {}

        strip = string.whitespace + string.punctuation + string.digits + "\"'"

        for filename in sys.argv[1:]:

            for line in open(filename):

                for word in line.split():

                    word = word.strip(strip) # remove all the combination of strip in prefix or suffix

                    if len(word) >= 2:

                        words[word] = words.get(word, 0) + 1

        for word in sorted(words):

            print("'{0}' occurs {1} times".format(word,words[word]))

秒客网

Python 统计文本中单词的个数

相关文章