词频统计设计的改进

时间:2022-11-22 03:16:51
 1 package zuoye1;
2
3 import java.io.BufferedReader;
4 import java.io.FileNotFoundException;
5 import java.io.FileReader;
6 import java.io.IOException;
7 import java.util.ArrayList;
8 import java.util.Collections;
9 import java.util.Comparator;
10 import java.util.HashMap;
11 import java.util.List;
12 import java.util.Map;
13 import java.util.StringTokenizer;
14 import java.util.Map.Entry;
15
16 public class FileWord {
17
18 /**
19 * 读入文件,实现词频统计
20 */
21 public static void main(String[] args) {
22 HashMap<String,Integer> map=new HashMap<String,Integer>();//用于统计各个单词的个数,排序
23 //过滤字符串中的所有标点符号
24 String regex=" ?.!:,\"\"'';\n";
25 BufferedReader br;
26 try {
27 //FileReader类创建了一个可以读取文件内容的Reader类、调用构造方法FileReader()
28 br = new BufferedReader(new FileReader("c:\\english.txt"));//文件完整路径
29 String sentence;
30 int wordCount = 0;
31 try {
32 while((sentence = br.readLine()) !=null){ //用readLine读取文件,判断读取文件是否为空
33 sentence = sentence.replaceAll(regex, "");
34 StringTokenizer token=new StringTokenizer(sentence);
35 while(token.hasMoreTokens()){ //循环遍历
36 wordCount++;
37 String word = token.nextToken();
38 if(map.containsKey(word)){ //HashMap不允许重复的key,所以利用这个特性,去统计单词的个数
39 int count=map.get(word);
40 map.put(word, count+1); //如果HashMap已有这个单词,则设置它的数量加1
41 }
42 else{
43 map.put(word, 1); //如果没有这个单词,则新填入,数量为1
44 }
45 }
46 }
47 System.out.println("总共单词数:"+wordCount);
48 sort(map);
49 } catch (IOException e) {
50 e.printStackTrace();
51 }
52 }catch(FileNotFoundException e) {
53 e.printStackTrace();
54 }
55 }
56 //排序
57 public static void sort(HashMap<String,Integer> map){
58 //声明集合folder,存放单词和单词个数
59 List<Map.Entry<String, Integer>> folder = new ArrayList<Map.Entry<String, Integer>>(map.entrySet());
60 Collections.sort(folder, new Comparator<Map.Entry<String, Integer>>() {
61 public int compare(Map.Entry<String, Integer> obj1, Map.Entry<String, Integer> obj2) {
62 return (obj2.getValue() - obj1.getValue());
63 }
64 });
65 //输出
66 for (int i = 0; i < folder.size(); i++) {
67 Entry<String, Integer> en = folder.get(i);
68 System.out.println(en.getKey()+":"+en.getValue());
69 }
70 }
71 }

实现结果

总共单词数:181
as:
7
the:
7
not:
6
it:
6
to:
5
are:
4
a:
4
your:
4
in:
4
they:
3
live:
3
and:
3
of:
2
do:2
may:
2
by:
2
be:
2
clothes:
2
that:
2
often:
2
have:
2
from:
2
above:
2
is:
2
you:
2
door:
1
its:
1
suppose.It:
1
palace.The:
1
contentedly:
1
snow:
1
friends,Turn:
1
yourself:
1
means.which:
1
or:
1
windows:
1
life,poor:
1
bad:
1
quiet:
1
like:
1
without:
1
thoughts.:
1
simply:
1
abode;the:
1
change.Sell:
1
will:
1
some:
1
fault
-finder:1
herb,like:
1
before:
1
most:
1
I:
1
old,
return:1
trouble:
1
life:
1
change;we:
1
supported:
1
is.You:
1
spring.:
1
me:
1
mind:
1
town;but:
1
there,and:
1
paradise.Love:
1
hardnames.It:
1
is,meet:
1
should:
1
seem:
1
independent:
1
new:1
alms
-house:1
poor
-house.The:1
pleasant,thrilling,glorious:
1
;
do:1
garden:
1
happens:
1
keep:
1
but:
1
However:
1
reflected:
1
being:
1
brightly:
1
enough:
1
Cultivate:
1
any.May:
1
looks:
1
more:
1
sage.Do:
1
town
's:1
when:1
faults:
1
richest.The:
1
disreputable.:
1
think:
1
get:
1
so:
1
much:
1
lives:
1
perhaps:
1
early:
1
things,whether:
1
call:
1
dishonest:
1
sun:
1
shun:
1
melts:
1
setting:
1
them.Things:
1
poverty:
1
poorest:
1
mean:
1
receive:
1
find:
1
hourss,even:
1
thoughts,as:
1
rich:
1
poor:
1
man
's:1
cheering:1
great:
1
see:
1
supporting:
1
themselves:
1
misgiving.Most:
1

 ssh://git@git.coding.net:linliaimeli/FileWord.git

 https://git.coding.net/linliaimeli/FileWord.git