本文实例讲述了Python正则表达式匹配中文用法。分享给大家供大家参考,具体如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
#!/usr/bin/python
#-*- coding:cp936-*-#思路,将str转换成unicode,方可用正则表达式,前提是,要知道文件的编码,本例中是gbk
import cPickle as mypickle
import re
import sys
if (__name__ = = '__main__' ):
fid1 = file ( 'demo.txt' , 'r' ); #demo.txt写入字符如:服务器之家
p = re. compile ( '(^\s+|\s+$)' );
phanzigbk = re. compile ( '[\\x20-\\x7f]' );
phanzi = re. compile (u '[\u4e00-\u9fa5]' ); #这里要加u,注意
commlines = fid1.readlines();
fid1.close();
dictfamilyname = {};
dictfirstname = {};
for line in commlines:
line = p.sub('',line);
print type (line);
print line;
uline = unicode (line, 'gbk' );
print type (uline);
candidates = phanzi.findall(uline);
print len (candidates);
if ( len (candidates) = = 2 ):
print candidates[ 0 ];
familynamegbk = candidates[ 0 ].encode( 'gbk' ); #把unicode型的变量变成str型的变量
firstnamegbk = candidates[ 1 ].encode( 'gbk' );
if (dictfamilyname.has_key(familynamegbk)):
dictfamilyname[familynamegbk] = dictfamilyname[familynamegbk] + 1 ;
else :
dictfamilyname[familynamegbk] = 1 ;
if (dictfirstname.has_key(firstnamegbk)):
dictfirstname[firstnamegbk] = dictfirstname[firstnamegbk] + 1 ;
else :
dictfirstname[firstnamegbk] = 1 ;
familynameitems = dictfamilyname.items();
print familynameitems;
firstnameitems = dictfirstname.items();
familynameitems.sort(key = lambda d:d[ 1 ],reverse = True );
firstnameitems.sort(key = lambda d :d[ 1 ],reverse = True );
fid = file ( 'familyname.txt' , 'w' );
for m in familynameitems:
s = m[ 0 ] + '\t' + str (m[ 1 ]);
fid.write(s);
fid.write( '\n' );
fid.close();
fid = file ( 'firstname.txt' , 'w' );
for m in firstnameitems:
s = m[ 0 ] + '\t' + str (m[ 1 ]);
fid.write(s);
fid.write( '\n' );
fid.close();
print 'finish'
|
运行效果图如下:
希望本文所述对大家Python程序设计有所帮助。