python3字符串与文本处理

时间:2021-04-11 06:05:45

每个程序都回涉及到文本处理,如拆分字符串、搜索、替换、词法分析等。许多任务都可以通过内建的字符串方法来轻松解决,但更复杂的操作就需要正则表达式来解决。

1、针对任意多的分隔符拆分字符串

In [1]: line = 'asdf fjdk; afed, fjek,asdf,    foo'
#使用正则模块
In [2]: import re
#使用正则split方法可以匹配多分割符
In [3]: re.split(r'[;,\s]\s*',line)
Out[3]: ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
#使用捕获组分割会将匹配的文本也包含在最终结果中
In [4]: re.split(r'(;|,|\s)\s*',line)
Out[4]: ['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']
#如果不想在结果中看到分隔符,可以受用?:的形式使用非捕获组
In [5]: re.split(r'(?:,|;|\s)\s*',line)
Out[5]: ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

2、在字符串的开头或结尾处做文本匹配

In [6]: url = 'https://www.baidu.com'
#检查字符串的结尾,匹配则返回True
In [7]: url.endswith('.com')
Out[7]: True
In [8]: url.endswith('.cn')
Out[8]: False
#检查字符串的开头,匹配则返回true
In [10]: url.startswith('https:')
Out[10]: True
In [11]: url.startswith('http:')
Out[11]: False In [1]: import os
In [2]: filenames = os.listdir('.')
In [3]: filenames
Out[3]:
['.tcshrc',
'.bash_logout',
'.mysql_history',
'Python-3.7.0.tgz',
'.bash_history',
'.cache',
'anaconda-ks.cfg',
'.ipython',
'.cshrc',
'.bashrc',
'.viminfo',
'Python-3.7.0',
'mysql-boost-8.0.12.tar.gz',
'heapq_queue.py',
'mysql-8.0.12',
'.bash_profile']
#应用匹配开头字符串过滤
In [4]: [i for i in filenames if i.startswith('.')]
Out[4]:
['.tcshrc',
'.bash_logout',
'.mysql_history',
'.bash_history',
'.cache',
'.ipython',
'.cshrc',
'.bashrc',
'.viminfo',
'.bash_profile']
#多个结果匹配时使用元组集合
In [5]: [i for i in filenames if i.endswith(('.py','.gz','.tgz'))]
Out[5]: ['Python-3.7.0.tgz', 'mysql-boost-8.0.12.tar.gz', 'heapq_queue.py']
#判断目录中是否有.py结尾的文件
In [6]: any(i.endswith('.py') for i in filenames)
Out[6]: True
#应用列子解析网页或文本内容
from urllib.request import urlopen def read_date(name):
if name.startswith(('http:','https:','ftp:')):
return urlopen(name).read().decode()
else:
with open(name) as f:
return f.read() result = read_date('test.txt')
print(result) #也可以使用正则匹配
In [9]: import re In [10]: re.match('http:|https:|ftp:','https://www.baidu.com')
Out[10]: <re.Match object; span=(0, 6), match='https:'>

3、利用shell通配符做字符串匹配

#利用fnmatch模块中的fnmatch和fnmatchcase函数匹配文本
In [12]: from fnmatch import fnmatch,fnmatchcase In [13]: fnmatch('foo.txt','*.txt')
Out[13]: True In [14]: fnmatch('foo.txt','?oo.txt')
Out[14]: True In [15]: names = ['dat01.csv','dat99.csv','config.ini','foo.py'] In [18]: [i for i in names if fnmatch(i,'dat[0-9]*.csv')]
Out[18]: ['dat01.csv', 'dat99.csv']
#对于Windows操作系统时使用fnmatch函数时,匹配它不区分大小写,这时我们可以使用fnmatchcase函数来代替,它完全按照提供的字符串来匹配
In [2]: fnmatch('foo.txt','*.TXT')
Out[2]: True In [3]: fnmatchcase('foo.txt','*.TXT')
Out[3]: False #推倒式过滤文件名的字符串
In [20]: addresses = [
...: '5412 N CLARK ST',
...: '1060 W ADDISON ST',
...: '1039 W GRANVILLE AVE',
...: '2122 N CLARK ST',
...: '4802 N BROADWAY',
...: ] In [21]: [i for i in addresses if fnmatch(i,'*ST')]
Out[21]: ['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST'] In [22]: [i for i in addresses if fnmatch(i,'*CLARK*')]
Out[22]: ['5412 N CLARK ST', '2122 N CLARK ST']

4、文本模式的匹配和查找

对于简单的文字匹配我们只需要使用基本的字符串方法str.find()、str.endswith()、str.startswith(),对于更复杂的匹配就需要使用正则表达式模块re来匹配了

In [23]: text1 = 'today is 10/19/2018.Pycon starts 3/13/2019.'

In [24]: import re

In [25]: re.match(r'\d+/\d+/\d+',text1)

In [28]: re.findall(r'\d+/\d+/\d+',text1)
Out[28]: ['10/19/2018', '3/13/2019'] In [29]: text2 = '11/20/2018' In [30]: re.match(r'\d+/\d+/\d+',text2)
Out[30]: <re.Match object; span=(0, 10), match='11/20/2018'> In [31]: result = re.match(r'(\d+)/(\d+)/(\d+)',text2) In [32]: result.groups()
Out[32]: ('', '', '') In [33]: result.group(0)
Out[33]: '11/20/2018' In [34]: result.group(1)
Out[34]: '' In [35]: result.group(2)
Out[35]: '' In [36]: result.group(3)
Out[36]: ''
#分组取出所有的日期并按格式输出
In [39]: text1 = 'today is 10/19/2018.Pycon starts 3/13/2019.' In [40]: for month,day,year in re.findall(r'(\d+)/(\d+)/(\d+)',text1):
...: print('{}-{}-{}'.format(year,month,day))
...:
2018-10-19
2019-3-13 #如果文本数据比较大可以使用finditer()方法,以迭代器的方法匹配
In [43]: text1 = 'today is 10/19/2018.Pycon starts 3/13/2019.' In [44]: for i in re.finditer(r'(\d+)/(\d+)/(\d+)',text1):
...: print(i.groups())
...:
('', '', '')
('', '', '')

5、查找和替换文本

对于简单的文本模式,可以使用str.replace()方法即可

In [45]: text = 'abcabcabcabc'

In [46]: text.replace('a','ee')
Out[46]: 'eebceebceebceebc'

针对更为复杂的匹配,可以使用re模块中的sub()方法

In [47]: text3 = 'today is 10/19/2018. pycon starts 3/13/2013.'

In [49]: re.sub(r'(\d+)/(\d+)/(\d+)',r'\3-\1-\2',text3)
Out[49]: 'today is 2018-10-19. pycon starts 2013-3-13.'

更为复杂的例子,如把日期换成字符格式

In [54]: text3 = 'today is 10/19/2018. pycon starts 3/13/2013.'

In [55]: from calendar import month_abbr

In [56]: def change_date(m):
...: mon_name = month_abbr[int(m.group(1))]
...: return '{} {} {}'.format(m.group(2),mon_name,m.group(3))
...:
...: In [57]: re.sub(r'(\d+)/(\d+)/(\d+)',change_date,text3)
Out[57]: 'today is 19 Oct 2018. pycon starts 13 Mar 2013.'
#subn()可以返回完成了多少次替换
In [58]: re.subn(r'(\d+)/(\d+)/(\d+)',change_date,text3)
Out[58]: ('today is 19 Oct 2018. pycon starts 13 Mar 2013.', 2)

6、以不区分大小写的方式对文本做查找和替换

要进行不分区大小写的文本操作时,可以使用re模块进程操作时都要加上re.IGNORECASE标记

In [60]: text = 'UPPER PYTHON,lower python, mixed Python'
In [61]: re.findall('python',text,flags=re.IGNORECASE)
Out[61]: ['PYTHON', 'python', 'Python']
import re
def matchcase(word):
def replace(m):
text = m.group()
if text.isupper():
return word.upper()
elif text.islower():
return word.lower()
elif text[0].isupper():
return word.capitalize()
else:
return word
return replace
#保持原字符大小写或首字母大写替换实例
text = 'UPPER PYTHON,lower python,Mixed Python'
print(re.sub('python',matchcase('snake'),text,flags=re.IGNORECASE))

7、最短匹配的正则表达式

str_pat = re.compile(r'\"(.*)\"')
text1 = 'computer says "no."'
str_pat.findall(text1)
Out[18]: ['no.']
text2 = 'computer says "no." phone says "yes."'
str_pat.findall(text2) #在使用.*贪婪匹配时它将匹配尽可能多的匹配项
Out[20]: ['no." phone says "yes.']
str_pat = re.compile(r'\"(.*?)\"') #只需要在多匹配后加上?号,就会以最少的匹配模式进行匹配
str_pat.findall(text2)
Out[22]: ['no.', 'yes.']

8、多行模式的正则表达式

comment = re.compile(r'python(.*?)end')
text1 = 'python is ver good \n so so end'
comment.findall(text1) #.*匹配不到换行符
Out[27]: []
comment = re.compile(r'python(.*?)end',flags=re.DOTALL) #加上标记re.DOTALL将匹配所有的字符包括换行符
comment.findall(text1)
Out[29]: [' is ver good \n so so ']
comment = re.compile(r'python((?:.|\n)*?)end') #(?:.|\n)会指定一个非捕获组,它只做匹配但不捕获结果,也不分配组号
comment.findall(text1)
Out[31]: [' is ver good \n so so ']

9、将Unicode文本统一表示为规范形式是

s1 = 'spicy\u00f1o'  #它使用的是(U+00F1)全组成的(fully composed)
s2 = 'spicy\u0303o' #它使用的是(U+0303)拉丁字母组合而成
s1 == s2 #所以字符比较是不相等的
Out[35]: False
s1
Out[36]: 'spicyño'
s2
Out[37]: 'spicỹo'

10、从字符串中去掉不需要的字符

#strip()方法用来从字符串的开始和结尾处去掉字符,lstrip()和rstrip()分别从左或右开始执行去除字符操作,默认去除的是空格符,也可以指定
In [21]: s = ' hello world \n' In [22]: s.strip()
Out[22]: 'hello world' In [23]: s.lstrip()
Out[23]: 'hello world \n' In [24]: s.rstrip()
Out[24]: ' hello world' In [25]: t = '-----hello=====' In [26]: t.lstrip('-') #指定去除字符
Out[26]: 'hello=====' In [27]: t.strip('-=') #可以指定多个字符
Out[27]: 'hello' #使用上面的方法不能去除中间的字符,要去除中间的字符可以使用replace()方法或正则表达式替换
In [28]: s.replace(' ','')
Out[28]: 'helloworld\n' In [29]: re.sub('\s+', '',s)
Out[29]: 'helloworld'

11、对齐文本字符串

#对应基本的字符串对齐,可以使用字符串方法ljust()、rjust()和center(),分别表示左对齐,右对齐和居中对齐,它还可以填充字符可选参数
In [31]: text = 'hello world' In [32]: text.ljust(30)
Out[32]: 'hello world ' In [33]: text.rjust(30)
Out[33]: ' hello world' In [34]: text.center(30)
Out[34]: ' hello world ' In [35]: text.center(30,'=')
Out[35]: '=========hello world==========' #format()函数也可以用来完成对齐任务,需要做的就是合理利用'<'、'>'和'^'字符分别表示左对齐、右对齐和居中对齐,并提供一个期望的宽度值,如果想指定填充字符,可以在对齐符前指定:
In [36]: format(text,'>20')
Out[36]: ' hello world' In [37]: format(text,'<20')
Out[37]: 'hello world ' In [38]: format(text,'^20')
Out[38]: ' hello world ' In [39]: format(text,'=^20')
Out[39]: '====hello world=====' In [40]: format(text,'=^20s')
Out[40]: '====hello world=====' In [41]: format(text,'*^20s')
Out[41]: '****hello world*****'
#当格式化多个值时,也可以使用format()方法
In [42]: '{:>10s}{:<10s}'.format('hello','world')
Out[42]: ' helloworld ' In [43]: '{:#>10s} {:&<10s}'.format('hello','world')
Out[43]: '#####hello world&&&&&'

12、字符串链接及合并

#合并的字符串在一个序列或可迭代对象中,最好的方法是使用join()方法
In [44]: data = ['I','like','is','python'] In [45]: ' '.join(data)
Out[45]: 'I like is python' In [46]: ','.join(data)
Out[46]: 'I,like,is,python' #利用生成器表达式转换后链接字符串会更高效
In [47]: ','.join(str(d) for d in data)
Out[47]: 'I,like,is,python'

13、给字符串中的变量名做插值处理

#在字符串中给变量赋值一般常见的处理方式是使用format()方法
In [5]: str_variable = "{name} today {num} old year" In [6]: str_variable.format(name='zhang',num=20)
Out[6]: 'zhang today 20 old year' #另一种方式是使用format_map()和vars()联合匹配当前环境中的变量名
In [7]: name = 'python' In [8]: num = 18 In [9]: str_variable.format_map(vars())
Out[9]: 'python today 18 old year'
#vars()还可用在类实例上
In [10]: class info:
...: def __init__(self,name,num):
...: self.name = name
...: self.num = num
...: In [11]: a = info('shell',23) In [12]: str_variable.format_map(vars(a))
Out[12]: 'shell today 23 old year'
#对于传递参数不够时将会抛出异常,可以定义一个带有__missing__()方法的字典类来处理
In [13]: class safesub(dict):
...: def __missing__(self,key):
...: return '{' + key + '}'
...: In [14]: del num In [15]: str_variable.format_map(safesub(vars()))
Out[15]: 'python today {num} old year'

14、以固定的列数重新格式化文本

#textwrap模块可以以多种方式重新格式化字符串:
>>> import textwrap
>>> s = "look into eyes, look into my eyes, the eyes,the eyes, \
... the eyes, not around the eyes, don't look around the eyes, \
... look into my eyes, you're under."
>>> print(textwrap.fill(s,70)
... )
look into eyes, look into my eyes, the eyes,the eyes, the eyes, not
around the eyes, don't look around the eyes, look into my eyes, you're
under.
>>> print(textwrap.fill(s,40))
look into eyes, look into my eyes, the
eyes,the eyes, the eyes, not around the
eyes, don't look around the eyes, look
into my eyes, you're under.
>>> print(textwrap.fill(s,40,initial_indent=' '))
look into eyes, look into my eyes, the
eyes,the eyes, the eyes, not around the
eyes, don't look around the eyes, look
into my eyes, you're under.
>>> print(textwrap.fill(s,40,subsequent_indent=' '))
look into eyes, look into my eyes, the
eyes,the eyes, the eyes, not around the
eyes, don't look around the eyes, look
into my eyes, you're under.
#可以通过os.get_terminal_size()来获取终端的尺寸大小
>>> import os
>>> print(textwrap.fill(s,os.get_terminal_size().columns))
look into eyes, look into my eyes, the eyes,the eyes, the eyes, not around the eyes, don't look around
the eyes, look into my eyes, you're under.
>>> print(os.get_terminal_size())
os.terminal_size(columns=105, lines=32)

15、在文本中处理HTML和XML实体

#使用html.escape()函数来替换HTML标签为文本样式
In [1]: s = 'Elements are written aa "<tag>text</tag>".' In [2]: import html In [3]: s
Out[3]: 'Elements are written aa "<tag>text</tag>".' In [4]: html.escape(s)
Out[4]: 'Elements are written aa &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.'
#忽略quote标签
In [5]: html.escape(s,quote=False)
Out[5]: 'Elements are written aa "&lt;tag&gt;text&lt;/tag&gt;".'
#处理ASCII文本
In [6]: s1 = 'Spicy &quot;Jalapeño&quot.'
In [7]: from html.parser import HTMLParser
In [9]: p = HTMLParser()
In [11]: p.unescape(s1)
Out[11]: 'Spicy "Jalapeño".'
#生成ASCII文本
In [12]: s2 = p.unescape(s1)
In [13]: s2.encode('ascii',errors='xmlcharrefreplace')
Out[13]: b'Spicy "Jalapeño".'
#处理XML实体
In [14]: s3 = 'the prompt is &gt;&gt;&gt;'
In [15]: from xml.sax.saxutils import unescape
In [16]: unescape(s3)
Out[16]: 'the prompt is >>>'

16、文本分词

#从左到右将字符串解析为标记流(stream of tokens)
In [17]: text = 'foo = 23 + 42 * 10' In [18]: tokens= [('NAME','foo'),('EQ','='),('NUM',''),('PLUS','+'),('NUM',''),('TIMES','*'),('NUM','
...: 10')] In [19]: import re
#使用正则表达式
InIn [20]: NAME = r'(?P<NAME>[a-zA_][a-zA-Z_0-9]*)' In [21]: NUM = r'(?P<NUM>\d+)' In [22]: PLUS = r'(?P<PLUS>\+)' In [23]: TIMES = r'(?P<TIMES>\*)' In [24]: EQ = r'(?P<EQ>=)' In [25]: WS = r'(?P<WS>\s+)' In [26]: master_pat = re.compile('|'.join([NAME,NUM,PLUS,TIMES,EQ,WS]))
#使用模式对象的scanner()方法来完成分词操作
In [27]: scanner = master_pat.scanner('foo = 42')
#在给定的文本中重复调用match()方法,一次匹配一个模式,下面是匹配过程
In [28]: scanner.match()
Out[28]: <re.Match object; span=(0, 3), match='foo'> In [29]: _.lastgroup,_.group()
Out[29]: ('NAME', 'foo') In [30]: scanner.match()
Out[30]: <re.Match object; span=(3, 4), match=' '> In [31]: _.lastgroup,_.group()
Out[31]: ('WS', ' ') In [32]: scanner.match()
Out[32]: <re.Match object; span=(4, 5), match='='> In [33]: _.lastgroup,_.group()
Out[33]: ('EQ', '=') In [34]: scanner.match()
Out[34]: <re.Match object; span=(5, 6), match=' '> In [35]: _.lastgroup,_.group()
Out[35]: ('WS', ' ') In [36]: scanner.match()
Out[36]: <re.Match object; span=(6, 8), match=''> In [37]: _.lastgroup,_.group()
Out[37]: ('NUM', '')
#通过生成器函数来转化为代码的形式
In [40]: from collections import namedtuple In [41]: token = namedtuple('token',['type','value']) In [42]: def generate_tokens(pat,text):
...: scanner = pat.scanner(text)
...: for m in iter(scanner.match,None):
...: yield token(m.lastgroup,m.group())
...: In [43]: for tok in generate_tokens(master_pat,'foo = 42'):
...: print(tok)
...:
token(type='NAME', value='foo')
token(type='WS', value=' ')
token(type='EQ', value='=')
token(type='WS', value=' ')
token(type='NUM', value='')
#过滤空格标记
In [45]: tokens = (tok for tok in generate_tokens(master_pat,text) if tok.type != 'WS') In [46]: for tok in tokens:print(tok)
token(type='NAME', value='foo')
token(type='EQ', value='=')
token(type='NUM', value='')
token(type='PLUS', value='+')
token(type='NUM', value='')
token(type='TIMES', value='*')
token(type='NUM', value='')

17、编写一个简单的递归下降解析器

import re
import collections #定义文本分词变量
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
MINUS = r'(?P<MINUS>-)'
TIMES = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS = r'(?P<WS>\s+)' master_pat = re.compile('|'.join([NUM,PLUS,MINUS,TIMES,DIVIDE,LPAREN,RPAREN,WS]))
Token = collections.namedtuple('Token',['type','value']) #过滤文本分词
def generate_tokens(text):
scanner = master_pat.scanner(text)
for m in iter(scanner.match,None):
tok = Token(m.lastgroup,m.group())
if tok.type != 'WS':
yield tok class ExpressionEvaluator:
def parse(self,text):
self.tokens = generate_tokens(text)
self.nexttok = None
self.tok = None
self._advance()
return self.expr() def _advance(self):
self.tok,self.nexttok = self.nexttok,next(self.tokens,None)
def _accept(self,toktype):
if self.nexttok and self.nexttok.type == toktype:
self._advance()
return True
else:
return False
def _expect(self,toktype):
if not self._accept(toktype):
raise SyntaxError('Expected' + toktype) def expr(self):
exprval = self.term()
while self._accept('PLUS') or self._accept('MINUS'):
op = self.tok.type
right = self.term()
if op == 'PLUS':
exprval += right
elif op == 'MINUS':
exprval -= right
return exprval def term(self):
termval = self.factor()
while self._accept('TIMES') or self._accept('DIVIDE'):
op = self.tok.type
right = self.factor()
if op == 'TIMES':
termval *= right
elif op == 'DIVIDE':
termval /= right
return termval def factor(self):
if self._accept('NUM'):
return int(self.tok.value)
elif self._accept('LPAREN'):
exprval = self.expr()
self._expect('RPAREN')
return exprval
else:
raise SyntaxError('Expected NUMBER or LPAREN') if __name__ == '__main__':
e = ExpressionEvaluator()
print(e.parse(''))
print(e.parse('2 + 3'))
print(e.parse('2 + 3 * 4'))
print(e.parse('2 + (3 + 4) * 5'))

18、在字节串上进行文本操作

In [2]: data = b'hello world'

In [3]: data
Out[3]: b'hello world'
#切片
In [4]: data[0:5]
Out[4]: b'hello'
#分隔
In [6]: data.split()
Out[6]: [b'hello', b'world']
#替换
In [7]: data.replace(b'hello',b'python')
Out[7]: b'python world'
#在返回单个切片时将返回ASCII字节表对应的位置
In [8]: data[0]
Out[8]: 104