模式匹配与正则表达式


# 不用正则表达式来查找文本模式(匹配一个电话号码，格式为：415-555-4242)

def phone_num(text):
    if len(text) != 12:
        return  False
    for i in range(0,3):
        if not text[i].isdecimal():
            return False
    if text[3] != "-":
        return False
    for i in range(4,7):
        if not text[i].isdecimal():
            return False
    if text[7] != "-":
        return False
    for i in range(8,12):
        if not text[i].isdecimal():
            return False
    return True

if __name__ == "__main__":
    message = input(">>>:")
    for i in range(len(message)):
        chunk = message[i:i+12]
        if phone_num(chunk):
            print ("\033[31;1mphone num\033[0m:%s"%chunk)
# >>>: Call me at 415-555-1011 tomorrow. 415-555-9999 is my office
# phone num:415-555-1011
# phone num:415-555-9999

# 用正则表达式查找文本模式

import re
phone_num_re = re.compile(r"\d\d\d-\d\d\d\-\d\d\d\d")
number = input(">>>:")
chunk_re = phone_num_re.search(number)
print ("\033[31;1mphone_num_re\033[0m:%s"%chunk_re.group())
# >>>: Call me at 415-555-1011 tomorrow
# phone_num_re:415-555-1011
chunk_re_find = phone_num_re.findall(number)
print ("\033[31;1mphone_num_re\033[0m:%s"%chunk_re_find)
# >>>:Call me at 415-555-1011 tomorrow. 415-555-9999 is my office
# phone_num_re:['415-555-1011', '415-555-9999']

# 用管道匹配多个分组
"""
字符|称为“管道”。希望匹配许多表达式中的一个时，就可以使用它
利用findall()方法，可以找到“所有”匹配的地方
"""

import re
heroRegex = re.compile(r"linux|python")
mo1 = heroRegex.search("linux and python is my love")
print (mo1.group())
# linux
mo2 = heroRegex.findall("linux and python is my love")
print (mo2)
# ['linux', 'python']

# 用问号实现可选匹配
# 字符?表明它前面的分组在这个模式中是可选的

batRefex = re.compile(r"Bat(wo)?man")
mo3 = batRefex.search("The Adventures of Batman")
print (mo3.group())



# 用星号匹配零次或多次

batRegex1 = re.compile(r'Bat(wo)*man')
mo5 = batRegex1.search('The Adventures of Batwoman')
mo4 = batRegex1.search('The Adventures of Batwowowowoman')
print (mo5.group())
print (mo4.group())


# 用加号匹配一次或多次
# *意味着“匹配零次或多次”，+（加号）则意味着“匹配一次或多次”。星号不要求分组出现在匹配的字符串中，但加号不同，加号前面的分组必须“至少出现一次”

batRegex2 = re.compile(r'Bat(wo)+man')
mo6 = batRegex2.search('The Adventures of Batwoman')
print (mo6.group())
mo7 = batRegex2.search('The Adventures of Batwowowowoman')
print (mo7.group())
# Batwoman
# Batwowowowoman

# 用花括号匹配特定次数



"""
findall()方法

search()将返回一个Match对象，包含被查找字符串中的“第一次”匹配的文本，而findall()方法将返回一组字符串，包含被查找字符串中的所有匹配。
"""

# 通配字符
# .（句点）字符称为“通配符”。它匹配除了换行之外的所有字符,句点字符只匹配一个字符.
# 要匹配真正的句点，就是用倒斜杠转义：\.

atRegex = re.compile(r'.at')
print (atRegex.findall('The cat in the hat sat on the flat mat.'))
# ['cat', 'hat', 'sat', 'lat', 'mat']



# 用点-星匹配所有字符

nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mm = nameRegex.search('First Name: Al Last Name: Sweigart')
print (mm.group(1))
print (mm.group(2))
# Al
# Sweigart



# 用句点字符匹配换行
# 通过传入re.DOTALL 作为re.compile()的第二个参数，可以让句点字符匹配所有字符，包括换行字符

noNewlineRegex = re.compile('.*') # 不能匹配换行
mm2 = noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()
print (mm2)
# Serve the public trust.
newlineRegex1 = re.compile('.*', re.DOTALL)
mm3 = newlineRegex1.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()
print (mm3)
# Serve the public trust.
# Protect the innocent.
# Uphold the law.



# 不区分大小写的匹配
# 让正则表达式不区分大小写，可以向re.compile()传入re.IGNORECASE 或re.I，作为第二个参数

robocop = re.compile(r'robocop', re.I)
ro = robocop.search('RoboCop is part man, part machine, all cop.').group()
print (ro)
# RoboCop



# 用sub()方法替换字符串
# Regex对象的sub()方法需要传入两个参数。第一个参数是一个字符串，用于取代发现的匹配。第二个参数是一个字符串，即正则表达式。sub()方法返回替换完成后的字符串

namesRegex = re.compile(r'Agent \w+')
resub = namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')
print (resub)
# CENSORED gave the secret documents to CENSORED.



# 管理复杂的正则表达式
# 忽略正则表达式字符串中的空白符和注释,可以向re.compile()传入变量re.VERBOSE，作为第二个参数。
# 使用了三重引号('")，创建了一个多行字符串。这样就可以将正则表达式定义放在多行中，让它更可读
# 示例：

phoneRegex = re.compile(r'''(
(\d{3}|\(\d{3}\))?  # area code
(\s|-|\.)?           # separator
\d{3}                # first 3 digits
(\s|-|\.)           # separator
\d{4}                # last 4 digits
(\s*(ext|x|ext.)\s*\d{2,5})? # extension
)''', re.VERBOSE)



# 组合使用re.IGNOREC ASE、re.DOTALL 和re.VERBOSE
# 可以使用管道字符（|）将变量组合起来




"""
项目：电话号码和E-mail 地址提取程序

# 为电话号码创建一个正则表达式

PhoneRegex = re.compile(r"(\d{3}|\(\d{3}\))?(\s|-|\.)?(\d{3})(\s|-|\.)(\d{4})(\s*(ext|x|ext.)\s*(\d{2,5}))?")


# 为E-mail 地址创建一个正则表达式

MailRegex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}")

参考资料：Python编程快速上手让繁琐工作自动化

秒客网

模式匹配与正则表达式

相关文章