re模块(正则)

一, 什么是正则?

　　正则就是用一些具有特殊含义的符号组合到一起(称为正则表达式)来描述字符或者字符串的方法.

　　在python中,正则内嵌在python中,并通过re模块实现,正则表达模式被编译成一系列的字节码,然后由c编写的匹配引擎执行.

二,常用的匹配模式(元字符)

re模块(正则)

import re

# 将所有的数据都找出来放进列表中list,一一匹配

print(re.findall('alex', 'haha alex is alex is dsb'))

# >>>: ['alex', 'alex']

# \w 匹配字母数字及下划线(一个\w每次匹配一个字符)

# \W 匹配非字母数字及下划线

print(re.findall('\w','Aah123 +-_'))

# >>>: ['A', 'a', 'h', '1', '2', '3', '_']

print(re.findall('\w\w','Aah123 +-_'))

# >>>: ['Aa', 'h1', '23']

print(re.findall('\w9\w','Aa9h123 aaa9c+-_'))

# >>>: ['a9h', 'a9c']

# \s 匹配任意空白字符,等价于\t\n\r\f

# \S 匹配非空字符

# \d 匹配任意数字,[0-9]

# \D 匹配任意非数字

# ^ : 仅从开头开始匹配

# $ : 仅从尾部开始匹配

print(re.findall('^alex', 'alex is alex is alex'))

# >>>: ['alex']

print(re.findall('^alex', '1alex is alex is alex'))

# >>>: []

重复匹配: | . | * | ? | .* | .*? | + | {n,m}

# . :代表一个字符,该字符可以是任意字符(除换行符)

print(re.findall('a.c', 'a alc aaac a c asfdsaf'))

# >>>: ['alc', 'aac', 'a c']

print(re.findall('a.c', 'a alc aaac a\nc asfd',re.DOTALL))  #DOTALL使得.匹配包括换行符在内的所有字符

# >>>: ['alc', 'aac', 'a\nc']

# ? :代表左边那一个字符出现0次或者1次

print(re.findall('ab?', 'a ab abb abbbb a123b a123bbbb'))

# >>>: ['a', 'ab', 'ab', 'ab', 'a', 'a']

# * :代表左边那一个字符出现0次到无穷次

print(re.findall('ab*', 'a ab abb abbbb a123b a123bbbb'))

# >>>: ['a', 'ab', 'abb', 'abbbb', 'a', 'a']

# + :代表左边那一个字符出现1次到无穷次

print(re.findall('ab+', 'a ab abb abbbb a123b a123bbbb'))

# >>>: ['ab', 'abb', 'abbbb']

# {n,m} :代表左边那一个字符出现n次到m次

print(re.findall('ab{1,3}', 'a ab abb abbbb a123b a123bbbb'))

# >>>: ['ab', 'abb', 'abbb']

# .* :匹配任意0个到无穷个字符,贪婪匹配

print(re.findall('a.*c','a132142qwdcavcccc(((()))))c2333'))

# >>>: ['a132142qwdcavcccc(((()))))c']

# .*? :匹配任意０个到无穷个字符，非贪婪匹配

print(re.findall('a.*?c', 'a132142qwdcavcccc(((()))))c2333'))

# >>>: ['a132142qwdc', 'avc']

# |:或者

print(re.findall('companies|company', 'Too many companies have gone bankrupt,c and the next one is my company'))

# >>>: ['companies', 'company']

# ():分组

print(re.findall('compan(?:ies|y)','Too many companies have gone bankrupt,c and the next one is my company'))

# >>>: ['companies', 'company']

# \ :转义

print(re.findall('a\\\\c','a\c aac'))

print(re.findall(r'a\\c','a\c aac'))

# >>>: ['a\\c']

# 忽略大小写

# print(re.findall('alex','my name is alex ALex is dSB',re.I))

# # >>>: ['alex', 'ALex']

# msg = '''my name is egon

# asdfassg egon

# 122324324egon'''

# print(re.findall('egon$',msg,re.M))

# >>>: ['egon', 'egon', 'egon']

# []: 代表匹配一个字符,这个字符是来自于自定义的范围

print(re.findall('a[1]c', 'a a1c aaac a c asfdsaf',re.DOTALL))

# >>>: ['a1c']

print(re.findall('a[0-9]c', 'a a1c aaac a7c asfdsaf',re.DOTALL))  #[0-9]的数字

# >>>: ['a1c', 'a7c']

print(re.findall('a[a-zA-Z]c', 'a a1c aaac a7c asfdsaf',re.DOTALL))   #所有字母

# >>>: ['aac']

print(re.findall('a[+*/-]c', 'a a1c aaac a7c asfdsaf',re.DOTALL))     #-代表连字符,在首尾才表示符号意思

# re模块其他方法

res=re.findall('(href)="(.*?)"','<p>动感视频</p><a href="https://www.douniwan.com/1.mp4">逗你玩呢</a><a href="https://www.xxx.com/2.mp4">葫芦娃</a>')

print(res)

res=re.search('(href)="(.*?)"','<p>动感视频</p><a href="https://www.douniwan.com/1.mp4">逗你玩呢</a><a href="https://www.xxx.com/2.mp4">葫芦娃</a>')

print(res)

print(res.group(0))

print(res.group(1))

print(res.group(2))

#运行结果

[('href', 'https://www.douniwan.com/1.mp4'), ('href', 'https://www.xxx.com/2.mp4')]

<_sre.SRE_Match object; span=(14, 51), match='href="https://www.douniwan.com/1.mp4"'>

href="https://www.douniwan.com/1.mp4"

href

https://www.douniwan.com/1.mp4

相关文章