Python爬虫之使用正则表达式抓取数据

匹配标签

匹配title标签

a标签

相关文章：Linux中的正则表达式

实例：

匹配标签

匹配title标签

匹配网页的 <title></title> 标签，也就是网页的标题。 .*？就是匹配1个或多个字符，也就是这里不能是空的。当加入括号的话，就是代表取值了 (.*?)

import re

import requests

resp=requests.get("http://www.baidu.com")

resp.encoding="utf-8"  #设置编码格式为utf-8

html=resp.text

title=re.findall(r'<title>.*?</title>',html)  #匹配 <title></title>

for t in title:

    print(t)

title_value=re.findall(r'<title>(.*?)</title>',html)  #匹配 <title></title>里面的内容

for t in title_value:

    print(t)

#####################################################################

<title>百度一下，你就知道</title>

百度一下，你就知道

a标签

匹配<a href="" ></a> ，并且获取a标签里面的内容

import re

import requests

resp=requests.get("http://www.baidu.com")

resp.encoding="utf-8"  #设置编码格式为utf-8

html=resp.text  

urls = re.findall(r"<a.*?>.*?<\/a>", html)   #匹配所有的a标签

for u in urls:

    print(u)

texts = re.findall(r"<a.*?>(.*?)</a>", html)   #获取超链接<a>和</a>之间内容

for t in texts:

    print(t)

#######################################################################################

<a href=http://news.baidu.com name=tj_trnews class=mnav>新闻</a>

<a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a>

<a href=http://map.baidu.com name=tj_trmap class=mnav>地图</a>

<a href=http://v.baidu.com name=tj_trvideo class=mnav>视频</a>

<a href=http://tieba.baidu.com name=tj_trtieba class=mnav>贴吧</a>

<a href=http://www.baidu.com/bdorz/login.gif?login&amp;tpl=mn&amp;u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>登录</a>

<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb">登录</a>

<a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;">更多产品</a>

<a href=http://home.baidu.com>关于百度</a>

<a href=http://ir.baidu.com>About Baidu</a>

<a href=http://www.baidu.com/duty/>使用百度前必读</a>

<a href=http://jianyi.baidu.com/ class=cp-feedback>意见反馈</a>

新闻

hao123

地图

视频

贴吧

登录

登录

更多产品

关于百度

About Baidu

使用百度前必读

意见反馈

table标签

抓取 <table></table> 表格中的内容。

假设现在有这么一个网页

<html>

<table class="table">

            <tr>

                <th>姓名</th>

                <th>性别</th>

            </tr>

            <tr>

                <td>小谢</td>

                <td>男</td>

            </tr>

            <tr>

                <td>小红</td>

                <td>女</td>

            </tr>

</table>

</html>

匹配代码

import re

import requests

resp=requests.get("http://127.0.0.1/1.html")

resp.encoding="utf-8"  #设置编码格式为utf-8

html=resp.text  

#匹配table标签

tables=re.findall(r"<table.*?>.*?<\/table>",html,re.M|re.S)

for table in tables:

    print(table)

#匹配<tr></tr>之间的内容

trs=re.findall(r"<tr>(.*?)</tr>",html,re.S|re.M) #因为<tr>标签大多数不是在同一行，所以要加 re.S和re.M多行匹配

for tr in trs:

    print(tr)

#匹配<th></th>之间的内容

for row in trs:

    ths=re.findall(r"<th>(.*?)</th>",row,re.S|re.M)

    for th in ths:

        print(th)

#匹配<td></td>之间的内容

for row in trs:

    tds=re.findall(r"<td>(.*?)</td>",row,re.S|re.M)

    for td in tds:

        print(td)

##################################################################################

<table class="table">

            <tr>

                <th>姓名</th>

                <th>性别</th>

            </tr>

            <tr>

                <td><B>小谢</B></td>

                <td>男<br/></td>

            </tr>

            <tr>

                <td><B>小红</B></td>

                <td>女<br/></td>

            </tr>

</table>

                <th>姓名</th>

                <th>性别</th>

                <td>小谢</td>

                <td>男</td>

                <td>小红</td>

                <td>女</td>

姓名

性别

小谢

男

小红

女

匹配标签里面的属性

匹配a标签里面的URL

假如现在有网页

<html>

	<a href="http://www.baidu.com">百度一下，你就知道</a>

	<a href="http://www.mi.com">小米官网</a>

</html>

import re

import requests

resp=requests.get("http://127.0.0.1/1.html")

resp.encoding="utf-8"  #设置编码格式为utf-8

html=resp.text  

urls=re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')",html,re.I|re.S|re.M)  #匹配 href=""

for url in urls:

    print(url)

###################################################################################

http://www.baidu.com

http://www.mi.com

匹配img标签里的 src

加入现在有网页

<html>

	<img src="http://t1.27270.com/uploads/tu/201811/310/f3e9db6b68.jpg"  name="美女"/>

	<img src="http://t1.27270.com/uploads/tu/201811/229/ea7fda100e.jpg" />

</html>

匹配代码：

import re

import requests

resp=requests.get("http://127.0.0.1/1.html")

resp.encoding="utf-8"  #设置编码格式为utf-8

html=resp.text  

srcs=re.findall(r'src="(.*?)"',html,re.I|re.S|re.M)

for src in srcs:

    print(src)

##################################################################

http://t1.27270.com/uploads/tu/201811/310/f3e9db6b68.jpg

http://t1.27270.com/uploads/tu/201811/229/ea7fda100e.jpg

#假如要获取图片的名字，也就是上面的 f3e9db6b68.jpg 或者  ea7fda100e.jpg

import re

import requests

resp=requests.get("http://127.0.0.1/1.html")

resp.encoding="utf-8"  #设置编码格式为utf-8

html=resp.text  

srcs=re.findall(r'src="(.*?)"',html,re.I|re.S|re.M)

for src in srcs:

    name=src.split("/")[-1]

    print(name)

##################################################################

f3e9db6b68.jpg

ea7fda100e.jpg