Python爬虫之pyquery库的基本使用

时间:2021-11-28 07:40:34
 # 字符串初始化
html = '''
<div>
<ul>
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
''' from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li')) # url初始化
from pyquery import PyQuery as pq
doc = pq(url = "http://www.baidu.com")
print(doc("head")) # 文件初始化
from pyquery import PyQuery as pq
doc = pq(filename = "demo.html")
print(doc('li')) # 基本CSS选择器
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 注意下面id 前面需要加上#,class 前面需要加上.
print(doc('#container .list li')) # 查找元素
# 子元素
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(lis) lis = items.children()
print(type(lis))
print(lis) lis = items.children('.active')
print(lis) # 父元素
html = '''
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
container = items.parent()
print(type(container))
print(container) html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
parents = items.parents()
print(type(parents))
print(parents) parents = items.parents('.wrap')
print(parents)
 # 兄弟元素
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 注意下面item-0后面直接是. 没有空格
li = doc('.list .item-0.active')
print(li.siblings()) print(li.siblings('.active')) # 遍历
# 单个元素
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li) html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items()
print(type(lis))
for li in lis:
print(li) # 获取信息
# 获取属性
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
# 获取属性的两种方法
print(a.attr('href'))
print(a.attr.href) # 获取文本
print(a.text()) # 获取html
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
# 得到<li>标签里面的代码
print(li.html()) # DOM操作
# addClass、removeClass
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.remove_class('active')
print(li)
li.add_class('active')
print(li) # attr CSS
li.attr('name', 'link')
print(li)
li.css('font-size', '14px')
print(li) # remove
html = '''
<div class = "wrap">
Hello,World
<p>This is a paragraph</p>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove()
print(wrap.text()) # 伪类选择器
html = '''
<div class = "wrap">
<div id = "container">
<ul class = "list">
<li class = "item-0">first item</li>
<li class = "item-1"><a href = "link2.html">second item</a></li>
<li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
<li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
<li class = "item-0"><a href = "link5.html">fifthth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 获取第一个元素
li = doc('li:first-child')
print(li)
# 获取最后一个元素
li = doc('li:last-child')
print(li)
# 获取第二个元素
li = doc('li:nth-child(2)')
print(li)
# 获取下标为2的元素后面的所有元素(下标从0开始)
li = doc('li:gt(2)')
print(li)
# 获取下标为偶数的元素
li = doc('li:nth-child(2n)')
print(li)
# 获取内容包含second 的元素
li = doc('li:contains(second)')
print(li)

Python爬虫之pyquery库的基本使用的更多相关文章

  1. Python爬虫之PyQuery使用(六)

    Python爬虫之PyQuery使用 PyQuery简介 pyquery能够通过选择器精确定位 DOM 树中的目标并进行操作.pyquery相当于jQuery的python实现,可以用于解析HTML网 ...

  2. python爬虫之urllib库(三)

    python爬虫之urllib库(三) urllib库 访问网页都是通过HTTP协议进行的,而HTTP协议是一种无状态的协议,即记不住来者何人.举个栗子,天猫上买东西,需要先登录天猫账号进入主页,再去 ...

  3. python爬虫之urllib库(二)

    python爬虫之urllib库(二) urllib库 超时设置 网页长时间无法响应的,系统会判断网页超时,无法打开网页.对于爬虫而言,我们作为网页的访问者,不能一直等着服务器给我们返回错误信息,耗费 ...

  4. python爬虫之urllib库(一)

    python爬虫之urllib库(一) urllib库 urllib库是python提供的一种用于操作URL的模块,python2中是urllib和urllib2两个库文件,python3中整合在了u ...

  5. Python爬虫之selenium库使用详解

    Python爬虫之selenium库使用详解 本章内容如下: 什么是Selenium selenium基本使用 声明浏览器对象 访问页面 查找元素 多个元素查找 元素交互操作 交互动作 执行JavaS ...

  6. Mac os 下 python爬虫相关的库和软件的安装

      由于最近正在放暑假,所以就自己开始学习python中有关爬虫的技术,因为发现其中需要安装许多库与软件所以就在这里记录一下以避免大家在安装时遇到一些不必要的坑. 一. 相关软件的安装:   1. h ...

  7. python爬虫&lpar;四&rpar;&lowbar;urllib2库的基本使用

    本篇我们将开始学习如何进行网页抓取,更多内容请参考:python学习指南 urllib2库的基本使用 所谓网页抓取,就是把URL地址中指定的网络资源从网络流中读取出来,保存到本地.在Python中有很 ...

  8. python爬虫之PyQuery的基本使用

    PyQuery库也是一个非常强大又灵活的网页解析库,如果你有前端开发经验的,都应该接触过jQuery,那么PyQuery就是你非常绝佳的选择,PyQuery 是 Python 仿照 jQuery 的严 ...

  9. python爬虫之requests库

    在python爬虫中,要想获取url的原网页,就要用到众所周知的强大好用的requests库,在2018年python文档年度总结中,requests库使用率排行第一,接下来就开始简单的使用reque ...

随机推荐

  1. workman源代码阅读 - 使用信号处理器实现定时器

    <?php /** * SIGALRM信号处理器注册成功后,在什么情况下进程会收到该信号呢? * * 在Linux系统下,每个进程都有惟一的一个定时器,该定时器提供了以秒为单位的定时功能.在定时 ...

  2. Java中this关键字在构造方法中的使用

    1. Java中this关键字代表对象本身.用this关键字可以在类的内部调用属性和方法,这样代码的可读性比较高,因为它明确的指出了这个属性或方法的来源. 2. 同时在构造函数中也可以使用this关键 ...

  3. jquery实现表格行的动态增加和删除

    $("#Addmaterial").click(function () {//Addmaterial是增加按钮的ID $("#tab tr").attr(&qu ...

  4. 大数据时代的数据存储,非关系型数据库MongoDB(一)

    原文地址:http://www.cnblogs.com/mokafamily/p/4076954.html 爆炸式发展的NoSQL技术 在过去的很长一段时间中,关系型数据库(Relational Da ...

  5. flutter 自定义主题切换

    1. 定义local_srorage.dart文件 使用Flutter第三方插件shared_preferences实现存储键值对信息 相关shared_preferences插件可参考: flutt ...

  6. py-day4-2 python 内置函数

    zip() #zip 拉链方法 一一对应 只要是序列类型的都可以 print(list(zip(('a','b','c'),(1,2,3)))) 结果: [('a', 1), ('b', 2), (' ...

  7. Keras bug in model&period;predict

    When I use Keras to predict behind a web service, it occurred an error. and the error message is lik ...

  8. css 子盒子上下居中 文字溢出省略号

    <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8&quo ...

  9. DVWA手记——取消登录

    DVWA在渗透测试方面表现相当不错,可以自定义高中低的安全级别.同事为了测试一个小工具,只好取消登录认证.本以为Config可以设置,结果没有,只好自己动手了——才能风衣足食. 更改文件:\dvwa\ ...

  10. 什么是WCF&lpar;转&rpar;

    什么是WCF(Windows Communication Foundation(WCF)) 大家可以百度一下了解什么是WCF.当然有些人看到密密麻麻的黑框白字就懒的读.即使读了 可能也没明白确切的含义 ...