1.如下内容,欲练此功,必先....正则
"""古诗文爬取""" import requests import re def parse_page(url): rep = requests.get( url=url, headers={"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36"} ) text = rep.text # re正则匹配古诗文标题 titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL) # re正则匹配古诗文朝代 dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text) # re正则匹配古诗文作者 authors = re.findall(r'<p class="source">.*?<a.*?><a.*?>(.*?)</a>', text, re.DOTALL) # re正则匹配古诗文内容 content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL) contents = [] # 清除诗文内容br标签 for content in content_tags: data = re.sub(r"<.*?>", "", content) contents.append(data.strip()) poems = [] # zip参数可放置一个或多个迭代器,并把对应的元素打包成元组 for value in zip(titles, dynasties, authors, contents): title, dynastie, author, content = value poem = { "title": title, "dynastie": dynastie, "author": author, "content": content } poems.append(poem) print(poems) def main(): for x in range(1, 101): url = "https://www.gushiwen.org/default_{}.aspx".format(x) parse_page(url) if __name__ == '__main__': main()
"""糗事百科笑话段子""" import requests import re def parse_detail(url): rep = requests.get( url=url, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36" } ) text = rep.content.decode(encoding="utf-8") # 匹配所有作者 users_tag = re.findall(r'<div\sclass="author clearfix">.*?<h2>(.*?)</h2>', text, re.DOTALL) users = [] for user in users_tag: users.append(user.strip()) # 匹配作者年龄 ages = re.findall(r'<div\sclass="author clearfix">.*?<div.*?>(.*?)</div>', text, re.S) # 匹配作者内容 content_tags = re.findall(r'<div\sclass="article\sblock.*?">.*?<span>(.*?)</span>', text, re.S) contents = [] for content in content_tags: data = re.sub(r'<.*?>', "", content).strip() contents.append(data) user_infos = [] for value in zip(users, ages, contents): user, age, content = value user_info = { "user": user, "age": age, "content": content, } user_infos.append(user_info) print(user_infos) def spider(): for i in range(1, 14): url = "https://www.qiushibaike.com/text/page/{}/".format(i) parse_detail(url) if __name__ == '__main__': spider()
学习正则链接:http://www.runoob.com/regexp/regexp-tutorial.html