爬取糗事百科

本次实战采取爬取糗事百科的段子

首先找到糗事百科的网址：http://www.qiushibaike.com/

爬取糗事百科

我们这次打算爬取文字模块。其网址部分是http://www.qiushibaike.com/text/ 爬取糗事百科

然后去找到总共有多少页。打开审查元素，选择左上角的箭头，选择网页上的元素

可以看到爬取糗事百科

我们所需要的元素都在li标签下的span内，这对我们之后写正则表达式起到关键的作用。

再仔细观察每页的网址，如第2页，第8页，第35页等，可以发现一个规律爬取糗事百科

爬取糗事百科

即所有页面的规律符合http://www.qiushibaike.com/text/page/X/ （对第一页同样成立）

结合以上所有的信息，我们可以编写出我们的Python 爬虫

# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup
import requests
import random

# ----------- 处理页面上的各种标签 -----------
class HTML_Tool:
    # 用非 贪婪模式 匹配 \t 或者 \n 或者 空格 或者 超链接 或者 图片
    BgnCharToNoneRex = re.compile("(\t|\n| |<a.*?>|<img.*?>)")

    # 用非 贪婪模式 匹配 任意<>标签
    EndCharToNoneRex = re.compile("<.*?>")

    # 用非 贪婪模式 匹配 任意<p>标签
    BgnPartRex = re.compile("<p.*?>")
    CharToNewLineRex = re.compile("(<br/>|</p>|<tr>|<div>|</div>)")
    CharToNextTabRex = re.compile("<td>")

    # 将一些html的符号实体转变为原始符号
    replaceTab = [("<", "<"), (">", ">"), ("&", "&"), ("&", "\""), (" ", " ")]

    def Replace_Char(self, x):
        x = self.BgnCharToNoneRex.sub("", x)
        x = self.BgnPartRex.sub("\n    ", x)
        x = self.CharToNewLineRex.sub("\n", x)
        x = self.CharToNextTabRex.sub("\t", x)
        x = self.EndCharToNoneRex.sub("", x)

        for t in self.replaceTab:
            x = x.replace(t[0], t[1])
        return x


class Qiubai_Spider:
    # 申明相关的属性
    def __init__(self, url):
        self.user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
        self.iplist=[]


        response = requests.get('http://haoip.cc/index/2061578.htm')
        ips = BeautifulSoup(response.text, 'lxml').find_all('tr')
        # print ips
        for ip in ips:
            # print ip.find('td').get_text().strip()
            self.iplist.append(ip.find('td').get_text().strip())

        UA=random.choice(self.user_agent_list)
        print UA
        header={'User-Agent':UA}
        html=requests.get(url,header)
        contents=BeautifulSoup(html.text,'lxml').find_all('span',class_="page-numbers")[-1].get_text()
        print contents
        for i in range(1,int(contents)+1):
            self.myUrl = url +'page/'+str(i)
            print self.myUrl
            self.datas = []
            self.myTool = HTML_Tool()
            self.find_title(self.myUrl)
        print u'已经启动糗事百科爬虫，咔嚓咔嚓'




    def find_title(self, myPage):
        # 匹配 <h1 class="core_title_txt" title="">xxxxxxxxxx</h1> 找出标题
        # myMatch = re.search(r'<h3.*?>(.*?)</h3>', myPage, re.S)
        page_html=requests.get(myPage)
        contents=BeautifulSoup(page_html.text,'lxml').find_all('div',class_="content")

        for content in contents:
            # pattern=re.compile('<span>(.*?)</span>',re.S)
            # items=re.findall(pattern,content)
            # for item in items:
            # title = content.replace('\\', '').replace('/', '').replace(':', '').replace('*', '').replace('?', '').replace('"','').replace(
            #     '>', '').replace('<', '').replace('|', '')
            # print content
            text=content.find('span').get_text().replace('\\', '').replace('/', '').replace(':', '').replace('*', '').replace('?', '').replace('"','').replace(
                '>', '').replace('<', '').replace('|', '')
            print (text+'\n\n')
        # return title




# -------- 程序入口处 ------------------
print u"""#---------------------------------------
#   程序：糗事百科爬虫
"""

bdurl = 'http://www.qiushibaike.com/text/'
mySpider = Qiubai_Spider(bdurl)

结果为

爬取糗事百科

当然还有很大的完善空间，比如或保存，以及代理设定，timeout等，不过现在得去上课（哎），以后会完善

秒客网

爬取糗事百科

相关文章