day48-python爬虫学习三

Python的urllib和urllib2模块都做与请求URL相关的操作，但他们提供不同的功能。他们两个最显着的差异如下：

　　urllib2可以接受一个Request对象，并以此可以来设置一个URL的headers，但是urllib只接收一个URL。这意味着，你不能伪装你的用户代理字符串等。
　　urllib模块可以提供进行urlencode的方法，该方法用于GET查询字符串的生成，urllib2的不具有这样的功能。这就是urllib与urllib2经常在一起使用的原因。

例子：

#爬糗事百科段子

import urllib,urllib2

import re

import sys

page = 2

def getPage(page_num=1):

    url = "https://www.qiushibaike.com/8hr/page/" + str(page_num)

    headers = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

    try:

        request = urllib2.Request(url, headers=headers)

        response = urllib2.urlopen(request)

        html = response.read()

        return html

    except urllib2.URLError, e:

        if hasattr(e, "code"):

            print "连接服务器失败，错误代码: {0}".format(e.code)

            return None

        if hasattr(e, "reason"):

            print "连接服务器失败，错误圆圆: {0}".format(e.reason)

            return None

def getPageCoent(page_num=1):

    html =getPage(page_num)

    re_page = re.compile(

        r'<div class="author.*?>.*?<a.*?<img.*?alt="(.*?)">.*?<div class="content">.*?<span>(.*?)</span>.*?</div>.*?<span class="stats-vote">.*?<i class="number">(\d+)</i>',

        re.S)

    items = re_page.findall(html)

    page_contents = []

    replaceBR = re.compile(r'<br/>')

    for item in items:

        content = item[1]

        new_content = replaceBR.sub('\n', content)

        page_contents.append([page_num,

                             item[0].strip(),

                              new_content.strip(),

                             item[2].strip()]

                             )

    return page_contents

def getOneStory(page_contents):

    for story in page_contents:

        input = raw_input()

        if input == 'Q' or input == 'q':

            sys.exit()

        print "第{0}页\t发布人:{1}\t赞;{2}\n{3}\n".format(story[0],story[1],story[3],story[2])

if __name__ == '__main__':

    print("正在看段子，按回车看新段子，退出q")

    num = 1

    while True:

        page_contents = getPageCoent(num)

        getOneStory(page_contents)

        num += 1

秒客网

day48-python爬虫学习三

相关文章