Python爬虫：爬取糗事百科

网上看到的教程，但是是用正则表达式写的，并不能运行，后面我就用xpath改了，然后重新写了逻辑，并且使用了双线程，也算是原创了吧
#!/usr/bin/python
# -*- encoding:utf-8 -*-

from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import sys
#编码
reload(sys)
sys.setdefaultencoding('utf-8')

#定义输出函数
def towrite(contentdict):
    f.writelines(u'作者:' + contentdict['author'] + '\n')
    f.writelines(u'内容:' + contentdict['content'] + '\n')
    f.writelines(u'好笑:' + contentdict['vote'] + '\n')
    f.writelines(u'评论:' + contentdict['span'] + '\n\n')

def spider(url):
    #得到页面代码
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = { 'User-Agent' : user_agent }
    html = requests.get(url,headers=headers)

    #获取内容
    selector = etree.HTML(html.text)
    content_field = selector.xpath('//*[@id="content-left"]/div[@class="article block untagged mb15"]')
    item={}
    for i in range(len(content_field)):
        #作者
        author_f= content_field[i].xpath('div[@class="author clearfix"]')[0]
        author=author_f.xpath('string(.)').replace('\n','').replace(' ','')
        #内容
        content_f=content_field[i].xpath('div[@class="content"]/text()')
        content=''
        for n in range(len(content_f)):
            content_temp=content_f[n].replace('\n','').replace(' ','').replace('\t','')
            content+=str(content_temp)

        #好笑
        vote=''
        vote_temp= content_field[i].xpath('div[@class="stats"]/span[@class="stats-vote"]/i/text()')[0]
        vote+=str(vote_temp)

        #评论,如果评论为空，则不会显示i节点
        span=''
        span_temp_l= content_field[i].xpath('div[@class="stats"]/span[@class="stats-comments"]/a/i/text()')
        span_temp=[]
        if len(span_temp_l)>0:
            span_temp=span_temp_l[0]
        else:
            span_temp='0'
        span+=str(span_temp)

        item['author'] = author
        item['content'] = content
        item['vote'] = vote
        item['span'] = span

        towrite(item)

if __name__ == '__main__':
    pool = ThreadPool(4)
    f = open('content.txt','a')
    url = []
    for i in range(1,36):
        newpage = 'http://www.qiushibaike.com/hot/page/' + str(i)
        url.append(newpage)

    results = pool.map(spider, url)
    pool.close()
    pool.join()
    f.close()
秒客网

Python爬虫：爬取糗事百科

相关文章