为了收集笑话也是挺拼的,我就不相信你所有的都看过了。还有,请问哪位仁兄能指点之下怎么把网上抓取到的图片写到word里面,捉摸了好久都没弄出来。
糗百不需要登录,html直接解析,只要在request的时候加上header就行了,是本菜鸟入手的最佳选择。重点就是正则表达式的匹配上花了很多时间,然后利用bs4获取了热点笑话的页面总数,最后为了方便,就封装成类。该代码抓取了今天24小时内的网站所有热点笑话(当然,图片的笑话就还没实现,有时间在捉摸一下~),写到文件中。
代码(文件参见:https://github.com/zgcao/Claw):
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 10 10:40:03 2015
@author: zhigang
"""
import urllib
import re
from bs4 import BeautifulSoup
import time
class my_qiubai:
def __init__(self):
self.stories = []
self.output ="D:\\qiubai_hot_"+time.strftime('%Y-%m-%d',time.localtime(time.time()))+".txt"
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self.headers = { 'User-Agent' : self.user_agent }
self.pagecount = 1
print('Spider has started...')
def getPageCount(self):
url = 'http://www.qiushibaike.com/hot/page/1'
nowcontent = urllib.request.urlopen(urllib.request.Request(url,headers = self.headers)).read().decode('utf-8')
soup = BeautifulSoup(nowcontent)
pagelist = soup.find("div", {"class": "pagenumber"}).stripped_strings
for page in pagelist:
self.pagecount = int(page)
def getPageContent(self,pagenumber):
url = 'http://www.qiushibaike.com/hot/page/' + str(pagenumber)
#prepare the headers which will be needed when get request for quishibaike
request = urllib.request.Request(url,headers = self.headers)
response = urllib.request.urlopen(request)
try:
content = response.read().decode('utf-8')
pattern = re.compile('.*?(.*?).*?(.*?)
(.*?)
',re.S)
items = re.findall(pattern,content)
for item in items:
#item0:author_name;item1:content;item2:img
hasImg = re.search('img',item[2])
if not hasImg:
story=(item[0].strip()+":\n"+item[1].strip()+'\n')
self.stories.append(story)
except urllib.error.HTTPError as e:
if(e.code=='404'):
return
else:
print(e.code)
return
def loadPage(self):
self.getPageContent(1)
def write(self):
with open(self.output,'w+',encoding='utf-8') as f:
for story in self.stories:
f.write(story)
print(self.output+' has been stored.')
def viewAll(self):
startindex = 1
self.getPageCount()
for i in range(startindex,self.pagecount+1):
self.getPageContent(1)
print('Page:'+str(i)+' has been fetched...')
print('All pages have been fetched...')
spider = my_qiubai()
spider.viewAll()
spider.write()
print('Spider program stoped...')