import urllib.request import re # pattern1 = re.compile('<a\shref="(.*?)"\sclass="title"\stitle') 匹配完整段子内容链接 # # content_url_list = pattern1.findall(html) # # pattern2 = re.compile('</p>(.*?)<div\sclass="ad610">',re.S) 匹配点开段子标题后完整段子的内容 # # content_list = pattern2.findall(html) # # http://www.neihan8.com/article/index_3.html # # User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 class Spider: def __init__(self,page): self.page = page self.switch = True # 爬取开关,决定用户是否继续爬取页面信息 def loadPage(self): ''' 下载页面 ''' # 下载第一部分页面来获取完整段子内容的连接,且打开链接 print("页面下载中......") if self.page == "1": url = "http://www.neihan8.com/article/index.html" else: url = "http://www.neihan8.com/article/index_"+ self.page +".html" headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"} request = urllib.request.Request(url,headers=headers) response = urllib.request.urlopen(request) html = response.read().decode("utf-8") # print(html) pattern1 = re.compile('<a\shref="(.*?)"\sclass="title"\stitle') content_url_list = pattern1.findall(html) print("页面下载完成!") for content_url in content_url_list: # print(url) content_url = "http://www.neihan8.com" + content_url request = urllib.request.Request(content_url, headers=headers) response = urllib.request.urlopen(request) html = response.read().decode("utf-8") pattern2 = re.compile('</p>(.*?)<div\sclass="ad610">', re.S) content_list = pattern2.findall(html) self.dealPage(content_list) def dealPage(self,content_list): ''' 处理每页的段子信息 ''' for content in content_list: # print(content) # print("-" * 30) content = content.replace('<p>','').replace('</p>','') # print(content) # print("-" * 30) self.writPage(content) def writPage(self,content): ''' 把段子信息写入文件中 ''' print("文件写入中......") with open("内涵段子第"+ self.page +"页集合.txt","a") as f: f.write(content) f.write("\n" + ("-"*50)) def work(self): ''' 控制爬虫如何运行 ''' print("文件写入完成!感谢使用!") while self.switch: command = input("如果确定继续爬取,请按回车(退出按q):") if command == "q": self.switch = False else: page_num = input("请输入要再次爬取的页码:") self.page = page_num self.loadPage() if __name__ == '__main__': page_num = input("请输入要爬取的页码:") Spider = Spider(page_num) Spider.loadPage() Spider.work()