爬虫小练习:爬取内涵段子指定页数段子(可控制是否继续爬取)

时间:2022-06-25 20:37:34
import urllib.request
import re


# pattern1 = re.compile('<a\shref="(.*?)"\sclass="title"\stitle')  匹配完整段子内容链接
#
# content_url_list = pattern1.findall(html)
#
# pattern2 = re.compile('</p>(.*?)<div\sclass="ad610">',re.S)  匹配点开段子标题后完整段子的内容
#
# content_list = pattern2.findall(html)
#
# http://www.neihan8.com/article/index_3.html
#
# User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36


class Spider:
    def __init__(self,page):
        self.page = page
        self.switch = True  # 爬取开关,决定用户是否继续爬取页面信息


    def loadPage(self):
        '''
            下载页面
        '''
        # 下载第一部分页面来获取完整段子内容的连接,且打开链接
        print("页面下载中......")
        if self.page == "1":
            url = "http://www.neihan8.com/article/index.html"
        else:
            url = "http://www.neihan8.com/article/index_"+ self.page +".html"
        headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
        request = urllib.request.Request(url,headers=headers)
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        # print(html)
        pattern1 = re.compile('<a\shref="(.*?)"\sclass="title"\stitle')
        content_url_list = pattern1.findall(html)
        print("页面下载完成!")
        for content_url in content_url_list:
            # print(url)
            content_url = "http://www.neihan8.com" + content_url
            request = urllib.request.Request(content_url, headers=headers)
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
            pattern2 = re.compile('</p>(.*?)<div\sclass="ad610">', re.S)
            content_list = pattern2.findall(html)
            self.dealPage(content_list)


    def dealPage(self,content_list):
        '''
            处理每页的段子信息
        '''
        for content in content_list:
            # print(content)
            # print("-" * 30)
            content = content.replace('<p>','').replace('</p>','')
            # print(content)
            # print("-" * 30)
            self.writPage(content)


    def writPage(self,content):
        '''
            把段子信息写入文件中
        '''
        print("文件写入中......")
        with open("内涵段子第"+ self.page +"页集合.txt","a") as f:
            f.write(content)
            f.write("\n" + ("-"*50))


    def work(self):
        '''
            控制爬虫如何运行
        '''
        print("文件写入完成!感谢使用!")
        while self.switch:
            command = input("如果确定继续爬取,请按回车(退出按q):")
            if command == "q":
                self.switch = False
            else:
                page_num = input("请输入要再次爬取的页码:")
                self.page = page_num
                self.loadPage()


if __name__ == '__main__':
    page_num = input("请输入要爬取的页码:")
    Spider = Spider(page_num)
    Spider.loadPage()
    Spider.work()