1 # coding=utf-8 2 import numpy as np 3 import pandas as pd 4 import sys 5 6 from selenium import webdriver 7 import time 8 import requests 9 import re 10 from openpyxl.workbook import Workbook 11 import matplotlib.pyplot as plt 12 import matplotlib 13 14 urls = [] 15 urls_new = [] 16 titles = [] 17 titles_new = [] 18 days = [] 19 comments = [] 20 authors = [] 21 sources = [] 22 comment = [] 23 ty = [] 24 def save_to_file(file_name, contents): 25 fh = open(file_name, 'w') 26 fh.write(contents) 27 fh.close() 28 29 url="https://www.ithome.com/" 30 # headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36' 31 # '(KHTML,like Gecko) Chrome/50.0.2661.102 Safari/537.36 QIHU 360EE'} 32 headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"} 33 rep = requests.get(url,headers=headers) 34 rep.encoding="utf-8" 35 strw=rep.text 36 save_to_file('ithome.html', strw) 37 p = re.compile(r'<div class="lst lst-1 new-list">(.*?)</div>\s*?</div>') 38 m = p.findall(strw) 39 print(len(m[0])) 40 p = re.compile(r'<div class=\"block \d{4} new-list-\d{1}\"(?: style=\".*?\")?><ul>(.*?)</ul></div>') 41 m2 = p.findall(m[0]) 42 print(len(m2)) 43 44 broswer = webdriver.Chrome('D:\谷歌\Google\Chrome\Application\chromedriver.exe') 45 46 for i in m2: 47 m2 = re.findall(r'</span><span class=\"title\">.*?href=\"(.*?)\">(?:<.*?>)?(.*?)(?:</font>)?</a></span></li>', i) 48 for j in m2: 49 urls.append(j[0]) 50 titles.append(j[1]) 51 print(len(urls)) 52 for i in range(len(urls)): 53 print(u'读取中' + urls[i]) 54 broswer.get(urls[i]) 55 time.sleep(1) 56 strw2 = broswer.page_source 57 # print(strw2) 58 p2 = re.compile(r'https://\w+?.ithome.com/(?:html/)?(.*?)/.*?') 59 m2 = p2.findall(urls[i]) 60 print(m2) 61 p = re.compile(u'<span id="pubtime_baidu">(\d*-\d*-\d*).*?</span><span id="source_baidu">' 62 u'来源:<a href=".*?" .*?>(.*?)</a></span><span id="author_baidu">' 63 u'作者:(?:<strong>)?(.*?)(?:</strong>)?</span>.*?<span id="commentcount">(.*?)</span>') 64 m = p.findall(strw2) 65 print(m) 66 if len(m) > 0: 67 days.append(m[0][0]) 68 sources.append(m[0][1]) 69 authors.append(m[0][2]) 70 urls_new.append(urls[i]) 71 comments.append(m[0][3]) 72 titles_new.append(titles[i]) 73 ty.append(m2[0]) 74 print("读取结束") 75 data={'日期':days,'作者':authors,'来源':sources,'标题':titles_new,'链接':urls_new,'评论数量':comments,'新闻类型':ty} 76 df = pd.DataFrame(data, columns=['日期', '作者', '来源','标题','链接','评论数量','新闻类型']) 77 # print(df) 78 df.to_excel(r'ShuJuPa.xlsx',sheet_name='数据爬取结果',encoding='gb2312')