# -*- coding: utf-8 -*- import time import requests import re from bs4 import BeautifulSoup # 获取页面 def getHTMLText(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" # 获取前n页并存储 def getPage(n): # 存储页面的文件 f = "Pages.html" fo = open(f,"w",encoding='utf-8') # 默认是gbk编码,后面的网页内容是decode过的unicode编码,会导致解析不了 for i in range(1,2*n+1,2): time.sleep(0.5) url="http://search.jd.com/search?keyword=%E7%83%AD%E6%B0%B4%E5%99%A8&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%83%AD%E6%B0%B4%E5%99%A8&ev=exbrand_%E6%B5%B7%E5%B0%94%EF%BC%88Haier%EF%BC%89%5E&stock=1&page="+str(i) r=getHTMLText(url) fo.write(r) # 向文件写入一个字符串或字节流 fo.seek(2) # 指向文件结尾 fo.close() getPage(46) fo = open("Pages.html","rt",encoding="utf-8") r=fo.read() fo.close() suop=BeautifulSoup(r,"html.parser") print(suop.find_all("a"))