实例一.京东商品爬取
from requests import * try: r=get("https://item.jd.com/3888216.html") r.raise_for_status() r.encoding=r.apparent_encoding print(r.text[200:500]) except: print("爬取失败")
<meta name="keywords" content="HUAWEI畅享6,华为畅享6,华为畅享6报价,HUAWEI畅享6报价"/>
<meta name="description" content="【华为畅享6】京东JD.COM提供华为畅享6正品行货,并包括HUAWEI畅享6网购指南,以及华为畅享6图片、畅享6参数、畅享6评论、畅享6心得、畅享6技巧等信息,网购华为畅享6上京东,放心又轻松" />
<meta name="format-detection" content="telephone=no">
实例2.亚马逊商品爬取
from requests import * try: r=get("https://www.amazon.cn/gp/product/B01G2WHK6Q/ref=s9_acsd_top_hd_bw_by84kd_c_x_1_w?pf_rd_m=A1AJ19PSB66TGU&pf_rd_s=merchandised-search-2&pf_rd_r=SJKQ5Y5MSPBYRY3DB4Q7&pf_rd_r=SJKQ5Y5MSPBYRY3DB4Q7&pf_rd_t=101&pf_rd_p=73d9023c-d182-4530-8e41-0e85293f5983&pf_rd_p=73d9023c-d182-4530-8e41-0e85293f5983&pf_rd_i=888505051") r.raise_for_status() r.encoding=r.apparent_encoding print(r.text[200:500]) except: print(r.status_code) print(r.request.headers)
输出:503
{'User-Agent': 'python-requests/2.18.4', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
爬取失败,因为亚马逊不支持爬虫程序的访问,我们需要模拟为浏览器访问,用headers修改头部信息
from requests import * try: hd={"User-Agent":"Chrome/10"} r=get("https://www.amazon.cn/gp/product/B01G2WHK6Q/ref=s9_acsd_top_hd_bw_by84kd_c_x_1_w?pf_rd_m=A1AJ19PSB66TGU&pf_rd_s=merchandised-search-2&pf_rd_r=SJKQ5Y5MSPBYRY3DB4Q7&pf_rd_r=SJKQ5Y5MSPBYRY3DB4Q7&pf_rd_t=101&pf_rd_p=73d9023c-d182-4530-8e41-0e85293f5983&pf_rd_p=73d9023c-d182-4530-8e41-0e85293f5983&pf_rd_i=888505051",headers=hd) r.raise_for_status() r.encoding=r.apparent_encoding print(r.text[200:500]) except: print(r.status_code) print(r.request.headers)
实例3.百度搜索关键词提交
from requests import * try: #百度的搜索关键词是http://www.baidu.com/s?wd=keyword search={"wd":"python"} r=get("http://www.baidu.com/s",params=search) #查看输出的url print(r.request.url) r.raise_for_status() r.encoding=r.apparent_encoding print(len(r.text)) except: print(r.status_code)
实例4.网页图片的爬取(需要后缀为图片格式的网页)
将网页中的图片爬取到指定目录的文件夹下
import requests import os root="D:\\" url="http://uploads.xuexila.com/allimg/1612/902-16120GA509.jpg" path=root+url.split('/')[-1] try: if not os.path.exists(root): os.makedirs(root) if not os.path.exists(path): r=requests.get(url) f=open(path,'wb') f.write(r.content) f.close() else: print("文件已存在") except: print(r.status_code)
实例5.查询IP归属地
import requests #使用ip138网站进行查询,后缀参数为ip=ipaddress url="http://m.ip138.com/ip.asp?ip=" try: r=requests.get(url+"159.226.43.225") r.raise_for_status() r.encoding=r.apparent_encoding print(r.text[-500:]) except: print(r.status_code)