https://www.cnblogs.com/gaochsh/p/6757475.html
https://cuiqingcai.com/2621.html
其中可以通过chrome安装xpath插件进行验证自己写的关键爬虫xpath语句(过程略)
第一则是爬取豌豆荚下的应用类别,输入格式.xlsx(主要由包名组成,前缀一致)
代码如下:
# -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf8') import xlrd import xlwt import urllib2 from lxml import etree # 读取文件,如上截图所示 data = xlrd.open_workbook('app0306.xlsx') table = data.sheets()[0] n_row = table.nrows n_col = table.ncols x = [] y = [] for i in range(n_row): x.append(table.row_values(i)[0]) workbook = xlwt.Workbook(encoding='ascii') worksheet = workbook.add_sheet('app') count = 0 for xx in x: url = xx # 发起请求 req = urllib2.Request(url) fd = urllib2.urlopen(req) data = fd.read() data = data.decode('utf-8') # print(type(data)) print "on scanning ", count + 1 if '抱歉,该应用已下架' in data: y.append('N0') flag = 'NO' print('NO') else: y.append('Yes') flag = 'Yes' print('Yes') selector = etree.HTML(data) zz = "http://www.wandoujia.com/apps/" # 删除网址豌豆荚前缀 l = len(zz) if flag == 'Yes': content1 = selector.xpath('//div[@class="app-info"]/p/span/text()') # 内容定位,应用名 # for i in content1: # print i content2 = selector.xpath('//div[@class="col-right"]/div/dl/dd/a/text()') # 内容定位,应用类别 # for j in content2: # print j worksheet.write(count, 0, url[l:]) # 1列 worksheet.write(count, 1, content1) # 2列 worksheet.write(count, 2, content2) # 2列 else: worksheet.write(count, 0, url[l:]) # 1列 worksheet.write(count, 1, "NO") # 2列 count += 1 workbook.save('app.xls') # 输出路径自行定义
下一则是爬取谷歌应用类别,输入txt文件,最好设置停顿时间和代理商反爬虫
输入txtshili
# -*- coding: UTF-8 -*- # -*- coding: utf-8 -*- """ Created on Sun Nov 5 11:03:06 2017 @author: Administrator """ #批量检查url有效性 import urllib2 from urllib2 import URLError import xlwt import datetime,time import requests from lxml import etree result_url=[] result = [] count=0 not_200=0 f=open("app0306.txt","r") # 域名或网址的txt文件 workbook = xlwt.Workbook(encoding='ascii') worksheet = workbook.add_sheet('My workshet') user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64)' headers = { 'User-Agent' : user_agent } for line in f: count+=1 print "on scanning ",count try: # req = requests.request("get", "http://" + line) # print req.status_code req = urllib2.Request(line, headers = headers) # 网址 #req = urllib2.Request("http://" + line) #域名 response = urllib2.urlopen(req) data = response.read() data = data.decode('utf-8') except URLError, e: if hasattr(e,'reason'): #stands for URLError print "can not reach a server,writing..." elif hasattr(e,'code'): #stands for HTTPError print "find http error, writing..." else: #stands for unknown error print "unknown error, writing..." not_200 += 1 # result_url.append(line) # result.append('NO') re = 'NO' time.sleep(1) # 休眠1秒 else: #print "url is reachable!" #else 中不用再判断 response.code 是否等于200,若没有抛出异常,肯定返回200,直接关闭即可 #result.append('YES') print "Yes!" response.close() time.sleep(1) # 休眠1秒 re = 'YES' finally: pass if re == 'YES': selector = etree.HTML(data) content1 = selector.xpath('//div[@class="details-info"]/div/div/h1/div/text()') # for i in content1: # print i content2 = selector.xpath('//div[@class="left-info"]/div/a/span[@itemprop="genre"]/text()') # for j in content2: # print j worksheet.write(count-1, 0, line) # 1列 worksheet.write(count-1, 1, content1) # 2列 worksheet.write(count-1, 2, content2) # 2列 else: worksheet.write(count-1, 0, line) # 1列 worksheet.write(count-1, 1, "NO") # 2列 # worksheet.write(count-1, 0, line) # 1列 # worksheet.write(count-1, 1, re) # 2列 workbook.save('appmonth.xls') # 路径自行定义 print "scanning over,total",count,"; did not response 200:",not_200 f.close()