简单静态网页爬虫-xpath

时间:2021-02-02 19:31:16
xpath语法可参考

https://www.cnblogs.com/gaochsh/p/6757475.html

https://cuiqingcai.com/2621.html

其中可以通过chrome安装xpath插件进行验证自己写的关键爬虫xpath语句(过程略)

第一则是爬取豌豆荚下的应用类别,输入格式.xlsx(主要由包名组成,前缀一致)

简单静态网页爬虫-xpath

 

代码如下:

 
# -*- coding: utf-8 -*-

import sys
reload(sys)
sys.setdefaultencoding('utf8')
import xlrd
import xlwt
import urllib2
from lxml import etree

# 读取文件,如上截图所示
data = xlrd.open_workbook('app0306.xlsx')
table = data.sheets()[0]
n_row = table.nrows
n_col = table.ncols

x = []
y = []
for i in range(n_row):
    x.append(table.row_values(i)[0])

workbook = xlwt.Workbook(encoding='ascii')
worksheet = workbook.add_sheet('app')
count = 0
for xx in x:
    url = xx
    # 发起请求
    req = urllib2.Request(url)
    fd = urllib2.urlopen(req)

    data = fd.read()
    data = data.decode('utf-8')
    # print(type(data))

    print "on scanning ", count + 1
    if '抱歉,该应用已下架' in data:
        y.append('N0')
        flag = 'NO'
        print('NO')
    else:
        y.append('Yes')
        flag = 'Yes'
        print('Yes')

    selector = etree.HTML(data)
    zz = "http://www.wandoujia.com/apps/"  # 删除网址豌豆荚前缀
    l = len(zz)
    if flag == 'Yes':
        content1 = selector.xpath('//div[@class="app-info"]/p/span/text()') # 内容定位,应用名
        # for i in content1:
        #     print i
        content2 = selector.xpath('//div[@class="col-right"]/div/dl/dd/a/text()')  # 内容定位,应用类别
        # for j in content2:
        #     print j
        worksheet.write(count, 0, url[l:])  # 1列
        worksheet.write(count, 1, content1)  # 2列
        worksheet.write(count, 2, content2)  # 2列
    else:
        worksheet.write(count, 0, url[l:])  # 1列
        worksheet.write(count, 1, "NO")  # 2列
    count += 1
    workbook.save('app.xls') # 输出路径自行定义


下一则是爬取谷歌应用类别,输入txt文件,最好设置停顿时间和代理商反爬虫

输入txtshili

简单静态网页爬虫-xpath

# -*- coding: UTF-8 -*- # -*- coding: utf-8 -*- """ Created on Sun Nov 5 11:03:06 2017  @author: Administrator """ #批量检查url有效性 import urllib2
from urllib2 import URLError
import xlwt
import datetime,time
import requests
from lxml import etree

result_url=[]
result = []
count=0
not_200=0
f=open("app0306.txt","r")    # 域名或网址的txt文件  workbook = xlwt.Workbook(encoding='ascii')
worksheet = workbook.add_sheet('My workshet')

user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64)' headers = { 'User-Agent' : user_agent }
for line in f:
    count+=1
    print "on scanning ",count
    try:
        # req = requests.request("get", "http://" + line)  # print req.status_code   req = urllib2.Request(line, headers = headers)   # 网址  #req = urllib2.Request("http://" + line) #域名  response = urllib2.urlopen(req)
        data = response.read()
        data = data.decode('utf-8')
    except URLError, e:
        if hasattr(e,'reason'): #stands for URLError  print "can not reach a server,writing..."  elif hasattr(e,'code'): #stands for HTTPError  print "find http error, writing..."  else: #stands for unknown error  print "unknown error, writing..."  not_200 += 1
        # result_url.append(line)  # result.append('NO')  re = 'NO'  time.sleep(1)  # 休眠1秒  else:
        #print "url is reachable!"  #else 中不用再判断 response.code 是否等于200,若没有抛出异常,肯定返回200,直接关闭即可  #result.append('YES')  print "Yes!"  response.close()
        time.sleep(1)  # 休眠1秒  re = 'YES'  finally:
        pass   if re == 'YES':
        selector = etree.HTML(data)
        content1 = selector.xpath('//div[@class="details-info"]/div/div/h1/div/text()')
        # for i in content1:  # print i  content2 = selector.xpath('//div[@class="left-info"]/div/a/span[@itemprop="genre"]/text()')
        # for j in content2:  # print j  worksheet.write(count-1, 0, line)  # 1列  worksheet.write(count-1, 1, content1)  # 2列  worksheet.write(count-1, 2, content2)  # 2列  else:
        worksheet.write(count-1, 0, line)  # 1列  worksheet.write(count-1, 1, "NO")  # 2列   # worksheet.write(count-1, 0, line) # 1列  # worksheet.write(count-1, 1, re) # 2列  workbook.save('appmonth.xls')  # 路径自行定义

print "scanning over,total",count,"; did not response 200:",not_200
f.close()