本文实例讲述了Python查询阿里巴巴关键字排名的方法。分享给大家供大家参考。具体如下:
这里使用python库urllib及pyquery基本东西的应用,实现阿里巴巴关键词排名的查询,其中涉及到urllib代理的设置,pyquery对html文档的解析
1. urllib 基础模块的应用,通过该类获取到url中的html文档信息,内部可以重写代理的获取方法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
class ProxyScrapy( object ):
def __init__( self ):
self .proxy_robot = ProxyRobot()
self .current_proxy = None
self .cookie = cookielib.CookieJar()
def __builder_proxy_cookie_opener( self ):
cookie_handler = urllib2.HTTPCookieProcessor( self .cookie)
handlers = [cookie_handler]
if PROXY_ENABLE:
self .current_proxy = ip_port = self .proxy_robot.get_random_proxy()
proxy_handler = urllib2.ProxyHandler({ 'http' : ip_port[ 7 :]})
handlers.append(proxy_handler)
opener = urllib2.build_opener( * handlers)
urllib2.install_opener(opener)
return opener
def get_html_body( self ,url):
opener = self .__builder_proxy_cookie_opener()
request = urllib2.Request(url)
#request.add_header("Accept-Encoding", "gzip,deflate,sdch")
#request.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
#request.add_header("Cache-Control", "no-cache")
#request.add_header("Connection", "keep-alive")
try :
response = opener. open (request,timeout = 2 )
http_code = response.getcode()
if http_code = = 200 :
if PROXY_ENABLE:
self .proxy_robot.handle_success_proxy( self .current_proxy)
html = response.read()
return html
else :
if PROXY_ENABLE:
self .proxy_robot.handle_double_proxy( self .current_proxy)
return self .get_html_body(url)
except Exception as inst:
print inst, self .current_proxy
self .proxy_robot.handle_double_proxy( self .current_proxy)
return self .get_html_body(url)
|
2. 根据输入的公司名及关键词列表,返回每个关键词的排名
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
def search_keywords_rank(keyword_company_name, keywords):
def get_context(url):
start = clock()
html = curl.get_html_body(url)
finish = clock()
print url,(finish - start)
d = pq(html)
items = d( "#J-items-content .ls-item" )
items_c = len (items)
print items_c
if items_c < 38 :
return get_context(url)
return items, items_c
result = OrderedDict()
for keyword in keywords:
for page_index in range ( 1 , 9 ):
u = url % (re.sub( '\s+' , '_' , keyword.strip()), page_index)
items, items_c = get_context(u)
b = False
for item_index in range ( 0 , items_c):
e = items.eq(item_index).find( '.title a' )
p_title = e.text()
p_url = e.attr( 'href' )
e = items.eq(item_index).find( '.cright h3 .dot-product' )
company_name = e.text()
company_url = e.attr( 'href' )
if keyword_company_name in company_url:
total_index = (page_index - 1 ) * 38 + item_index + 1 + ( 0 if page_index = = 1 else 5 )
print 'page %s, index %s, total index %s' % (page_index, item_index + 1 , total_index)
b = True
if keyword not in result:
result[keyword] = (p_title, p_url, page_index, item_index + 1 , total_index, u)
break
if b:
break
return result
|
希望本文所述对大家的Python程序设计有所帮助。