本文主要分享关于python登录并爬取淘宝信息的相关代码,还是挺不错的,大家可以了解下。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from selenium import webdriver
import time
import datetime
import traceback
import logging
import os
from selenium.webdriver.common.action_chains import ActionChains
import codecs
#登录
def login(driver,site):
driver.get(site)
time.sleep( 5 )
try :
#点击请登录
driver.find_element_by_class_name( "h" ).click()
time.sleep( 5 )
#输入账号和密码
driver.find_element_by_id( "TPL_username_1" ).send_keys(u "yourusername" )
time.sleep( 5 )
#print driver.find_element_by_id("TPL_username_1")
driver.find_element_by_id( "TPL_password_1" ).send_keys(u "yourpsd" )
time.sleep( 5 )
#点击登录
driver.find_element_by_id( "J_SubmitStatic" ).click()
time.sleep( 30 )
except :
print u "failure"
def crawlmarket(driver,filename,site):
#driver = webdriver.Firefox()
driver.get(site)
driver.maximize_window()
time.sleep( 10 )
driver.refresh()
time.sleep( 10 )
test = driver.find_elements_by_xpath( "//a[@class='J_ItemLink']" )
#是否获取到消息,若无则登录
if len (test) = = 0 :
login(driver,site)
time.sleep( 30 )
resultstrall = ""
resultstr = ""
strinfo = ""
for i in range ( 0 , len (test), 1 ):
if test[i].text ! = "" :
resultstr = test[i].text.strip() + '\n'
print resultstr
resultstrall + = resultstr
#是否成功抓取
if resultstrall ! = "":
f = codecs. open (filename, 'w' , 'utf-8' )
f.write(resultstrall)
f.close()
#若没有成功抓取将网站写入error
else :
strinfo = filename + "," + site
print strinfo
ferror = codecs. open ( "error.txt" , 'a' , 'utf-8' )
ferror.write(strinfo)
ferror.close()
driver.quit()
def crawltaobaosousuo(driver,filename,site):
#driver = webdriver.Firefox()
driver.get(site)
driver.maximize_window()
time.sleep( 10 )
driver.get(site)
time.sleep( 30 )
driver.refresh()
test = driver.find_elements_by_xpath( "//a[@class='J_ClickStat']" )
resultstrall = ""
resultstr = ""
strinfo = ""
for i in range ( 0 , len (test), 1 ):
if test[i].text ! = "" :
resultstr = test[i].text.strip() + '\n'
print resultstr
resultstrall + = resultstr
if resultstrall ! = "":
f = codecs. open (filename, 'w' , 'utf-8' )
f.write(resultstrall)
f.close()
else :
strinfo = filename + "," + site
print strinfo
ferror = codecs. open ( "error.txt" , 'a' , 'utf-8' )
ferror.write(strinfo)
ferror.close()
driver.quit()
def jiexi(driver):
f = open ( "1.txt" , "r" )
for line in f:
time.sleep( 60 )
info = line.split( "," )
href = info[ 1 ]
filename = info[ 0 ].decode( "utf-8" )
print filename
if "markets" in href:
crawlmarket(driver,filename,href)
else :
crawltaobaosousuo(driver,filename,href)
if __name__ = = '__main__' :
driver = webdriver.Firefox()
jiexi(driver)
|
小结
有改进策略一起探讨,可以抓取淘宝部分网页内容,根据自己的需求改改吧,会被风控。个人觉得不登录的效果更好。
以上就是本文关于python 登录并爬取淘宝信息代码示例的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题。如有不足之处,欢迎留言指出。感谢朋友们对本站的支持!
原文链接:http://blog.csdn.net/Japan__/article/details/50821586