1.获取数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
import requests
def drg(url):
try :
head = { 'User-Agent' : 'Mozilla / 5.0 (Windows NT 10.0 ; Win64; x64) AppleWebKit / \
537.36 (KHTML, like Gecko) Chrome / \
91.0 . 4472.164 Safari / 537.36 '}
r = requests.get(url,headers = head)
r.raise_for_status() # 如果状态不是200,引发HTTPError异常
r.encoding = r.apparent_encoding
return r.text
except :
return "产生异常"
url = "https://www.ip138.com/mobile.asp?mobile=13018305773&action=mobile"
print (drg(url))
|
2.解析数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
import requests
def login():
try :
# 登录之后界面的url
urllogin = "http://www.cqooc.com/user/login?username=12608199000635&password=48C032612C2A6777D28A969307B52127E198D59AA78522943C1B283CF7B89E69&nonce=6BA36BBB1F623279&cnonce=8257070573EFE28F"
s = requests.session()
r = s.post(urllogin,data = Form,headers = headers)
r.encoding = r.apparent_encoding
r.raise_for_status()
return s
except Exception as error:
print (error)
def get_html(s,url):
try :
r = s.get(url,headers = headers)
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.text
except Exception as error:
print (error)
if __name__ = = "__main__" :
# 登录之后的界面user-agent
headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36" ,
}
# 跟着自己的改变
Form = {
"username" : "12608199000635" ,
"password" : "48C032612C2A6777D28A969307B52127E198D59AA78522943C1B283CF7B89E69" ,
"nonce" : "6BA36BBB1F623279" ,
"cnonce" : "8257070573EFE28F"
}
lin = login()
# 个人中心的网址
url = "http://www.cqooc.com/my/learn"
html = get_html(lin,url)
print (html)
|
3.数据保存为CSV格式和存入数据库
保存为CSV
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
import requests
from lxml import etree
import csv
#获取数据
def get_html(url,time = 30 ):
try :
r = requests.get(url, timeout = time)
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.text
except Exception as error:
print (error)
def parser(html): #解析函数
doc = etree.HTML(html) #html转换为soup对象
out_list = [] #解析函数输出数据的列表
#二次查找法
for row in doc.xpath( "//*[@class='book-img-text']//li/*[@class='book-mid-info']" ):
row_data = [
row.xpath( "h4/a/text()" )[ 0 ], #书名
row.xpath( "p[@class='author']/a/text()" )[ 0 ], #作者
row.xpath( "p[2]/text()" )[ 0 ].strip(), #介绍
row.xpath( "p[@class='update']/span/text()" )[ 0 ] #更新日期
]
out_list.append(row_data) #将解析的每行数据插入到输出列表中
return out_list
def save_csv(item,path): #数据存储,将list数据写入文件,防止乱码
with open (path, "a+" , newline = '',encoding = "utf-8" ) as f: #创建utf8编码文件
csv_write = csv.writer(f) #创建写入对象
csv_write.writerows(item) #一次性写入多行
if __name__ = = "__main__" :
for i in range ( 1 , 6 ):
url = "https://www.qidian.com/rank/fengyun?style=1&page={0}" . format (i)
html = get_html(url) #获取网页数据
out_list = parser(html) #解析网页,输出列表数据
save_csv(out_list, "d:\\book.csv" ) #数据存储
|
存入数据库
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
import pymysql
import requests
from lxml import etree
def get_html(url, time = 3000 ):
try :
headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"
}
r = requests.get(url, timeout = time,headers = headers)
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.text
except Exception as err:
print (err)
result = []
def parse_html(html):
html = etree.HTML(html)
for row in html.xpath( '//*[@id="content"]/div/div[1]/ul/li' ):
Naame = row.xpath( "div[2]/h2/a/text()" )[ 0 ].strip() #//*[@id="content"]/div/div[1]/ul[1]/div[2]/h2/a
score = row.xpath( "div[2]/p[2]/span[2]/text()" )[ 0 ].strip() #//*[@id="content"]/div/div[1]/ul[1]/div[2]/p[2]/span[2]
price = row.xpath( "div[2]/p[1]/text()" )[ 0 ].strip().split( "/" ) #//*[@id="content"]/div/div[1]/ul[1]/div[2]/p[1]/text()
price = price[ 0 ]
content = price[ 1 ]
a = price[ 2 ]
b = price[ - 1 ]
detail = [Naame,score,price,content,a,b]
result.append(detail)
def join_all(sql_insert,vals, * * dbinfo):
try :
connet = pymysql.connect( * * dbinfo)
cursor = connet.cursor()
cursor.executemany(sql_insert,vals)
connet.commit()
print ( '添加成功!' )
except Exception as err:
print (err)
connet.rollback()
cursor.close()
if __name__ = = "__main__" :
for page in range ( 1 , 16 ):
url = "https://book.douban.com/latest?subcat=%E5%85%A8%E9%83%A8&p={0}" . format ( str (page))
parms = {
"host" : "127.0.0.1" ,
"port" : 3306 ,
"user" : "root" ,
"passwd" : "123456" ,
"db" : "db" ,
"charset" : "utf8"
}
html = get_html(url)
parse_html(html)
sql_insert = "INSERT INTO db(Naame,score,price,content,a,b)\
Values( % s, % s, % s, % s, % s, % s)"
join_all(sql_insert,result, * * parms)
print (result)
|
总结
本篇文章就到这里了,希望能够给你带来帮助,也希望您能够多多关注服务器之家的更多内容!
原文链接:https://blog.csdn.net/qq_50951790/article/details/120643441