1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
import requests
import re
import json
import os
session = requests.session()
def fetch_url(url):
return session.get(url).content.decode( 'gbk' )
def get_doc_id(url):
return re.findall( 'view/(.*).html' , url)[ 0 ]
def parse_type(content):
return re.findall(r "docType.*?\:.*?\'(.*?)\'\," , content)[ 0 ]
def parse_title(content):
return re.findall(r "title.*?\:.*?\'(.*?)\'\," , content)[ 0 ]
def parse_doc(content):
result = ''
url_list = re.findall( '(https.*?0.json.*?)\\\\x22}' , content)
url_list = [addr.replace( "\\\\\\/" , "/" ) for addr in url_list]
for url in url_list[: - 5 ]:
content = fetch_url(url)
y = 0
txtlists = re.findall( '"c":"(.*?)".*?"y":(.*?),' , content)
for item in txtlists:
if not y = = item[ 1 ]:
y = item[ 1 ]
n = '\n'
else :
n = ''
result + = n
result + = item[ 0 ].encode( 'utf-8' ).decode( 'unicode_escape' , 'ignore' )
return result
def parse_txt(doc_id):
content_url = 'https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=' + doc_id
content = fetch_url(content_url)
md5 = re.findall( '"md5sum":"(.*?)"' , content)[ 0 ]
pn = re.findall( '"totalPageNum":"(.*?)"' , content)[ 0 ]
rsign = re.findall( '"rsign":"(.*?)"' , content)[ 0 ]
content_url = 'https://wkretype.bdimg.com/retype/text/' + doc_id + '?rn=' + pn + '&type=txt' + md5 + '&rsign=' + rsign
content = json.loads(fetch_url(content_url))
result = ''
for item in content:
for i in item[ 'parags' ]:
result + = i[ 'c' ].replace( '\\r' , '\r' ).replace( '\\n' , '\n' )
return result
def parse_other(doc_id):
content_url = "https://wenku.baidu.com/browse/getbcsurl?doc_id=" + doc_id + "&pn=1&rn=99999&type=ppt"
content = fetch_url(content_url)
url_list = re.findall( '{"zoom":"(.*?)","page"' , content)
url_list = [item.replace( "\\" , '') for item in url_list]
if not os.path.exists(doc_id):
os.mkdir(doc_id)
for index, url in enumerate (url_list):
content = session.get(url).content
path = os.path.join(doc_id, str (index) + '.jpg' )
with open (path, 'wb' ) as f:
f.write(content)
print ( "图片保存在" + doc_id + "文件夹" )
def save_file(filename, content):
with open (filename, 'w' , encoding = 'utf8' ) as f:
f.write(content)
print ( '已保存为:' + filename)
# test_txt_url = 'https://wenku.baidu.com/view/cbb4af8b783e0912a3162a89.html?from=search'
# test_ppt_url = 'https://wenku.baidu.com/view/2b7046e3f78a6529657d5376.html?from=search'
# test_pdf_url = 'https://wenku.baidu.com/view/dd6e15c1227916888586d795.html?from=search'
# test_xls_url = 'https://wenku.baidu.com/view/eb4a5bb7312b3169a551a481.html?from=search'
def main():
url = input ( '请输入要下载的文库URL地址' )
content = fetch_url(url)
doc_id = get_doc_id(url)
type = parse_type(content)
title = parse_title(content)
if type = = 'doc' :
result = parse_doc(content)
save_file(title + '.txt' , result)
elif type = = 'txt' :
result = parse_txt(doc_id)
save_file(title + '.txt' , result)
else :
parse_other(doc_id)
if __name__ = = "__main__" :
main()
|
爬取结果
以上就是python 爬取百度文库并以下载的详细内容,更多关于python 爬取百度文库的资料请关注服务器之家其它相关文章!
原文链接:https://github.com/Jack-Cherish/python-spider