1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/02/11 21:44
# @Author : dangxusheng
# @Email : dangxusheng163@163.com
# @File : download_by_href.py
'''
自动从arxiv.org 下载文献
'''
import os
import os.path as osp
import requests
from lxml import etree
from pprint import pprint
import re
import time
import glob
headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36" ,
"Host" : 'arxiv.org'
}
HREF_CN = 'http://cn.arxiv.org/pdf/'
HREF_SRC = 'http://cn.arxiv.org/pdf/'
SAVE_PATH = '/media/dangxs/E/Paper/download_at_20200730'
os.makedirs(SAVE_PATH, exist_ok = True )
FAIL_URLS = []
FAIL_URLS_TXT = f '{SAVE_PATH}/fail_urls.txt'
def download(url, title):
pattern = r '[\\/:*?"\'<>|\r\n]+'
new_title = re.sub(pattern, " " , title)
print (f 'new title: {new_title}' )
save_filepath = '%s/%s.pdf' % (SAVE_PATH, new_title)
if osp.exists(save_filepath) and osp.getsize(save_filepath) > 50 * 1024 :
print (f 'this pdf is be existed.' )
return True
try :
with open (save_filepath, 'wb' ) as file :
# 分字节下载
r = requests.get(url, stream = True , timeout = None )
for i in r.iter_content( 2048 ):
file .write(i)
if osp.getsize(save_filepath) > = 10 * 1024 :
print ( '%s 下载成功.' % title)
return True
except Exception as e:
print (e)
return False
# 从arxiv.org 去下载
def search(start_size = 0 , title_keywords = 'Facial Expression' ):
# 访问地址: https://arxiv.org/find/grp_eess,grp_stat,grp_cs,grp_econ,grp_math/1/ti:+Face/0/1/0/past,2018,2019/0/1?skip=200&query_id=1c582e6c8afc6146&client_host=cn.arxiv.org
req_url = 'https://arxiv.org/search/advanced'
req_data = {
'advanced' : 1 ,
'terms-0-operator' : 'AND' ,
'terms-0-term' : title_keywords,
'terms-0-field' : 'title' ,
'classification-computer_science' : 'y' ,
'classification-physics_archives' : 'all' ,
'classification-include_cross_list' : 'include' ,
'date-filter_by' : 'date_range' , # date_range | specific_year
# 'date-year': DOWN_YEAR,
'date-year' : '',
'date-from_date' : '2015' ,
'date-to_date' : '2020' ,
'date-date_type' : 'announced_date_first' , # submitted_date | submitted_date_first | announced_date_first
'abstracts' : 'show' ,
'size' : 50 ,
'order' : '-announced_date_first' ,
'start' : start_size,
}
res = requests.get(req_url, params = req_data, headers = headers)
html = res.content.decode()
html = etree.HTML(html)
total_text = html.xpath( '//h1[@class="title is-clearfix"]/text()' )
total_text = ' '.join(total_text).replace(' \n ', ' ').lstrip(' ').strip(' ')
# i.e. : Showing 1–50 of 355 results
num = re.findall( '\d+' , total_text)
# Sorry, your query returned no results
if len (num) = = 0 : return [], 0
total = int (num[ - 1 ]) # 查询总条数
paper_list = html.xpath( '//ol[@class="breathe-horizontal"]/li' )
info_list = []
for p in paper_list:
title = p.xpath( './p[@class="title is-5 mathjax"]//text()' )
title = ' '.join(title).replace(' \n ', ' ').lstrip(' ').strip(' ')
href = p.xpath( './div/p/a/@href' )[ 0 ]
info_list.append({ 'title' : title, 'href' : href})
return info_list, total
# 去指定页面下载
def search_special():
res = requests.get( 'https://gitee.com/weberyoung/the-gan-zoo?_from=gitee_search' )
html = res.content.decode()
html = etree.HTML(html)
paper_list = html.xpath( '//div[@class="file_content markdown-body"]//li' )
info_list = []
for p in paper_list:
title = p.xpath( './/text()' )
title = ' '.join(title).replace(' \n ', ' ').lstrip(' ').strip(' ')
href = p.xpath( './a/@href' )[ 0 ]
info_list.append({ 'title' : title, 'href' : href})
pprint(info_list)
return info_list
if __name__ = = '__main__' :
page_idx = 0
total = 1000
keywords = 'Facial Action Unit'
while page_idx < = total / / 50 :
paper_list, total = search(page_idx * 50 , keywords)
print (f 'total: {total}' )
if total = = 0 :
print ( 'no found .' )
exit( 0 )
for p in paper_list:
title = p[ 'title' ]
href = HREF_CN + p[ 'href' ].split( '/' )[ - 1 ] + '.pdf'
print (href)
if not download(href, title):
print ( '从国内镜像下载失败,从源地址开始下载 >>>>' )
# 使用国际URL再下载一次
href = HREF_SRC + p[ 'href' ].split( '/' )[ - 1 ] + '.pdf'
if not download(href, title):
FAIL_URLS.append(p)
page_idx + = 1
# 下载最后的部分
last_1 = total - page_idx * 50
paper_list, total = search(last_1, keywords)
for p in paper_list:
title = p[ 'title' ]
href = HREF_CN + p[ 'href' ].split( '/' )[ - 1 ] + '.pdf'
if not download(href, title):
FAIL_URLS.append(p)
time.sleep( 1 )
pprint(FAIL_URLS)
with open (FAIL_URLS_TXT, 'a+' ) as f:
for item in FAIL_URLS:
href = item[ 'href' ]
title = item[ 'title' ]
f.write(href + '\n' )
print ( 'done.' )
|
以上就是python自动从arxiv下载paper的示例代码的详细内容,更多关于python 从arxiv下载paper的资料请关注服务器之家其它相关文章!
原文链接:https://www.cnblogs.com/dxscode/p/13406238.html