python3 爬虫新手笔记(二):PRAW、API爬取Reddit
一. Reddit
1. 前期准备
API · Reddit 阅读说明
OAuth2 · Reddit create a app,得到client id和secret
acquire a token
import requests import requests.auth client_auth = requests.auth.HTTPBasicAuth('client_id','secret') post_data = {"grant_type": "password", "username": "XXX", "password": "XXX"} headers = {"User-Agent": "ChangeMeClient/0.1 by YourUsername"} response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers) print(response.json())
{'access_token': 'XXX', 'token_type': 'bearer', 'expires_in': 3600, 'scope': '*'}
2. 表结构
参数 | 类型 | 描述 |
id |
int | 唯一标识 |
url | varchar(255) | |
url_md5 | varchar(255) | |
title | varchar(255) | |
author | varchar(255) | |
created_utc | datetime | |
selftext | text | 本身的文本或跳转到的超链接 |
score | int | 得分 |
num_comments | int | 评论的数量 |
upvote_ratio | float | up的比例 |
3. 实现
3.1 API包装器PRAW
PRAW: The Python Reddit API WrapperPRAW是一个reddit API的包装器,提供了使用API的接口。
class RedditSpider(scrapy.Spider): name = "reddit" allowed_domains = ["reddit.com"] start_urls = [ "https://www.reddit.com" ] def parse(self, response): #使用client id 和secret 进行登陆 reddit = praw.Reddit(client_id='XXX', client_secret='XXX', grant_type='client_credentials', user_agent='mytestscripts/1.0') """ sub = reddit.submission(id='9klf7s') #print(sub.title) #pprint.pprint(vars(sub)) """ #可以通过 subreddit.stream.submissions()来监控某一个子版块出现的新帖子 #subreddit = reddit.subreddit('dapps') #for sub in subreddit.stream.submissions(): #limit=None来获取所有的贴子,默认为100 #每次得到的属性类别数量可能不一样 subs = reddit.subreddit('dapps').new(limit=None) for sub in subs: item = RedditItem() item['html'] = response.body #print(item['html']) #permalink是网站下该帖子的前缀,需要和网站地址拼接构成该帖子的链接地址 url = 'https://{}{}'.format(self.allowed_domains[0], sub.permalink) item['url'] = url ...... redditor = sub.author #作者可能为空 #print("author:", redditor.name) if redditor is not None: item['author'] = redditor.name else: item['author'] = "" #sub.created_utc是一个utc时间戳,需要转换成datetime格式 #print("created utc:", sub.created_utc) item['created_time'] = datetime.datetime.utcfromtimestamp(sub.created_utc) #如果帖子本身只是一个超链接,那么sub.selftext为空 item['selftext'] = sub.selftext if sub.is_self==False : item['selftext'] = sub.url ...... yield item
html_insert = '''insert into reddit_dapps_html(html) values('{html}')''' reddit_insert = '''insert into reddit_dapps(url, url_md5, title, author, created_time, selftext, score, num_comments, upvote_ratio) values('{url}', '{url_md5}', '{title}', '{author}', '{created_time}', '{selftext}', '{score}', '{num_comments}', '{upvote_ratio}')''' def process_item(self, item, spider): html = item['html'] if html: item['html'] = html.strip().decode(encoding="utf-8") ...... #将时间格式化 #created_time created_time = item['created_time'] if created_time: item['created_time'] = created_time.strftime("%Y-%m-%d %H:%M:%S") selftext = item['selftext'] if selftext: item['selftext'] = selftext.replace('\n', '').replace(' ', ' ') ...... sqltext1 = self.html_insert.format( html = pymysql.escape_string(item['html'])) #由于score等是数字,需要先转换为字符串格式 sqltext2 = self.reddit_insert.format( url = pymysql.escape_string(item['url']), ...... score = pymysql.escape_string(str(item['score'])), num_comments = pymysql.escape_string(str(item['num_comments'])), upvote_ratio = pymysql.escape_string(str(item['upvote_ratio']))) self.cursor.execute(sqltext1) self.cursor.execute(sqltext2) return item def open_spider(self, spider): # connet database # 选择字符集为'utf8mb4' self.connect = pymysql.connect( host=self.settings.get('MYSQL_HOST'), port=self.settings.get('MYSQL_PORT'), db=self.settings.get('MYSQL_DBNAME'), user=self.settings.get('MYSQL_USER'), passwd=self.settings.get('MYSQL_PASSWD'), charset='utf8mb4', use_unicode=True)
3.2 直接通过API(仅测试)
- 需要使用
oauth token
reddit.com: api documentation
Many endpoints on reddit use the same protocol for controlling pagination and filtering. These endpoints are called Listings and share five common parameters:
, andshow
.Listings do not use page numbers because their content changes so frequently. Instead, they allow you to view slices of the underlying data. Listing JSON responses contain
fields which are equivalent to the “next” and “prev” buttons on the site and in combination withcount
can be used to page through the listing.The common parameters are as follows:
- only one should be specified. these indicate the fullname of an item in the listing to use as the anchor point of the slice.limit
- the maximum number of items to return in this slice of the listing.count
- the number of items already seen in this listing. on the html site, the builder uses this to determine when to give values forbefore
in the response.show
- optional parameter; ifall
is passed, filters such as “hide links that I have voted on” will be disabled.To page through a listing, start by fetching the first page without specifying values for
. The response will contain anafter
value which you can pass in the next request. It is a good idea, but not required, to send an updated value forcount
which should be the number of items already fetched.
slice_headers = {'Authorization':'token_type access_token'}
params = {'limit':'1'} #限制一次取得的数量
count = 1
while count<3:
response = requests.get("https://oauth.reddit.com/r/dapps/new", headers = slice_headers, params=params)
if response.status_code==200:
response_json = response.json()
for child in response_json['data']['children']:
print("submission json:",child)
url = 'https://{}{}'.format(self.allowed_domains[0], child['data']['permalink'])
print("url:", url)
print("title:", child['data']['title'])
print("author:", child['data']['author'])
print("created time:", datetime.datetime.utcfromtimestamp(child['data']['created_utc']))
if child['data']['is_self']==False:
print("self text:", child['data']['url'])
print("self text:", child['data']['selftext'])
print("score: ", child['data']['score'])
print("num comments:", child['data']['num_comments'])
after = response_json['data']['after']
if after==None:
params = {'limit':'1', 'after':after}