本文是传入Twitter视频播放链接通过http://twdown.net/网站获得Twitter视频的MP3,MP4下载链接用YouTube-dl下载Twitter视频
#coding=utf-8
import os
import re
import sys
import time
import datetime
import requests
# import pdfkit
import hashlib
from scrapy.http import Request, HtmlResponse
from scrapy.selector import HtmlXPathSelector
from pymongo import MongoClient
import smtplib
import urlparse
import json
import redis
from gevent.pool import Pool
from gevent import monkey
monkey.patch_all()
reload(sys)
sys.setdefaultencoding('utf-8')
def get_url(data):
proxies={'https':'127.0.0.1:8123'}
url = 'http://twdown.net/download.php/'
print data
# data = {
# 'URL':'https://twitter.com/ciamemewarfare/status/839957301981220864'
# }
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Content-Length':'73',
'Content-Type':'application/x-www-form-urlencoded',
# 'Cookie':'__cfduid=d5949cf9b7f7659555bf30bd1176139981497940317; td_cookie=18446744071807429655; _ga=GA1.2.2022387890.1497940226; _gid=GA1.2.1592134325.1497940226; _gat=1',
# 'Cookie':'__cfduid=d31e200302b2de7c45238b596be124e011497948038; td_cookie=18446744071812537626; _ga=GA1.2.347564219.1497947962; _gid=GA1.2.1008045576.1497947962; _gat=1',
# 'Cookie':'__cfduid=d31e200302b2de7c45238b596be124e011497948038; td_cookie=18446744071812537626; _ga=GA1.2.347564219.1497947962; _gid=GA1.2.1008045576.1497947962; _gat=1',
# 'Cookie':'__cfduid=d5949cf9b7f7659555bf30bd1176139981497940317; td_cookie=18446744071809053771; _ga=GA1.2.2022387890.1497940226; _gid=GA1.2.1592134325.1497940226; _gat=1',
'Cookie':'__cfduid=d5949cf9b7f7659555bf30bd1176139981497940317; td_cookie=18446744071819015387; _ga=GA1.2.2022387890.1497940226; _gid=GA1.2.1592134325.1497940226',
'Host':'twdown.net',
'Origin':'http://twdown.net',
'Proxy-Connection':'keep-alive',
'Referer':'http://twdown.net/',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
try_count = 10
hxs = None
while try_count:
print try_count
try:
#html = requests.get(url,headers=headers,proxies=proxies,verify=False).content
html = requests.post(url,headers=headers,proxies=proxies,data=data).content
response = HtmlResponse(url=url, body=html,encoding = 'utf-8')
# print response
hxs = HtmlXPathSelector(response)
break
except Exception as e:
print e
time.sleep(2)
print try_count
try_count -= 1
return hxs,html
pass
def get_download_url(p):# 通过http://twdown.net/,这个网站获得Twitter视频的MP3,MP4下载链接
link = 'http://twdown.net/'
temp = db_client.alex_movie.alex_movie_bytwitter.find_one({'_id':p})
title = temp['title']
url = temp['url']#链接
alex_id = temp['_id']
data = {}
data['URL'] = url
hxs,html = get_url(data)
# print html
mp4 = ''
mp3 = ''
download_list = hxs.select('//div[@class="col-md-8"]/table[@class="table table-condensed table-hover table-striped"]/tbody/tr')
for d_l in download_list:
d_l_td_a = d_l.select('./td/a')
for td_a in d_l_td_a:
td_a_href = ''.join(td_a.select('./@href').extract()).strip()
if '.mp4' in td_a_href:
td_a_mp4 = td_a_href
mp4 = td_a_mp4
print td_a_mp4
pass
if 'mp3.' in td_a_href:
td_a_mp3 = urlparse.urljoin(link,td_a_href)
mp3 = td_a_mp3
print td_a_mp3
pass
pass
pass
if not mp4 == '':
db_client.alex_movie.alex_movie_bytwitter.update({'_id':temp['_id']},{'$set':{'mp4_url':mp4,'is_download':True}})
pass
if not mp3 == '':
db_client.alex_movie.alex_movie_bytwitter.update({'_id':temp['_id']},{'$set':{'mp3_url':mp3,'is_download':True}})
pass
pass
def download(p): # 用YouTube-dl下载视频实体
movie_site = '/mnt/parastor/data/downdata/videos/usatoday_mp4/'
temp = db_client.alex_movie.alex_movie_bytwitter.find_one({'_id':p})
title = temp['title']#
url = temp['url']#链接
alex_id = temp['_id']
video_path = os.path.join(movie_site,'%s.mp4'%alex_id.encode('utf-8'))
try:
re_num = 10
while re_num:
# youtube-dl --proxy socks5://127.0.0.1:1080 https://www.youtube.com/watch?v=lokN6d1GaK4
youtube_content = 'youtube-dl --proxy socks5://127.0.0.1:1080 -i --no-check-certificate -o \"%s\" -R 20 \"%s\"'%(video_path,url)
# youtube_content = 'youtube-dl --proxy socks5://127.0.0.1:1080 -i -o \"%s\" -R 20 \"%s\" --get-url'%(video_path,url)
a = os.system(youtube_content.encode('utf-8'))
# print a
if a ==256:
print '错误'
if re_num == 1:
db_client.alex_movie.alex_movie_bytwitter.update({'_id':temp['_id']},{'$set':{'is_download':False}})
pass
pass
if a ==0:
print '成功'
db_client.alex_movie.alex_movie_bytwitter.update({'_id':temp['_id']},{'$set':{'is_mp4':True,'movie_download_site':video_path,'download_time':int(time.time())}})
break
pass
time.sleep(3)
re_num -= 1
pass
except Exception, e:
print '....'
pass
if __name__ == '__main__':
db_client = MongoClient('192.168.86.136',27017)
while True:
result1 = db_client.alex_movie.alex_movie_bytwitter.find({'is_download':True,'mp3_url':{'$exists':False}}).limit(1)
# r_num =len(result1)
if result1:
print '------>ok'
for r_1 in result1:
p = r_1['_id']
db_client.alex_movie.alex_movie_bytwitter.update({'_id':p},{'$set':{'is_download':False}})
# download(p)
get_download_url(p)
pass
pass
else:
break
pass
pass
db_client.close()