# -*- coding: utf-8 -*-
"""
Created on Sun May 22 21:19:26 2016
此程序采集第三级网址详细信息,包括三级网址,作者,年份,页数,语言,isbn,分类,描述
首页
http://www.allitebooks.com/
一级网址
http://www.allitebooks.com/page/1/
。。。。。
http://www.allitebooks.com/page/596/
二级网址
http://www.allitebooks.com/3d-printed-science-projects/
三级网址
http://file.allitebooks.com/20160521/3D Printed Science Projects.pdf
@author: daxiong
"""
import requests,bs4,csv,time,random,os,threading
#线程数
divident=10
dir="C:/Users/daxiong/Desktop/采集ebooks/"
#存放所有二级网址
#fileName=dir+'secondLinks.csv'
#采集补充更新
fileName=dir+'renew.csv'
#从secondLinks.csv文件中获取二级网址
def Get_secondLinks_from_csv(fileName):
file=open(fileName)
content=file.readlines()
return content
#获取一个三级网址
def Get_thirdLink(url):
res=requests.get(url)
soup=bs4.BeautifulSoup(res.text,'lxml')
elems=soup.select(".download-links a")
thirdLink=elems[0].get('href')
#写入csv
fileName=url.split("/")
fileName1=fileName[3]+".csv"
Write_table_to_csv(fileName1,[(url,thirdLink)])
'''
#采集一个三级页面详细信息
def Get_author(url):
#加载页面
res=requests.get(url)
#解析页面
soup=bs4.BeautifulSoup(res.text,'lxml')
#采集第三级链接
thirdLinkElems=soup.select(".download-links a")
thirdLink=thirdLinkElems[0].get('href')
#采集作者
author_elem=soup.select(".book-detail a")
author=author_elem[0].get('href')
'''
#写入所有第三级网址到csv文件
def Get_thirdLinks(list1,start):
for url in list1:
try:
Get_thirdLink(url)
except:
print("bad thirdlink:",url)
continue
#step表示要采集多少次;divident表示每次采集几个
def Step(urls_list,divident):
step=len(urls_list)/divident
step=int(step)
return step
#获取采集网页余数
def Left(urls_list):
step=Step(urls_list,divident)
left=len(urls_list)-step*divident
return left
#把一页内容写入csv文档 ,list_tableContent为二维列表[[a],[b],[c]]
def Write_table_to_csv(fileName,list_tableContent):
#对列表格式修改,字符串写入的格式不对
file=open(fileName,'w',newline='')
writer1=csv.writer(file)
writer1.writerows(list_tableContent)
file.close()
#采集某范围网址的公司数据
def download_range(list_pages,start,end):
urls_list_range1=list_pages[start:end]
Get_thirdLinks(urls_list_range1,start)
#print("well Done")
#一共四个参数,list1传递列表,target1传递函数,step传递步骤,left传递余数
def Multi_thread(list1,target1,step):
for i in range(0, len(list1), step): # loops 14 times, creates 14 threads
try:
downloadThread = threading.Thread(target=target1, args=(list1,i, i +step))
downloadThreads.append(downloadThread)
downloadThread.start()
except:
print ("bad thread:",downloadThread)
continue
# Wait for all threads to end.
for downloadThread in downloadThreads:
downloadThread.join()
print("Done one thread")
print('Done.')
#时间计算
def time_count(time2,time1):
print("time consuming:",time2-time1)
return time2-time1
secondLinks=Get_secondLinks_from_csv(fileName)
#获取所有二级网址
secondLinks1=[i.split(',')[1].strip('\n') for i in secondLinks]
#先采集前面5000个三级网址
#secondLinks2=secondLinks1[4000:5000]
step=Step(secondLinks1,divident)
left=Left(secondLinks1)
downloadThreads = [] # a list of all the Thread objects
'''
time1=time.time()
Multi_thread(secondLinks1,download_range,step)
time2=time.time()
time_count(time2,time1)
'''
site=secondLinks1[1]
# 'http://www.allitebooks.com/mapping-experiences/'
res=requests.get(site)
soup=bs4.BeautifulSoup(res.text,'lxml')
elems=soup.select(".download-links a")
thirdLink=elems[0].get('href')
#采集作者
author_elem=soup.select(".book-detail a")
author=author_elem[0].get('href')
author1=author.split("/")
author2=author1[-2] #最终结果
#采集isbn
isbn_elems=soup.select(".book-detail dd")
isbn=isbn_elems[1]
isbn1=isbn.getText() #最终结果
#采集year
year_elems=soup.select(".book-detail dd")
year=year_elems[2]
year1=year.getText() #最终结果
#采集pages
pages_elems=soup.select(".book-detail dd")
pages=pages_elems[3]
pages1=pages.getText() #最终结果
#采集language
language_elems=soup.select(".book-detail dd")
language=language_elems[4]
language1=language.getText() #最终结果
#采集category
category_elems=soup.select(".book-detail dd")
category=category_elems[7]
category1=category.getText() #最终结果
#描述内容
description_elems=soup.select(".entry-content")
description=description_elems[0].getText() #最终结果