入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库
数据库工具类:DBUtil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
import pymysql
class DBUtils( object ):
def connDB( self ): #连接数据库
conn = pymysql.connect(host = '192.168.251.114' ,port = 3306 , user = 'root' ,passwd = 'b6f3g2' ,db = 'yangsj' ,charset = 'utf8' );
cur = conn.cursor();
return (conn,cur);
def exeUpdate( self ,conn,cur,sql): #更新或插入操作
sta = cur.execute(sql);
conn.commit();
return (sta);
def exeDelete( self ,conn,cur,IDs): #删除操作 demo 没用到
sta = 0 ;
for eachID in IDs.split( ' ' ):
sta + = cur.execute( "delete from students where Id=%d" % ( int (eachID)));
conn.commit();
return (sta);
def exeQuery( self ,cur,sql): #查找操作
effect_row = cur.execute(sql);
return (effect_row,cur);
def connClose( self ,conn,cur): #关闭连接,释放资源
cur.close();
conn.close();
if __name__ = = '__main__' :
dbUtil = DBUtils();
conn,cur = dbUtil.connDB();
|
书籍操作文件 bookOpe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
from DBUtil import DBUtils
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
logging.basicConfig(
level = logging.INFO
)
class BookOperator( object ):
def __addBook( self ,book):
logging.info( "add book:%s" % book.bookName);
dbUtil = DBUtils();
conn,cur = dbUtil.connDB();
insertBookSql = ( "insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');" % (book.bookName,book.downLoadUrl,book.mainInfo));
dbUtil.exeUpdate(conn,cur,insertBookSql);
dbUtil.connClose(conn,cur);
def __selectLastBookId( self ):
logging.info( "selectLastBookId " );
dbUtil = DBUtils();
conn,cur = dbUtil.connDB();
selectLastBookSql = "select id from book order by id desc limit 1" ;
effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql);
bookId = cur.fetchone()[ 0 ];
dbUtil.connClose(conn,cur);
return bookId;
def __addBookDownLoadInfos( self ,downLoadInfos,bookId):
logging.info( "add bookId:%s" % bookId);
dbUtil = DBUtils();
conn,cur = dbUtil.connDB();
for downLoadinfo in downLoadInfos:
insertBookDownLoadInfo = ( "insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');" % (bookId,downLoadinfo.downName,downLoadinfo.downUrl));
dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);
dbUtil.connClose(conn,cur);
def addBookInfo( self ,book):
logging.info( "add bookInfo:%s" % book.bookName);
self .__addBook(book);
bookId = self .__selectLastBookId();
self .__addBookDownLoadInfos(book.downLoadInfos,bookId);
if __name__ = = '__main__' :
bookope = BookOperator();
book = Book( "aaa" , "yang" , "cccc" );
book.addDownLoadUrl(DownLoadInfo( "aaa.html" , "书籍" ));
bookope.addBookInfo(book);
|
书籍信息文件 bookInfo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
import sys
sys.encoding = "utf8"
class Book( object ):
#书籍信息#
def __init__( self ,mainInfo,downLoadUrl,bookName):
self .mainInfo = mainInfo;
self .downLoadUrl = downLoadUrl;
self .bookName = bookName;
self .downLoadInfos = [];
def addDownLoadUrl( self ,downloadInfo):
self .downLoadInfos.append(downloadInfo);
def print_book_info( self ):
print ( "bookName :%s" % ( self .bookName));
class DownLoadInfo( object ):
#下载信息#
def __init__( self ,downUrl,downName):
self .downUrl = downUrl;
self .downName = downName;
def print_down_info( self ):
print ( "downLoad %s - %s" % ( self .downUrl, self .downName));
|
51job界面解析文件 FiveOneJobFetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
import requests
from bs4 import BeautifulSoup
import sys
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
sys.encoding = "utf8"
class PageFetch( object ):
host = "//www.zzvips.com/" ; #域名+分类
category = "books/" ; #具体请求页
def __init__( self ,pageUrl):
self .pageUrl = pageUrl; #完整URL
self .url = PageFetch.host + PageFetch.category + pageUrl;
def __getPageContent( self ):
req = requests.get( self .url);
if req.status_code = = 200 :
req.encoding = "gb2312" ;
strText = req.text;
return strText;
else :
return "";
def getPageContent(url):
req = requests.get(url);
if req.status_code = = 200 :
req.encoding = "gb2312" ;
strText = req.text;
return strText;
else :
return "";
def __getMaxPageNumAndUrl( self ):
fetchUrl = self .pageUrl;
#获取分页地址 分页url 形如 list45_2.html 2为页号#
maxPageNum = 0 ;
maxLink = "";
while maxLink = = "":
url = PageFetch.host + PageFetch.category + fetchUrl;
reqContent = PageFetch.getPageContent(url)
soup = BeautifulSoup (reqContent, "html.parser" );
for ul in soup.select( ".plist" ):
print ( "数据" );
print (ul);
maxPageNum = ul.select( "strong" )[ 0 ].text;
alink = ul.select( "a" );
if alink[ - 1 ][ 'href' ] = = "#" :
maxLink = alink[ 1 ][ 'href' ];
else :
fetchUrl = alink[ - 1 ][ 'href' ];
return maxPageNum,maxLink;
def __formatPage( self ,pageNum):
#格式化url 形如 list45_2.html#
lineBeginSite = self .pageUrl.index( "_" ) + 1 ;
docBeginSite = self .pageUrl.index( "." );
return self .pageUrl[:lineBeginSite] + str (pageNum + 1 ) + self .pageUrl[docBeginSite:];
def getBookPageList( self ):
#获取书籍每页的URL#
shortPageList = [];
maxPageNum,urlPattern = self .__getMaxPageNumAndUrl();
for i in range ( int (maxPageNum)):
shortPageList.append( self .host + self .category + self .__formatPage(i));
return shortPageList;
def getDownloadPage(url):
downPage = [];
reqContent = PageFetch.getPageContent(url);
soup = BeautifulSoup (reqContent, "html.parser" );
for a in soup.select( ".cur-cat-list .btn-dl" ):
downPage.append(PageFetch.host + a[ 'href' ]);
return downPage;
def getBookInfo(url):
logging.info( "获取书籍信息url:%s" % url);
reqContent = PageFetch.getPageContent(url);
soup = BeautifulSoup (reqContent, "html.parser" );
mainInfo = (soup.select( "#soft-intro" ))[ 0 ].text.replace( "截图:" ," ").replace(" ' "," ");
title = (soup.select( "dl dt h1" ))[ 0 ].text.replace( "'" ,"");
book = Book(mainInfo,url,title);
for ul in soup.select( ".ul_Address" ):
for li in ul.select( "li" ):
downLoadInfo = DownLoadInfo(li.select( "a" )[ 0 ][ 'href' ],li.select( "a" )[ 0 ].text);
book.addDownLoadUrl(downLoadInfo);
return book;
if __name__ = = '__main__' :
p = PageFetch( "list152_1.html" );
shortPageList = p.getBookPageList();
downPage = [];
for page in shortPageList:
downLoadPage = PageFetch.getDownloadPage(page);
downPage = downPage + downLoadPage;
print ( "================汇总如下===============================" );
for bookDownLoadPage in downPage:
book = PageFetch.getBookInfo(bookDownLoadPage);
print (book.bookName + ":%s" % book.downLoadUrl);
for d in book.downLoadInfos:
print ( "%s - %s" % (d.downUrl,d.downName));
# p = PageFetch("list977_1.html");
# p = p.getMaxPageNumAndUrl();
# print (p);
|
执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
from FiveOneJobFetch import PageFetch
from bookInfo import Book
from bookInfo import DownLoadInfo
from bookOpe import BookOperator
def main(url):
p = PageFetch(url);
shortPageList = p.getBookPageList();
bookOperator = BookOperator();
downPage = [];
for page in shortPageList:
downLoadPage = PageFetch.getDownloadPage(page);
downPage = downPage + downLoadPage;
for bookDownLoadPage in downPage:
book = PageFetch.getBookInfo(bookDownLoadPage);
bookOperator.addBookInfo(book);
print ( "数据抓取成功:" + url);
if __name__ = = '__main__' :
urls = [ "list152_35.html" , "list300_2.html" , "list476_6.html" , "list977_2.html" , "list572_5.html" , "list509_2.html" , "list481_1.html" , "list576_1.html" , "list482_1.html" , "list483_1.html" , "list484_1.html" ];
for url in urls:
main(url);
|
数据库表:书籍信息表和下载地址表
1
2
3
4
5
6
7
8
9
10
|
CREATE TABLE `book` (
`id` INT (11) NOT NULL AUTO_INCREMENT,
`bookName` VARCHAR (200) NULL DEFAULT NULL ,
`bookUrl` VARCHAR (500) NULL DEFAULT NULL ,
`bookInfo` TEXT NULL ,
PRIMARY KEY (`id`)
)
COLLATE = 'utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=2936;
|
1
2
3
4
5
6
7
8
9
10
|
CREATE TABLE `book_down_url` (
`id` INT (11) NOT NULL AUTO_INCREMENT,
`bookId` INT (11) NOT NULL DEFAULT '0' ,
`downName` VARCHAR (200) NOT NULL DEFAULT '0' ,
`downUrl` VARCHAR (2000) NOT NULL DEFAULT '0' ,
PRIMARY KEY (`id`)
)
COLLATE = 'utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=44441;
|
git地址:https://git.oschina.net/yangsj/BookFetch/tree/master