用selenium 自动爬取某一本小说章节及其内容,并存入数据库中

时间:2022-09-18 17:29:28
用selenium 自动爬取某一本小说章节及其内容,并存入数据库中用selenium 自动爬取某一本小说章节及其内容,并存入数据库中
 1 from selenium import webdriver
 2 import pymysql
 3 from selenium.webdriver.support.ui import WebDriverWait     # 等待
 4 from selenium.webdriver.support import expected_conditions as ec # 等待条件
 5 from selenium.webdriver.common.by import By
 6 import html
 7 import _thread
 8 from selenium.webdriver.chrome.options import Options
 9 
10 def ceil(x, y):
11     if x % y == 0:              # 相除后为整数
12         return int(x / y)
13     else:                       # 相除有小数
14         return int(x / y) + 1
15 
16 
17 
18 
19 # 创建一个浏览器
20 chrome_options = Options()
21 chrome_options.add_argument('--headless')
22 dr = webdriver.Chrome(chrome_options=chrome_options)
23 
24 # 设置访问的网站
25 dr.get('https://doupocangqiong1.com/1/list_piaotian/')
26 
27 # 获取所有的a标签
28 a = dr.find_elements_by_css_selector('.dirlist > li > a')
29 
30 # 连接数据库
31 db = pymysql.connect("localhost", "root", "root", "selenium", charset='utf8')
32 # 获取游标
33 cursor = db.cursor()
34 
35 for i in a:
36     name = i.text
37     href = i.get_attribute('href')
38     sql = "INSERT INTO novel (name,href,content) VALUES ('%s','%s','%s')"%(name,href,'')
39     cursor.execute(sql)         # 使用execute方法执行SQL语句
40     db.commit()
41 dr.close()          # 关闭浏览器
42 
43 
44 
45 def line(lineName, start, count):
46     dr = webdriver.Chrome(chrome_options=chrome_options)         # 创建一个浏览器
47     # 连接数据库
48     db = pymysql.connect("localhost", "root", "root", "selenium", charset='utf8')
49     # 获取游标
50     cursor = db.cursor()
51 
52     sql = "SELECT id,href FROM novel LIMIT %s, %s"%(start, count)
53     cursor.execute(sql)  # 使用execute方法执行SQL语句
54     data = cursor.fetchall()  # 使用 fetchall() 方法获取所有数据
55     for i in data:
56         dr.get(i[1])
57         # 放置等待
58         WebDriverWait(dr, 5, 0.1).until_not(ec.text_to_be_present_in_element((By.CSS_SELECTOR, '#chaptercontent'),
59                                                                             U'正在转码,请稍后......'))  # 等待dr浏览器10秒钟,每0.1秒钟问一次
60         content = html.escape(dr.find_element_by_css_selector('#chaptercontent').text)
61         # escape()将特殊字符转为特殊的编码格式,unescape()将编码格式转回特殊字符
62         sql = "UPDATE novel SET content = '%s' WHERE id = %s" % (content, i[0])
63         cursor.execute(sql)  # 使用execute方法执行SQL语句
64         db.commit()
65         print(lineName, '完成了', i[0], '的采集')
66     dr.close()          # 关闭窗口
67     dr.quit()           # 关闭浏览器
68     cursor.close()
69     db.close()
70     print(lineName, '完成了采集')
71 
72 
73 def productLine(func, total, lineCount):
74     every = ceil(total[0][0], lineCount)
75     print('every', every)
76     for i in range(lineCount):
77         print('-------------', i)
78         print(_thread.start_new_thread(func, ('line-' + str(i) + '', i * every, every)))
79 
80 
81 try:
82     sql = 'SELECT COUNT(*) FROM novel'
83     cursor.execute(sql)  # 使用execute方法执行SQL语句
84     total = cursor.fetchall()  # 使用 fetchall() 方法获取所有数据
85     print(total)
86 
87     productLine(line, total, 5)
88 
89 except:
90     print ("Error: unable to start thread")
91 
92 
93 while 1:
94    pass
View Code