1 from selenium import webdriver 2 import pymysql 3 from selenium.webdriver.support.ui import WebDriverWait # 等待 4 from selenium.webdriver.support import expected_conditions as ec # 等待条件 5 from selenium.webdriver.common.by import By 6 import html 7 import _thread 8 from selenium.webdriver.chrome.options import Options 9 10 def ceil(x, y): 11 if x % y == 0: # 相除后为整数 12 return int(x / y) 13 else: # 相除有小数 14 return int(x / y) + 1 15 16 17 18 19 # 创建一个浏览器 20 chrome_options = Options() 21 chrome_options.add_argument('--headless') 22 dr = webdriver.Chrome(chrome_options=chrome_options) 23 24 # 设置访问的网站 25 dr.get('https://doupocangqiong1.com/1/list_piaotian/') 26 27 # 获取所有的a标签 28 a = dr.find_elements_by_css_selector('.dirlist > li > a') 29 30 # 连接数据库 31 db = pymysql.connect("localhost", "root", "root", "selenium", charset='utf8') 32 # 获取游标 33 cursor = db.cursor() 34 35 for i in a: 36 name = i.text 37 href = i.get_attribute('href') 38 sql = "INSERT INTO novel (name,href,content) VALUES ('%s','%s','%s')"%(name,href,'') 39 cursor.execute(sql) # 使用execute方法执行SQL语句 40 db.commit() 41 dr.close() # 关闭浏览器 42 43 44 45 def line(lineName, start, count): 46 dr = webdriver.Chrome(chrome_options=chrome_options) # 创建一个浏览器 47 # 连接数据库 48 db = pymysql.connect("localhost", "root", "root", "selenium", charset='utf8') 49 # 获取游标 50 cursor = db.cursor() 51 52 sql = "SELECT id,href FROM novel LIMIT %s, %s"%(start, count) 53 cursor.execute(sql) # 使用execute方法执行SQL语句 54 data = cursor.fetchall() # 使用 fetchall() 方法获取所有数据 55 for i in data: 56 dr.get(i[1]) 57 # 放置等待 58 WebDriverWait(dr, 5, 0.1).until_not(ec.text_to_be_present_in_element((By.CSS_SELECTOR, '#chaptercontent'), 59 U'正在转码,请稍后......')) # 等待dr浏览器10秒钟,每0.1秒钟问一次 60 content = html.escape(dr.find_element_by_css_selector('#chaptercontent').text) 61 # escape()将特殊字符转为特殊的编码格式,unescape()将编码格式转回特殊字符 62 sql = "UPDATE novel SET content = '%s' WHERE id = %s" % (content, i[0]) 63 cursor.execute(sql) # 使用execute方法执行SQL语句 64 db.commit() 65 print(lineName, '完成了', i[0], '的采集') 66 dr.close() # 关闭窗口 67 dr.quit() # 关闭浏览器 68 cursor.close() 69 db.close() 70 print(lineName, '完成了采集') 71 72 73 def productLine(func, total, lineCount): 74 every = ceil(total[0][0], lineCount) 75 print('every', every) 76 for i in range(lineCount): 77 print('-------------', i) 78 print(_thread.start_new_thread(func, ('line-' + str(i) + '', i * every, every))) 79 80 81 try: 82 sql = 'SELECT COUNT(*) FROM novel' 83 cursor.execute(sql) # 使用execute方法执行SQL语句 84 total = cursor.fetchall() # 使用 fetchall() 方法获取所有数据 85 print(total) 86 87 productLine(line, total, 5) 88 89 except: 90 print ("Error: unable to start thread") 91 92 93 while 1: 94 pass