采集电影天堂下载链接
视频地址:邀请你看《Python爬虫教程:零基础学爬虫》https://url.cn/5H9sxCv?sf=uri
使用版本3.72
使用时报错没有库解决办法
打开cmd 进入安装目录禅道Scripts文件夹
cd C:\Python\Python37\Scripts
使用pip install requests
使用pip install re
使用pip install pymysql
数据库名字dytt8
数据库用户名密码 root root
字符 utf8
SET FOREIGN_KEY_CHECKS=0; DROP TABLE IF EXISTS `dytt8`; CREATE TABLE `dytt8` ( `id` int(11) NOT NULL AUTO_INCREMENT, `title` varchar(255) DEFAULT NULL, `content` text, `link` text, PRIMARY KEY (`id`) ) ENGINE=MyISAM AUTO_INCREMENT=428 DEFAULT CHARSET=utf8;
全部代码
import requests import re import pymysql db = pymysql.connect(host='localhost',port=3306,user='root',passwd='root',db='py',charset='utf8') cursor = db.cursor() def getMovieList(page): print ('第',page,"页") res = requests.get('https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'.format(page)) res.encoding = 'gb2312' result = res.text reg = r'<a href="(.*?)" class="ulink">(.*?)</a>' reg = re.compile(reg) return re.findall(reg,result) def getMoveContent(url,title): res = requests.get('https://www.dytt8.net{}'.format(url)) res.encoding = 'gb2312' result = res.text reg = r'<div class="co_content8">(.*?)<strong><font color="#ff0000">.*?<a href="(.*?)"' reg = re.compile(reg,re.S) #如果错误则忽略返回空值 try: content, link = re.findall(reg, result)[0] print (link,'已保存') except: print ('错误忽略') return sql = "insert into dytt8(title,content,link) values('{}','{}','{}')".format(title,content.replace("'","\\'"),link) cursor.execute(sql) for page in range(1,16): for url,title in getMovieList(page): getMoveContent(url,title) # break