使用Beautifulsoup做python网络爬虫

一、python数据爬虫简介

1.爬虫介绍

爬虫，即网络爬虫，我们可以理解为在网络上爬行的蜘蛛，互联网就比作一张大网，而爬虫便是在这张网上爬来爬去的蜘蛛，如果它遇到想要的资源，就会抓取下来。想抓取什么？这个由我们来控制它。

比如我们想抓取一个网页上面的内容，在这个网中就要有一条道路，其实就是指向网页的地址或者超链接，那么它就可以爬到另一张网上来获取数据。这样，整个连在一起的大网对这个蜘蛛来说触手可及，分分钟爬下来对于我们来说不是事儿。

我们在浏览网页的时候一般情况下打开的是html页面，因此，看到的网页实质是由HTML 代码构成的，爬虫爬来的便是这些内容，通过分析和过滤这些 HTML 代码，实现对图片、文字等资源的获取，也就是所谓的爬虫。

2.爬虫的基本流程

用户获取网络数据的方式：

方式1：浏览器提交请求--->下载网页代码--->解析成页面
方式2：模拟浏览器发送请求(获取网页代码)->提取有用的数据->存放于数据库或文件中

爬虫要做的就是方式2；

流程：

（1）发起请求
使用http库向目标站点发起请求，即发送一个Request
Request包含：请求头、请求体等

Request模块缺陷：不能执行JS 和CSS 代码

（2）获取响应内容

如果服务器能正常响应，则会得到一个Response

Response包含：html，json，图片，视频等

（3）解析内容

解析html数据：正则表达式（RE模块），第三方解析库如Beautifulsoup，scrapy等
解析json数据：json模块

解析二进制数据:以wb的方式写入文件

（4）保存数据

数据库（MySQL，Mongdb、Redis）

3.python常见爬虫的方法

（1）scrapy爬虫

很强大的爬虫框架，可以满足简单的页面爬取（比如可以明确获知url pattern的情况）。用这个框架可以轻松爬下来如亚马逊商品信息之类的数据。但是对于稍微复杂一点的页面，如weibo的页面信息，这个框架就满足不了需求了
2、Beautifulsoup爬虫

整合了一些常用爬虫需求。缺点：不能加载JS

二、使用Beautifulsoup做网络爬虫

1.Beautifulsoup爬虫介绍

Beautiful Soup支持各种html解析器，包括python自带的标准库，还有其他的许多第三方库模块。其中一个就是lxml parser。

借助网页的结构和属性等特性来解析网页的工具，有了它我们不用再去写一些复杂的正则，只需要简单的几条语句就可以完成网页中某个元素的提取

安装方式：pip install beautifulsoup4

2.爬虫数据保存（txt,json,excel,MySQL）(以http://web.jobbole.com/category/css/为例）

# -*- coding:utf-8 -*-
#python网络爬虫，网页内容获取，保存成txt，保存成json，excel,mysql，并进行页面展示
#引入模块
import urllib.request	#用来访问url
from bs4 import BeautifulSoup	#爬虫
import os	#文件管理，文件夹管理
import time	#时间模块
import json	#json格式管理模块
import codecs	#编码转换模块，编码转换时，通常需要以unicode作为中间编码，即先将其他编码的字符串解码（decode）成unicode，再从unicode编码（encode）成另一种编码。
import xlwt		#excel写入模块
import xlrd		#excel读取模块
from xlutils.copy import copy	#复制页
import pymysql		#链接管理MySQL模块
pages=int(input("豆瓣电影250：请输入你要爬取的页数（每页25条，共20页）：\n"))		#全局变量，获取想要爬取的页数
def getdata():
	url="https://movie.douban.com/top250"
	#urls=[]	
	list=[]
	#网页格式如“https://movie.douban.com/top250?start=25&filter=”，进行简单运算操作获取每一页的url
	for i in range(pages):
		#urls.append(str(url)+"?start="+str(i*25)+"&filter=")		
		myurl=str(url)+"?start="+str(i*25)+"&filter="
		
		res=urllib.request.urlopen(myurl)
		response=BeautifulSoup(res,"html.parser")
		items=response.find_all("div",{"class":"item"})
		for item in items:
			dict={}
			dict["title"]=item.find("div",{"class":"info"}).find("div",{"class":"hd"}).find("a").find("span").get_text()
		#if(item.find("div",{"class":"pic"}).find("a").find("img") is None):
		#	continue
		#else:
			dict['href']=item.find("div",{"class":"info"}).find("div",{"class":"hd"}).find("a").get("href")
		#dict['details']=item.find("div",{"class":"info"}).find("div",{"class":"bd"}).find("p").get_text().encode("utf-8").decode("utf-8")
			dict['quote']=item.find("div",{"class":"info"}).find("div",{"class":"bd"}).find("p",{"class":"quote"}).find("span",{"class":"inq"}).get_text()
			dict["img"]=item.find("div",{"class":"pic"}).find("a").find("img").get("src")
			list.append(dict)
	return list
def savedatatotxt():
	mylist=getdata()
	folder_name="data"
	if not os.path.exists(folder_name):
		os.mkdir(folder_name)
	cur_data=time.strftime("%Y_%m_%d",time.localtime())
	file_name="douban"+cur_data+".txt"
	
	try:
		with open(folder_name+"/"+file_name,"w") as fp:
			for i in mylist:
				fp.write("title"+":"+i['title']+"\n")
				fp.write("href"+":"+i['href']+"\n")
				fp.write("quote"+":"+i['quote']+"\n")
				fp.write("imgUrl"+":"+i['img']+"\n")
	except IOError as err:
		raise("Error:"+str(err))
	finally:
		fp.close()
	print("数据已成功存入",file_name,"中")
def savedatatojson():
	mylist=getdata()
	folder_name="data"
	if not os.path.exists(folder_name):		#检测路径是否存在
		os.mkdir(folder_name)		#创建路径
	cur_data=time.strftime("%Y_%m_%d",time.localtime())
	file_name="douban"+cur_data+".json"
	
	try:
		with open(folder_name+"/"+file_name,"w") as fp:
			for i in mylist:
				line=json.dumps(i,ensure_ascii=False)+"\n"	#用于将dict类型的数据转成str
				fp.write(line)
	except IOError as err:
		raise("Error:"+str(err))
	finally:
		fp.close()
	print("数据已成功存入",file_name,"中")
def savedatatoexcel():
	mylist=getdata()
	folder_name="data"
	if not os.path.exists(folder_name):
		os.mkdir(folder_name)
	c_time=time.strftime("%Y-%m-%d",time.localtime())
	file_name="douban"+c_time+".xls"
	excelname=folder_name+"/"+file_name
	workbook=xlwt.Workbook(encoding="utf-8")	#创建工作簿，并以UTF8格式操作
	sheet=workbook.add_sheet(u'douban数据')		#创建工作页，名为“douban数据”
	headers=list(mylist[0].keys())				#将字典中的键以列表形式赋值给变量
	hStyle=xlwt.easyxf("font:color-index red,bold on")		#设置excel格式
	for i in range(len(headers)):
		sheet.write(0,i,headers[i],hStyle)		#第0行，第i列，写入数据headers[i]，以hStyle格式
	for j in range(1,len(mylist)+1):
		for m in range(len(mylist[j-1])):
			sheet.write(j,m,mylist[j-1][headers[m]])	#第j行，第m列，写入数据mylist[j-1][headers[m]]，以默认格式
	workbook.save(excelname)		#保存工作簿
	print("数据已成功存入",file_name,"中")
def savedatatomysql():
	c_connect=pymysql.connect(host="localhost",user="root",passwd="236619",db="douban",port=3306,charset="utf8")	#数据库连接
	if c_connect:
		print("Connection is OK")

	mylist=getdata()
	for i in mylist:
		cur=c_connect.cursor()	#游标
		#对数据库进行操作
		strsql="insert into doubaninfo values (null,%s,%s,%s,%s)"	#数据库操作字符串
		cur.execute(strsql,(i['title'],i['href'],i['quote'],i['img']))	#为strsql赋值
		c_connect.commit()	#执行操作
		if not cur:
			print("error")
	if c_connect:
		c_connect.close()	#关闭连接
	print("数据已成功存入doubaninfo中")

def createhtml():
	mylist=getdata()
	folder_name="html"
	if not os.path.exists(folder_name):
		os.mkdir(folder_name)
	# cur_data=time.strftime("%Y_%m_%d",time.localtime())
	file_name="douban.json"
	#创建HTML要使用的json格式
	try:
		with open(folder_name+"/"+file_name,"w",encoding="utf-8") as fp:
			fp.write("[")
			for i in mylist:
				if(i is not mylist[len(mylist)-1]):
					line=json.dumps(i,ensure_ascii=False)+",\n"
					fp.write(line)
				else:
					fp.write(json.dumps(i,ensure_ascii=False))
			fp.write("]")
	except IOError as err:
		raise("Error:"+str(err))
	finally:
		fp.close()
	
	str1='''
	<!DOCTYPE html>
	<html>
	<head>
		<meta charset="utf-8">
		<title>豆瓣电影排名</title>
		<style type="text/css">
			<!-- *{padding:10;margin:10;} -->
			form{text-align:center;}
			h1{text-align:center;color:blue;}
			div{padding:50;margin:50;}
			#box{
				width:800px;
				margin:10px auto;
			}
		</style>
	</head>
	<body>
		<h1>豆瓣电影排名</h1>
		<form>
			<input id="searTxt" type="text"  name="">
			<input id="searbtn" type="button" value="搜索">
		</form>
		<div style="" id="box">
		</div>
	</body>
		<script type="text/javascript" src="js/jquery1.7.js"></script>
		<script type="text/javascript">
	'''
	str2='''
	</script>
	</html>
	'''

	str3='''
	$.ajax({
				url:"douban.json",//文件路径
				type:"get",//获取文件方式 post（上传,根据参数...）
				dataType:"json",
				success:function(datas){
					var str1=""
					for(var i=0;i<datas.length;i++){
						// console.log(datas[i].details);
						str1+="<div>"+"<h3>"+"<a target='"+"_blank"+"'"+" href='"+datas[i].href+"'>"+datas[i].title+"</a>"+"</h3>"+
							"<img width='180' height='180' src='"+datas[i].img+"'/>"+
							"<p>"+datas[i].quote+"</p>"+"</div>";
					}
					$("#box").html(str1);

					searchDatas(datas);
				},
				error:function(){
					console.log("error");
				}
			})
			//点击搜索按钮获取搜索内容
			function searchDatas(datas){
				//datas==》数据
				$("#searbtn").on("click",function(){
					var searTxt=$("#searTxt").val();
					var list1=[];
					//----判断文本框是否有值-----
					if(searTxt.length<=0){
						$("#box").html("文本框为空！！！");
					}else{	
						for(var i=0;i<datas.length;i++){
							if(datas[i].title.indexOf(searTxt)!=-1){
								console.log(datas[i])
								list1.push(datas[i])
							}
						}
						if(list1.length>0){
							var str1=""
							for(var i=0;i<list1.length;i++){
								// console.log(list1[i].details);
								str1+="<div>"+"<h3>"+"<a target='"+"_blank"+"'"+" href='"+list1[i].href+"'>"+list1[i].title+"</a>"+"</h3>"+
							"<img width='180' height='180' src='"+list1[i].img+"'/>"+
							"<p>"+list1[i].quote+"</p>"+"</div>";
							}
							$("#box").html(str1);

						}else{
							$("#box").html("搜索的内容不存在！！！");
						}
					}

				})
			}
	'''
	#创建html文件
	folder_name="html"
	if not os.path.exists(folder_name):
		os.mkdir(folder_name)
	file_name="mytry.html"
	try:
		with open(folder_name+"/"+file_name,"w",encoding="utf-8") as fp:	#以“utf-8”编码格式对文件执行写入操作
			fp.write(str1+str3+str2)
	except IOError as err:
		raise("Error:"+str(err))
	finally:
		fp.close()
	print("已生成",file_name)
savedatatojson()
savedatatotxt()
savedatatoexcel()
savedatatomysql()
createhtml()

运行结果如下：

使用Beautifulsoup做python网络爬虫

其中MySql表结构如图所示：

使用Beautifulsoup做python网络爬虫

网页页面如下：

使用Beautifulsoup做python网络爬虫

秒客网

使用Beautifulsoup做python网络爬虫

相关文章