七月算法课程《python爬虫》第一课里的示例代码,很简单。
下载地址:http://download.csdn.net/detail/nnnnnnnnnnnny/9715077
jupyter notebook文件贴在这里了。
七月算法 Python爬虫项目班
课后习题一
爬邮政编码查询网页http://www.ip138.com/post/ ,提取到每个省份邮政编码的开头数字
import requests #比urllib好用
import xml.etree.ElementTree as ET
from xml.parsers.expat import ParserCreate
class DefaultSaxHandler(object):
def __init__(self, provinces):
self.provinces=provinces
# 处理标签开始
def start_element(self, name, attrs):
if name != 'map':
name=attrs['title']
number=attrs['href']
self.provinces.append((name, number))
# 处理标签结束
def end_element(self, name):
pass
# 文本处理
def char_data(self, text):
pass
def get_province_entry(url):
# 获取文本,并用gb2312解码
content = requests.get(url).content.decode('gb2312')
# print(type(content), content) # content为字符串类型
# 确定要查找字符串的开始位置,并用切片获取内容
start=content.find('<map name=\"map_86\" id=\"map_86\">')
end=content.find('</map>')
content=content[start:end+len('</map>')].strip() #获取所需的字段并去掉首尾的空格
# print("content:", content)
provinces=[]
# 生成Sax处理器, SAX(simple API for XML)是一种XML解析的替代方法, 详情百科
handler=DefaultSaxHandler(provinces)
# 初始化分析器
parser=ParserCreate()
parser.StartElementHandler=handler.start_element
parser.EndElementHandler=handler.end_element
parser.CharacterDataHandler=handler.char_data
#解析数据
parser.Parse(content)
# 结果字典为每一页的入口代码
return provinces
provinces=get_province_entry('http://www.ip138.com/post')
print("provinces:", provinces)
provinces: [('*', '/83/'), ('*', '/85/'), ('青海', '/81/'), ('甘肃', '/73/'), ('四川', '/61/'), ('云南', '/65/'), ('宁夏', '/75/'), ('内蒙古', '/01/'), ('黑龙江', '/15/'), ('吉林', '/13/'), ('辽宁', '/11/'), ('河北', '/50/'), ('北京', '/10/'), ('天津', '/30/'), ('陕西', '/71/'), ('山西', '/03/'), ('山东', '/25/'), ('河南', '/45/'), ('重庆', '/40/'), ('湖北', '/43/'), ('安徽', '/23/'), ('江苏', '/21/'), ('上海', '/20/'), ('贵州', '/55/'), ('广西', '/53/'), ('湖南', '/41/'), ('江西', '/33/'), ('浙江', '/31/'), ('福建', '/35/'), ('广东', '/51/'), ('海南', '/57/'), ('*', '/*g/'), ('澳门', '/aomen/'), ('香港', '/xianggang/')]
课后习题二
爬http://hq.sinajs.cn/list= ,看看几个公司的股价信息
import requests
import threading
def display_info(code):
url='http://hq.sinajs.cn/list=' + code
response=requests.get(url).text # 核心,爬下内容直接打印输出
print(response)
def single_thread(codes):
for code in codes:
code = code.strip()
display_info(code)
def multi_thread(tasks):
# print("tasks:", tasks)
# 用列表推导生成线程,注意只有一个元素的tuple定义时必须加一个逗号! 具体参见廖雪峰的python3基础教程 "tuple"
# print((['sh600007', 'sh600008', 'sh600009']), (['sh600007', 'sh600008', 'sh600009'], ))
# 关于多线程这一块可查看 廖雪峰的python3基础教程 "多线程"
threads = [threading.Thread(target = single_thread, args=(codes, )) for codes in tasks]
# 启动线程
for t in threads:
t.start()
# 等待直到线程结束
for t in threads:
t.join()
# 注意main函数的形式
if __name__ == '__main__':
codes=['sh600004', 'sh600005', 'sh600006', 'sh600007', 'sh600008', 'sh600009']
# print("len(codes):", len(codes))
# 计算每个线程要做多少工作
thread_len=int(len(codes) / 4)
# print("thread_len:", thread_len)
t1=codes[0:thread_len]
t2=codes[thread_len : thread_len*2]
t3=codes[thread_len*2 : thread_len * 3]
t4=codes[thread_len * 3:]
# print("t1:", t1, "t2:", t2, "t3:", t3, "t4:", t4)
# 多线程启动
multi_thread([t1, t2, t3, t4])
var hq_str_sh600004="白云机场,13.960,14.040,14.060,14.130,13.950,14.060,14.070,1511419,21244732.000,14186,14.060,257400,14.050,31900,14.040,6300,14.020,2200,14.010,10500,14.070,26100,14.080,81001,14.090,68300,14.100,12000,14.110,2016-12-19,10:40:27,00";
var hq_str_sh600005="武钢股份,3.490,3.510,3.520,3.560,3.460,3.520,3.530,20048507,70516972.000,937000,3.520,454000,3.510,776500,3.500,382300,3.490,678400,3.480,570361,3.530,1529951,3.540,1032710,3.550,1677780,3.560,797700,3.570,2016-12-19,10:40:27,00";
var hq_str_sh600006="东风汽车,6.930,6.940,6.890,6.930,6.800,6.890,6.900,11088555,76080639.000,3400,6.890,46300,6.880,81014,6.870,176900,6.860,130300,6.850,51300,6.900,78300,6.910,65900,6.920,54200,6.930,70800,6.940,2016-12-19,10:40:27,00";
var hq_str_sh600007="中国国贸,17.800,17.890,17.110,17.800,17.090,17.120,17.130,3981125,68517288.000,1500,17.120,33325,17.110,41800,17.100,4100,17.090,21300,17.080,541,17.130,9600,17.140,4000,17.150,11800,17.160,10200,17.170,2016-12-19,10:40:27,00";
var hq_str_sh600008="首创股份,4.110,4.100,4.100,4.120,4.080,4.100,4.110,4128304,16940208.000,505396,4.100,418100,4.090,732200,4.080,345000,4.070,204300,4.060,379200,4.110,561812,4.120,531400,4.130,654600,4.140,399900,4.150,2016-12-19,10:40:18,00";
var hq_str_sh600009="上海机场,26.510,26.620,26.620,26.690,26.350,26.610,26.620,919588,24375139.000,2400,26.610,4400,26.600,700,26.580,3400,26.560,500,26.550,200,26.620,9400,26.630,1500,26.650,9100,26.660,8600,26.680,2016-12-19,10:40:24,00";