这时网址为:http://flights.ctrip.com/booking/can-ctu-day-1.html?ddate1=2018-06-15
其中,can 表示广州,ctu 表示成都,日期 “2018-06-15”就比较明显了。一般的爬虫,只有替换这几个值,就可以遍历了。但观察发现,有个链接可以看到当前网页的所有json格式的数据。如下
同样可以看到城市和日期,该连接打开的是 json 文件,里面存储的就是当前页面的数据。显示如下,其中 "fis" 则是航班信息。
每一次爬取只要替换城市代码和日期即可,城市代码自己手动整理了一份:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
city = { "yie" : "阿尔山" , "aku" : "阿克苏" , "rht" : "阿拉善右旗" , "axf" : "阿拉善左旗" , "aat" : "阿勒泰" , "ngq" : "阿里" , "mfm" : "澳门"
, "aqg" : "安庆" , "ava" : "安顺" , "aog" : "鞍山" , "rlk" : "巴彦淖尔" , "aeb" : "百色" , "bav" : "包头" , "bsd" : "保山" , "bhy" : "北海" , "bjs" : "北京"
, "dbc" : "白城" , "nbs" : "白山" , "bfj" : "毕节" , "bpl" : "博乐" , "ckg" : "重庆" , "bpx" : "昌都" , "cgd" : "常德" , "czx" : "常州"
, "chg" : "朝阳" , "ctu" : "成都" , "juh" : "池州" , "cif" : "赤峰" , "swa" : "潮州" , "cgq" : "长春" , "csx" : "长沙" , "cih" : "长治" , "cde" : "承德"
, "cwj" : "沧源" , "dax" : "达州" , "dlu" : "大理" , "dlc" : "大连" , "dqa" : "大庆" , "dat" : "大同" , "ddg" : "丹东" , "dcy" : "稻城" , "doy" : "东营"
, "dnh" : "敦煌" , "dax" : "达县" , "lum" : "德宏" , "ejn" : "额济纳旗" , "dsn" : "鄂尔多斯" , "enh" : "恩施" , "erl" : "二连浩特" , "fuo" : "佛山"
, "foc" : "福州" , "fyj" : "抚远" , "fug" : "阜阳" , "kow" : "赣州" , "goq" : "格尔木" , "gyu" : "固原" , "gys" : "广元" , "can" : "广州" , "kwe" : "贵阳"
, "kwl" : "桂林" , "hrb" : "哈尔滨" , "hmi" : "哈密" , "hak" : "海口" , "hld" : "海拉尔" , "hdg" : "邯郸" , "hzg" : "汉中" , "hgh" : "杭州" , "hfe" : "合肥"
, "htn" : "和田" , "hek" : "黑河" , "het" : "呼和浩特" , "hia" : "淮安" , "hjj" : "怀化" , "txn" : "黄山" , "huz" : "惠州" , "jxa" : "鸡西" , "tna" : "济南"
, "jng" : "济宁" , "jgd" : "加格达奇" , "jmu" : "佳木斯" , "jgn" : "嘉峪关" , "swa" : "揭阳" , "jic" : "金昌" , "knh" : "金门" , "jnz" : "锦州"
, "cyi" : "嘉义" , "jhg" : "景洪" , "jsj" : "建三江" , "jjn" : "晋江" , "jgs" : "井冈山" , "jdz" : "景德镇" , "jiu" : "九江" , "jzh" : "九寨沟" , "khg" : "喀什"
, "kjh" : "凯里" , "kgt" : "康定" , "kry" : "克拉玛依" , "kca" : "库车" , "krl" : "库尔勒" , "kmg" : "昆明" , "lxa" : "拉萨" , "lhw" : "兰州" , "hzh" : "黎平"
, "ljg" : "丽江" , "llb" : "荔波" , "lyg" : "连云港" , "lpf" : "六盘水" , "lfq" : "临汾" , "lzy" : "林芝" , "lnj" : "临沧" , "lyi" : "临沂" , "lzh" : "柳州"
, "lzo" : "泸州" , "lya" : "洛阳" , "llv" : "吕梁" , "jmj" : "澜沧" , "lcx" : "龙岩" , "nzh" : "满洲里" , "lum" : "芒市" , "mxz" : "梅州" , "mig" : "绵阳"
, "ohe" : "漠河" , "mdg" : "牡丹江" , "mfk" : "马祖" , "khn" : "南昌" , "nao" : "南充" , "nkg" : "南京" , "nng" : "南宁" , "ntg" : "南通" , "nny" : "南阳"
, "ngb" : "宁波" , "nlh" : "宁蒗" , "pzi" : "攀枝花" , "sym" : "普洱" , "ndg" : "齐齐哈尔" , "jiq" : "黔江" , "iqm" : "且末" , "bpe" : "秦皇岛" , "tao" : "青岛"
, "iqn" : "庆阳" , "juz" : "衢州" , "rkz" : "日喀则" , "riz" : "日照" , "syx" : "三亚" , "xmn" : "厦门" , "sha" : "上海" , "szx" : "深圳" , "hpg" : "神农架"
, "she" : "沈阳" , "sjw" : "石家庄" , "tcg" : "塔城" , "hyn" : "台州" , "tyn" : "太原" , "yty" : "泰州" , "tvs" : "唐山" , "tcz" : "腾冲" , "tsn" : "天津"
, "thq" : "天水" , "tgo" : "通辽" , "ten" : "铜仁" , "tlq" : "吐鲁番" , "wxn" : "万州" , "weh" : "威海" , "wef" : "潍坊" , "wnz" : "温州" , "wnh" : "文山"
, "wua" : "乌海" , "hlh" : "乌兰浩特" , "urc" : "乌鲁木齐" , "wux" : "无锡" , "wuz" : "梧州" , "wuh" : "武汉" , "wus" : "武夷山" , "sia" : "西安" , "xic" : "西昌"
, "xnn" : "西宁" , "jhg" : "西双版纳" , "xil" : "锡林浩特" , "dig" : "香格里拉(迪庆)" , "xfn" : "襄阳" , "acx" : "兴义" , "xuz" : "徐州" , "hkg" : "香港"
, "ynt" : "烟台" , "eny" : "延安" , "ynj" : "延吉" , "ynz" : "盐城" , "yty" : "扬州" , "lds" : "伊春" , "yin" : "伊宁" , "ybp" : "宜宾" , "yih" : "宜昌"
, "yic" : "宜春" , "yiw" : "义乌" , "inc" : "银川" , "llf" : "永州" , "uyn" : "榆林" , "yus" : "玉树" , "ycu" : "运城" , "zha" : "湛江" , "dyg" : "张家界"
, "zqz" : "张家口" , "yzy" : "张掖" , "zat" : "昭通" , "cgo" : "郑州" , "zhy" : "中卫" , "hsn" : "舟山" , "zuh" : "珠海" , "wmt" : "遵义(茅台)" , "zyi" : "遵义(新舟)" }
|
为了防止频繁请求出现 429,useragent 也找多一些让其随机取值。但是有时候太频繁则需要输入验证码,所以还是每爬取一个出发城市,暂停10秒钟吧。
先创建表用于存储数据,此处用的是 sql server:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
create table kkflight(
id int identity( 1 , 1 ), - - 自增 id
itinerardate date, - - 行程日期
airline varchar( 100 ), - - 航空公司
airlinecode varchar( 100 ), - - 航空公司代码
flightnumber varchar( 20 ), - - 航班号
flightnumbers varchar( 20 ), - - 航班号 - 共享(实际航班)
aircraft varchar( 50 ), - - 飞机型号
aircraftsize char( 2 ), - - 型号大小(l大;m中;s小)
airporttax decimal( 10 , 2 ), - - 机场建设费
fueloiltax decimal( 10 , 2 ), - - 燃油税
fromcity varchar( 50 ), - - 出发城市
fromcitycode varchar( 10 ), - - 出发城市代码
fromairport varchar( 50 ), - - 出发机场
fromterminal varchar( 20 ), - - 出发航站楼
fromdatetime datetime, - - 出发时间
tocity varchar( 50 ), - - 到达城市
tocitycode varchar( 10 ), - - 到达城市代码
toairport varchar( 50 ), - - 到达机场
toterminal varchar( 20 ), - - 到达航站楼
todatetime datetime, - - 到达时间
durationhour int , - - 时长(小时h)
durationminute int , - - 时长(分钟m)
duration varchar( 20 ), - - 时长(字符串)
currency varchar( 10 ), - - 币种
ticketprices decimal( 10 , 2 ), - - 票价
discount decimal( 4 , 2 ), - - 已打折扣
punctualityrate decimal( 4 , 2 ), - - 准点率
aircraftcabin char( 1 ), - - 仓位(f头等舱;c公务舱;y经济舱)
insertdate datetime default(getdate()), - - 添加时间
)
|
因为是爬取所有城市,所以城市不限制,只限制日期,即爬取哪天至哪天的数据。全部脚本如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
#-*- coding: utf-8 -*-
# python 3.5.0
import json
import time
import random
import datetime
import sqlalchemy
import urllib.request
import pandas as pd
from operator import itemgetter
from dateutil.parser import parse
class flight( object ):
def __init__( self ):
self .airline = {} #航空公司代码
self .engine = sqlalchemy.create_engine( "mssql+pymssql://kk:kk@hzc/myspider" )
self .url = ''
self .headers = {}
self .city = { "aat" : "阿勒泰" , "acx" : "兴义" , "aeb" : "百色" , "aku" : "阿克苏" , "aog" : "鞍山" , "aqg" : "安庆" , "ava" : "安顺" , "axf" : "阿拉善左旗" , "bav" : "包头" , "bfj" : "毕节" , "bhy" : "北海"
, "bjs" : "北京" , "bpe" : "秦皇岛" , "bpl" : "博乐" , "bpx" : "昌都" , "bsd" : "保山" , "can" : "广州" , "cde" : "承德" , "cgd" : "常德" , "cgo" : "郑州" , "cgq" : "长春" , "chg" : "朝阳" , "cif" : "赤峰"
, "cih" : "长治" , "ckg" : "重庆" , "csx" : "长沙" , "ctu" : "成都" , "cwj" : "沧源" , "cyi" : "嘉义" , "czx" : "常州" , "dat" : "大同" , "dax" : "达县" , "dbc" : "白城" , "dcy" : "稻城" , "ddg" : "丹东"
, "dig" : "香格里拉(迪庆)" , "dlc" : "大连" , "dlu" : "大理" , "dnh" : "敦煌" , "doy" : "东营" , "dqa" : "大庆" , "dsn" : "鄂尔多斯" , "dyg" : "张家界" , "ejn" : "额济纳旗" , "enh" : "恩施"
, "eny" : "延安" , "erl" : "二连浩特" , "foc" : "福州" , "fug" : "阜阳" , "fuo" : "佛山" , "fyj" : "抚远" , "goq" : "格尔木" , "gys" : "广元" , "gyu" : "固原" , "hak" : "海口" , "hdg" : "邯郸"
, "hek" : "黑河" , "het" : "呼和浩特" , "hfe" : "合肥" , "hgh" : "杭州" , "hia" : "淮安" , "hjj" : "怀化" , "hkg" : "香港" , "hld" : "海拉尔" , "hlh" : "乌兰浩特" , "hmi" : "哈密" , "hpg" : "神农架"
, "hrb" : "哈尔滨" , "hsn" : "舟山" , "htn" : "和田" , "huz" : "惠州" , "hyn" : "台州" , "hzg" : "汉中" , "hzh" : "黎平" , "inc" : "银川" , "iqm" : "且末" , "iqn" : "庆阳" , "jdz" : "景德镇"
, "jgd" : "加格达奇" , "jgn" : "嘉峪关" , "jgs" : "井冈山" , "jhg" : "西双版纳" , "jic" : "金昌" , "jiq" : "黔江" , "jiu" : "九江" , "jjn" : "晋江" , "jmj" : "澜沧" , "jmu" : "佳木斯" , "jng" : "济宁"
, "jnz" : "锦州" , "jsj" : "建三江" , "juh" : "池州" , "juz" : "衢州" , "jxa" : "鸡西" , "jzh" : "九寨沟" , "kca" : "库车" , "kgt" : "康定" , "khg" : "喀什" , "khn" : "南昌" , "kjh" : "凯里" , "kmg" : "昆明"
, "knh" : "金门" , "kow" : "赣州" , "krl" : "库尔勒" , "kry" : "克拉玛依" , "kwe" : "贵阳" , "kwl" : "桂林" , "lcx" : "龙岩" , "lds" : "伊春" , "lfq" : "临汾" , "lhw" : "兰州" , "ljg" : "丽江" , "llb" : "荔波"
, "llf" : "永州" , "llv" : "吕梁" , "lnj" : "临沧" , "lpf" : "六盘水" , "lum" : "芒市" , "lxa" : "拉萨" , "lya" : "洛阳" , "lyg" : "连云港" , "lyi" : "临沂" , "lzh" : "柳州" , "lzo" : "泸州"
, "lzy" : "林芝" , "mdg" : "牡丹江" , "mfk" : "马祖" , "mfm" : "澳门" , "mig" : "绵阳" , "mxz" : "梅州" , "nao" : "南充" , "nbs" : "白山" , "ndg" : "齐齐哈尔" , "ngb" : "宁波" , "ngq" : "阿里"
, "nkg" : "南京" , "nlh" : "宁蒗" , "nng" : "南宁" , "nny" : "南阳" , "ntg" : "南通" , "nzh" : "满洲里" , "ohe" : "漠河" , "pzi" : "攀枝花" , "rht" : "阿拉善右旗" , "riz" : "日照" , "rkz" : "日喀则"
, "rlk" : "巴彦淖尔" , "sha" : "上海" , "she" : "沈阳" , "sia" : "西安" , "sjw" : "石家庄" , "swa" : "揭阳" , "sym" : "普洱" , "syx" : "三亚" , "szx" : "深圳" , "tao" : "青岛" , "tcg" : "塔城" , "tcz" : "腾冲"
, "ten" : "铜仁" , "tgo" : "通辽" , "thq" : "天水" , "tlq" : "吐鲁番" , "tna" : "济南" , "tsn" : "天津" , "tvs" : "唐山" , "txn" : "黄山" , "tyn" : "太原" , "urc" : "乌鲁木齐" , "uyn" : "榆林" , "wef" : "潍坊"
, "weh" : "威海" , "wmt" : "遵义(茅台)" , "wnh" : "文山" , "wnz" : "温州" , "wua" : "乌海" , "wuh" : "武汉" , "wus" : "武夷山" , "wux" : "无锡" , "wuz" : "梧州" , "wxn" : "万州" , "xfn" : "襄阳" , "xic" : "西昌"
, "xil" : "锡林浩特" , "xmn" : "厦门" , "xnn" : "西宁" , "xuz" : "徐州" , "ybp" : "宜宾" , "ycu" : "运城" , "yic" : "宜春" , "yie" : "阿尔山" , "yih" : "宜昌" , "yin" : "伊宁" , "yiw" : "义乌" , "ynj" : "延吉"
, "ynt" : "烟台" , "ynz" : "盐城" , "yty" : "扬州" , "yus" : "玉树" , "yzy" : "张掖" , "zat" : "昭通" , "zha" : "湛江" , "zhy" : "中卫" , "zqz" : "张家口" , "zuh" : "珠海" , "zyi" : "遵义(新舟)" }
"""{"kji":"布尔津"}"""
self .useragent = [
"mozilla/5.0 (windows nt 6.3; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/66.0.3359.139 safari/537.36" ,
"mozilla/5.0 (windows nt 6.1; wow64) applewebkit/535.7 (khtml, like gecko) chrome/16.0.912.36 safari/535.7" ,
"mozilla/5.0 (windows nt 6.2; win64; x64; rv:16.0) gecko/16.0 firefox/16.0" ,
"mozilla/5.0 (macintosh; intel mac os x 10_7_3) applewebkit/534.55.3 (khtml, like gecko) version/5.1.3 safari/534.53.10" ,
"mozilla/5.0 (compatible; msie 9.0; windows nt 6.1; win64; x64; trident/5.0; .net clr 3.5.30729; .net clr 3.0.30729; .net clr 2.0.50727; media center pc 6.0)" ,
"mozilla/5.0 (compatible; msie 8.0; windows nt 6.0; trident/4.0; wow64; trident/4.0; slcc2; .net clr 2.0.50727; .net clr 3.5.30729; .net clr 3.0.30729; .net clr 1.0.3705; .net clr 1.1.4322)" ,
"mozilla/5.0 (windows nt 6.2; wow64) applewebkit/537.36 (khtml, like gecko) chrome/27.0.1500.55 safari/537.36" ,
"mozilla/5.0 (macintosh; intel mac os x 10_8_2) applewebkit/537.17 (khtml, like gecko) chrome/24.0.1309.0 safari/537.17"
"mozilla/5.0 (windows nt 6.1; win64; x64; rv:2.0b13pre) gecko/20110307 firefox/4.0b13pre" ,
"mozilla/5.0 (x11; ubuntu; linux x86_64; rv:16.0) gecko/20100101 firefox/16.0" ,
"mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.11 (khtml, like gecko) chrome/23.0.1271.64 safari/537.11"
]
#遍历两个日期间的所有日期
def set_url_headers( self ,startdate,enddate):
startdate = datetime.datetime.strptime(startdate, '%y-%m-%d' )
enddate = datetime.datetime.strptime(enddate, '%y-%m-%d' )
while startdate< = enddate:
today = startdate.strftime( '%y-%m-%d' )
for fromcode, fromcity in sorted ( self .city.items(), key = itemgetter( 0 )):
for tocode, tocity in sorted ( self .city.items(), key = itemgetter( 0 )):
if fromcode ! = tocode:
self .url = 'http://flights.ctrip.com/domesticsearch/search/searchfirstrouteflights?dcity1=%s&acity1=%s&searchtype=s&ddate1=%s&isnearairportrecommond=0&logtoken=027e478a47494975ad74857b18283e12&rk=4.381066884522498182534&ck=9fc7881e8f373585c0e5f89152bc143d&r=0.24149333708195565406316' % (fromcode,tocode,today)
self .headers = {
"host" : "flights.ctrip.com" ,
"user-agent" : random.choice( self .useragent),
"referer" : "https://flights.ctrip.com/booking/%s-%s-day-1.html?ddate1=%s" % (fromcode,tocode,today),
"connection" : "keep-alive" ,
}
print ( "%s : %s(%s) ==> %s(%s) " % (today,fromcity,fromcode,tocity,tocode))
self .get_parse_json_data(today)
time.sleep( 10 )
startdate + = datetime.timedelta(days = 1 )
#获取一个页面中的数据
def get_one_page_json_data( self ):
req = urllib.request.request( self .url,headers = self .headers)
body = urllib.request.urlopen(req,timeout = 30 ).read().decode( 'gbk' )
jsondata = json.loads(body.strip( "'<>() " ).replace('\' ', ' \"'))
return jsondata
#获取一个页面中的数据,解析保存到数据库
def get_parse_json_data( self ,today):
jsondata = self .get_one_page_json_data()
df = pd.dataframe(columns = [ 'itinerardate' , 'airline' , 'airlinecode' , 'flightnumber' , 'flightnumbers' , 'aircraft' , 'aircraftsize'
, 'airporttax' , 'fueloiltax' , 'fromcity' , 'fromcitycode' , 'fromairport' , 'fromterminal' , 'fromdatetime' , 'tocity' , 'tocitycode' , 'toairport'
, 'toterminal' , 'todatetime' , 'durationhour' , 'durationminute' , 'duration' , 'currency' , 'ticketprices' , 'discount' , 'punctualityrate' , 'aircraftcabin' ])
if bool (jsondata[ "fis" ]):
#获取航空公司代码及公司名称
company = jsondata[ "als" ]
for k in company.keys():
if k not in self .airline:
self .airline[k] = company[k]
index = 0
for data in jsondata[ "fis" ]:
df.loc[index, 'itinerardate' ] = today #行程日期
#df.loc[index,'airline'] = self.airline[data["alc"].strip()] #航空公司
df.loc[index, 'airline' ] = self .airline[data[ "alc" ].strip()] if (data[ "alc" ].strip() in self .airline) else none #航空公司
df.loc[index, 'airlinecode' ] = data[ "alc" ].strip() #航空公司代码
df.loc[index, 'flightnumber' ] = data[ "fn" ] #航班号
df.loc[index, 'flightnumbers' ] = data[ "sdft" ] #共享航班号(实际航班)
df.loc[index, 'aircraft' ] = data[ "cf" ][ "c" ] #飞机型号
df.loc[index, 'aircraftsize' ] = data[ "cf" ][ "s" ] #型号大小(l大;m中;s小)
df.loc[index, 'airporttax' ] = data[ "tax" ] #机场建设费
df.loc[index, 'fueloiltax' ] = data[ "of" ] #燃油税
df.loc[index, 'fromcity' ] = data[ "acn" ] #出发城市
df.loc[index, 'fromcitycode' ] = data[ "acc" ] #出发城市代码
df.loc[index, 'fromairport' ] = data[ "apbn" ] #出发机场
df.loc[index, 'fromterminal' ] = data[ "asmsn" ] #出发航站楼
df.loc[index, 'fromdatetime' ] = data[ "dt" ] #出发时间
df.loc[index, 'tocity' ] = data[ "dcn" ] #到达城市
df.loc[index, 'tocitycode' ] = data[ "dcc" ] #到达城市代码
df.loc[index, 'toairport' ] = data[ "dpbn" ] #到达机场
df.loc[index, 'toterminal' ] = data[ "dsmsn" ] #到达航站楼
df.loc[index, 'todatetime' ] = data[ "at" ] #到达时间
df.loc[index, 'durationhour' ] = int ((parse(data[ "at" ]) - parse(data[ "dt" ])).seconds / 3600 ) #时长(小时h)
df.loc[index, 'durationminute' ] = int ((parse(data[ "at" ]) - parse(data[ "dt" ])).seconds % 3600 / 60 ) #时长(分钟m)
df.loc[index, 'duration' ] = str (df.loc[index, 'durationhour' ]) + 'h' + str (df.loc[index, 'durationminute' ]) + 'm' #时长(字符串)
df.loc[index, 'currency' ] = none #币种
df.loc[index, 'ticketprices' ] = data[ "lp" ] #票价
df.loc[index, 'discount' ] = none #已打折扣
df.loc[index, 'punctualityrate' ] = none #准点率
df.loc[index, 'aircraftcabin' ] = none #仓位(f头等舱;c公务舱;y经济舱)
index = index + 1
df.to_sql( "kkflight" , self .engine, index = false, if_exists = 'append' )
print ( "done!~" )
if __name__ = = "__main__" :
fly = flight()
fly.set_url_headers( '2018-06-16' , '2018-06-16' )
|
总结
以上所述是小编给大家介绍的python 爬取携程所有机票,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对服务器之家网站的支持!
原文链接:https://blog.csdn.net/kk185800961/article/details/80638035