本文实例为大家分享了python爬虫获取小区经纬度、地址的具体代码,供大家参考,具体内容如下
通过小区名称利用百度api可以获取小区的地址以及经纬度,但是由于api返回的值中的地址形式不同,所以可以首先利用小区名称进行一轮爬虫,获取小区的经纬度,然后再利用经纬度Reverse到小区的结构化的地址。另外小区名称如果是'...号‘,可以在爬虫开始之前在'号‘之后加一个'院‘,得到的精确度更高。这次写到程序更加便于二次利用,只需要给程序传递一个dataframe就可以坐等结果了。现在程序已经写好了,就等接下来在工作中看看效果如何了。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
class GetAddressInfo:
def __init__( self ,df):
import pandas
assert type (df) = = pandas.core.frame.DataFrame and ( 'city' in df.columns) and ( 'name' in df.columns),\
'The dataframe is not vailid'
from bs4 import BeautifulSoup
from urllib import request
import re
import pandas as pd
import numpy as np
import urllib.parse as urp
self .__data__ = df
def get_address( self ):
import numpy as np
self .__data__[ '小区经度' ] = np.nan
self .__data__[ '小区纬度' ] = np.nan
self .__data__[ '小区地址' ] = np.nan
for i in self .__data__.index:
self .__data__.loc[i, '小区纬度' ], self .__data__.loc[i, '小区经度' ], self .__data__.loc[i, '小区地址' ] = \
self .__get_neigbour_address__( self .__data__.loc[i, 'name' ],\
self .__data__.loc[i, 'city' ])
return self .__data__
def __lat__( self ,res):
try :
return pd.to_numeric(re.findall( '"lat":(.*)' ,res)[ 0 ].split( ',' )[ 0 ])
except :
return 0
def __lng__( self ,res):
try :
return pd.to_numeric(re.findall( '"lng":(.*)' ,res)[ 0 ])
except :
return 0
def __address__( self ,res):
try :
return re.findall( '"address":"(.*)",' ,res)[ 0 ]
except :
return 'None'
def __get_neigbour_address__( self ,name,city):
my_ak = ##替换自己的ak
qurey = urp.quote(name)
tag = urp.quote( '住宅区' )
try :
url = 'http://api.map.baidu.com/place/v2/search?query=' + qurey + '&tag=' + tag + '®ion=' + urp.quote(city) + '&output=json&ak=' + my_ak
req = request.urlopen(url)
res = req.read().decode()
lat = self .__lat__(res)
lng = self .__lng__(res)
address = self .__address__(res)
return lat,lng,address
except :
return 0 , 0 , 'None'
class ReverseGetAddress:
def __init__( self ,data):
assert ( '小区纬度' in data.columns) and ( '小区经度' in data.columns) and ( 'name' in data.columns),\
'The DataFrame is not vailid'
from bs4 import BeautifulSoup
from urllib import request
import re
import pandas as pd
import numpy as np
import urllib.parse as urp
self .__data__ = data
def __get_address1__( self ,url):
try :
req = request.urlopen(url)
res = req.read().decode()
address = re.findall( 'address":"(.*?)"' ,res)[ 0 ]
return address
except :
return 'None1'
def __to_string__( self ,arr):
return str (arr)
def __get_address2__( self ):
my_ak = ##替换自己的Ak
base_url1 = 'http://api.map.baidu.com/geocoder/v2/?callback=renderReverse'
base_url2 = '&location='
base_url3 = '&pois=0&radius=1&output=json&pois=1&ak='
url = base_url1 + base_url2 + self .__data__[ '小区纬度' ]. apply ( self .__to_string__) + ',' \
+ self .__data__[ '小区经度' ]. apply ( self .__to_string__) + base_url3 + my_ak
return url
def get_address( self ):
url = self .__get_address2__()
self .__data__[ '小区地址' ] = url. apply ( self .__get_address1__)
return self .__data__
|
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/weixin_41968760/article/details/80677954