空气质量数据下载

时间:2021-01-07 22:26:42

从pm25.in网站下载全国各个城市、各个站点逐小时空气质量数据。Github地址:https://github.com/lixiang-ucas/PM25

使用Urllib2和BeautifulSoup库进行数据抓取

 getUrlRespHtml是下载函数,设置了一些HTTP头

 getCities获取城市列表 

getCurrentHour获取当前一个小时的数据

__name__入口函数,每小时执行一次getCurrentHour


# -*- coding: utf-8 -*-
"""
Created on Tue Jan 26 14:28:14 2016


@author: lixiang
"""


import urllib2
import socket
import time
from bs4 import BeautifulSoup    #besutifulsoup的第三版  


import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )


sleep_time=30


def getUrlRespHtml(url):
    heads = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
            'Accept-Charset':'GB2312,utf-8;q=0.7,*;q=0.7', 
            'Accept-Language':'zh-cn,zh;q=0.5', 
            'Cache-Control':'max-age=0', 
            'Connection':'keep-alive', 
            'Host':'John', 
            'Keep-Alive':'115', 
            'Referer':url, 
            'User-Agent':'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.14) Gecko/20110221 Ubuntu/10.10 (maverick) Firefox/3.6.14'}
 
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
    urllib2.install_opener(opener) 
    req = urllib2.Request(url)
    opener.addheaders = heads.items()
#    respHtml = opener.open(req).read()
#    return respHtml.decode('gbk').encode('utf-8')
    try:
        respHtml = opener.open(req).read()
    except urllib2.HTTPError, e:
        print e.code
    return respHtml


def getCities():
    site='http://pm25.in'
    html = getUrlRespHtml(site)
    soup = BeautifulSoup(html) 
    hrefs = soup.find("div",{"class":"all"}).find_all('a')
    cs=[]
    for href in hrefs:
        c=href['href']
        c=c[1:len(c)]
        cs.append(c)
#    cs=cs[0:10]
    return list(set(cs))
    
def getCurrentHour():
    socket.setdefaulttimeout(60) # 60 秒钟后超时
    file_c = open('city.txt','w')
    file_s = open('site.txt','w')
#    cities=['beijing','tianjin','shanghai']
    cities=getCities()
    print 'city count is:'+str(len(cities))
#    city='tianjin'
    for city in cities:
        ind=cities.index(city)
        if ind%50==0:
            time.sleep(sleep_time)
        site ='http://pm25.in/' + city + '.html'  
        print str(ind)+' '+site
    #    html = urllib2.urlopen(site)  
        html = getUrlRespHtml(site)
        soup = BeautifulSoup(html) 
        #时间        
        dt = soup.find('div',{'class':'live_data_time'}).find('p').string
        dt = dt.split(':')[1]
        #城市数据
        cityAQI = soup.find_all("div",{"class":"value"})        
        file_c.write(city+'\t')
        file_c.write(dt+'\t')
        for ind_aqi in range(len(cityAQI)-1):
            aqi=cityAQI[ind_aqi]
            temp='-'
            if aqi.string!=None:
                file_c.write(aqi.string.split()[0]+'\t')
            else:
                file_c.write(temp+'\t')
        file_c.write('\n')
        #各个站点数据
        trs=soup.find_all('tr')
        for ind_s in range(1,len(trs)):
            file_s.write(city+'\t')
            file_s.write(dt+'\t')
            tr=trs[ind_s]
            siteAQI=tr.find_all('td')
            for aqi in siteAQI:
                temp='-'
                if aqi.string!=None:
                    file_s.write(aqi.string.split()[0]+'\t')
                else:
                    file_s.write(temp+'\t')
            file_s.write('\n')
        time.sleep(sleep_time) 
    file_c.close()
    file_s.close()


if __name__ == '__main__': 
    while True:
        getCurrentHour()
        time.sleep(3600)