python抓取最新博客内容并生成Rss

osc的rss不是全文输出的,不开心，所以就有了python抓取osc最新博客生成Rss
									# -*- coding: utf-8 -*-

									from bs4 import BeautifulSoup

									import urllib2

									import datetime

									import time

									import PyRSS2Gen

									from email.Utils import formatdate

									import re

									import sys

									import os

									reload(sys)

									sys.setdefaultencoding('utf-8')

									class RssSpider():

									 def __init__(self):

									 self.myrss = PyRSS2Gen.RSS2(title='OSChina',

									link='http://my.oschina.net',

									description=str(datetime.date.today()),

									pubDate=datetime.datetime.now(),

									 lastBuildDate = datetime.datetime.now(),

									items=[]

									)

									self.xmlpath=r'/var/www/myrss/oschina.xml'

									self.baseurl="http://www.oschina.net/blog"

									 #if os.path.isfile(self.xmlpath):

									#os.remove(self.xmlpath)

									 def useragent(self,url):

									 i_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) 

									 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", 

									"Referer": 'http://baidu.com/'}

									 req = urllib2.Request(url, headers=i_headers)

									 html = urllib2.urlopen(req).read()

									 return html

									 def enterpage(self,url):

									 pattern = re.compile(r'd{4}Sd{2}Sd{2}sd{2}Sd{2}')

									rsp=self.useragent(url)

									soup=BeautifulSoup(rsp)

									timespan=soup.find('div',{'class':'BlogStat'})

									timespan=str(timespan).strip().replace('n','').decode('utf-8')

									match=re.search(r'd{4}Sd{2}Sd{2}sd{2}Sd{2}',timespan)

									timestr=str(datetime.date.today())

									 if match:

									timestr=match.group()

									 #print timestr

									ititle=soup.title.string

									div=soup.find('div',{'class':'BlogContent'})

									rss=PyRSS2Gen.RSSItem(

									title=ititle,

									link=url,

									 description = str(div),

									 pubDate = timestr

									)

									 return rss

									 def getcontent(self):

									rsp=self.useragent(self.baseurl)

									soup=BeautifulSoup(rsp)

									ul=soup.find('div',{'id':'RecentBlogs'})

									 for li in ul.findAll('li'):

									div=li.find('div')

									 if div is not None:

									alink=div.find('a')

									 if alink is not None:

									link=alink.get('href')

									 print link

									html=self.enterpage(link)

									self.myrss.items.append(html)

									 def SaveRssFile(self,filename):

									finallxml=self.myrss.to_xml(encoding='utf-8')

									file=open(self.xmlpath,'w')

									file.writelines(finallxml)

									file.close()

									if __name__=='__main__':

									rssSpider=RssSpider()

									rssSpider.getcontent()

									rssSpider.SaveRssFile('oschina.xml')
以上所述就是本文的全部内容了，希望大家能够喜欢。
秒客网

python抓取最新博客内容并生成Rss

相关文章