python 豆瓣采集

新手今天刚学python~~~ 有点凌乱~勉强看吧只能算是给新手看看，见谅

简单版本的豆瓣采集美图~~~~~~ 美女天天有有木有~~~

python 豆瓣采集

python 3.4

sqlite3

BeautifulSoup 4.4

 from bs4 import BeautifulSoup

 import urllib.request

 import time,sched,os

 import sqlite3

 import sys

 #sys.exit()

 cx = sqlite3.connect('c:\\sqlite\\test.db')

 global cu

 cu=cx.cursor()

 cu.execute('select name from sqlite_master where type=\'table\' order by name;')

 for ds in cu.fetchall():

     #print(ds[0])

     if ds[0] != 'caiji':

         print("表不存在，开始创建")

         cu.execute("create table caiji (id INTEGER PRIMARY KEY AUTOINCREMENT,pid integer,nickname text NULL); ")

     else:

         print("存在")

         break

 #t=('grmlmgjsadf',)

 #cx.execute("insert into caiji(nickname) values(?)",t)

 #cx.commit()

 #cu.execute('select * from caiji where nickname=\''+'grmlmgjsadf'+'\'')

 #if cu.fetchall():

 #    print('dsa')

 #cu.close()

 #cx.close()

 path="d:\\imgs\\"

 ISOTIMEFORMAT='%Y%m%d'

 def dwonloadimg(uri):

     temp=time.strftime(ISOTIMEFORMAT, time.localtime())

     isexists=os.path.exists(path+temp)

     if not isexists:

         os.makedirs(path+temp)

     conn=urllib.request.urlopen(uri)

     pos=uri.rfind("/")

     name=uri[pos+1:]

     f=open(path+temp+'\\'+name,'wb')

     f.write(conn.read())

     conn.close()

     f.close()

 def Getarticle1(uri):

     res=urllib.request.urlopen(uri)

     html=res.read()

     res.close()

     str(html,'utf-8')

     bs=BeautifulSoup(html)

     imgs=bs.find_all('div',class_="topic-figure cc")

     for s in imgs:

         strc=s.find('img').attrs['src']

         print('图片：',strc)

         dwonloadimg(strc)

 def init():

     print('开始抓取')

     url="http://www.douban.com/group/haixiuzu/"

     temp=urllib.request.urlopen(url)

     html=temp.read()

     str(html,'utf-8')

     bs=BeautifulSoup(html)

     divs=bs.find_all('td',class_='title')

     for s in divs:

         uri=s.a["href"]

         #print(s.a.string,"\n",uri)

         cu.execute('select * from caiji where nickname=\''+uri+'\'')

         global cu

         if not cu.fetchall():

             print("新文章")

             cx.execute("insert into caiji(nickname) values(?)",(uri,))

             cx.commit()

             Getarticle1(uri)

        # else:

         #    print("文章存在")

     print("结束")

 while True:

     init()

     time.sleep(60)

秒客网

python 豆瓣采集

相关文章