python 豆瓣采集

时间:2022-01-07 03:37:45

新手今天刚学python~~~ 有点凌乱~勉强看吧 只能算是给新手看看,见谅

简单版本的 豆瓣采集美图~~~~~~ 美女天天有 有木有~~~

python 豆瓣采集python 豆瓣采集

python 3.4

sqlite3

BeautifulSoup 4.4

 from bs4 import BeautifulSoup
import urllib.request
import time,sched,os
import sqlite3
import sys #sys.exit() cx = sqlite3.connect('c:\\sqlite\\test.db')
global cu
cu=cx.cursor()
cu.execute('select name from sqlite_master where type=\'table\' order by name;')
for ds in cu.fetchall():
#print(ds[0])
if ds[0] != 'caiji':
print("表不存在,开始创建")
cu.execute("create table caiji (id INTEGER PRIMARY KEY AUTOINCREMENT,pid integer,nickname text NULL); ")
else:
print("存在")
break
#t=('grmlmgjsadf',)
#cx.execute("insert into caiji(nickname) values(?)",t)
#cx.commit()
#cu.execute('select * from caiji where nickname=\''+'grmlmgjsadf'+'\'')
#if cu.fetchall():
# print('dsa') #cu.close()
#cx.close() path="d:\\imgs\\"
ISOTIMEFORMAT='%Y%m%d' def dwonloadimg(uri):
temp=time.strftime(ISOTIMEFORMAT, time.localtime())
isexists=os.path.exists(path+temp)
if not isexists:
os.makedirs(path+temp)
conn=urllib.request.urlopen(uri)
pos=uri.rfind("/")
name=uri[pos+1:]
f=open(path+temp+'\\'+name,'wb')
f.write(conn.read())
conn.close()
f.close() def Getarticle1(uri):
res=urllib.request.urlopen(uri)
html=res.read()
res.close()
str(html,'utf-8')
bs=BeautifulSoup(html)
imgs=bs.find_all('div',class_="topic-figure cc")
for s in imgs:
strc=s.find('img').attrs['src']
print('图片:',strc)
dwonloadimg(strc)
def init():
print('开始抓取')
url="http://www.douban.com/group/haixiuzu/"
temp=urllib.request.urlopen(url)
html=temp.read()
str(html,'utf-8')
bs=BeautifulSoup(html)
divs=bs.find_all('td',class_='title')
for s in divs:
uri=s.a["href"]
#print(s.a.string,"\n",uri)
cu.execute('select * from caiji where nickname=\''+uri+'\'')
global cu
if not cu.fetchall():
print("新文章")
cx.execute("insert into caiji(nickname) values(?)",(uri,))
cx.commit()
Getarticle1(uri)
# else:
# print("文章存在")
print("结束") while True:
init()
time.sleep(60)