爬虫实战计划的第一篇:淘女郎网站图片爬取
爬虫一般都是从爬取妹子图入门的,哈哈。这个算是最简单的爬虫了,只用了request库,也是最入门级别的爬虫方法。
ps:附上GitHub链接:
淘女郎网站图片爬取
代码展示:
def open_url(url): req = urllib.request.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36') response = urllib.request.urlopen(req) html = response.read() return html def get_MM_Data(url,currentPage=0): #获取基本信息,以及user_id data={ 'q':'', 'viewFlag':'A', 'sortType':'default', 'searchStyle':'', 'searchRegion':'city:', 'searchFansNum':'', 'currentPage':currentPage, 'pageSize':'100' } data=urllib.parse.urlencode(data) urll = urllib.request.Request(url, data=bytes(data, 'utf-8')) urll.add_header('user-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36') response = urllib.request.urlopen(urll) html = response.read().decode('gbk').encode('utf-8') htmlToDict=json.loads(html) return htmlToDict['data']['searchDOList'] def get_MM_ID(data): ID=[] for i in data: ID.append(i['userId']) return ID def get_MM_Name(data): for i in data: user_name.append(i['realName']) return user_name def get_MM_city(data): for i in data: user_city.append(i['city']) return user_city def get_MM_Height(data): for i in data: user_height.append(i['height']) return user_height def get_MM_Weight(data): for i in data: user_weight.append(i['weight']) return user_weight def get_photos_url(url): album_id=[] html=urllib.request.Request(url) html.add_header('user-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36') html1=urllib.request.urlopen(html).read().decode('gbk') response=re.findall('album_id=[0-9]*',html1) for i in response: if i not in album_id: album_id.append(i) return album_id def get_photos(url): pic_url=[] html = open_url(url) html1=html.decode('gbk') for i in json.loads(html1)['picList']: picUrl=i['picUrl'] pic_url.append('https:'+picUrl) return pic_url
主函数:
if __name__=='__main__':
url='https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8' data=get_MM_Data(url,1) if not os.path.exists('D://taoMM'): #判断路径是否存在 os.mkdir('D://taoMM') #路径不存在,新建文件夹 os.chdir('D://taoMM') ID=get_MM_ID(data) get_MM_city(data) get_MM_Name(data) get_MM_Weight(data) get_MM_Height(data) for j in range(0,ID.__len__()-1): folder_name = user_name[j] + '_' + user_city[j] + '_体重' + str(user_weight[j]) + '_身高' + str(user_height[j]) if not os.path.exists('D:\\taoMM\\'+folder_name): os.mkdir("D:\\taoMM\\"+folder_name) album = get_photos_url( 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20={}'.format(ID[j])) print('当前模特:{}'.format(user_name[j])) print(album) photos = get_photos( 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id={0}&{1}&top_pic_id=0&cover=%2F%2Fimg.alicdn.com%2Fimgextra&page=1'.format( ID[j], album[0])) print(photos) for i in photos: name=i.split('/')[-1] with open('D://taoMM//'+folder_name+'/'+name,'wb') as f: try: html=open_url(i) except urllib.error.HTTPError: continue f.write(html)
效果展示: