Python 多进程爬虫实例
import json import re import time from multiprocessing import Pool import requests from requests.exceptions import RequestException from bs4 import BeautifulSoup def get_one_page(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(html): data_list = [] soup = BeautifulSoup(html, "lxml") index_list = soup.select('i.board-index') img_list = [x['data-src'] for x in soup.findAll('img', {'class': 'board-img'})] name_list = soup.select('p.name') actor_list = soup.select('p.star') time_list = soup.select('p.releasetime') score_list = soup.select('p.score') for i in range(len(index_list)): data_list.append({ 'index': index_list[i].get_text(), 'image': img_list[i], 'title': name_list[i].get_text(), 'actor': actor_list[i].get_text().strip(), 'time': time_list[i].get_text(), 'score': score_list[i].get_text() }) return data_list def write_to_file(content): with open('resul1t.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n') f.close() def main(offset_list): for offset in offset_list: url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) if html: for item in parse_one_page(html): write_to_file(item) if __name__ == '__main__': # pool = Pool() # pool.map(main, [i * 10 for i in range(10)]) # pool.close() # pool.join() # main(1) offset_list = list(range(0, 100, 10)) # 多进程 p = Pool() for index in range(5): p.apply_async(main, args=(offset_list[index * 2:(index + 1) * 2],)) p.close() p.join()