Python 多进程爬虫实例

import json

import re

import time

from multiprocessing import Pool

import requests

from requests.exceptions import RequestException

from bs4 import BeautifulSoup

def get_one_page(url):

    try:

        response = requests.get(url)

        if response.status_code == 200:

            return response.text

        return None

    except RequestException:

        return None

def parse_one_page(html):

    data_list = []

    soup = BeautifulSoup(html, "lxml")

    index_list = soup.select('i.board-index')

    img_list = [x['data-src'] for x in soup.findAll('img', {'class': 'board-img'})]

    name_list = soup.select('p.name')

    actor_list = soup.select('p.star')

    time_list = soup.select('p.releasetime')

    score_list = soup.select('p.score')

    for i in range(len(index_list)):

        data_list.append({

            'index': index_list[i].get_text(),

            'image': img_list[i],

            'title': name_list[i].get_text(),

            'actor': actor_list[i].get_text().strip(),

            'time': time_list[i].get_text(),

            'score': score_list[i].get_text()

        })

    return data_list

def write_to_file(content):

    with open('resul1t.txt', 'a', encoding='utf-8') as f:

        f.write(json.dumps(content, ensure_ascii=False) + '\n')

        f.close()

def main(offset_list):

    for offset in offset_list:

        url = 'http://maoyan.com/board/4?offset=' + str(offset)

        html = get_one_page(url)

        if html:

            for item in parse_one_page(html):

                write_to_file(item)

if __name__ == '__main__':

    # pool = Pool()

    # pool.map(main, [i * 10 for i in range(10)])

    # pool.close()

    # pool.join()

    # main(1)

    offset_list = list(range(0, 100, 10))  # 多进程

    p = Pool()

    for index in range(5):

        p.apply_async(main, args=(offset_list[index * 2:(index + 1) * 2],))

    p.close()

    p.join()
秒客网

Python 多进程爬虫实例

相关文章