Tornado框架实现异步爬虫

from urllib.parse import urljoin

from bs4 import BeautifulSoup

from tornado import gen, httpclient, ioloop, queues

base_url = "http://www.tornadoweb.org/en/stable/"

concurrency = 20 #并发数

async def get_links(url):

    http_client = httpclient.AsyncHTTPClient()

    response = await http_client.fetch(url)

    html = response.body.decode("utf8")

    soup = BeautifulSoup(html)

    links = [urljoin(base_url, a.get("href")) for a in soup.find_all("a",href=True)]

    return links

async def main():

    seen_set = set()

    q = queues.Queue()

    async def fetch_url(current_url):

        if current_url in seen_set:

            return

        print("获取 {}".format(current_url))

        seen_set.add(current_url)

        next_urls = await get_links(current_url)

        for new_url in next_urls:

            if new_url.startswith(base_url):

                await q.put(new_url)

    async def worker():

        async for url in q:

            if url is None:

                return

            try:

                await fetch_url(url)

            except Exception as e:

                print(e)

                print("exec")

            finally:

                q.task_done()

    await q.put(base_url)

    workers = gen.multi([worker() for _ in range(concurrency)])

    await q.join()

    for _ in range(concurrency):

        await q.put(None)

    await workers

if __name__ == '__main__':

    loop = ioloop.IOLoop.current()

    loop.run_sync(main)
秒客网

Tornado框架实现异步爬虫

相关文章