使用常规方法爬取猫眼电影

时间:2020-12-12 12:30:54

1:首先确定要爬取的网站:爬取的url链接规律,请求方式时post还是get,

 

2:然后简单书写爬虫进行网页测试:

import requests
from requests.exceptions import RequestException

def get_one_page(url):
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)"
    }
    try:
        response = requests.get(url,headers = headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return  None

def main():
   url = 'http://maoyan.com/board/4?'
   html = get_one_page(url)
   print(html)



if __name__ =="__main__":
   main()

3:测试通过后,增加网页循环对爬取内容进行处理,然后方法一保存为txt格式,方法二保存为csv格式:

 

# !/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import re
import time
import json
from requests.exceptions import RequestException

def get_one_page(url):
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)"
    }
    try:
        response = requests.get(url,headers = headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return  None

# 定义parse_one_page,对html进行解析,re.S表示匹配任何非空白字符,其中(.*?)表示匹配的内容:
def parse_one_page(html):
      pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name"><a'
                           +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                            +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)

       #html表示匹配的目标对象
      items= re.findall(pattern, html)
      for item in items:
          yield {
              "index":item[0],
              "image":item[1],
              "name":item[2],
              "actor":item[3].strip(),
              "time":item[4].strip(),
              "star":item[5]+item[6],
       }

def main(offset):
     url = 'http://maoyan.com/board/4?offset='+ str(offset)
     html = get_one_page(url)
     for item in parse_one_page(html):
         print(item)
         # write_to_file(item)
         write_to_csv(item)

def write_to_csv(content):
    with open("猫眼result.csv",'a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False) +'\n')

# def write_to_file(content):
#     # a表示追加的方式进行添加
#     with open('猫眼result.txt', 'a', encoding='utf-8') as f:
#         f.write(json.dumps(content, ensure_ascii=False) + '\n')




if __name__ =="__main__":
   for i in range(10):
       main(offset = i * 10)
       time.sleep(1)

 

 

 

使用进程池抓取:

# !/usr/bin/env python
# -*- coding:utf-8 -*-

# !/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import re
import time
import json
from multiprocessing import Pool
from requests.exceptions import RequestException

def get_one_page(url):
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)"
    }
    try:
        response = requests.get(url,headers = headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return  None

# 定义parse_one_page,对html进行解析,re.S表示匹配任何非空白字符,其中(.*?)表示匹配的内容:
def parse_one_page(html):
      pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name"><a'
                           +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                            +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)

       #html表示匹配的目标对象
      items= re.findall(pattern, html)
      for item in items:
          yield {
              "index":item[0],
              "image":item[1],
              "name":item[2],
              "actor":item[3].strip(),
              "time":item[4].strip(),
              "star":item[5]+item[6],
       }

def main(offset):
     url = 'http://maoyan.com/board/4?offset='+ str(offset)
     html = get_one_page(url)
     for item in parse_one_page(html):
         print(item)
         # write_to_file(item)
         write_to_csv(item)

def write_to_csv(content):
    with open("猫眼进程result.csv",'a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False) +'\n')

# def write_to_file(content):
#     # a表示追加的方式进行添加
#     with open('猫眼result.txt', 'a', encoding='utf-8') as f:
#         f.write(json.dumps(content, ensure_ascii=False) + '\n')




if __name__ =="__main__":
   pool = Pool()
   pool.map(main,[i*10 for i in range(10)])