import re import requests def get_html(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'} try: r = requests.get(url, headers=headers) r.raise_for_status() return r.text except: print('status_code is not 200') return None def parse_html(text, info_list): pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?name"><a' +'.*?>(.*?)</a>.*?star">\s+主演:(.*?)\s+</p>.*?releasetime">上映时间:(.*?)</p>' +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>', re.S) items = re.findall(pattern, text) for item in items: info_list.append({ 'movie': item[1], 'rank': item[0], 'actors': item[2], 'time': item[3], 'score': item[4] + item[5] }) if __name__ == '__main__': url = 'http://maoyan.com/board/4' info_list = [] for i in range(10): path = url + '?offset=' + str(i*10) txt = get_html(path) if txt: parse_html(txt, info_list) for info in info_list: print(info)