python3----练习题(爬取电影天堂资源,大学排名,淘宝商品比价)

时间:2022-07-12 18:31:41
 1 import requests
2 import re
3
4 url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'
5 for n in range(1, 2):
6 new_url = url.format(n)
7 html_1 = requests.get(new_url)
8 html_1.encoding = 'gb2312'
9 detil_list = re.findall('<a href="(.*?)" class="ulink">', html_1.text)
10
11 for m in detil_list:
12 b_url = 'http://www.ygdy8.net' + m
13 html_2 = requests.get(b_url)
14 html_2.encoding = 'gb2312'
15 ftp = re.findall('<a href="(.*?)">.*?</a></td>', html_2.text)
16 with open('tddy.txt', 'a', encoding='utf-8') as f:
17 f.write(ftp[0] + '\n')

 

大学排名练习

 1 import bs4
2 import requests
3 from bs4 import BeautifulSoup
4
5 def get_html_text(url):
6 try:
7 r = requests.get(url, timeout=20)
8 r.raise_for_status()
9 r.encoding = r.apparent_encoding
10 return r.text
11 except:
12 return " "
13
14
15 def fill_univ_list(ulist, html):
16 soup = BeautifulSoup(html, "html.parser")
17 for tr in soup.find('tbody').children:
18 if isinstance(tr, bs4.element.Tag): # 判断类型
19 tds = tr('td')
20 ulist.append([tds[0].string, tds[1].string, tds[3].string])
21
22
23 def print_univ_list(ulist, num):
24 tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
25 print(tplt.format("排名", "学校名称", "总分", chr(12288)))
26 for i in range(num):
27 u = ulist[i]
28 print(tplt.format(u[0], u[1], u[2], chr(12288)))
29
30
31 def main():
32 uinfo = []
33 url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html'
34 html = get_html_text(url)
35 fill_univ_list(uinfo, html)
36 print_univ_list(uinfo, 20)
37
38
39 main()

 淘宝商品比价:

 1 import requests
2 import re
3
4 def get_html_text(url):
5 try:
6 r = requests.get(url, timeout=30)
7 r.raise_for_status()
8 r.encoding = 'utf-8'
9 return r.text
10 except:
11 return ""
12
13
14 def parse_page(ilt, html):
15 try:
16 plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
17 tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
18 for i in range(len(plt)):
19 price = eval(plt[i].split(':')[1])
20 title = eval(tlt[i].split(':')[1])
21 ilt.append([price, title])
22 except:
23 print("")
24
25 def print_goods_list(ilt):
26 tplt = "{:4}\t{:8}\t{:16}"
27 print(tplt.format("序号", "价格", "商品名称"))
28 count = 0
29 for g in ilt:
30 count = count + 1
31 print(tplt.format(count, g[0], g[1]))
32
33 def main():
34 goods = '减肥餐'
35 depth = 2
36 start_url = 'http://s.taobao.com/search?q=' + goods
37 info_list = []
38 for i in range(depth):
39 try:
40 url = start_url + '&s=' + str(44*i)
41 html = get_html_text(url)
42 parse_page(info_list, html)
43 except:
44 continue
45 print_goods_list(info_list)

 股票数据:

 1 import re
2 import traceback
3
4 import requests
5 import sys
6 from bs4 import BeautifulSoup
7
8
9 def get_html_text(url, code='utf-8'):
10 headers ={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
11 try:
12 r = requests.get(url, timeout=20, headers=headers)
13 r.raise_for_status()
14 r.encoding = code
15 return r.text
16 except:
17 return ""
18
19 def get_stock_list(lst, stock_url):
20 html = get_html_text(stock_url, 'GB2312')
21 soup = BeautifulSoup(html, 'html.parser')
22 a = soup.find_all('a')
23 for i in a:
24 stock_code = re.findall(r'[s][hz]\d{6}', str(i))
25 if len(stock_code) != 0:
26 lst.append(stock_code)
27
28
29 def get_stock_info(lst, stock_url, fpath):
30 count = 0
31 for stock in lst:
32 url = stock_url + stock[0] + '.html'
33 print(url)
34 html = get_html_text(url)
35 try:
36 if html == "":
37 continue
38 info_dict = {}
39 soup = BeautifulSoup(html, 'html.parser')
40 stock_info = soup.find('div', attrs={'class': 'stock-bets'})
41 info_dict.update({'股票名称': stock_info.text.split()[0]})
42
43 key_list = stock_info.find_all('dt')
44 value_list = stock_info.find_all('dd')
45 for i in range(len(key_list)):
46 key = key_list[i].text
47 info_dict[key] = value_list[i].text
48
49 with open(fpath, 'a', encoding='utf-8') as f:
50 f.write(str(info_dict) + '\n')
51 count = count + 1
52 print("\r当前进度: {:.2f}%".format(count*100/len(lst), end=""))
53 except:
54 traceback.print_exc(file=sys.stdout)
55 count = count + 1
56 print("\r当前进度: {:.2f}%".format(count * 100 / len(lst), end=""))
57 continue
58
59 def main():
60 stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
61 stock_info_url = 'http://gupiao.baidu.com/stock/'
62 output_file = 'D:/BaiduStockInfo.txt'
63 slist = []
64 get_stock_list(slist, stock_list_url)
65 get_stock_info(slist, stock_info_url, output_file)