1. import requests
2. from scrapy import Selector
3. import pandas as pd
4.
5. class getSteamInfo():
6.
7. headers = {
8. "Host": "那个网站",
9. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
10. "accept-encoding": "gzip, deflate, br",
11. "accept-language": "zh-CN,zh;q=0.9",
12. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
13. }
14.
15. url = []
16. name = []
17. sales_date = []
18. discount = []
19. price = []
20.
21. # api获取ip
22. def getApiIp(self):
23. # 获取且仅获取一个ip
24. api_url = 'api地址'
25. res = requests.get(api_url, timeout=5)
26. try:
27. if res.status_code == 200:
28. api_data = res.json()['data'][0]
29. proxies = {
30. 'http': 'http://{}:{}'.format(api_data['ip'], api_data['port']),
31. 'https': 'http://{}:{}'.format(api_data['ip'], api_data['port']),
32. }
33. print(proxies)
34. return proxies
35. else:
36. print('获取失败')
37. except:
38. print('获取失败')
39.
40. def getInfo(self):
41. url = 'https://那个网站/search/results/?query&start=0&count=50&sort_by=_ASC&os=win&snr=1_7_7_globaltopsellers_7&filter=globaltopsellers&infinite=1'
42. res = self.getRes(url,self.headers,'','','GET')#自己封装的请求方法
43. res = res.json()['results_html']
44. sel = Selector(text=res)
45. nodes = sel.css('.search_result_row')
46. for node in nodes:
47. url = node.css('a::attr(href)').extract_first()
48. if url not in self.url:
49. self.url.append(url)
50. name = node.css('a .search_name .title::text').extract_first()
51. sales_date = node.css('a .search_released::text').extract_first()
52. discount = node.css('.search_discount span::text').extract_first()
53. discount = discount if discount else 'no discount'
54. price = node.css('a .search_price::text').extract_first().strip()
55. discountPrice = node.css('.discounted::text').extract()
56. discountPrice = discountPrice[-1] if discountPrice else ''
57. price = discountPrice if discountPrice else price
58. self.name.append(name)
59. self.sales_date.append(sales_date)
60. self.discount.append(discount)
61. self.price.append(price)
62. else:
63. print('已存在')
64. # self.insert_info()
65.
66. def insert_info(self):
67. data = {
68. 'URL':self.url,'游戏名':self.name,'发售日':self.sales_date,'是否打折':self.discount,'价格':self.price
69. }
70. frame = pd.DataFrame(data)
71. xlsxFrame = pd.read_excel('./steam.xlsx')
72. print(xlsxFrame)
73. if xlsxFrame is not None:
74. print('追加')
75. frame = frame.append(xlsxFrame)
76. frame.to_excel('./steam.xlsx', index=False)
77. else:
78. frame.to_excel('./steam.xlsx', index=False)
79.
80. # 专门发送请求的方法,代理请求三次,三次失败返回错误
81. def getRes(self,url, headers, proxies, post_data, method):
82. if proxies:
83. for i in range(3):
84. try:
85. # 传代理的post请求
86. if method == 'POST':
87. res = requests.post(url, headers=headers, data=post_data, proxies=proxies)
88. # 传代理的get请求
89. else:
90. res = requests.get(url, headers=headers, proxies=proxies)
91. if res:
92. return res
93. except:
94. print(f'第{i+1}次请求出错')
95. else:
96. return None
97. else:
98. for i in range(3):
99. proxies = self.getApiIp()
100. try:
101. # 请求代理的post请求
102. if method == 'POST':
103. res = requests.post(url, headers=headers, data=post_data, proxies=proxies)
104. # 请求代理的get请求
105. else:
106. res = requests.get(url, headers=headers, proxies=proxies)
107. if res:
108. return res
109. except:
110. print(f"第{i+1}次请求出错")
111. else:
112. return None
113.
114. if __name__ == '__main__':
115. getSteamInfo().getInfo()