day24-京东商品作业
from selenium.webdriver import Chrome, ChromeOptions
import time, csv, os
from bs4 import BeautifulSoup
def analysis_data(html):
soup = BeautifulSoup(html, 'lxml')
goods_li = soup.select('#J_goodsList>ul>-item')
all_data = []
for li in goods_li:
name = li.select_one('.p-name>a').attrs['title']
price = li.select_one('.p-price i').text
comment_count = li.select_one('.p-commit a').text
shop_name = li.select_one('.p-shop a').attrs['title']
goods_url = 'https:' + li.select_one('.p-name>a').attrs['href']
all_data.append([name, price, comment_count, shop_name, goods_url])
result = os.path.exists('files/电脑.csv')
with open('files/电脑.csv', 'a', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
if not result:
writer.writerow(['商品名称', '价格', '评论数', '店铺名', '商品详情地址'])
writer.writerows(all_data)
def get_net_data():
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
b = Chrome(options=options)
b.implicitly_wait(3)
b.get('')
b.find_element_by_id('key').send_keys('电脑\n')
for page in range(5):
# 滚动
for _ in range(10):
b.execute_script('(0, 800)')
time.sleep(1)
# 获取网页源代码, 解析数据
analysis_data(b.page_source)
# 点击下一页
b.find_element_by_class_name('pn-next').click()
b.close()
if __name__ == '__main__':
get_net_data()