day24-京东商品作业

时间:2025-01-17 10:30:14
from selenium.webdriver import Chrome, ChromeOptions import time, csv, os from bs4 import BeautifulSoup def analysis_data(html): soup = BeautifulSoup(html, 'lxml') goods_li = soup.select('#J_goodsList>ul>-item') all_data = [] for li in goods_li: name = li.select_one('.p-name>a').attrs['title'] price = li.select_one('.p-price i').text comment_count = li.select_one('.p-commit a').text shop_name = li.select_one('.p-shop a').attrs['title'] goods_url = 'https:' + li.select_one('.p-name>a').attrs['href'] all_data.append([name, price, comment_count, shop_name, goods_url]) result = os.path.exists('files/电脑.csv') with open('files/电脑.csv', 'a', encoding='utf-8', newline='') as f: writer = csv.writer(f) if not result: writer.writerow(['商品名称', '价格', '评论数', '店铺名', '商品详情地址']) writer.writerows(all_data) def get_net_data(): options = ChromeOptions() options.add_experimental_option('excludeSwitches', ['enable-automation']) options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) b = Chrome(options=options) b.implicitly_wait(3) b.get('') b.find_element_by_id('key').send_keys('电脑\n') for page in range(5): # 滚动 for _ in range(10): b.execute_script('(0, 800)') time.sleep(1) # 获取网页源代码, 解析数据 analysis_data(b.page_source) # 点击下一页 b.find_element_by_class_name('pn-next').click() b.close() if __name__ == '__main__': get_net_data()