1.淘宝天猫爬取
1.1 方法一selenium实现数据爬取
# -- coding: utf-8 --
# 模拟淘宝登录并爬取数据
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
# 配置 ChromeOptions 防止被检测
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--incognito") # 使用隐身模式
# options.add_argument('--headless') # 无头浏览,开发和调试时可以注释掉这行
# 创建 WebDriver 实例
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
wait = WebDriverWait(driver, 10)
def login_taobao(username, password):
print('开始登录...')
try:
login_url = 'https://login.taobao.com/'
driver.get(login_url)
# 输入用户名
input_login_id = wait.until(EC.presence_of_element_located((By.ID, 'fm-login-id')))
input_login_id.send_keys(username)
# 输入密码
input_login_password = wait.until(EC.presence_of_element_located((By.ID, 'fm-login-password')))
input_login_password.send_keys(password)
# 点击登录按钮
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.fm-button.fm-submit.password-login')))
submit.click()
# 等待滑动验证(用户手动操作)
print("请完成滑动验证...")
time.sleep(20) # 等待用户完成滑动验证,具体时间可以根据实际情况调整
# 等待登录完成
is_logging = wait.until(EC.url_changes(login_url))
if is_logging:
print("登录成功!")
return True
except TimeoutException:
print('登录超时,尝试点击其他登录按钮...')
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.fm-button.fm-submit')))
submit.click()
is_logging = wait.until(EC.url_changes(login_url))
if is_logging:
print("登录成功!")
return True
else:
print("登录失败,请重试。")
return False
def scrape_data(url):
driver.get(url)
time.sleep(3)
try:
# 获取书名和价格
titles = driver.find_elements(By.XPATH, '//div[contains(@class, "title--F6pvp_RZ")]/span')
prices = driver.find_elements(By.XPATH, '//div[contains(@style, "margin-right: 8px;")]')
if not titles or not prices:
titles = driver.execute_script("return Array.from(document.querySelectorAll('div.title--F6pvp_RZ > span')).map(el => el.innerText);")
prices = driver.execute_script("return Array.from(document.querySelectorAll('div[style*=\"margin-right: 8px;\"]')).map(el => el.innerText);")
count = 0
for title, price in zip(titles, prices):
print(f"书名: {title.text}")
print(f"价格: {price.text}元")
print('-' * 40)
count += 1
print(f"共爬取数据数量: {count}")
except Exception as e:
print("爬取数据时出错:", e)
# 主程序
if __name__ == "__main__":
USERNAME = '123456'
PASSWORD = '123456'
SEARCH_URL = 'https://s.taobao.com/search?fromTmallRedirect=true&page=1&q=%E7%88%AC%E8%99%AB%E4%B9%A6%E7%B1%8D&spm=a21bo.tmall%2Fa.201856.d13&tab=mall'
if login_taobao(USERNAME, PASSWORD):
print("登录成功,开始爬取数据...")
scrape_data(SEARCH_URL)
else:
print("请检查你的账号或密码。")
driver.quit()
爬取结果:
1.2 方法二通过ajax抓包数据爬取
import requests
import re
headers = {
"accept": "*/*",
"accept-language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"referer": "https://s.taobao.com/search?fromTmallRedirect=true&page=1&q=%E7%88%AC%E8%99%AB%E4%B9%A6%E7%B1%8D&spm=a21bo.tmall%2Fa.201856.d13&tab=mall",
"sec-ch-ua": "\"Chromium\";v=\"130\", \"Microsoft Edge\";v=\"130\", \"Not?A_Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "script",
"sec-fetch-mode": "no-cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0"
}
cookies = {
"cna": "g4EOHzXrgHoCAduYS0E8SWih",
"thw": "cn",
"t": "3a90b40f94a4edfdf06bcece3a2936b9",
"lgc": "%5Cu673A%5Cu667A%5Cu5982%5Cu6211%5CuFF0C%5Cu6211%5Cu6700%5Cu673A%5Cu667A",
"cancelledSubSites": "empty",
"dnk": "%5Cu673A%5Cu667A%5Cu5982%5Cu6211%5CuFF0C%5Cu6211%5Cu6700%5Cu673A%5Cu667A",
"tracknick": "%5Cu673A%5Cu667A%5Cu5982%5Cu6211%5CuFF0C%5Cu6211%5Cu6700%5Cu673A%5Cu667A",
"_hvn_lgc_": "0",
"wk_cookie2": "1680360eb9a9247593a46c95bd771a60",
"wk_unb": "UUphzOZ5IcwwgGM6og%3D%3D",
"sn": "",
"_tb_token_": "3e8678e6d113e",
"xlly_s": "1",
"cookie2": "15adddec5888058788ddaac800fd51d7",
"miid": "6143653262971201208",
"3PcFlag": "1729430368628",
"sgcookie": "E100gliWtzx0CrKEDiJUcKVChKecqwqkovMLjnYjBWn6wBDrK8p7DmSdg7rLCx4VKHOHtvCcWxENA7I4Y0GmGuzQ63GCcrWgnIB%2BNsHQDPCsr%2FA%3D",
"havana_lgc2_0": "eyJoaWQiOjIyMDY0NzY0ODQ4OTQsInNnIjoiOTU4NDcyNmY5NDMyOTg4NzJlNTZlY2IxZWE4OGI5YTYiLCJzaXRlIjowLCJ0b2tlbiI6IjFwLXJvUEJaV3dFMml1WGw1dVRRV29RIn0",
"havana_lgc_exp": "1760534375422",
"cookie3_bak": "15adddec5888058788ddaac800fd51d7",
"cookie3_bak_exp": "1729689575422",
"unb": "2206476484894",
"uc1": "cookie21=URm48syIYB3rzvI4Dim4&cookie14=UoYcCoRfPtPMBw%3D%3D&existShop=false&cookie15=URm48syIIVrSKA%3D%3D&cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&pas=0",
"uc3": "nk2=2CJ65MVu169ryY9OLJJRRzmF&id2=UUphzOZ5IcwwgGM6og%3D%3D&vt3=F8dD37r7qaJfa%2FyEHjE%3D&lg2=WqG3DMC9VAQiUQ%3D%3D",
"csg": "9a6bb3e2",
"env_bak": "FM%2BgndCFxn4BgiZj0uopqLQpbgHfkqTbi%2FkiJo7c989A",
"cookie17": "UUphzOZ5IcwwgGM6og%3D%3D",
"skt": "99956fd7ed60ab02",
"existShop": "MTcyOTQzMDM3NQ%3D%3D",
"uc4": "nk4=0%402hdlb0qg8cYmNhhLd8NxcLGZDI46LqnaPc5IJDo%3D&id4=0%40U2grF837Gylo5%2BY1fiFeehizBDfhYMxV",
"_cc_": "U%2BGCWk%2F7og%3D%3D",
"_l_g_": "Ug%3D%3D",
"sg": "%E6%99%BA48",
"_nk_": "%5Cu673A%5Cu667A%5Cu5982%5Cu6211%5CuFF0C%5Cu6211%5Cu6700%5Cu673A%5Cu667A",
"cookie1": "U7SpdxookD0DS279tPoGcs1OC0jfCJQNwTBe0rQG%2BhY%3D",
"mtop_partitioned_detect": "1",
"_m_h5_tk": "91ba15c4f2fc30abcd89152bfe95b3aa_1729516685684",
"_m_h5_tk_enc": "8ab06684344227fa28e9c4be4713771f",
"_samesite_flag_": "true",
"sdkSilent": "1729594446946",
"havana_sdkSilent": "1729594446946",
"tfstk": "gImnNRvoC2zCBVb_NdrBrCQ2eSTTO9Z7RbI8wuFy75P1vzPJRb4owfarvkHKs7lxZ8nK2001qxMSvpZJdkMQVuRvMnEodvZ5Gq0IMDxaQ-kaLgzyDBHU29OvMnK98OrktIhLEz-Hj-244Jzzz5Jge82FzbzFQPP_F85UUMJiQ5NaU_SF45-ah-XU4ulrQd2SMp8UO0oZbB9Xy9B3oQoaKyVqLWkKpc53JwMQsgI-jV43g3VG4gogLAFwTZsDySub9rFs_hI3qAyoi-ch_CqrIDi07DR1qjciul2s5Qbui4kKl4zMLHkgxW4mjXWfWk0rO04I-T9jQklslSae5CwivmUuGPfwTAHgTr0o9Csz9qDZsrnO6iEZ3VqG42_NuJT5VRJ-ba_78RwgMRdP7Qsh4vVeIdb2Fyy_dSpMIa_78RwgMdvGu3aUCJNA.",
"isg": "BIuL1D8SaEQ7jrWBfj2jbAsSGi91IJ-imqqYBv2IZkogHKt-hfL38t669hzyPPea"
}
url = "https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/"
params = {
"jsv": "2.7.4",
"appKey": "12574478",
"t": "1729508074199",
"sign": "0ed5130d5dacbc8d430f82e1538eb473",
"api": "mtop.relationrecommend.wirelessrecommend.recommend",
"v": "2.0",
"timeout": "10000",
"type": "jsonp",
"dataType": "jsonp",
"callback": "mtopjsonp8",
"data": "{\"appId\":\"34385\",\"params\":\"{\\\"device\\\":\\\"HMA-AL00\\\",\\\"isBeta\\\":\\\"false\\\",\\\"grayHair\\\":\\\"false\\\",\\\"from\\\":\\\"nt_history\\\",\\\"brand\\\":\\\"HUAWEI\\\",\\\"info\\\":\\\"wifi\\\",\\\"index\\\":\\\"4\\\",\\\"rainbow\\\":\\\"\\\",\\\"schemaType\\\":\\\"auction\\\",\\\"elderHome\\\":\\\"false\\\",\\\"isEnterSrpSearch\\\":\\\"true\\\",\\\"newSearch\\\":\\\"false\\\",\\\"network\\\":\\\"wifi\\\",\\\"subtype\\\":\\\"\\\",\\\"hasPreposeFilter\\\":\\\"false\\\",\\\"prepositionVersion\\\":\\\"v2\\\",\\\"client_os\\\":\\\"Android\\\",\\\"gpsEnabled\\\":\\\"false\\\",\\\"searchDoorFrom\\\":\\\"srp\\\",\\\"debug_rerankNewOpenCard\\\":\\\"false\\\",\\\"homePageVersion\\\":\\\"v7\\\",\\\"searchElderHomeOpen\\\":\\\"false\\\",\\\"search_action\\\":\\\"initiative\\\",\\\"sugg\\\":\\\"_4_1\\\",\\\"sversion\\\":\\\"13.6\\\",\\\"style\\\":\\\"list\\\",\\\"ttid\\\":\\\"600000@taobao_pc_10.7.0\\\",\\\"needTabs\\\":\\\"true\\\",\\\"areaCode\\\":\\\"CN\\\",\\\"vm\\\":\\\"nw\\\",\\\"countryNum\\\":\\\"156\\\",\\\"m\\\":\\\"pc\\\",\\\"page\\\":1,\\\"n\\\":48,\\\"q\\\":\\\"%E7%88%AC%E8%99%AB%E4%B9%A6%E7%B1%8D\\\",\\\"qSource\\\":\\\"url\\\",\\\"pageSource\\\":\\\"a21bo.tmall/a.201856.d13\\\",\\\"tab\\\":\\\"mall\\\",\\\"pageSize\\\":48,\\\"totalPage\\\":100,\\\"totalResults\\\":4800,\\\"sourceS\\\":\\\"0\\\",\\\"sort\\\":\\\"_coefp\\\",\\\"bcoffset\\\":\\\"\\\",\\\"ntoffset\\\":\\\"\\\",\\\"filterTag\\\":\\\"\\\",\\\"service\\\":\\\"\\\",\\\"prop\\\":\\\"\\\",\\\"loc\\\":\\\"\\\",\\\"start_price\\\":null,\\\"end_price\\\":null,\\\"startPrice\\\":null,\\\"endPrice\\\":null,\\\"itemIds\\\":null,\\\"p4pIds\\\":null,\\\"p4pS\\\":null,\\\"categoryp\\\":\\\"\\\",\\\"myCNA\\\":\\\"g4EOHzXrgHoCAduYS0E8SWih\\\"}\"}"
}
response = requests.get(url, headers=headers, cookies=cookies, params=params)
names = re.findall(r'"title":"([^"]+)","utLogMap"', response.text)
prices = re.findall(r'"price":"([^"]+)","time"', response.text)
for name, price in zip(names, prices):
print(f"名称: {name}")
print(f"价格: {price}")
print('-' * 40)
1.3 方法三通过drissionpage 自动化实现数据爬取(推荐)
注意需要提前在浏览器进行登录操作:
from DrissionPage import ChromiumPage
page = ChromiumPage()
page.get('https://s.taobao.com/search?fromTmallRedirect=true&page=1&q=%E7%88%AC%E8%99%AB%E4%B9%A6%E7%B1%8D&spm=a21bo.tmall%2Fa.201856.d13&tab=mall')
# 获取class下所有数据
products = page.eles('@class=doubleCardWrapper--BpyYIb1O')
# 将书名和价格打印出来
for product in products:
text = product.text
# 分割字符串为多行
lines = text.strip().split('\n')
# 提取书名(假设在第一行)
book_title = lines[0].strip()
price = lines[4].strip()
print(f"书名: {book_title}")
print(f"价格: {price}\n")
2.智联招聘数据爬取
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
def get_job_details(driver, job_link):
# 在新标签页中打开链接
driver.execute_script(f"window.open('{job_link}');")
driver.switch_to.window(driver.window_handles[-1]) # 切换到新标签页
time.sleep(5)
# 获取详细信息
contents = driver.find_elements(By.XPATH, '//div[@class="describtion__detail-content"]')
details = [content.text for content in contents]
# 关闭当前标签页并切换回主页面
driver.close()
driver.switch_to.window(driver.window_handles[0])
return details
driver = webdriver.Chrome()
driver.get('https://www.zhaopin.com/sou/jl719/kwE8M8CQO/p1')
time.sleep(5)
el_lists = driver.find_elements(By.XPATH, '//div[@class="jobinfo__top"]')
for el_list in el_lists:
name = el_list.find_element(By.XPATH, './a').text
link = el_list.find_element(By.XPATH, './a').get_attribute('href')
salary = el_list.find_element(By.XPATH, '../div[1]/p').text # 薪水
company_size = el_list.find_element(By.XPATH, '../../div[2]/div[2]/div[2]').text # 公司规模
company_name = el_list.find_element(By.XPATH, '../../div[2]/div[1]/a').text # 公司名称
# 获取详细信息
job_details = get_job_details(driver, link)
if job_details:
for job_detail in job_details:
print(job_detail)
print(f"职位名称: {name}")
print(f"薪水: {salary}")
print(f"公司名称: {company_name}")
print(f"公司规模: {company_size}")
print("-" * 40)
driver.quit()
爬取结果:
3. 知网数据多页爬取
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
driver = webdriver.Chrome()
driver.get('https://kns.cnki.net/kns8s/AdvSearch')
time.sleep(5)
input_element = driver.find_element(By.XPATH,'//*[@id="gradetxt"]/dd[1]/div[2]/input')
input_element.send_keys('爬虫')
search_button = driver.find_element(By.XPATH,'//*[@id="ModuleSearch"]/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div[3]/input')
search_button.click()
time.sleep(5)
# 定义一个函数来抓取当前页面的数据
def scrape_page():
titles = driver.find_elements(By.XPATH, '//*[@id="gridTable"]/div/div/div/table/tbody/tr/td[2]/a')
authors = driver.find_elements(By.XPATH, '//*[@id="gridTable"]/div/div/div/table/tbody/tr/td[3]/a')
title_times = driver.find_elements(By.XPATH, '//*[@id="gridTable"]/div/div/div/table/tbody/tr/td[5]')
for i in range(len(titles)):
title = titles[i].text
author = authors[i].text
title_time = title_times[i].text
print(f"论文标题: {title}, 论文作者: {author}, 发表时间: {title_time}")
# 打开文章链接
article_link = titles[i]
article_link.click()
# 切换到新标签页
driver.switch_to.window(driver.window_handles[-1])
time.sleep(5)
# 获取文章摘要
summary = driver.find_element(By.XPATH, '//*[@id="ChDivSummary"]').text
print(f"论文摘要: {summary}")
# 关闭当前标签页并切回原页面
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(3)
# 循环抓取多页数据
page_limit = 3 # 设定要抓取的页数
for page in range(page_limit):
scrape_page()
# 点击下一页
next_button = driver.find_element(By.XPATH, '//*[@id="PageNext"]')
next_button.click()
time.sleep(5)
# 退出WebDriver
driver.quit()
爬取结果: