爬虫日常实战

时间:2024-10-26 17:01:36

1.淘宝天猫爬取

1.1 方法一selenium实现数据爬取

# -- coding: utf-8 --
# 模拟淘宝登录并爬取数据
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# 配置 ChromeOptions 防止被检测
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--incognito")  # 使用隐身模式
# options.add_argument('--headless')  # 无头浏览,开发和调试时可以注释掉这行

# 创建 WebDriver 实例
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    "source": """
    Object.defineProperty(navigator, 'webdriver', {
        get: () => undefined
    })
    """
})
wait = WebDriverWait(driver, 10)

def login_taobao(username, password):
    print('开始登录...')
    try:
        login_url = 'https://login.taobao.com/'
        driver.get(login_url)

        # 输入用户名
        input_login_id = wait.until(EC.presence_of_element_located((By.ID, 'fm-login-id')))
        input_login_id.send_keys(username)

        # 输入密码
        input_login_password = wait.until(EC.presence_of_element_located((By.ID, 'fm-login-password')))
        input_login_password.send_keys(password)

        # 点击登录按钮
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.fm-button.fm-submit.password-login')))
        submit.click()

        # 等待滑动验证(用户手动操作)
        print("请完成滑动验证...")
        time.sleep(20)  # 等待用户完成滑动验证,具体时间可以根据实际情况调整

        # 等待登录完成
        is_logging = wait.until(EC.url_changes(login_url))
        if is_logging:
            print("登录成功!")
            return True
    except TimeoutException:
        print('登录超时,尝试点击其他登录按钮...')
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.fm-button.fm-submit')))
        submit.click()
        is_logging = wait.until(EC.url_changes(login_url))
        if is_logging:
            print("登录成功!")
            return True
        else:
            print("登录失败,请重试。")
            return False

def scrape_data(url):
    driver.get(url)
    time.sleep(3)

    try:
        # 获取书名和价格
        titles = driver.find_elements(By.XPATH, '//div[contains(@class, "title--F6pvp_RZ")]/span')
        prices = driver.find_elements(By.XPATH, '//div[contains(@style, "margin-right: 8px;")]')

        if not titles or not prices:
            titles = driver.execute_script("return Array.from(document.querySelectorAll('div.title--F6pvp_RZ > span')).map(el => el.innerText);")
            prices = driver.execute_script("return Array.from(document.querySelectorAll('div[style*=\"margin-right: 8px;\"]')).map(el => el.innerText);")

        count = 0
        for title, price in zip(titles, prices):
            print(f"书名: {title.text}")
            print(f"价格: {price.text}元")
            print('-' * 40)
            count += 1

        print(f"共爬取数据数量: {count}")

    except Exception as e:
        print("爬取数据时出错:", e)

# 主程序
if __name__ == "__main__":
    USERNAME = '123456'
    PASSWORD = '123456'
    SEARCH_URL = 'https://s.taobao.com/search?fromTmallRedirect=true&page=1&q=%E7%88%AC%E8%99%AB%E4%B9%A6%E7%B1%8D&spm=a21bo.tmall%2Fa.201856.d13&tab=mall'

    if login_taobao(USERNAME, PASSWORD):
        print("登录成功,开始爬取数据...")
        scrape_data(SEARCH_URL)
    else:
        print("请检查你的账号或密码。")

    driver.quit()

爬取结果: 

 

1.2 方法二通过ajax抓包数据爬取

import requests
import re

headers = {
    "accept": "*/*",
    "accept-language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
    "referer": "https://s.taobao.com/search?fromTmallRedirect=true&page=1&q=%E7%88%AC%E8%99%AB%E4%B9%A6%E7%B1%8D&spm=a21bo.tmall%2Fa.201856.d13&tab=mall",
    "sec-ch-ua": "\"Chromium\";v=\"130\", \"Microsoft Edge\";v=\"130\", \"Not?A_Brand\";v=\"99\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "script",
    "sec-fetch-mode": "no-cors",
    "sec-fetch-site": "same-site",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0"
}
cookies = {
    "cna": "g4EOHzXrgHoCAduYS0E8SWih",
    "thw": "cn",
    "t": "3a90b40f94a4edfdf06bcece3a2936b9",
    "lgc": "%5Cu673A%5Cu667A%5Cu5982%5Cu6211%5CuFF0C%5Cu6211%5Cu6700%5Cu673A%5Cu667A",
    "cancelledSubSites": "empty",
    "dnk": "%5Cu673A%5Cu667A%5Cu5982%5Cu6211%5CuFF0C%5Cu6211%5Cu6700%5Cu673A%5Cu667A",
    "tracknick": "%5Cu673A%5Cu667A%5Cu5982%5Cu6211%5CuFF0C%5Cu6211%5Cu6700%5Cu673A%5Cu667A",
    "_hvn_lgc_": "0",
    "wk_cookie2": "1680360eb9a9247593a46c95bd771a60",
    "wk_unb": "UUphzOZ5IcwwgGM6og%3D%3D",
    "sn": "",
    "_tb_token_": "3e8678e6d113e",
    "xlly_s": "1",
    "cookie2": "15adddec5888058788ddaac800fd51d7",
    "miid": "6143653262971201208",
    "3PcFlag": "1729430368628",
    "sgcookie": "E100gliWtzx0CrKEDiJUcKVChKecqwqkovMLjnYjBWn6wBDrK8p7DmSdg7rLCx4VKHOHtvCcWxENA7I4Y0GmGuzQ63GCcrWgnIB%2BNsHQDPCsr%2FA%3D",
    "havana_lgc2_0": "eyJoaWQiOjIyMDY0NzY0ODQ4OTQsInNnIjoiOTU4NDcyNmY5NDMyOTg4NzJlNTZlY2IxZWE4OGI5YTYiLCJzaXRlIjowLCJ0b2tlbiI6IjFwLXJvUEJaV3dFMml1WGw1dVRRV29RIn0",
    "havana_lgc_exp": "1760534375422",
    "cookie3_bak": "15adddec5888058788ddaac800fd51d7",
    "cookie3_bak_exp": "1729689575422",
    "unb": "2206476484894",
    "uc1": "cookie21=URm48syIYB3rzvI4Dim4&cookie14=UoYcCoRfPtPMBw%3D%3D&existShop=false&cookie15=URm48syIIVrSKA%3D%3D&cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&pas=0",
    "uc3": "nk2=2CJ65MVu169ryY9OLJJRRzmF&id2=UUphzOZ5IcwwgGM6og%3D%3D&vt3=F8dD37r7qaJfa%2FyEHjE%3D&lg2=WqG3DMC9VAQiUQ%3D%3D",
    "csg": "9a6bb3e2",
    "env_bak": "FM%2BgndCFxn4BgiZj0uopqLQpbgHfkqTbi%2FkiJo7c989A",
    "cookie17": "UUphzOZ5IcwwgGM6og%3D%3D",
    "skt": "99956fd7ed60ab02",
    "existShop": "MTcyOTQzMDM3NQ%3D%3D",
    "uc4": "nk4=0%402hdlb0qg8cYmNhhLd8NxcLGZDI46LqnaPc5IJDo%3D&id4=0%40U2grF837Gylo5%2BY1fiFeehizBDfhYMxV",
    "_cc_": "U%2BGCWk%2F7og%3D%3D",
    "_l_g_": "Ug%3D%3D",
    "sg": "%E6%99%BA48",
    "_nk_": "%5Cu673A%5Cu667A%5Cu5982%5Cu6211%5CuFF0C%5Cu6211%5Cu6700%5Cu673A%5Cu667A",
    "cookie1": "U7SpdxookD0DS279tPoGcs1OC0jfCJQNwTBe0rQG%2BhY%3D",
    "mtop_partitioned_detect": "1",
    "_m_h5_tk": "91ba15c4f2fc30abcd89152bfe95b3aa_1729516685684",
    "_m_h5_tk_enc": "8ab06684344227fa28e9c4be4713771f",
    "_samesite_flag_": "true",
    "sdkSilent": "1729594446946",
    "havana_sdkSilent": "1729594446946",
    "tfstk": "gImnNRvoC2zCBVb_NdrBrCQ2eSTTO9Z7RbI8wuFy75P1vzPJRb4owfarvkHKs7lxZ8nK2001qxMSvpZJdkMQVuRvMnEodvZ5Gq0IMDxaQ-kaLgzyDBHU29OvMnK98OrktIhLEz-Hj-244Jzzz5Jge82FzbzFQPP_F85UUMJiQ5NaU_SF45-ah-XU4ulrQd2SMp8UO0oZbB9Xy9B3oQoaKyVqLWkKpc53JwMQsgI-jV43g3VG4gogLAFwTZsDySub9rFs_hI3qAyoi-ch_CqrIDi07DR1qjciul2s5Qbui4kKl4zMLHkgxW4mjXWfWk0rO04I-T9jQklslSae5CwivmUuGPfwTAHgTr0o9Csz9qDZsrnO6iEZ3VqG42_NuJT5VRJ-ba_78RwgMRdP7Qsh4vVeIdb2Fyy_dSpMIa_78RwgMdvGu3aUCJNA.",
    "isg": "BIuL1D8SaEQ7jrWBfj2jbAsSGi91IJ-imqqYBv2IZkogHKt-hfL38t669hzyPPea"
}
url = "https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/"
params = {
    "jsv": "2.7.4",
    "appKey": "12574478",
    "t": "1729508074199",
    "sign": "0ed5130d5dacbc8d430f82e1538eb473",
    "api": "mtop.relationrecommend.wirelessrecommend.recommend",
    "v": "2.0",
    "timeout": "10000",
    "type": "jsonp",
    "dataType": "jsonp",
    "callback": "mtopjsonp8",
    "data": "{\"appId\":\"34385\",\"params\":\"{\\\"device\\\":\\\"HMA-AL00\\\",\\\"isBeta\\\":\\\"false\\\",\\\"grayHair\\\":\\\"false\\\",\\\"from\\\":\\\"nt_history\\\",\\\"brand\\\":\\\"HUAWEI\\\",\\\"info\\\":\\\"wifi\\\",\\\"index\\\":\\\"4\\\",\\\"rainbow\\\":\\\"\\\",\\\"schemaType\\\":\\\"auction\\\",\\\"elderHome\\\":\\\"false\\\",\\\"isEnterSrpSearch\\\":\\\"true\\\",\\\"newSearch\\\":\\\"false\\\",\\\"network\\\":\\\"wifi\\\",\\\"subtype\\\":\\\"\\\",\\\"hasPreposeFilter\\\":\\\"false\\\",\\\"prepositionVersion\\\":\\\"v2\\\",\\\"client_os\\\":\\\"Android\\\",\\\"gpsEnabled\\\":\\\"false\\\",\\\"searchDoorFrom\\\":\\\"srp\\\",\\\"debug_rerankNewOpenCard\\\":\\\"false\\\",\\\"homePageVersion\\\":\\\"v7\\\",\\\"searchElderHomeOpen\\\":\\\"false\\\",\\\"search_action\\\":\\\"initiative\\\",\\\"sugg\\\":\\\"_4_1\\\",\\\"sversion\\\":\\\"13.6\\\",\\\"style\\\":\\\"list\\\",\\\"ttid\\\":\\\"600000@taobao_pc_10.7.0\\\",\\\"needTabs\\\":\\\"true\\\",\\\"areaCode\\\":\\\"CN\\\",\\\"vm\\\":\\\"nw\\\",\\\"countryNum\\\":\\\"156\\\",\\\"m\\\":\\\"pc\\\",\\\"page\\\":1,\\\"n\\\":48,\\\"q\\\":\\\"%E7%88%AC%E8%99%AB%E4%B9%A6%E7%B1%8D\\\",\\\"qSource\\\":\\\"url\\\",\\\"pageSource\\\":\\\"a21bo.tmall/a.201856.d13\\\",\\\"tab\\\":\\\"mall\\\",\\\"pageSize\\\":48,\\\"totalPage\\\":100,\\\"totalResults\\\":4800,\\\"sourceS\\\":\\\"0\\\",\\\"sort\\\":\\\"_coefp\\\",\\\"bcoffset\\\":\\\"\\\",\\\"ntoffset\\\":\\\"\\\",\\\"filterTag\\\":\\\"\\\",\\\"service\\\":\\\"\\\",\\\"prop\\\":\\\"\\\",\\\"loc\\\":\\\"\\\",\\\"start_price\\\":null,\\\"end_price\\\":null,\\\"startPrice\\\":null,\\\"endPrice\\\":null,\\\"itemIds\\\":null,\\\"p4pIds\\\":null,\\\"p4pS\\\":null,\\\"categoryp\\\":\\\"\\\",\\\"myCNA\\\":\\\"g4EOHzXrgHoCAduYS0E8SWih\\\"}\"}"
}
response = requests.get(url, headers=headers, cookies=cookies, params=params)

names =  re.findall(r'"title":"([^"]+)","utLogMap"', response.text)
prices =  re.findall(r'"price":"([^"]+)","time"', response.text)
for name, price in zip(names, prices):
    print(f"名称: {name}")
    print(f"价格: {price}")
    print('-' * 40)

1.3 方法三通过drissionpage 自动化实现数据爬取(推荐

注意需要提前在浏览器进行登录操作:

from DrissionPage import ChromiumPage

page = ChromiumPage()
page.get('https://s.taobao.com/search?fromTmallRedirect=true&page=1&q=%E7%88%AC%E8%99%AB%E4%B9%A6%E7%B1%8D&spm=a21bo.tmall%2Fa.201856.d13&tab=mall')
# 获取class下所有数据
products = page.eles('@class=doubleCardWrapper--BpyYIb1O')
# 将书名和价格打印出来
for product in products:
        text = product.text
        # 分割字符串为多行
        lines = text.strip().split('\n')
        # 提取书名(假设在第一行)
        book_title = lines[0].strip()
        price = lines[4].strip()
        print(f"书名: {book_title}")
        print(f"价格: {price}\n")


2.智联招聘数据爬取

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

def get_job_details(driver, job_link):
    # 在新标签页中打开链接
    driver.execute_script(f"window.open('{job_link}');")
    driver.switch_to.window(driver.window_handles[-1])  # 切换到新标签页
    time.sleep(5)
    # 获取详细信息
    contents = driver.find_elements(By.XPATH, '//div[@class="describtion__detail-content"]')
    details = [content.text for content in contents]
    # 关闭当前标签页并切换回主页面
    driver.close()
    driver.switch_to.window(driver.window_handles[0])
    return details

driver = webdriver.Chrome()
driver.get('https://www.zhaopin.com/sou/jl719/kwE8M8CQO/p1')
time.sleep(5)

el_lists = driver.find_elements(By.XPATH, '//div[@class="jobinfo__top"]')
for el_list in el_lists:
    name = el_list.find_element(By.XPATH, './a').text
    link = el_list.find_element(By.XPATH, './a').get_attribute('href')
    salary = el_list.find_element(By.XPATH, '../div[1]/p').text  # 薪水
    company_size = el_list.find_element(By.XPATH, '../../div[2]/div[2]/div[2]').text  # 公司规模
    company_name = el_list.find_element(By.XPATH, '../../div[2]/div[1]/a').text  # 公司名称

    # 获取详细信息
    job_details = get_job_details(driver, link)
    if job_details:
        for job_detail in job_details:
            print(job_detail)

    print(f"职位名称: {name}")
    print(f"薪水: {salary}")
    print(f"公司名称: {company_name}")
    print(f"公司规模: {company_size}")
    print("-" * 40)

driver.quit()

爬取结果:

 

3. 知网数据多页爬取

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

driver = webdriver.Chrome()
driver.get('https://kns.cnki.net/kns8s/AdvSearch')
time.sleep(5)
input_element = driver.find_element(By.XPATH,'//*[@id="gradetxt"]/dd[1]/div[2]/input')

input_element.send_keys('爬虫')
search_button = driver.find_element(By.XPATH,'//*[@id="ModuleSearch"]/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div[3]/input')
search_button.click()
time.sleep(5)

# 定义一个函数来抓取当前页面的数据
def scrape_page():
    titles = driver.find_elements(By.XPATH, '//*[@id="gridTable"]/div/div/div/table/tbody/tr/td[2]/a')
    authors = driver.find_elements(By.XPATH, '//*[@id="gridTable"]/div/div/div/table/tbody/tr/td[3]/a')
    title_times = driver.find_elements(By.XPATH, '//*[@id="gridTable"]/div/div/div/table/tbody/tr/td[5]')

    for i in range(len(titles)):
        title = titles[i].text
        author = authors[i].text
        title_time = title_times[i].text
        print(f"论文标题: {title}, 论文作者: {author}, 发表时间: {title_time}")
        # 打开文章链接
        article_link = titles[i]
        article_link.click()

        # 切换到新标签页
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(5)
        # 获取文章摘要
        summary = driver.find_element(By.XPATH, '//*[@id="ChDivSummary"]').text
        print(f"论文摘要: {summary}")
        # 关闭当前标签页并切回原页面
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        time.sleep(3)


# 循环抓取多页数据
page_limit = 3  # 设定要抓取的页数
for page in range(page_limit):
    scrape_page()
    # 点击下一页
    next_button = driver.find_element(By.XPATH, '//*[@id="PageNext"]')
    next_button.click()
    time.sleep(5)

# 退出WebDriver
driver.quit()

爬取结果: