使用 Puppeteer 绕过 Captcha:实现商家数据自动化采集

时间:2024-11-26 07:54:52
const puppeteer = require('puppeteer'); // 代理信息配置(以亿牛云爬虫代理为例 www.16yun.cn) const proxyConfig = { host: 'proxy.16yun.cn', // 代理服务器地址 port: '12345', // 代理端口号 username: 'your_username', // 代理用户名 password: 'your_password' // 代理密码 }; // 目标 URL const targetUrl = 'https://www.dianping.com/search/keyword/1/0_商家关键字'; (async () => { // 启动 Puppeteer,配置代理 const browser = await puppeteer.launch({ headless: false, // 设置为 false 以便调试 args: [`--proxy-server=${proxyConfig.host}:${proxyConfig.port}`] // 配置代理服务器 }); const page = await browser.newPage(); // 设置用户代理(User-Agent) await page.setUserAgent( 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' ); // 设置 Cookie(可从浏览器中登录后提取有效的 Cookie) const cookies = [ { name: 'cookieName', value: 'cookieValue', domain: '.dianping.com' } ]; await page.setCookie(...cookies); // 配置 HTTP Basic Auth 代理认证 await page.authenticate({ username: proxyConfig.username, password: proxyConfig.password }); // 访问目标 URL try { console.log('正在访问目标页面...'); await page.goto(targetUrl, { waitUntil: 'networkidle2' }); // 模拟滚动,加载更多数据 await page.evaluate(() => { return new Promise((resolve) => { let totalHeight = 0; const distance = 100; const timer = setInterval(() => { const scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; if (totalHeight >= scrollHeight) { clearInterval(timer); resolve(); } }, 100); }); }); // 提取商家信息 const data = await page.evaluate(() => { const result = []; const items = document.querySelectorAll('.shop-all-list li'); // 根据实际 DOM 调整选择器 items.forEach(item => { const name = item.querySelector('.tit a')?.textContent.trim(); const address = item.querySelector('.addr')?.textContent.trim(); const rating = item.querySelector('.comment-list .item-rank-rst')?.getAttribute('title'); if (name && address) { result.push({ name, address, rating }); } }); return result; }); console.log('采集到的数据:', data); } catch (error) { console.error('页面加载或采集时出错:', error); } finally { await browser.close(); } })();