使用 Puppeteer 绕过 Captcha:实现商家数据自动化采集
const puppeteer = require('puppeteer');
// 代理信息配置(以亿牛云爬虫代理为例 www.16yun.cn)
const proxyConfig = {
host: 'proxy.16yun.cn', // 代理服务器地址
port: '12345', // 代理端口号
username: 'your_username', // 代理用户名
password: 'your_password' // 代理密码
};
// 目标 URL
const targetUrl = 'https://www.dianping.com/search/keyword/1/0_商家关键字';
(async () => {
// 启动 Puppeteer,配置代理
const browser = await puppeteer.launch({
headless: false, // 设置为 false 以便调试
args: [`--proxy-server=${proxyConfig.host}:${proxyConfig.port}`] // 配置代理服务器
});
const page = await browser.newPage();
// 设置用户代理(User-Agent)
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
);
// 设置 Cookie(可从浏览器中登录后提取有效的 Cookie)
const cookies = [
{
name: 'cookieName',
value: 'cookieValue',
domain: '.dianping.com'
}
];
await page.setCookie(...cookies);
// 配置 HTTP Basic Auth 代理认证
await page.authenticate({
username: proxyConfig.username,
password: proxyConfig.password
});
// 访问目标 URL
try {
console.log('正在访问目标页面...');
await page.goto(targetUrl, { waitUntil: 'networkidle2' });
// 模拟滚动,加载更多数据
await page.evaluate(() => {
return new Promise((resolve) => {
let totalHeight = 0;
const distance = 100;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(timer);
resolve();
}
}, 100);
});
});
// 提取商家信息
const data = await page.evaluate(() => {
const result = [];
const items = document.querySelectorAll('.shop-all-list li'); // 根据实际 DOM 调整选择器
items.forEach(item => {
const name = item.querySelector('.tit a')?.textContent.trim();
const address = item.querySelector('.addr')?.textContent.trim();
const rating = item.querySelector('.comment-list .item-rank-rst')?.getAttribute('title');
if (name && address) {
result.push({ name, address, rating });
}
});
return result;
});
console.log('采集到的数据:', data);
} catch (error) {
console.error('页面加载或采集时出错:', error);
} finally {
await browser.close();
}
})();