关于node爬虫爬取时被限制(爬取电商网站为例) - litao_blogs

时间:2024-02-18 22:43:52

关于node爬虫爬取时被限制(爬取电商网站为例)

2017-07-14 14:09  litao_blogs  阅读(1319)  评论(0编辑  收藏  举报

关于node爬虫爬取时被限制

  最近写了一个node 爬虫,  于是 就拿某电商网站测试,开启10个线程,间隔时间为2毫秒,开始爬取,前几天很正常,几百条商品数据信息几分钟就爬完了。但是昨天开始该商场就开始限制我的访问了。(不知道是通过ip限制,还是cookie限制),我在网上找了很多方法都没法解决,其中我最理想的是通过跟换ip,

var request = require(\'superagent-charset\');

require(\'superagent-proxy\')(request); 

var proxy = \'http://209.33.162.219:8080\';

request .get( \'https://detail.1688.com/offer/44428100590.html\') .proxy(proxy)

.set(\'cookie\', \'UM_distinctid=15af8e3d19f66a-03df47b2ba2195-191d7059-1fa400-15af8e3d1a08f6; cna=OxpaEb73A1UCAdpYH+hATXqS; _uab_collina=149023940723768695652655; ali_beacon_id=218.88.31.232.1490596342983.231559.1; hp_ab_is_marked_2016_4=1; l=And3HGwu-SYjjx8Ig-9aQ50Lh2DB5Uue; ali_apache_track="c_ms=1|c_mt=3|c_mid=b2b-3085674767b5a88|c_lid=%E9%85%B8%E8%BE%A3%E5%9C%9F%E8%B1%86%E4%B8%9D520530"; CNZZDATA1261052687=845523286-1490331363-https%253A%252F%252Fdetail.1688.com%252F%7C1496812116; h_keys="%u513f%u7ae5%u8f66#%u88e4%u5b50%u5973%u590f#2017%u6625%u590f%u65b0%u54c1%u5973%u5355%u978b%u6b27%u7f8e%u65f6%u5c1a%u6027%u611f%u6d45%u53e3%u5973%u978b%u4e2d%u7a7a%u5c16%u5934%u9ad8%u8ddf%u978b%u5382%u5bb6#%u62d6%u628a#%u6d74%u888d#%u78e8%u7259%u68d2#%u51cf%u538b%u795e%u5668#%u6574%u4eba%u73a9%u5177#diy#%u683c%u5b50%u94fa"; ad_prefer="2017/07/12 16:42:34"; JSESSIONID=8L78mkZv1-IXwXzw5HvqnIWif3NC-5BwODPQ-RGRC; cookie1=AHgCNgBgBWwjdzREfejrVMwElKFey%2FZGnVu4hZgztXA%3D; cookie2=2a1a2747d1683104d9116705d8e7c151; cookie17=UNDcTy30Wcizjw%3D%3D; uss=VTxxpyqOTnPBIqJrczIE%2FdDIOcN4%2BHek9sqaAJ9V%2B6COmfvh66vNU3it; t=376437ef5372654129d1e0b540765e23; _tb_token_=fb6513301e345; sg=074; m_id=b2b-3085674767b5a88; m_level=PM.PENDING; m_sign=1073741824; __cn_logon__=true; __cn_logon_id__=%E9%85%B8%E8%BE%A3%E5%9C%9F%E8%B1%86%E4%B8%9D520530; ali_apache_tracktmp="c_w_signed=Y"; cn_tmp="Z28mC+GqtZ2WkhcK1VOcprI/yb9MLerE33XGf/X8NtL+/HWRVFZKen+Yz+ZvOUkUYOS6sCBvbMvQXu/DuOz3yKUrWHHBwIR7b8axbqBxKhk+y73fmvvpFF5X9seKegxdudQAz/0tRituLEFPiBcwIQ7cQQSX7KaQcdrHcysZdh3BMs0e3c0ndIuMfM6NDCKaBcnf82yxzeWDE2QjSoe1WTDOMH7vN+JC24iXFbCSp/2M1QtSyUkfVvAhedfyECPU5Ysm0s/QbVKnt/xY3HxWlA=="; _cn_slid_="Pzy3y%2FqYLc"; tbsnid=sxZuk3wDIYHP7PTcRh9em5fA4bfQzXgQPfId%2BSgjZZ86sOlEpJKl9g%3D%3D; LoginUmid="LR9v0g2vFi7tnsIIu%2Bm26YfjYRDwYbxyxBAKzcMuHJJTAhztWl1ziA%3D%3D"; userID="0c%2Fx3DmaRvhTA9tJkESv5%2Bccsgsmdo8q7WgOtXFiibE6sOlEpJKl9g%3D%3D"; last_mid=b2b-3085674767b5a88; unb=3085674767; __last_loginid__="%E9%85%B8%E8%BE%A3%E5%9C%9F%E8%B1%86%E4%B8%9D520530"; login="kFeyVBJLQQI%3D"; _csrf_token=1499907488138; ali_ab=218.88.31.232.1490234427766.6; _is_show_loginId_change_block_=b2b-3085674767b5a88_false; _show_force_unbind_div_=b2b-3085674767b5a88_false; _show_sys_unbind_div_=b2b-3085674767b5a88_false; _show_user_unbind_div_=b2b-3085674767b5a88_false; alicnweb=hp_newbuyerguide%3Dtrue%7Ctouch_tb_at%3D1499912919203%7Clastlogonid%3D%25E9%2585%25B8%25E8%25BE%25A3%25E5%259C%259F%25E8%25B1%2586%25E4%25B8%259D520530%7Cshow_inter_tips%3Dfalse; CNZZDATA1253659577=2085335604-1490234879-https%253A%252F%252Fwww.flh88.cn%252F%7C1499910669; __rn_alert__=false; aliBeacon_bcookie=; userIDNum="XYspvbWbOwNH7KXpkJSlpw%3D%3D"; _nk_=6RcDNQ6r1WCYv5yu62x8bzqw6USkkqX2; isg=AvT0I9aXEUHpxoS4FbudtmxBxbKmZRpyKpiRzI5VgH8C-ZRDtt3oR6q7Dwfa; _tmp_ck_0="bVAqUMwg%2Ba2kNJD2jy9VVAOV4kMaPsGXyncad0zSY%2Bi%2FzEDX4Wm41EGKXIXlI7D3dlmyi4GHvQvGHPVWoCInvPtslMWYB%2BKGWtxorFjpRWhmwpyH2qDLJ0kaW6jlSWn7%2FIJusNFd%2BlG3JeRjrCrSbrNJvHykwGt4I9US9zXOqJ5ITvjKWwlj%2BRjnT7POlIQBMCiE%2BbdmEwTVHT0iARsg6VA9sCGrznW85gR1S6fWTs7WtCFGMWkwFzEpk4JgWf2Nd3aCX6%2FYoPASe6soggnCZQpPvtonyD%2BCm4byoLJ7MPLQVspx2PIpcdwhGw%2BRKirKPlKWVnIRXyBXroxGxqQKYYdHQFCsYiUrMhbsRQP2RDpqIfHG%2FsL01Qmkl3izVvZHqh3kJWwta3F8PlEo8soBlzS1sYc4AzrIhkrcHxYrfWVKfn4Rm%2F6p6thsRNXHXOaLb8jFinM2%2FghVT90U%2Bl3vxwHwRs%2ByYnVjxXedrVPs4TfCpw0gKrX0eHToQDnnDfICJktBz9bGXbkrtT%2F4GFDjiJssrthKoSVMUR7umWgluBv0V1kge8%2BVzW6rZb2k3OuUA2F%2FbZYWFmBMAsg62id2T3kNP%2BqkRLmT"; _umdata=70CF403AFFD707DFF81D0962FFFD4C5F229BBD46966278F0D52F7E86636EA0FBABAB4E3970CEDDE1CD43AD3E795C914C4287A26647753C1187A2D07CC9F9B073\') .set("user-agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36")

.end(onresponse);

function onresponse (err, res) { if (err) { console.log(err); } else { console.log(res.status, res.headers); console.log(res.body); } }

 

但是它一直报错Error: connect ETIMEDOUT 209.33.162.219:8080; 跟换了好多ip但是就是一直报错,有哪位大神知道的话,可以指点一下。

 

于是我就采用了另外一种方法。保留一个线程开启,每次间隔时间为10秒, 虽然说没有限制我访问了,但是这样的效率很低。

后续 我会多测试几种方法。如:定时更换cookie。

我用的是superagent 模块。