JavaScript--模拟网络爬虫

 <!doctype html>

 <html>

  <head>

   <meta charset="UTF-8">

   <title>Document</title>

   <script>

     function getUrls(){

       var reg=

         /<a\s+[^>]*?href=['"]([^'"]+?)['"][^>]*?>/g;

       var arr=null;//声明变量arr，初始化为null

       //获取body元素的内容，保存在变量html中

       var html=document.body.innerHTML;

       //反复查找html中符合reg规则的字符串，保存在arr中，如果arr不等于null，就继续找

       while((arr=reg.exec(html))!=null){

         //arr: ["<a ....>","http://..."]

         //输出本次找到的a元素

         console.log(arr[1]);

                   //RegExp.$1 取出本次匹配的第一个分组的子内容

       }

     }

   </script>

  </head>

  <body>

   <link href="index.css"/><body><a class="header" href="http://tedu.cn">go to tedu</a><h1>welcome</h1><a name="top"></a><div>Hello World</div><a href="http://tmooc.cn" target="_blank">go to tmooc</a></body>

   <button onclick="getUrls()">开始爬虫</button>

  </body>

 </html>
秒客网

JavaScript--模拟网络爬虫

相关文章