C#网络爬虫代码分享 C#简单的爬取工具

公司编辑妹子需要爬取网页内容，叫我帮忙做了一简单的爬取工具

这是爬取网页内容，像是这对大家来说都是不难得，但是在这里有一些小改动，代码献上，大家参考

				?

									private string gethttpwebrequest(string url) 

									    { 

									      httpwebresponse result; 

									      string strhtml = string.empty; 

									      try

									      { 

									        uri uri = new uri(url); 

									        webrequest webreq = webrequest.create(uri); 

									        webresponse webres = webreq.getresponse(); 

									        httpwebrequest myreq = (httpwebrequest)webreq; 

									        myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705"; 

									        myreq.accept = "*/*"; 

									        myreq.keepalive = true; 

									        myreq.headers.add("accept-language", "zh-cn,en-us;q=0.5"); 

									        result = (httpwebresponse)myreq.getresponse(); 

									        stream recevicestream = result.getresponsestream(); 

									        streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding("utf-8")); 

									        strhtml = readerofstream.readtoend(); 

									        readerofstream.close(); 

									        recevicestream.close(); 

									        result.close(); 

									      } 

									      catch

									      { 

									        uri uri = new uri(url); 

									        webrequest webreq = webrequest.create(uri); 

									        httpwebrequest myreq = (httpwebrequest)webreq; 

									        myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705"; 

									        myreq.accept = "*/*"; 

									        myreq.keepalive = true; 

									        myreq.headers.add("accept-language", "zh-cn,en-us;q=0.5"); 

									        //result = (httpwebresponse)myreq.getresponse(); 

									        try

									        { 

									          result = (httpwebresponse)myreq.getresponse(); 

									        } 

									        catch (webexception ex) 

									        { 

									          result = (httpwebresponse)ex.response; 

									        } 

									        stream recevicestream = result.getresponsestream(); 

									        streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding("gb2312")); 

									        strhtml = readerofstream.readtoend(); 

									        readerofstream.close(); 

									        recevicestream.close(); 

									        result.close(); 

									      } 

									      return strhtml; 

									    }

这是根据url爬取网页远吗，有一些小改动，很多网页有不同的编码格式，甚至有些网站做了反爬取的防范，这个方法经过能够改动也能爬去

C#网络爬虫代码分享 C#简单的爬取工具

以下是爬取网页所有的网址链接

				?

									/// <summary> 

									   /// 提取html代码中的网址 

									   /// </summary> 

									   /// <param name="htmlcode"></param> 

									   /// <returns></returns> 

									   private static list<string> gethyperlinks(string htmlcode, string url) 

									   { 

									     arraylist al = new arraylist(); 

									     bool isgenxin = false; 

									     stringbuilder weburlsb = new stringbuilder();//sql 

									     stringbuilder linksb = new stringbuilder();//展示数据 

									     list<string> weburllistzx = new list<string>();//新增 

									     list<string> weburllist = new list<string>();//旧的 

									     string productioncontent = htmlcode; 

									     regex reg = new regex(@"http(s)?://([\w-]+\.)+[\w-]+/?"); 

									     string wangzhanyuming = reg.match(url, 0).value; 

									     matchcollection mc = regex.matches(productioncontent.replace("href=\"/", "href=\"" + wangzhanyuming).replace("href='/", "href='" + wangzhanyuming).replace("href=/", "href=" + wangzhanyuming).replace("href=\"./", "href=\"" + wangzhanyuming), @"<[aa][^>]* href=[^>]*>", regexoptions.singleline); 

									     int index = 1; 

									     foreach (match m in mc) 

									     { 

									       matchcollection mc1 = regex.matches(m.value, @"[a-za-z]+://[^\s]*", regexoptions.singleline); 

									       if (mc1.count > 0) 

									       { 

									         foreach (match m1 in mc1) 

									         { 

									           string linkurlstr = string.empty; 

									           linkurlstr = m1.value.replace("\"", "").replace("'", "").replace(">", "").replace(";", ""); 

									           weburlsb.append("$-$"); 

									           weburlsb.append(linkurlstr); 

									           weburlsb.append("$_$"); 

									           if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr)) 

									           { 

									             isgenxin = true; 

									             weburllistzx.add(linkurlstr); 

									             linksb.appendformat("{0}<br/>", linkurlstr); 

									           } 

									         } 

									       } 

									       else

									       { 

									         if (m.value.indexof("javascript") == -1) 

									         { 

									           string amstr = string.empty; 

									           string wangzhanxiangduilujin = string.empty; 

									           wangzhanxiangduilujin = url.substring(0, url.lastindexof("/") + 1); 

									           amstr = m.value.replace("href=\"", "href=\"" + wangzhanxiangduilujin).replace("href='", "href='" + wangzhanxiangduilujin); 

									           matchcollection mc11 = regex.matches(amstr, @"[a-za-z]+://[^\s]*", regexoptions.singleline); 

									           foreach (match m1 in mc11) 

									           { 

									             string linkurlstr = string.empty; 

									             linkurlstr = m1.value.replace("\"", "").replace("'", "").replace(">", "").replace(";", ""); 

									             weburlsb.append("$-$"); 

									             weburlsb.append(linkurlstr); 

									             weburlsb.append("$_$"); 

									             if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr)) 

									             { 

									               isgenxin = true; 

									               weburllistzx.add(linkurlstr); 

									               linksb.appendformat("{0}<br/>", linkurlstr); 

									             } 

									           } 

									         } 

									       } 

									       index++; 

									     } 

									     return weburllistzx; 

									   }

这块的技术其实就是简单的使用了正则去匹配！接下来献上获取标题，以及存储到xml文件的方法

				?

									/// <summary> 

									    /// // 把网址写入xml文件 

									    /// </summary> 

									    /// <param name="strurl"></param> 

									    /// <param name="alhyperlinks"></param> 

									    private static void writetoxml(string strurl, list<string> alhyperlinks) 

									    { 

									      xmltextwriter writer = new xmltextwriter(@"d:\hyperlinks.xml", encoding.utf8); 

									      writer.formatting = formatting.indented; 

									      writer.writestartdocument(false); 

									      writer.writedoctype("hyperlinks", null, "urls.dtd", null); 

									      writer.writecomment("提取自" + strurl + "的超链接"); 

									      writer.writestartelement("hyperlinks"); 

									      writer.writestartelement("hyperlinks", null); 

									      writer.writeattributestring("datetime", datetime.now.tostring()); 

									      foreach (string str in alhyperlinks) 

									      { 

									        string title = getdomain(str); 

									        string body = str; 

									        writer.writeelementstring(title, null, body); 

									      } 

									      writer.writeendelement(); 

									      writer.writeendelement(); 

									      writer.flush(); 

									      writer.close(); 

									    } 

									    /// <summary> 

									    /// 获取网址的域名后缀 

									    /// </summary> 

									    /// <param name="strurl"></param> 

									    /// <returns></returns> 

									    private static string getdomain(string strurl) 

									    { 

									      string retval; 

									      string strregex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)"; 

									      regex r = new regex(strregex, regexoptions.ignorecase); 

									      match m = r.match(strurl); 

									      retval = m.tostring(); 

									      strregex = @"\.|/$"; 

									      retval = regex.replace(retval, strregex, "").tostring(); 

									      if (retval == "") 

									        retval = "other"; 

									      return retval; 

									    } 

									/// <summary> 

									    /// 获取标题 

									    /// </summary> 

									    /// <param name="html"></param> 

									    /// <returns></returns> 

									    private static string gettitle(string html) 

									    { 

									      string titlefilter = @"<title>[\s\s]*?</title>"; 

									      string h1filter = @"<h1.*?>.*?</h1>"; 

									      string clearfilter = @"<.*?>"; 

									      string title = ""; 

									      match match = regex.match(html, titlefilter, regexoptions.ignorecase); 

									      if (match.success) 

									      { 

									        title = regex.replace(match.groups[0].value, clearfilter, ""); 

									      } 

									      // 正文的标题一般在h1中，比title中的标题更干净 

									      match = regex.match(html, h1filter, regexoptions.ignorecase); 

									      if (match.success) 

									      { 

									        string h1 = regex.replace(match.groups[0].value, clearfilter, ""); 

									        if (!string.isnullorempty(h1) && title.startswith(h1)) 

									        { 

									          title = h1; 

									        } 

									      } 

									      return title; 

									    }

这就是所用的全部方法，还是有很多需要改进之处！大家如果有发现不足之处还请指出，谢谢！

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持服务器之家。

秒客网

C#网络爬虫代码分享 C#简单的爬取工具

相关文章