基于C#实现网页爬虫

本文实例为大家分享了基于C#实现网页爬虫的详细代码，供大家参考，具体内容如下

HTTP请求工具类：

功能：

1、获取网页html

2、下载网络图片

				?

									using System;

									using System.Collections.Generic;

									using System.IO;

									using System.Linq;

									using System.Net;

									using System.Text;

									using System.Threading.Tasks;

									using System.Windows.Forms;

									namespace Utils

									{

									  /// <summary>

									  /// HTTP请求工具类

									  /// </summary>

									  public class HttpRequestUtil

									  {

									    /// <summary>

									    /// 获取页面html

									    /// </summary>

									    public static string GetPageHtml(string url)

									    {

									      // 设置参数

									      HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;

									      request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";

									      //发送请求并获取相应回应数据

									      HttpWebResponse response = request.GetResponse() as HttpWebResponse;

									      //直到request.GetResponse()程序才开始向目标网页发送Post请求

									      Stream responseStream = response.GetResponseStream();

									      StreamReader sr = new StreamReader(responseStream, Encoding.UTF8);

									      //返回结果网页（html）代码

									      string content = sr.ReadToEnd();

									      return content;

									    }

									    /// <summary>

									    /// Http下载文件

									    /// </summary>

									    public static void HttpDownloadFile(string url)

									    {

									      int pos = url.LastIndexOf("/") + 1;

									      string fileName = url.Substring(pos);

									      string path = Application.StartupPath + "\\download";

									      if (!Directory.Exists(path))

									      {

									        Directory.CreateDirectory(path);

									      }

									      string filePathName = path + "\\" + fileName;

									      if (File.Exists(filePathName)) return;

									      // 设置参数

									      HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;

									      request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";

									      request.Proxy = null;

									      //发送请求并获取相应回应数据

									      HttpWebResponse response = request.GetResponse() as HttpWebResponse;

									      //直到request.GetResponse()程序才开始向目标网页发送Post请求

									      Stream responseStream = response.GetResponseStream();

									      //创建本地文件写入流

									      Stream stream = new FileStream(filePathName, FileMode.Create);

									      byte[] bArr = new byte[1024];

									      int size = responseStream.Read(bArr, 0, (int)bArr.Length);

									      while (size > 0)

									      {

									        stream.Write(bArr, 0, size);

									        size = responseStream.Read(bArr, 0, (int)bArr.Length);

									      }

									      stream.Close();

									      responseStream.Close();

									    }

									  }

									}

多线程爬取网页代码：

				?

									using System;

									using System.Collections.Generic;

									using System.ComponentModel;

									using System.Data;

									using System.Drawing;

									using System.IO;

									using System.Linq;

									using System.Text;

									using System.Text.RegularExpressions;

									using System.Threading;

									using System.Threading.Tasks;

									using System.Windows.Forms;

									using Utils;

									namespace 爬虫

									{

									  public partial class Form1 : Form

									  {

									    List<Thread> threadList = new List<Thread>();

									    Thread thread = null;

									    public Form1()

									    {

									      InitializeComponent();

									    }

									    private void button1_Click(object sender, EventArgs e)

									    {

									      DateTime dtStart = DateTime.Now;

									      button3.Enabled = true;

									      button2.Enabled = true;

									      button1.Enabled = false;

									      int page = 0;

									      int count = 0;

									      int personCount = 0;

									      lblPage.Text = "已完成页数：0";

									      int index = 0;

									      for (int i = 1; i <= 10; i++)

									      {

									        thread = new Thread(new ParameterizedThreadStart(delegate(object obj)

									        {

									          for (int j = 1; j <= 10; j++)

									          {

									            try

									            {

									              index = (Convert.ToInt32(obj) - 1) * 10 + j;

									              string pageHtml = HttpRequestUtil.GetPageHtml("http://tt.mop.com/c44/0/1_" + index.ToString() + ".html");

									              Regex regA = new Regex("<a[\\s]+class=\"J-userPic([^<>]*?)[\\s]+href=\"([^\"]*?)\"");

									              Regex regImg = new Regex("<p class=\"tc mb10\"><img[\\s]+src=\"([^\"]*?)\"");

									              MatchCollection mc = regA.Matches(pageHtml);

									              foreach (Match match in mc)

									              {

									                int start = match.ToString().IndexOf("href=\"");

									                string url = match.ToString().Substring(start + 6);

									                int end = url.IndexOf("\"");

									                url = url.Substring(0, end);

									                if (url.IndexOf("/") == 0)

									                {

									                  string imgPageHtml = HttpRequestUtil.GetPageHtml("http://tt.mop.com" + url);

									                  personCount++;

									                  lblPerson.Invoke(new Action(delegate() { lblPerson.Text = "已完成条数：" + personCount.ToString(); }));

									                  MatchCollection mcImgPage = regImg.Matches(imgPageHtml);

									                  foreach (Match matchImgPage in mcImgPage)

									                  {

									                    start = matchImgPage.ToString().IndexOf("src=\"");

									                    string imgUrl = matchImgPage.ToString().Substring(start + 5);

									                    end = imgUrl.IndexOf("\"");

									                    imgUrl = imgUrl.Substring(0, end);

									                    if (imgUrl.IndexOf("http://i1") == 0)

									                    {

									                      try

									                      {

									                        HttpRequestUtil.HttpDownloadFile(imgUrl);

									                        count++;

									                        lblNum.Invoke(new Action(delegate()

									                        {

									                          lblNum.Text = "已下载图片数" + count.ToString();

									                          DateTime dt = DateTime.Now;

									                          double time = dt.Subtract(dtStart).TotalSeconds;

									                          if (time > 0)

									                          {

									                            lblSpeed.Text = "速度：" + (count / time).ToString("0.0") + "张/秒";

									                          }

									                        }));

									                      }

									                      catch { }

									                      Thread.Sleep(1);

									                    }

									                  }

									                }

									              }

									            }

									            catch { }

									            page++;

									            lblPage.Invoke(new Action(delegate() { lblPage.Text = "已完成页数：" + page.ToString(); }));

									            if (page == 100)

									            {

									              button1.Invoke(new Action(delegate() { button1.Enabled = true; }));

									              MessageBox.Show("完成！");

									            }

									          }

									        }));

									        thread.Start(i);

									        threadList.Add(thread);

									      }

									    }

									    private void button2_Click(object sender, EventArgs e)

									    {

									      button1.Invoke(new Action(delegate()

									      {

									        foreach (Thread thread in threadList)

									        {

									          if (thread.ThreadState == ThreadState.Suspended)

									          {

									            thread.Resume();

									          }

									          thread.Abort();

									        }

									        button1.Enabled = true;

									        button2.Enabled = false;

									        button3.Enabled = false;

									        button4.Enabled = false;

									      }));

									    }

									    private void Form1_FormClosing(object sender, FormClosingEventArgs e)

									    {

									      foreach (Thread thread in threadList)

									      {

									        thread.Abort();

									      }

									    }

									    private void button3_Click(object sender, EventArgs e)

									    {

									      foreach (Thread thread in threadList)

									      {

									        if (thread.ThreadState == ThreadState.Running)

									        {

									          thread.Suspend();

									        }

									      }

									      button3.Enabled = false;

									      button4.Enabled = true;

									    }

									    private void button4_Click(object sender, EventArgs e)

									    {

									      foreach (Thread thread in threadList)

									      {

									        if (thread.ThreadState == ThreadState.Suspended)

									        {

									          thread.Resume();

									        }

									      }

									      button3.Enabled = true;

									      button4.Enabled = false;

									    }

									  }

									}

截图：

基于C#实现网页爬虫

以上就是本文的全部内容，希望对大家的学习有所帮助。

秒客网

基于C#实现网页爬虫

相关文章