基于C#实现网页爬虫

时间:2022-03-08 12:06:49

本文实例为大家分享了基于C#实现网页爬虫的详细代码,供大家参考,具体内容如下

HTTP请求工具类:

功能:

1、获取网页html

2、下载网络图片

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
 
namespace Utils
{
  /// <summary>
  /// HTTP请求工具类
  /// </summary>
  public class HttpRequestUtil
  {
    /// <summary>
    /// 获取页面html
    /// </summary>
    public static string GetPageHtml(string url)
    {
      // 设置参数
      HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
      request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
      //发送请求并获取相应回应数据
      HttpWebResponse response = request.GetResponse() as HttpWebResponse;
      //直到request.GetResponse()程序才开始向目标网页发送Post请求
      Stream responseStream = response.GetResponseStream();
      StreamReader sr = new StreamReader(responseStream, Encoding.UTF8);
      //返回结果网页(html)代码
      string content = sr.ReadToEnd();
      return content;
    }
 
    /// <summary>
    /// Http下载文件
    /// </summary>
    public static void HttpDownloadFile(string url)
    {
      int pos = url.LastIndexOf("/") + 1;
      string fileName = url.Substring(pos);
      string path = Application.StartupPath + "\\download";
      if (!Directory.Exists(path))
      {
        Directory.CreateDirectory(path);
      }
      string filePathName = path + "\\" + fileName;
      if (File.Exists(filePathName)) return;
 
      // 设置参数
      HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
      request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
      request.Proxy = null;
      //发送请求并获取相应回应数据
      HttpWebResponse response = request.GetResponse() as HttpWebResponse;
      //直到request.GetResponse()程序才开始向目标网页发送Post请求
      Stream responseStream = response.GetResponseStream();
 
      //创建本地文件写入流
      Stream stream = new FileStream(filePathName, FileMode.Create);
 
      byte[] bArr = new byte[1024];
      int size = responseStream.Read(bArr, 0, (int)bArr.Length);
      while (size > 0)
      {
        stream.Write(bArr, 0, size);
        size = responseStream.Read(bArr, 0, (int)bArr.Length);
      }
      stream.Close();
      responseStream.Close();
    }
  }
}

多线程爬取网页代码:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using Utils;
 
namespace 爬虫
{
  public partial class Form1 : Form
  {
    List<Thread> threadList = new List<Thread>();
    Thread thread = null;
 
    public Form1()
    {
      InitializeComponent();
    }
 
    private void button1_Click(object sender, EventArgs e)
    {
      DateTime dtStart = DateTime.Now;
      button3.Enabled = true;
      button2.Enabled = true;
      button1.Enabled = false;
      int page = 0;
      int count = 0;
      int personCount = 0;
      lblPage.Text = "已完成页数:0";
      int index = 0;
 
      for (int i = 1; i <= 10; i++)
      {
        thread = new Thread(new ParameterizedThreadStart(delegate(object obj)
        {
          for (int j = 1; j <= 10; j++)
          {
            try
            {
              index = (Convert.ToInt32(obj) - 1) * 10 + j;
              string pageHtml = HttpRequestUtil.GetPageHtml("http://tt.mop.com/c44/0/1_" + index.ToString() + ".html");
              Regex regA = new Regex("<a[\\s]+class=\"J-userPic([^<>]*?)[\\s]+href=\"([^\"]*?)\"");
              Regex regImg = new Regex("<p class=\"tc mb10\"><img[\\s]+src=\"([^\"]*?)\"");
              MatchCollection mc = regA.Matches(pageHtml);
              foreach (Match match in mc)
              {
                int start = match.ToString().IndexOf("href=\"");
                string url = match.ToString().Substring(start + 6);
                int end = url.IndexOf("\"");
                url = url.Substring(0, end);
                if (url.IndexOf("/") == 0)
                {
                  string imgPageHtml = HttpRequestUtil.GetPageHtml("http://tt.mop.com" + url);
                  personCount++;
                  lblPerson.Invoke(new Action(delegate() { lblPerson.Text = "已完成条数:" + personCount.ToString(); }));
                  MatchCollection mcImgPage = regImg.Matches(imgPageHtml);
                  foreach (Match matchImgPage in mcImgPage)
                  {
                    start = matchImgPage.ToString().IndexOf("src=\"");
                    string imgUrl = matchImgPage.ToString().Substring(start + 5);
                    end = imgUrl.IndexOf("\"");
                    imgUrl = imgUrl.Substring(0, end);
                    if (imgUrl.IndexOf("http://i1") == 0)
                    {
                      try
                      {
                        HttpRequestUtil.HttpDownloadFile(imgUrl);
                        count++;
                        lblNum.Invoke(new Action(delegate()
                        {
                          lblNum.Text = "已下载图片数" + count.ToString();
                          DateTime dt = DateTime.Now;
                          double time = dt.Subtract(dtStart).TotalSeconds;
                          if (time > 0)
                          {
                            lblSpeed.Text = "速度:" + (count / time).ToString("0.0") + "张/秒";
                          }
                        }));
                      }
                      catch { }
                      Thread.Sleep(1);
                    }
                  }
                }
              }
            }
            catch { }
            page++;
            lblPage.Invoke(new Action(delegate() { lblPage.Text = "已完成页数:" + page.ToString(); }));
 
            if (page == 100)
            {
              button1.Invoke(new Action(delegate() { button1.Enabled = true; }));
              MessageBox.Show("完成!");
            }
          }
        }));
        thread.Start(i);
        threadList.Add(thread);
      }
    }
 
    private void button2_Click(object sender, EventArgs e)
    {
      button1.Invoke(new Action(delegate()
      {
        foreach (Thread thread in threadList)
        {
          if (thread.ThreadState == ThreadState.Suspended)
          {
            thread.Resume();
          }
          thread.Abort();
        }
        button1.Enabled = true;
        button2.Enabled = false;
        button3.Enabled = false;
        button4.Enabled = false;
      }));
    }
 
    private void Form1_FormClosing(object sender, FormClosingEventArgs e)
    {
      foreach (Thread thread in threadList)
      {
        thread.Abort();
      }
    }
 
    private void button3_Click(object sender, EventArgs e)
    {
      foreach (Thread thread in threadList)
      {
        if (thread.ThreadState == ThreadState.Running)
        {
          thread.Suspend();
        }
      }
      button3.Enabled = false;
      button4.Enabled = true;
    }
 
    private void button4_Click(object sender, EventArgs e)
    {
      foreach (Thread thread in threadList)
      {
        if (thread.ThreadState == ThreadState.Suspended)
        {
          thread.Resume();
        }
      }
      button3.Enabled = true;
      button4.Enabled = false;
    }
  }
}

截图:

基于C#实现网页爬虫

以上就是本文的全部内容,希望对大家的学习有所帮助。