C#基于正则表达式实现获取网页中所有信息的网页抓取类实例

时间:2022-06-08 07:48:53

本文实例讲述了C#基于正则表达式实现获取网页中所有信息的网页抓取类。分享给大家供大家参考,具体如下:

类的代码:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
using System;
using System.Data;
using System.Configuration;
using System.Net;
using System.IO;
using System.Text;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Threading;
using System.Web;
using System.Web.UI.MobileControls;
/// <summary>
/// 网页类
/// </summary>
public class WebPage
{
    #region 私有成员
    private Uri m_uri;  //url
    private List<Link> m_links;  //此网页上的链接
    private string m_title;    //标题
    private string m_html;     //HTML代码
    private string m_outstr;    //网页可输出的纯文本
    private bool m_good;      //网页是否可用
    private int m_pagesize;    //网页的大小
    private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//存放所有网页的Cookie
    #endregion
    #region 属性
    /// <summary>
    /// 通过此属性可获得本网页的网址,只读
    /// </summary>
    public string URL
    {
      get
      {
        return m_uri.AbsoluteUri;
      }
    }
    /// <summary>
    /// 通过此属性可获得本网页的标题,只读
    /// </summary>
    public string Title
    {
      get
      {
        if (m_title == "")
        {
          Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
          Match mc = reg.Match(m_html);
          if (mc.Success)
            m_title = mc.Groups["title"].Value.Trim();
        }
        return m_title;
      }
    }
    public string M_html
    {
      get
      {
        if (m_html == null)
        {
          m_html = "";
        }
        return m_html;
      }
    }
    /// <summary>
    /// 此属性获得本网页的所有链接信息,只读
    /// </summary>
    public List<Link> Links
    {
      get
      {
        if (m_links.Count == 0) getLinks();
        return m_links;
      }
    }
    /// <summary>
    /// 此属性返回本网页的全部纯文本信息,只读
    /// </summary>
    public string Context
    {
      get
      {
        if (m_outstr == "") getContext(Int16.MaxValue);
        return m_outstr;
      }
    }
    /// <summary>
    /// 此属性获得本网页的大小
    /// </summary>
    public int PageSize
    {
      get
      {
        return m_pagesize;
      }
    }
    /// <summary>
    /// 此属性获得本网页的所有站内链接
    /// </summary>
    public List<Link> InsiteLinks
    {
      get
      {
        return getSpecialLinksByUrl("^http://" + m_uri.Host, Int16.MaxValue);
      }
    }
    /// <summary>
    /// 此属性表示本网页是否可用
    /// </summary>
    public bool IsGood
    {
      get
      {
        return m_good;
      }
    }
    /// <summary>
    /// 此属性表示网页的所在的网站
    /// </summary>
    public string Host
    {
      get
      {
        return m_uri.Host;
      }
    }
    #endregion
    /// <summary>
    /// 从HTML代码中分析出链接信息
    /// </summary>
    /// <returns>List<Link></returns>
    private List<Link> getLinks()
    {
      if (m_links.Count == 0)
      {
        Regex[] regex = new Regex[2];
        regex[0] = new Regex(@"<a\shref\s*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
        regex[1] = new Regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", RegexOptions.IgnoreCase);
        for (int i = 0; i < 2; i++)
        {
          Match match = regex[i].Match(m_html);
          while (match.Success)
          {
            try
            {
              string url = HttpUtility.UrlDecode(new Uri(m_uri, match.Groups["URL"].Value).AbsoluteUri);
              string text = "";
              if (i == 0) text = new Regex("(<[^>]+>)|(\\s)|( )|&|\"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, "");
              Link link = new Link();
              link.Text = text;
              link.NavigateUrl = url;
              m_links.Add(link);
            }
            catch (Exception ex) { Console.WriteLine(ex.Message); };
            match = match.NextMatch();
          }
        }
      }
      return m_links;
    }
    /// <summary>
    /// 此私有方法从一段HTML文本中提取出一定字数的纯文本
    /// </summary>
    /// <param name="instr">HTML代码</param>
    /// <param name="firstN">提取从头数多少个字</param>
    /// <param name="withLink">是否要链接里面的字</param>
    /// <returns>纯文本</returns>
    private string getFirstNchar(string instr, int firstN, bool withLink)
    {
      if (m_outstr == "")
      {
        m_outstr = instr.Clone() as string;
        m_outstr = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
        m_outstr = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
        m_outstr = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
        if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
        Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);
        m_outstr = objReg.Replace(m_outstr, "");
        Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
        m_outstr = objReg2.Replace(m_outstr, " ");
      }
      return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr;
    }
    #region 公有文法
    /// <summary>
    /// 此公有方法提取网页中一定字数的纯文本,包括链接文字
    /// </summary>
    /// <param name="firstN">字数</param>
    /// <returns></returns>
    public string getContext(int firstN)
    {
      return getFirstNchar(m_html, firstN, true);
    }
    /// <summary>
    /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式
    /// </summary>
    /// <param name="pattern">正则式</param>
    /// <param name="count">返回的链接的个数</param>
    /// <returns>List<Link></returns>
    public List<Link> getSpecialLinksByUrl(string pattern, int count)
    {
      if (m_links.Count == 0) getLinks();
      List<Link> SpecialLinks = new List<Link>();
      List<Link>.Enumerator i;
      i = m_links.GetEnumerator();
      int cnt = 0;
      while (i.MoveNext() && cnt < count)
      {
        if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.NavigateUrl).Success)
        {
          SpecialLinks.Add(i.Current);
          cnt++;
        }
      }
      return SpecialLinks;
    }
    /// <summary>
    /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式
    /// </summary>
    /// <param name="pattern">正则式</param>
    /// <param name="count">返回的链接的个数</param>
    /// <returns>List<Link></returns>
    public List<Link> getSpecialLinksByText(string pattern, int count)
    {
      if (m_links.Count == 0) getLinks();
      List<Link> SpecialLinks = new List<Link>();
      List<Link>.Enumerator i;
      i = m_links.GetEnumerator();
      int cnt = 0;
      while (i.MoveNext() && cnt < count)
      {
        if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.Text).Success)
        {
          SpecialLinks.Add(i.Current);
          cnt++;
        }
      }
      return SpecialLinks;
    }
    /// <summary>
    /// 这公有方法提取本网页的纯文本中满足某正则式的文字 by 何问起
    /// </summary>
    /// <param name="pattern">正则式</param>
    /// <returns>返回文字</returns>
    public string getSpecialWords(string pattern)
    {
      if (m_outstr == "") getContext(Int16.MaxValue);
      Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase);
      Match mc = regex.Match(m_outstr);
      if (mc.Success)
        return mc.Groups[1].Value;
      return string.Empty;
    }
    #endregion
    #region 构造函数
    private void Init(string _url)
    {
      try
      {
        m_uri = new Uri(_url);
        m_links = new List<Link>();
        m_html = "";
        m_outstr = "";
        m_title = "";
        m_good = true;
        if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))
        {
          m_good = false;
          return;
        }
        HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);
        rqst.AllowAutoRedirect = true;
        rqst.MaximumAutomaticRedirections = 3;
        rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
        rqst.KeepAlive = true;
        rqst.Timeout = 10000;
        lock (WebPage.webcookies)
        {
          if (WebPage.webcookies.ContainsKey(m_uri.Host))
            rqst.CookieContainer = WebPage.webcookies[m_uri.Host];
          else
          {
            CookieContainer cc = new CookieContainer();
            WebPage.webcookies[m_uri.Host] = cc;
            rqst.CookieContainer = cc;
          }
        }
        HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();
        Stream sm = rsps.GetResponseStream();
        if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)
        {
          rsps.Close();
          m_good = false;
          return;
        }
        Encoding cding = System.Text.Encoding.Default;
        string contenttype = rsps.ContentType.ToLower();
        int ix = contenttype.IndexOf("charset=");
        if (ix != -1)
        {
          try
          {
            cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));
          }
          catch
          {
            cding = Encoding.Default;
          }
          //该处视情况而定 有的需要解码
          //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd());
          m_html = new StreamReader(sm, cding).ReadToEnd();
        }
        else
        {
         //该处视情况而定 有的需要解码
          //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd());
          m_html = new StreamReader(sm, cding).ReadToEnd();
          Regex regex = new Regex("charset=(?<cding>[^=]+)?\"", RegexOptions.IgnoreCase);
          string strcding = regex.Match(m_html).Groups["cding"].Value;
          try
          {
            cding = Encoding.GetEncoding(strcding);
          }
          catch
          {
            cding = Encoding.Default;
          }
          byte[] bytes = Encoding.Default.GetBytes(m_html.ToCharArray());
          m_html = cding.GetString(bytes);
          if (m_html.Split('?').Length > 100)
          {
            m_html = Encoding.Default.GetString(bytes);
          }
        }
        m_pagesize = m_html.Length;
        m_uri = rsps.ResponseUri;
        rsps.Close();
      }
      catch (Exception ex)
      {
      }
    }
    public WebPage(string _url)
    {
      string uurl = "";
      try
      {
        uurl = Uri.UnescapeDataString(_url);
        _url = uurl;
      }
      catch { };
      Init(_url);
    }
    #endregion
}

调用:

?
1
2
3
WebPage webInfo = new WebPage("http://hovertree.net/");
webInfo.Context;//不包含html标签的所有内容
webInfo.M_html;//包含html标签的内容 by 何问起

希望本文所述对大家C#程序设计有所帮助。