本文实例讲述了C#使用正则表达式抓取网站信息的方法。分享给大家供大家参考,具体如下:
这里以抓取京东商城商品详情为例。
1、创建JdRobber.cs程序类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
public class JdRobber
{
/// <summary>
/// 判断是否京东链接
/// </summary>
/// <param name="param"></param>
/// <returns></returns>
public bool ValidationUrl( string url)
{
bool result = false ;
if (!String.IsNullOrEmpty(url))
{
Regex regex = new Regex( @"^http://item.jd.com/\d+.html$" );
Match match = regex.Match(url);
if (match.Success)
{
result = true ;
}
}
return result;
}
/// <summary>
/// 抓取京东信息
/// </summary>
/// <param name="param"></param>
/// <returns></returns>
public void GetInfo( string url)
{
if (ValidationUrl(url))
{
string htmlStr = WebHandler.GetHtmlStr(url, "Default" );
if (!String.IsNullOrEmpty(htmlStr))
{
string pattern = "" ; //正则表达式
string sourceWebID = "" ; //商品关键ID
string title = "" ; //标题
decimal price = 0; //价格
string picName = "" ; //图片
//提取商品关键ID
pattern = @"http://item.jd.com/(?<Object>\d+).html" ;
sourceWebID = WebHandler.GetRegexText(url, pattern);
//提取标题
pattern = @"<div.*id=\""name\"".*>[\s\S]*<h1>(?<Object>.*?)</h1>" ;
title = WebHandler.GetRegexText(htmlStr, pattern);
//提取图片
int begin = htmlStr.IndexOf( "<div id=\"spec-n1\"" );
int end = htmlStr.IndexOf( "</div>" , begin + 1);
if (begin > 0 && end > 0)
{
string subPicHtml = htmlStr.Substring(begin, end - begin);
pattern = @"<img.*src=\""(?<Object>.*?)\"".*/>" ;
picName = WebHandler.GetRegexText(subPicHtml, pattern);
}
//提取价格
if (sourceWebID != "" )
{
string priceUrl = @"http://p.3.cn/prices/get?skuid=J_" + sourceWebID + "&type=1" ;
string priceJson = WebHandler.GetHtmlStr(priceUrl, "Default" );
pattern = @"\""p\"":\""(?<Object>\d+(\.\d{1,2})?)\""" ;
price = WebHandler.GetValidPrice(WebHandler.GetRegexText(priceJson, pattern));
}
Console.WriteLine( "商品名称:{0}" , title);
Console.WriteLine( "图片:{0}" , picName);
Console.WriteLine( "价格:{0}" , price);
}
}
}
}
|
2、创建WebHandler.cs公共方法类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
/// <summary>
/// 公共方法类
/// </summary>
public class WebHandler
{
/// <summary>
/// 获取网页的HTML码
/// </summary>
/// <param name="url">链接地址</param>
/// <param name="encoding">编码类型</param>
/// <returns></returns>
public static string GetHtmlStr( string url, string encoding)
{
string htmlStr = "" ;
try
{
if (!String.IsNullOrEmpty(url))
{
WebRequest request = WebRequest.Create(url); //实例化WebRequest对象
WebResponse response = request.GetResponse(); //创建WebResponse对象
Stream datastream = response.GetResponseStream(); //创建流对象
Encoding ec = Encoding.Default;
if (encoding == "UTF8" )
{
ec = Encoding.UTF8;
}
else if (encoding == "Default" )
{
ec = Encoding.Default;
}
StreamReader reader = new StreamReader(datastream, ec);
htmlStr = reader.ReadToEnd(); //读取数据
reader.Close();
datastream.Close();
response.Close();
}
}
catch { }
return htmlStr;
}
/// <summary>
/// 获取正则表达式中的关键字
/// </summary>
/// <param name="input">文本</param>
/// <param name="pattern">表达式</param>
/// <returns></returns>
public static string GetRegexText( string input, string pattern)
{
string result = "" ;
if (!String.IsNullOrEmpty(input) && !String.IsNullOrEmpty(pattern))
{
Regex regex = new Regex(pattern, RegexOptions.IgnoreCase);
Match match = regex.Match(input);
if (match.Success)
{
result = match.Groups[ "Object" ].Value;
}
}
return result;
}
/// <summary>
/// 返回有效价格
/// </summary>
/// <param name="strPrice"></param>
/// <returns></returns>
public static decimal GetValidPrice( string strPrice)
{
decimal price = 0;
try
{
if (!String.IsNullOrEmpty(strPrice))
{
Regex regex = new Regex( @"^\d+(\.\d{1,2})?$" , RegexOptions.IgnoreCase);
Match match = regex.Match(strPrice);
if (match.Success)
{
price = decimal .Parse(strPrice);
}
}
}
catch { }
return price;
}
}
|
希望本文所述对大家C#程序设计有所帮助。