之前都是用正则抓取页面,本人正则不咋地,有些东西用抓取来很费劲,呵呵
在网上看到别人推荐一个 HtmlAgilityPack 的东西,网上找了资料,自己写了个抓取网页的例子,框架用的ASP.NET MVC 4,先看看效果
演示地址:http://www.5imvc.com/Html/cnblogs
首先下载插件,NuGet里就有
创建 model
/// <summary>
/// 页面抓取结果
/// </summary>
public class Result
{
/// <summary>
/// 链接
/// </summary>
public string url { get; set; }
/// <summary>
/// 标题
/// </summary>
public string title { get; set; }
/// <summary>
/// 头像地址
/// </summary>
public string img { get; set; }
/// <summary>
/// 正文内容
/// </summary>
public string content { get; set; }
}
Controllers:
导入命名空间:
using HtmlAgilityPack;
public ActionResult Index()
{
return View(getList());
} /// <summary>
/// 抓取方法
/// </summary>
/// <returns></returns>
public List<Result> getList()
{
List<Result> list = new List<Result>(); #region 老式的正则抓取 //System.Net.WebRequest req = System.Net.WebRequest.Create("http://www.cnblogs.com/");
//System.Net.WebResponse res = req.GetResponse(); // GetResponse blocks until the response arrives
//System.IO.Stream ReceiveStream = res.GetResponseStream(); // Read the stream into a string
//System.IO.StreamReader sr = new System.IO.StreamReader(ReceiveStream);
//string resultstring = sr.ReadToEnd(); //string regstr = "<h3><a class=\"titlelnk\" href=\"(.*?)\" target=\"_blank\">(.*?)</a></h3>";
//MatchCollection matches = Regex.Matches(resultstring, regstr, RegexOptions.Multiline);
//foreach (Match item in matches)
//{
// list.Add(new Result { url = item.Groups[1].Value, title = item.Groups[2].Value });
//} #endregion HtmlWeb htmlWeb = new HtmlWeb();
HtmlDocument htmlDoc = htmlWeb.Load(@"http://www.cnblogs.com/"); //选择博客园首页文章列表
htmlDoc.DocumentNode.SelectNodes("//div[@id='post_list']/div[@class='post_item']").
AsParallel().ToList().ForEach(ac =>
{
//抓取图片,因为有空的,所以拿变量存起来
HtmlNode node = ac.SelectSingleNode(".//p[@class='post_item_summary']/a/img"); list.Add(new Result
{
url = ac.SelectSingleNode(".//a[@class='titlelnk']").Attributes["href"].Value,
title = ac.SelectSingleNode(".//a[@class='titlelnk']").InnerText,
//图片如果为空,显示默认图片
img = node == null ? VirtualPathUtility.ToAbsolute("~/Content/img/avatar.png") : node.Attributes["src"].Value,
content = ac.SelectSingleNode(".//p[@class='post_item_summary']").InnerText
});
}); return list; }
View:
@model IEnumerable<Result>
@{
foreach (var item in Model)
{
<div class="newsitem">
<div>
<img src="@item.img" class="hoverimg" alt="News" />
<h3><a href="@item.url" target="_blank">@item.title</a>
</h3>
<p>
@item.content
</p>
</div>
<div>
<p>
<a href="@item.url" title="">查看全文</a>
</p>
</div>
</div>
}
}
原文转至:http://www.cnblogs.com/linfei721/archive/2013/05/08/3066697.html