使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

Web 前端代码

复制代码代码如下:

	
	<%@ Page Language="C#" AutoEventWireup="true" CodeFile="Default.aspx.cs" Inherits="_Default" %> 

	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 

	<html xmlns="http://www.w3.org/1999/xhtml"> 

	<head runat="server"> 

	<title></title> 

	</head> 

	<body> 

	<form id="form1" runat="server"> 

	<div> 

	<table cellpadding="1" cellspacing="1" bgcolor="#f1f1f1" style="text-align: center"> 

	<asp:Repeater ID="Repeater1" runat="server"> 

	<HeaderTemplate> 

	<tr> 

	<td> 

	标题 

	</td> 

	<td> 

	发布作者 

	</td> 

	<td> 

	发布时间 

	</td> 

	</tr> 

	</HeaderTemplate> 

	<ItemTemplate> 

	<tr bgcolor="#ffffff"> 

	<td align="left"> 

	<a href='<%#Eval("url") %>' target="_blank"> 

	<%#Eval("title") %> 

	</a> 

	</td> 

	<td> 

	<a href='<%#Eval("authorUrl") %>' target="_blank"> 

	<%#Eval("author") %> 

	</a> 

	</td> 

	<td> 

	<%#Eval("updatetime") %> 

	</td> 

	</tr> 

	</ItemTemplate> 

	</asp:Repeater> 

	</table> 

	</div> 

	</form> 

	</body> 

	</html>

cs 后台代码：

复制代码代码如下:

	
	using System; 

	using System.Collections.Generic; 

	using System.Linq; 

	using System.Web; 

	using System.Web.UI; 

	using System.Web.UI.WebControls; 

	using S1; 

	using System.Net; 

	using System.IO; 

	using System.Text; 

	using HtmlAgilityPack; 

	public partial class _Default : System.Web.UI.Page 

	{ 

	protected void Page_Load(object sender, EventArgs e) 

	{ 

	string page = string.Empty; 

	if (!IsPostBack) 

	{ 

	WebClient wc = new WebClient(); 

	string address = "http://www.cnblogs.com"; 

	if (!string.IsNullOrEmpty(Request.QueryString["p"])) 

	{ 

	address += "/" + Request.QueryString["p"];//分页，p=p2,p=p3 

	} 

	Stream stream = wc.OpenRead(address); 

	StreamReader sr = new StreamReader(stream, Encoding.UTF8); 

	string html = sr.ReadToEnd(); 

	//实例化HtmlAgilityPack.HtmlDocument对象 

	HtmlDocument doc = new HtmlDocument(); 

	//载入HTML 

	doc.LoadHtml(html); 

	//根据HTML节点NODE的ID获取节点 

	HtmlNode navNode = doc.GetElementbyId("post_list"); 

	//div[2]表示文章链接a位于post_list里面第3个div节点中 

	HtmlNodeCollection list = navNode.SelectNodes("//div[2]/h3/a"); //根据XPATH来索引节点 

	Cnblogs cnblogs = null; 

	IList<Cnblogs> cnlist = new List<Cnblogs>(); 

	foreach (HtmlNode node in list) 

	{ 

	cnblogs = new Cnblogs(); 

	//获取文章链接地址 

	cnblogs.url = node.Attributes["href"].Value.ToString(); 

	//获取文章标题 

	cnblogs.title = node.InnerText; 

	cnlist.Add(cnblogs); 

	} 

	HtmlNodeCollection list1 = navNode.SelectNodes("//div[2]/div/a"); 

	for (int i = 0; i < cnlist.Count; i++) 

	{ 

	cnlist[i].author = list1[i].InnerText; 

	cnlist[i].authorUrl = list1[i].Attributes["href"].Value.ToString(); 

	cnlist[i].updatetime = list1[i].NextSibling.InnerText.Replace("发布于", "").Trim(); 

	} 

	this.Repeater1.DataSource = cnlist; 

	this.Repeater1.DataBind(); 

	} 

	} 

	public class Cnblogs 

	{ 

	public string title { get; set; } 

	public string url { get; set; } 

	public string author { get; set; } 

	public string authorUrl { get; set; } 

	public string updatetime { get; set; } 

	} 

	}

秒客网

使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

相关文章