使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

时间:2022-01-03 06:08:13

使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

Web 前端代码

复制代码代码如下:

<%@ Page Language="C#" AutoEventWireup="true" CodeFile="Default.aspx.cs" Inherits="_Default" %> 
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 
<html xmlns="http://www.w3.org/1999/xhtml"> 
<head runat="server"> 
<title></title> 
</head> 
<body> 
<form id="form1" runat="server"> 
<div> 
<table cellpadding="1" cellspacing="1" bgcolor="#f1f1f1" style="text-align: center"> 
<asp:Repeater ID="Repeater1" runat="server"> 
<HeaderTemplate> 
<tr> 
<td> 
标题 
</td> 
<td> 
发布作者 
</td> 
<td> 
发布时间 
</td> 
</tr> 
</HeaderTemplate> 
<ItemTemplate> 
<tr bgcolor="#ffffff"> 
<td align="left"> 
<a href='<%#Eval("url") %>' target="_blank"> 
<%#Eval("title") %> 
</a> 
</td> 
<td> 
<a href='<%#Eval("authorUrl") %>' target="_blank"> 
<%#Eval("author") %> 
</a> 
</td> 
<td> 
<%#Eval("updatetime") %> 
</td> 
</tr> 
</ItemTemplate> 
</asp:Repeater> 
</table> 
</div> 
</form> 
</body> 
</html> 


cs 后台代码: 

复制代码代码如下:

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Web; 
using System.Web.UI; 
using System.Web.UI.WebControls; 
using S1; 
using System.Net; 
using System.IO; 
using System.Text; 
using HtmlAgilityPack; 
public partial class _Default : System.Web.UI.Page 

protected void Page_Load(object sender, EventArgs e) 

string page = string.Empty; 
if (!IsPostBack) 

WebClient wc = new WebClient(); 
string address = "http://www.cnblogs.com"; 
if (!string.IsNullOrEmpty(Request.QueryString["p"])) 

address += "/" + Request.QueryString["p"];//分页,p=p2,p=p3 

Stream stream = wc.OpenRead(address); 
StreamReader sr = new StreamReader(stream, Encoding.UTF8); 
string html = sr.ReadToEnd(); 
//实例化HtmlAgilityPack.HtmlDocument对象 
HtmlDocument doc = new HtmlDocument(); 
//载入HTML 
doc.LoadHtml(html); 
//根据HTML节点NODE的ID获取节点 
HtmlNode navNode = doc.GetElementbyId("post_list"); 
//div[2]表示文章链接a位于post_list里面第3个div节点中 
HtmlNodeCollection list = navNode.SelectNodes("//div[2]/h3/a"); //根据XPATH来索引节点 
Cnblogs cnblogs = null; 
IList<Cnblogs> cnlist = new List<Cnblogs>(); 
foreach (HtmlNode node in list) 

cnblogs = new Cnblogs(); 
//获取文章链接地址 
cnblogs.url = node.Attributes["href"].Value.ToString(); 
//获取文章标题 
cnblogs.title = node.InnerText; 
cnlist.Add(cnblogs); 

HtmlNodeCollection list1 = navNode.SelectNodes("//div[2]/div/a"); 
for (int i = 0; i < cnlist.Count; i++) 

cnlist[i].author = list1[i].InnerText; 
cnlist[i].authorUrl = list1[i].Attributes["href"].Value.ToString(); 
cnlist[i].updatetime = list1[i].NextSibling.InnerText.Replace("发布于", "").Trim(); 

this.Repeater1.DataSource = cnlist; 
this.Repeater1.DataBind(); 


public class Cnblogs 

public string title { get; set; } 
public string url { get; set; } 
public string author { get; set; } 
public string authorUrl { get; set; } 
public string updatetime { get; set; }