使用HtmlAgilityPack解析Html(非常好用)

时间:2022-05-22 12:36:51
 /// <summary>
/// 设计成一个exe,解决WebBrowser控件内存泄漏的问题.
/// </summary>
public partial class MainForm : Form
{
/// <summary>
/// 是否处理完成
/// </summary>
private bool isCompleted; //webBrowser只能运行在UI线程上,所以这里不用信号通知,而用一个变量,不断检查这个变量的状态 /// <summary>
/// 处理结果
/// </summary>
private List<RowData> executeResult = new List<RowData>(); private static MainForm instance = new MainForm();
/// <summary>
/// 单件实例
/// </summary>
public static MainForm Instance { get { return instance; } } private MainForm()
{
InitializeComponent();
webBrowser.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser_DocumentCompleted);
} private void webBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
if (this.webBrowser.ReadyState != WebBrowserReadyState.Complete)
return; var txt = webBrowser.Document.Body.InnerText;
var html = webBrowser.Document.Body.InnerHtml;
if (webBrowser.Document.Title == "选择")
{
var items = ExtractData(html);
executeResult.AddRange(items);
isCompleted = true;
}
} private List<RowData> ExtractData(string html)
{
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html); HtmlAgilityPack.HtmlNode node = doc.GetElementbyId("div");
var trNodes = node.SelectNodes("tbody/tr"); List<RowData> rows = new List<RowData>();
foreach (var trNode in trNodes)
{
var tdNode = trNode.SelectNodes("td/div");
RowData row = new RowData();
rows.Add(row); row.航班 = tdNode[0].InnerText;
row.出发时间 = tdNode[1].InnerText;
row.到达时间 = tdNode[2].InnerText;
row.机场 = tdNode[3].InnerText;
row.机型 = tdNode[4].InnerText;
row.头等 = tdNode[5].InnerText;
row.公务 = tdNode[6].InnerText;
row.全价 = tdNode[7].InnerText;
row.折扣 = tdNode[8].InnerText;
row.特价 = tdNode[9].InnerText;
} return rows;
} /// <summary>
/// 查询数据
/// </summary>
/// <param name="fromCity">出发城市代码</param>
/// <param name="toCity">到达城市代码</param>
/// <param name="date">出发日期</param>
/// <param name="timeout">超时时间</param>
/// <returns>机票信息</returns>
[MethodImpl(MethodImplOptions.Synchronized)]
public List<RowData> Query(string fromCity, string toCity, DateTime date, TimeSpan timeout)
{
isCompleted = false;
executeResult.Clear(); string urlTemplate = "http://www.xxx.com";
string url = string.Format(urlTemplate, fromCity, date.Month, date.Day, date.Year, toCity);
Navigate(url); DateTime startTime = DateTime.Now;
//未处理完,且没有超时,则等待
while (!isCompleted && startTime.Add(timeout) > DateTime.Now)
{
Thread.Sleep(100);
Application.DoEvents();
} return executeResult;
} private void Navigate(string url)
{
if (InvokeRequired)
{
BeginInvoke(new Action<string>(Navigate), url);
return;
} webBrowser.Navigate(url);
}
} /// <summary>
/// 对应到页面上的每一行数据
/// 不喜欢中文请自行修改
/// </summary>
public class RowData
{
public string 航班 { get; set; }
public string 出发时间 { get; set; }
public string 到达时间 { get; set; }
public string 机场 { get; set; }
public string 机型 { get; set; }
public string 头等 { get; set; }
public string 公务 { get; set; }
public string 全价 { get; set; }
public string 折扣 { get; set; }
public string 特价 { get; set; }
}