csharp: using HtmlAgilityPack and ScrapySharp reading Url find text

时间:2022-10-01 09:58:26

https://github.com/exaphaser/ScrapySharp

https://github.com/zzzprojects/html-agility-pack

https://github.com/atifaziz/Fizzler

https://archive.codeplex.com/?p=fizzlerex

https://github.com/aspnet/blazor

https://github.com/SteveSanderson/Blazor

https://www.mathjax.org/#samples 数学公式

https://github.com/Ivony/Jumony

https://github.com/GeReV/NSoup

https://github.com/robinvanderknaap/MvcJqGrid

http://www.defenseinnovationmarketplace.mil/strategy.html

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Net;
using System.Collections;
using ScrapySharp;
using ScrapySharp.Network;
using ScrapySharp.Core;
using HtmlAgilityPack; namespace HtmlAgilityPackDemo
{ /// <summary>
/// HTML解析利器HtmlAgilityPack
/// geovindu
/// 涂聚文
/// 20180305
/// </summary>
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
/// <summary>
///
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void Form1_Load(object sender, EventArgs e)
{
this.textBox1.Text = "ln"; //List<CityList> lis=new List<CityList>();
}
/// <summary>
///
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetWebClient(string url)
{
string strHTML = "";
WebClient myWebClient = new WebClient();
Stream myStream = myWebClient.OpenRead(url);
StreamReader sr = new StreamReader(myStream, Encoding.Default);//注意编码
strHTML = sr.ReadToEnd();
myStream.Close();
return strHTML;
} /// <summary>
/// nl
/// </summary>
/// <param name="cityCode"></param>
public string ParsePageByArea(String cityCode, out List<CityList> listcity)
{
StringBuilder stp = new StringBuilder();
CityList city = null;
List<CityList> clits = new List<CityList>();
//更加链接格式和省份代码构造URL
String url = String.Format("http://www.tianqihoubao.com/lishi/{0}.htm", cityCode);
//下载网页源代码
var docText = GetWebClient(url);
//加载源代码,获取文档对象
var doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(docText);
//更加xpath获取总的对象,如果不为空,就继续选择dl标签
var res = doc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[1]/div[6]/div[1]/div[1]/div[3]");
if (res != null)
{
var list = res.SelectNodes(@"dl");//选择标签数组
if (list.Count < 1)
{
listcity = clits;
return "";
}
foreach (var item in list)
{
var dd = item.SelectSingleNode(@"dd").SelectNodes("a");
foreach (var node in dd)
{
city = new CityList();
var text = node.InnerText.Trim();
//拼音代码要从href属性中进行分割提取
var herf = node.Attributes["href"].Value.Trim().Split('/', '.');
string str= string.Format("{0}:{1}", text, herf[herf.Length - 2]);
city.CityName = text;
city.CityCode = herf[herf.Length - 2];
stp.Append("\r\n" + str);
clits.Add(city); }
}
}
listcity = clits;
return stp.ToString();
}
/// <summary>
/// http://www.tianqihoubao.com/lishi/dalian/month/201802.html
/// </summary>
/// <param name="cityCode"></param>
/// <param name="year"></param>
/// <param name="month"></param>
public string ParsePageByCityMonth(String cityCode, Int32 year, Int32 month,out List<WeatherList> wea)
{
StringBuilder stp = new StringBuilder();
List<WeatherList> wlist = new List<WeatherList>();
WeatherList wt = null;
//更加拼音代码,月份信息构造URL
String url = String.Format("http://www.tianqihoubao.com/lishi/{0}/month/{1}{2:D2}.html", cityCode, year, month);
//获取该链接的源代码
var docText = GetWebClient(url);
//加载源代码,获取页面结构对象
var doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(docText);
//更加Xpath获取表格对象
var res = doc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[2]/div[6]/div[1]/div[1]/table[1]");
if (res != null)
{
//获取所有行
var list = res.SelectNodes(@"tr");
list.RemoveAt(0);//移除第一行,是表头
// 遍历每一行,获取日期,以及天气状况等信息
foreach (var item in list)
{
wt = new WeatherList();
var dd = item.SelectNodes(@"td");
//日期 - - 气温 - 风力风向
if (dd.Count != 4) continue;
//获取当前行日期
var date1 = dd[0].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
//获取当前行天气状况
var tq = dd[1].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
//获取当前行气温
var qw = dd[2].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
//获取当前行风力风向
var fx = dd[3].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
//输出
string str=string.Format("{0}:{1},{2},{3}", date1, tq, qw, fx);
stp.Append(str);
wt.Climate = tq;
wt.Date =DateTime.Parse(date1);
wt.Temperature = qw;
wt.WindDirection = fx;
wlist.Add(wt); }
}
wea = wlist;
return stp.ToString();
}
/// <summary>
/// http://www.dusystem.com/geovindu.html
/// ScrapingBrowser
/// 获取文件标题
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public string getHtmlTitle(string url)
{
StringBuilder titl = new StringBuilder();
var uri = new Uri(url);
var browser1 = new ScrapingBrowser();
var html1 = browser1.DownloadString(uri);
var doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html1);
var html = doc.DocumentNode; var title = html.SelectNodes("title");
foreach (var htmlNode in title)
{
titl.Append(htmlNode.InnerText);
}
//CssSelect CssSelectAncestors
var ps = html.SelectNodes("p").Elements("div#endText");
foreach (var htmlNode in ps)
{
titl.Append(htmlNode.InnerHtml);
} return titl.ToString(); }
/// <summary>
///
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void button1_Click(object sender, EventArgs e)
{
List<CityList> list = new List<CityList>();
this.richTextBox1.Text = ParsePageByArea(this.textBox1.Text.Trim(),out list);
this.comboBox1.DataSource = list;
this.comboBox1.DisplayMember = "CityName";
this.comboBox1.ValueMember = "CityCode"; }
/// <summary>
///
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void button2_Click(object sender, EventArgs e)
{
List<WeatherList> list = new List<WeatherList>();
int year=DateTime.Now.Year;
int mont=DateTime.Now.Month-1;
this.richTextBox2.Text = ParsePageByCityMonth(this.comboBox1.SelectedValue.ToString(), year, mont, out list);
this.dataGridView1.DataSource = list; } }
/// <summary>
///
/// </summary>
public class CityList
{
/// <summary>
///
/// </summary>
public string CityName { get; set; }
/// <summary>
///
/// </summary>
public string CityCode { get; set; }
} /// <summary>
/// Climate, temperature, wind direction
/// </summary>
public class WeatherList
{
/// <summary>
/// 气候
/// </summary>
public string Climate { get; set; }
/// <summary>
/// 温度
/// </summary>
public string Temperature { get; set; }
/// <summary>
/// 风向
/// </summary>
public string WindDirection { get; set; }
/// <summary>
///
/// </summary>
public DateTime Date { get; set; }
} }

  

  private void button3_Click(object sender, EventArgs e)
{
int year = DateTime.Now.Year;
int mont = DateTime.Now.Month - 1;
string url = "http://www.tianqihoubao.com/lishi/dalian/month/201802.html";
var docText = GetWebClient(url);
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument(); document.LoadHtml(docText); // document.OptionOutputAsXml = true; var divname = document.DocumentNode.Descendants("div").FirstOrDefault(); var body = document.DocumentNode.SelectNodes("//body").Single(); var ta = document.DocumentNode.SelectNodes("//table").Single(); foreach (var script in document.DocumentNode.Descendants("script").ToArray())
script.Remove();
foreach (var style in document.DocumentNode.Descendants("style").ToArray())
style.Remove(); // foreach (var comment in document.DocumentNode.SelectNodes("//comment()").ToArray())
// comment.Remove();//新增的代码 //document.DocumentNode.SelectSingleNode("//div[@id='myTrips']").SelectNodes(".//li");
//是示天气的
List<string> paragraphs = document.DocumentNode.SelectNodes("//table[@class='b']//tr").Select(paragraphNode => paragraphNode.InnerHtml).ToList(); string name = document.DocumentNode.SelectSingleNode("//td/input").Attributes["value"].Value; // List<string> paragraphs = document.DocumentNode.SelectNodes("//table[contains(@class, 'b')]//tr").Select(paragraphNode => paragraphNode.InnerHtml).ToList();////b: is class name
//XPath: /html[1]/body[1]/form[1]/div[2]/div[6]/div[1]/div[1]/table[1]/tr[1]
HtmlNode tablenode = document.DocumentNode.SelectSingleNode("//table[@class='b']//tr"); //b: is class name 根据XPath查找节点,跟XmlNode差不多 HtmlNode node = document.DocumentNode.SelectSingleNode("//*"); IEnumerable<HtmlNode> nodeList = node.Ancestors(); //获取该元素所有的父节点的集合
foreach (HtmlNode item in nodeList)
{
Console.Write(item.Name + " "); //输出 div div body html #document
} HtmlAttributeCollection attrs = node.Attributes;
foreach (var item in attrs)
{
Console.WriteLine(item.Name + " : " + item.Value); //输出 class :user_match clear
} HtmlNodeCollection CNodes = node.ChildNodes; //所有的子节点
foreach (HtmlNode item in CNodes)
{
Console.WriteLine(item.Name + "-" + item.InnerText); //输出 别忘了文本节点也算
} HtmlAttributeCollection attrs1 = node.ClosingAttributes; //获取在结束标记的 HTML 属性的集合。 例如</ul class="">
Console.WriteLine(attrs1.Count); //输出0 HtmlNode node1 = node.FirstChild; //悲剧了ul的第一个节点是一个 \n 换行文本节点 第二个节点才到第一个li
Console.WriteLine(node1.NodeType); //输出Text 文本节点
HtmlNode node3 = node.LastChild; //同样最后一个节点一样是 \n 文本节点
Console.WriteLine(node3.NodeType); //输出Text 文本节点 HtmlNode node2 = node.SelectSingleNode("child::div[1]"); //获取当前节点的第一个子li节点
Console.WriteLine(node2.XPath); //根据节点生成XPath表达式  /html[1]/body[1]/form[1]/div[2]/div[6]/div[1]/div[1]/table[1]/tr[1] Console.WriteLine(node.HasAttributes); //输出 True 判断节点是否含有属性
Console.WriteLine(node.HasChildNodes); //输出 True 判断节点是否含有子节点
Console.WriteLine(node.HasClosingAttributes); //False 判断节点结束标记是否含有属性 Console.WriteLine(node.Line); //输出 155 该节点开始标记位于页面代码的第几行
Console.WriteLine(node.LinePosition); //输出 1 该节点开始标记位于第几列2
Console.WriteLine(node.NodeType); //输出 Element 该节点类型 此处为元素节点
Console.WriteLine(node.OriginalName); //输出 ul
HtmlNode node4 = node.SelectSingleNode("child::div[1]");
Console.WriteLine(node4.InnerText); //输出
HtmlNode node5 = node4.NextSibling.NextSibling; //获取下一个兄弟元素 因为有一个换行符的文本节点,因此要两次,跳过换行那个文本节点
Console.WriteLine(node5.InnerText); //输出
HtmlNode node6 = node5.PreviousSibling.PreviousSibling; //同样两次以跳过换行文本节点
Console.WriteLine(node6.InnerText); //输出
HtmlNode node7 = node6.ParentNode; //获取父节点
Console.WriteLine(node7.Name); //输出 ul
string str = node.OuterHtml;
Console.WriteLine(str); //输出整个ul代码class="user_match clear">
Console.WriteLine(node.StreamPosition); //输出7331 获取此节点的流位置在文档中,相对于整个文档(Html页面源代码)的开始。 HtmlAgilityPack.HtmlDocument doc1 = node.OwnerDocument; foreach (HtmlAgilityPack.HtmlNode div in body.SelectNodes("//div"))
{
var classValue = div.Attributes["class"] == null ? null : div.Attributes["class"].Value; if (classValue == "first")
{
//write innerText into a table at place [i][column1]
}
else if (classValue == "second")
{
//write innerText into the same table in [i][column2]
}
} string innerText1 = document.DocumentNode.SelectSingleNode("//body").SelectNodes("//div").Single(n => n.Attributes.Any(a => a.Name == "class" && a.Value == "first")).InnerText;
}

  

csharp: using HtmlAgilityPack and ScrapySharp reading Url find text的更多相关文章

  1. 爬虫技术 -- 进阶学习(十)网易新闻页面信息抓取(htmlagilitypack搭配scrapysharp)

    最近在弄网页爬虫这方面的,上网看到关于htmlagilitypack搭配scrapysharp的文章,于是决定试一试~ 于是到https://www.nuget.org/packages/Scrapy ...

  2. HtmlAgilityPack搭配 ScrapySharp或HtmlAgilityPack&period;CssSelectors

    Html Agility Pack 源码中的类大概有28个左右,其实不算一个很复杂的类库,但它的功能确不弱,为解析DOM已经提供了足够强大的功能支持,可以跟jQuery操作DOM媲 美:)Html A ...

  3. 网易新闻页面信息抓取 -- htmlagilitypack搭配scrapysharp

    最近在弄网页爬虫这方面的,上网看到关于htmlagilitypack搭配scrapysharp的文章,于是决定试一试~ 于是到https://www.nuget.org/packages/Scrapy ...

  4. 网易新闻页面信息抓取(htmlagilitypack搭配scrapysharp)

    转自原文 网易新闻页面信息抓取(htmlagilitypack搭配scrapysharp) 最近在弄网页爬虫这方面的,上网看到关于htmlagilitypack搭配scrapysharp的文章,于是决 ...

  5. c&num;中的解析HTML组件 -- (HtmlAgilityPack&comma;Jumony&comma;ScrapySharp&comma;NSoup&comma;Fizzler)

    做数据抓取,网络爬虫方面的开发,自然少不了解析HTML源码的操作.那么问题来了,到底.NET如何来解析HTML,有哪些解析HTML源码的好用的,有效的组件呢?   作者在开始做这方面开发的时候就被这些 ...

  6. 使用HtmlAgilityPack和ScrapySharp抓取网页数据遇到的几个问题解决方法——格式编码问题

    需要用到对应市区县街道居委会的区域编码,于是找到统计局的网页,对这些数据进行抓取,用到了HtmlAgilityPack和ScrapySharp,由于也是第一次从网页抓取数据,所以对于HtmlAgili ...

  7. C&num;&plus;HtmlAgilityPack&plus;XPath带你采集数据&lpar;以采集天气数据为例子&rpar;

    第一次接触HtmlAgilityPack是在5年前,一些意外,让我从技术部门临时调到销售部门,负责建立一些流程和寻找潜在客户,最后在阿里巴巴找到了很多客户信息,非常全面,刚开始是手动复制到Excel, ...

  8. Net处理html页面元素工具类&lpar;HtmlAgilityPack&period;dll&rpar;的使用

    现在,在不少应用场合中都希望做到数据抓取,特别是基于网页部分的抓取.其实网页抓取的过程实际上是通过编程的方法,去抓取不同网站网页后,再进行 分析筛选的过程.比如,有的比较购物网站,会同时去抓取不同购物 ...

  9. C&num;:使用HtmlAgilityPack解析Html

    推荐阅读: HtmlAgilityPack 入门教程1 HtmlAgilityPack入门教程2 向HtmlAgilityPack道歉:解析HTML还是你好用 获取html中meta标签中的conte ...

随机推荐

  1. 批量创建SQL Server分区文件

    ) declare @i int set @table = 'v3_yqsd_report' begin exec('alter database '+@table+' add filegroup O ...

  2. Spring初始化 Map 和 解析Json value

    单独定义Map数据结构的bean: <bean id= "expToLevelMap" class="org.springframework.beans.facto ...

  3. win32控制台实现按任意键退出的功能

    win7之后的五win32 控制台出现了程序运行完之后就立即结束的问题,程序员根本无法看输出的结果.未来让控制台运行完之后能够等待程序员的操作.可以使用: system("PAUSE&quo ...

  4. IOS反地理编码取得城市名称

    // 获取当前所在的城市名 CLGeocoder *reverseGeocoder=[[CLGeocoder alloc] init]; [reverseGeocoder reverseGeocode ...

  5. 通过设置cookie实现单点登录

    最近要做个登录一个客户端跳转到另一个网站不用再登录,有两种方法,第一种就是写接口通过客户端传值账号直接到目标网站,另一种是写入cookie到目标网站.由于目标网站之前就是通过cookie实现单点登录, ...

  6. &lbrack;刷题&rsqb;算法竞赛入门经典&lpar;第2版&rpar; 5-14&sol;UVa1598 - Exchange

    题意:模拟买卖,当出售价bid等于或低于出售价ask,则交易. 代码:(Accepted,0.330s) //UVa1598 - Exchange //Accepted 0.330s //#defin ...

  7. 20160225&period;CCPP体系详解&lpar;0035天&rpar;

    程序片段(01):CircleList.h+CircleList.c+main.c 内容概要:环形链表 ///CircleList.h #pragma once #include <stdio. ...

  8. SSH网上商城---使用ajax完成用户名是否存在异步校验

    小伙伴在上网的时候,需要下载或者观看某些视频资料,更或者是在逛淘宝的时候,我们都需要注册一个用户,当我们填写好各种信息,点击确定的时候,提示用户名已经存在,小编就想,为什么当我们填写完用户名的时候,她 ...

  9. lambda表达式初步

    // Lambda_test20140801.cpp : 定义控制台应用程序的入口点. // #include "stdafx.h" #include <algorithm& ...

  10. c&plus;&plus;基础学习

    1.输入输出函数(cout,cin) #include<iostream> int main() { using namespace std; cout<<"Come ...