在项目中会经常用正则表达式过滤html,比如得到Body里面的内容,获取网页中的img,a标签,或者得到纯文本等等。
下面的Demo 实现对Html的过滤
主要用到的类:
1、System.Text.RegularExpressions; // 正则表达
2、System.IO; // IO流
3、System.Net; //
第一步:搭建简易前台页面
<form id="form1" runat="server"> <div> 目标源地址:<asp:TextBox ID="tbUrl" runat="server"></asp:TextBox></div> <br /> <asp:TextBox runat="server" TextMode="MultiLine" Width="500px" Height="500px" ID="tbCode"></asp:TextBox> <br /> <asp:Button ID="btnRetrieveAll" runat="server" Text="搜索整个Html源码" OnClick="btnRetrieveAll_Click" /> <asp:Button ID="btnRetrievePureTxt" runat="server" Text="搜索纯文本" OnClick="btnRetrievePureTxt_Click" /> <asp:Button ID="btnRetrieveLink" runat="server" Text="搜索链接标签" OnClick="btnRetrieveLink_Click" /> <asp:Button ID="btnRetrieveImg" runat="server" Text="搜索图片标签" onclick="btnRetrieveImg_Click" /> <asp:Button ID="btnRetriveScript" runat="server" Text="搜索脚本" onclick="btnRetriveScript_Click" /> </form>
第二步:定义类级变量
string strUrl = String.Empty; string strWholeHtml = string.Empty; const string MsgPageRetrieveFailed = "对不起,网页运行失败!"; bool flgPageRetrieved = true;
第三步:根据目标源取目标html源码
/// <summary> /// 用WebRequest和WebRespond从SourcePage.aspx中检索完整的html代码 /// 我们把html代码的格式转换为uft-8. /// </summary> /// <param name="url"></param> /// <returns></returns> public string GetWholeHtmlCode(string url) { string strHtml = string.Empty; StreamReader strReader = null; HttpWebResponse wrpContent = null; try { HttpWebRequest wrqContent = (HttpWebRequest)WebRequest.Create(strUrl); wrqContent.Timeout = 300000; wrpContent = (HttpWebResponse)wrqContent.GetResponse(); if (wrpContent.StatusCode != HttpStatusCode.OK) { flgPageRetrieved = false; strHtml = "对不起,网页运行失败"; } if (wrpContent != null) { strReader = new StreamReader(wrpContent.GetResponseStream(), Encoding.GetEncoding("utf-8")); strHtml = strReader.ReadToEnd(); } } catch (Exception e) { flgPageRetrieved = false; strHtml = e.Message; } finally { if (strReader != null) strReader.Close(); if (wrpContent != null) wrpContent.Close(); } return strHtml; }
目标URL源html码
protected void btnRetrieveAll_Click(object sender, EventArgs e) { strUrl = TextBox1.Text; strWholeHtml = this.GetWholeHtmlCode(strUrl); if (flgPageRetrieved) { tbResult.Text = strWholeHtml; } else { tbResult.Text = MsgPageRetrieveFailed; } }
Html源纯文本
/// <summary> /// 从html代码里搜索纯文本,这个纯文本只包括html的 /// Body标记. /// </summary> /// <param name="sender"></param> /// <param name="e"></param> protected void btnRetrievePureText_Click(object sender, EventArgs e) { strWholeHtml = this.GetWholeHtmlCode(strUrl); if (flgPageRetrieved) { string strRegexScript = @"(?m)<body[^>]*>(\w|\W)*?</body[^>]*>"; string strRegex = @"<[^>]*>"; string strMatchScript = string.Empty; Match matchText = Regex.Match(strWholeHtml, strRegexScript, RegexOptions.IgnoreCase); strMatchScript = matchText.Groups[0].Value; string strPureText = Regex.Replace(strMatchScript, strRegex, string.Empty, RegexOptions.IgnoreCase); tbResult.Text = strPureText; } else { tbResult.Text = MsgPageRetrieveFailed; } }
获取脚本代码
/// <summary> /// 从html代码中检索脚本代码. /// </summary> /// <param name="sender"></param> /// <param name="e"></param> protected void btnRetrieveSriptCode_Click(object sender, EventArgs e) { strWholeHtml = this.GetWholeHtmlCode(strUrl); if (flgPageRetrieved) { string strRegexScript = @"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>"; string strRegex = @"<[^>]*>"; string strMatchScript = string.Empty; MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexScript, RegexOptions.IgnoreCase); StringBuilder strbScriptList = new StringBuilder(); foreach (Match matchSingleScript in matchList) { string strSingleScriptText = Regex.Replace(matchSingleScript.Value, strRegex, string.Empty, RegexOptions.IgnoreCase); strbScriptList.Append(strSingleScriptText + "\r\n"); } tbResult.Text = strbScriptList.ToString(); } else { tbResult.Text = MsgPageRetrieveFailed; } }
获取图片img
/// <summary> /// 从html代码中检索图片信息. /// </summary> /// <param name="sender"></param> /// <param name="e"></param> protected void btnRetrieveImage_Click(object sender, EventArgs e) { strWholeHtml = this.GetWholeHtmlCode(strUrl); if (flgPageRetrieved) { string strRegexImg = @"(?is)<img.*?>"; MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexImg, RegexOptions.IgnoreCase); StringBuilder strbImageList = new StringBuilder(); foreach (Match matchSingleImage in matchList) { strbImageList.Append(matchSingleImage.Value + "\r\n"); } tbResult.Text = strbImageList.ToString(); } else { tbResult.Text = MsgPageRetrieveFailed; } }
html链接
/// <summary> /// 从html代码中检索链接. /// </summary> /// <param name="sender"></param> /// <param name="e"></param> protected void btnRetrievelink_Click(object sender, EventArgs e) { strUrl = TextBox1.Text; strWholeHtml = this.GetWholeHtmlCode(strUrl); if (flgPageRetrieved) { string strRegexLink = @"(?is)<a .*?>"; MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexLink, RegexOptions.IgnoreCase); StringBuilder strbLinkList = new StringBuilder(); foreach (Match matchSingleLink in matchList) { strbLinkList.Append(matchSingleLink.Value + "\r\n"); } tbResult.Text = strbLinkList.ToString(); } else { tbResult.Text = MsgPageRetrieveFailed; } }
这个Demo能满足大多数的过滤Html 需求。