1. 代表任意多个字符:(.*?)
2. 代表网页里的<body>*</body>任意的标签内容,替换以后网页源码就只剩纯文本:<[^>]*>
3. 代表网页中的空格:
4. 四位0~9的数字,一般用来匹配验证码:[0-9]{4}
5. 手机号正则式:\d{11}
6. 密码正则式:[.*|\s*]\w{6,16}$
7. 替换掉所有的换行空格等字符
8. htmltext = htmltext.Replace("\t", "").Replace("\r", "").Replace("\n", "").Replace(" ", "");
9. htmltext = Regex.Replace(htmltext, "<[^>]*>", "");
10. \d 匹配任意数字 \d:匹配一个任意数字 \d*:匹配任意多个任意的数字
HtmlDocument htmldoc = new HtmlDocument(); htmldoc.LoadHtml(htmltext); string htmldoctext = htmldoc.DocumentNode.InnerText; HtmlNode node = htmldoc.GetElementbyId("busi_success"); string divtext = node.InnerHtml; HtmlNodeCollection nodes = htmldoc.DocumentNode.SelectNodes("//table//tr//td"); //添加对Microsoft.mshtml这个程序集的引用,然后编写如下代码: IHTMLDocument2 doc = new HTMLDocumentClass(); doc.write(new object[]{pageSource}); doc.close(); Title = doc.title; Body = doc.body.innerText; public ArrayList GetMathList(string htmltext ,string pattern) Public string GetFirstMathString(string htmltext,string pattern,bool isCut) Public string GetFirstMathString(string htmltext,string pattern)
/// <summary> /// 正则多匹配,返回匹配ArrayList数组 /// </summary> /// <param>网页内容</param> /// <param>模式字符串</param> /// <returns></returns> public ArrayList GetMathList(string htmltext, string pattern) { ArrayList list = new ArrayList(); try { MatchCollection mc; //定义一个Regex对象实例 Regex regex = new Regex("(.*?)", RegexOptions.Singleline | RegexOptions.IgnoreCase); //或者多行匹配模式RegexOptions.Multiline mc = regex.Matches(htmltext); //在输入字符串中找到所有匹配 for (int i = 0; i < mc.Count; i++) { //匹配一条信息就处理 string groupcode = mc[i].Value.ToString(); //处理函数 list.Add(groupcode); } } catch (Exception) { return null; } return list; }
//正则单匹配模式 Regex regex = new Regex("(.*?)", RegexOptions.Singleline); //text为要匹配的源字符串 Match match = regex.Match(htmltext); //匹配成功,获取结果 if (match.Success) { string result1 = match.ToString(); //对匹配出来的结果进行非目标值替换 string result2 = Regex.Replace(result1, "(.*?)",""); //rs是目标字符串 rs = Regex.Replace(result2, "(.*?)", ""); } //正则多匹配模式 MatchCollection mc; //定义一个Regex对象实例 Regex regex = new Regex("(.*?)", RegexOptions.Singleline | RegexOptions.IgnoreCase); //或者多行匹配模式RegexOptions.Multiline mc = regex.Matches(groupcodeHtml); //在输入字符串中找到所有匹配 for (int i = 0; i < mc.Count; i++) { //匹配一条信息就处理 string groupcode = mc[i].Value.ToString(); //处理函数 }
/// <summary> /// 正则表达式dan匹配方法 /// </summary> /// <param>网页内容</param> /// <param>模式字符串</param> /// <param>返回匹配成功的字符串</param> /// <returns>匹配是否成功</returns> public static bool GetMatchStr(string htmltext, string pattern, out string result) { bool IsGetSuccess = false; result = ""; try { string[] replaceStrs=new string[2]; if (pattern.Contains("(.*?)")) { string splitStr = pattern.Replace("(.*?)", "|"); replaceStrs = splitStr.Split(‘|‘); } Regex regex = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase); Match match = regex.Match(htmltext); if (match.Success) { result = match.ToString(); result = result.Replace(replaceStrs[0], "").Replace(replaceStrs[1], ""); } else { IsGetSuccess = false; } } catch (Exception ex) { IsGetSuccess = false; } finally { if (!string.IsNullOrEmpty(result)) { IsGetSuccess = true; } else { IsGetSuccess = false; } } return IsGetSuccess; }
字符 描述 \