还是说我的采集规则不正确呢?
并且采集出来的中文是乱码。
这是什么问题呢?
24 个解决方案
#1
不完整。采集规则有问题吧。
乱码。就是编码的问题咯
乱码。就是编码的问题咯
#2
代码呢。
#3
如果采集的时候有应用过滤或者正则的话,那应该要检查下过滤或正则有没有正确
#4
//1、提取网页的纯文本
HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create("http://community.csdn.net/");
myReq.Accept = "Accept-Language:zh-cn";
myReq.Referer = "http://quote.morningstar.com/fund/f.aspx?Country=USA&Symbol=AGDA";
myReq.MaximumAutomaticRedirections = 1;
myReq.AllowAutoRedirect = true;
HttpWebResponse myres = (HttpWebResponse)myReq.GetResponse();//
Stream resStream = myres.GetResponseStream();
StreamReader sr = new StreamReader(resStream, System.Text.Encoding.GetEncoding("GB2312"));
char[] buff = new char[20000];
int c = 0; //实际读取的字节数
while ((c = sr.Read(buff, 0, buff.Length)) > 0)//将抓到的代码逐步放到数组里面。
{
Response.Write(buff, 0, c);
}
//2、去所有html标签和javascript代码
//html为得到的html纯文本
string Htmlstring = "";
foreach (char c1 in buff)
{
Htmlstring += c1.ToString();
}
//删除脚本
Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
//删除HTML
Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
// add new 2006-12-30
Htmlstring = Regex.Replace(Htmlstring, @"<.*?>", "", RegexOptions.IgnoreCase);// 清除所有标签
Htmlstring = Regex.Replace(Htmlstring, @"<script.*>[\s\S]*?</script>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<td.*?>", "", RegexOptions.IgnoreCase);// 清除td
Htmlstring = Regex.Replace(Htmlstring, @"</td>", "", RegexOptions.IgnoreCase);// 清除td
Htmlstring = Regex.Replace(Htmlstring, @"<div.*?>", "", RegexOptions.IgnoreCase);// 清除div
Htmlstring = Regex.Replace(Htmlstring, @"</div>", "", RegexOptions.IgnoreCase);// 清除td
Htmlstring.Replace("<", "");
Htmlstring.Replace(">", "");
Htmlstring.Replace("\r\n", "");
Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
string clearScriptPattern = @"]*>(.|n)*?";//前面去掉空格,中间(.|n)*?为非贪婪匹配
string clearStylePattern = @"(.|n)*?";
string clearHtmlPattern = @"<[^>]*>";
string clearSpacePattern = @" | |s";
RegexOptions options = RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled;
string parseResult = Regex.Replace(Htmlstring, clearScriptPattern, "", options);
parseResult = Regex.Replace(parseResult, clearStylePattern, "", options);
parseResult = Regex.Replace(parseResult, clearHtmlPattern, "", options);
parseResult = Regex.Replace(parseResult, clearSpacePattern, " ", options);
//3、提取网页的链接,包括href和frame及iframe
//Get_url_Array(userinput, webtext);
//4、提取网页的title等(其它的标签可依此类推,正则是一样的)
string url = "http://www.cftea.com/images/logo.gif";
WebClient wc = new WebClient();
wc.Credentials = CredentialCache.DefaultCredentials;
Byte[] pages = wc.DownloadData(url);
string pageHtml = Encoding.GetEncoding("GB2312").GetString(pages);
Match charSetMatch = Regex.Match(pageHtml, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string getcode = charSetMatch.Groups[2].Value;
pageHtml = Encoding.GetEncoding(getcode).GetString(pages);
Match title = Regex.Match(pageHtml, "<title>(.*)</title>");
this.NE_Title.Text = title.Groups[1].Value.ToString();//获得标题
string meatregex = "<meta" + @"\s+" + "name=\"description\"" + @"\s+" + "content=\"(?<content>[^\"" + @"\<\>" + "]*)\"";
Match description = Regex.Match(pageHtml, meatregex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
Label1.Text = description.Groups[1].Value;
//5、可以实现简单的表单提交及cookie保存
#5
我的代码编码都有转换啊,但是好象没有什么作用?
#6
static class WebFunc
{
private static CookieContainer cookie = new CookieContainer();
private static string contentType = "application/x-www-form-urlencoded";
private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
public static string GetHtmlEx(string url, Encoding encoding)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.UserAgent = userAgent;
request.ContentType = contentType;
request.CookieContainer = cookie;
request.Accept = accept;
request.Method = "get";
WebResponse response = request.GetResponse();
Stream responseStream = response.GetResponseStream();
StreamReader reader = new StreamReader(responseStream, encoding);
String html = reader.ReadToEnd();
response.Close();
return html;
}
}
//调用
WebFunc.GetHtmlEx("网址", Encoding.UTF8)
#7
这位大哥的编码没有乱了,但每个网站的编码格式都不一样啊,怎样通用所有的网站呢?
#8
而且采集到的html代码不完整啊,只有头部部分采集到了。
#9
这位大哥的编码没有乱了,但每个网站的编码格式都不一样啊,怎样通用所有的网站呢?
==
这个不可能的,你可以给个xml文件,里面存放了每个网站的相关配置信息,采集的时候读这个配置文件即可
比如
==
这个不可能的,你可以给个xml文件,里面存放了每个网站的相关配置信息,采集的时候读这个配置文件即可
比如
<Media>
<ClassName>xxx</ClassName>
<Name>xxx</Name>
<Site>http://www.xxx.com</Site>
<Encoding>UTF-8</Encoding>
<Enable>true</Enable>
<IsGatherAllData>false</IsGatherAllData>
<GatherPageCount>10</GatherPageCount>
<DelayMilliSeconds>0</DelayMilliSeconds>
#10
恩恩,谢谢,那网站采集不完整是什么问题呢?
#11
#12
现在已经把html全部采集到了,问下如题第5步该怎么实现?
#13
路过,
#14
没人知道4楼的第5步代码吗?
#15
各位大哥帮帮忙,能否贴个代码?不胜感激!
#16
楼主,我贴的就是高歌写的。原版来给你解释了。
#17
呵呵,这位大哥厉害。小妹还需要多学习学习啊!
#18
#19
什么什么啊!
#20
好奇怪,你从哪里来的我的代码
#21
你非常罕见非常罕见的回复过一个正则问题。。。
#22
//5、可以实现简单的表单提交及cookie保存
模拟表单提交的时候 request.Method = "post";
使用的cookie就用刚才获得的cookie
模拟表单提交的时候 request.Method = "post";
使用的cookie就用刚才获得的cookie
#23
呵呵,好像是的。
你和过客都没回复,我就献丑了
#24
路过,
#1
不完整。采集规则有问题吧。
乱码。就是编码的问题咯
乱码。就是编码的问题咯
#2
代码呢。
#3
如果采集的时候有应用过滤或者正则的话,那应该要检查下过滤或正则有没有正确
#4
//1、提取网页的纯文本
HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create("http://community.csdn.net/");
myReq.Accept = "Accept-Language:zh-cn";
myReq.Referer = "http://quote.morningstar.com/fund/f.aspx?Country=USA&Symbol=AGDA";
myReq.MaximumAutomaticRedirections = 1;
myReq.AllowAutoRedirect = true;
HttpWebResponse myres = (HttpWebResponse)myReq.GetResponse();//
Stream resStream = myres.GetResponseStream();
StreamReader sr = new StreamReader(resStream, System.Text.Encoding.GetEncoding("GB2312"));
char[] buff = new char[20000];
int c = 0; //实际读取的字节数
while ((c = sr.Read(buff, 0, buff.Length)) > 0)//将抓到的代码逐步放到数组里面。
{
Response.Write(buff, 0, c);
}
//2、去所有html标签和javascript代码
//html为得到的html纯文本
string Htmlstring = "";
foreach (char c1 in buff)
{
Htmlstring += c1.ToString();
}
//删除脚本
Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
//删除HTML
Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
// add new 2006-12-30
Htmlstring = Regex.Replace(Htmlstring, @"<.*?>", "", RegexOptions.IgnoreCase);// 清除所有标签
Htmlstring = Regex.Replace(Htmlstring, @"<script.*>[\s\S]*?</script>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<td.*?>", "", RegexOptions.IgnoreCase);// 清除td
Htmlstring = Regex.Replace(Htmlstring, @"</td>", "", RegexOptions.IgnoreCase);// 清除td
Htmlstring = Regex.Replace(Htmlstring, @"<div.*?>", "", RegexOptions.IgnoreCase);// 清除div
Htmlstring = Regex.Replace(Htmlstring, @"</div>", "", RegexOptions.IgnoreCase);// 清除td
Htmlstring.Replace("<", "");
Htmlstring.Replace(">", "");
Htmlstring.Replace("\r\n", "");
Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
string clearScriptPattern = @"]*>(.|n)*?";//前面去掉空格,中间(.|n)*?为非贪婪匹配
string clearStylePattern = @"(.|n)*?";
string clearHtmlPattern = @"<[^>]*>";
string clearSpacePattern = @" | |s";
RegexOptions options = RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled;
string parseResult = Regex.Replace(Htmlstring, clearScriptPattern, "", options);
parseResult = Regex.Replace(parseResult, clearStylePattern, "", options);
parseResult = Regex.Replace(parseResult, clearHtmlPattern, "", options);
parseResult = Regex.Replace(parseResult, clearSpacePattern, " ", options);
//3、提取网页的链接,包括href和frame及iframe
//Get_url_Array(userinput, webtext);
//4、提取网页的title等(其它的标签可依此类推,正则是一样的)
string url = "http://www.cftea.com/images/logo.gif";
WebClient wc = new WebClient();
wc.Credentials = CredentialCache.DefaultCredentials;
Byte[] pages = wc.DownloadData(url);
string pageHtml = Encoding.GetEncoding("GB2312").GetString(pages);
Match charSetMatch = Regex.Match(pageHtml, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string getcode = charSetMatch.Groups[2].Value;
pageHtml = Encoding.GetEncoding(getcode).GetString(pages);
Match title = Regex.Match(pageHtml, "<title>(.*)</title>");
this.NE_Title.Text = title.Groups[1].Value.ToString();//获得标题
string meatregex = "<meta" + @"\s+" + "name=\"description\"" + @"\s+" + "content=\"(?<content>[^\"" + @"\<\>" + "]*)\"";
Match description = Regex.Match(pageHtml, meatregex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
Label1.Text = description.Groups[1].Value;
//5、可以实现简单的表单提交及cookie保存
#5
我的代码编码都有转换啊,但是好象没有什么作用?
#6
static class WebFunc
{
private static CookieContainer cookie = new CookieContainer();
private static string contentType = "application/x-www-form-urlencoded";
private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
public static string GetHtmlEx(string url, Encoding encoding)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.UserAgent = userAgent;
request.ContentType = contentType;
request.CookieContainer = cookie;
request.Accept = accept;
request.Method = "get";
WebResponse response = request.GetResponse();
Stream responseStream = response.GetResponseStream();
StreamReader reader = new StreamReader(responseStream, encoding);
String html = reader.ReadToEnd();
response.Close();
return html;
}
}
//调用
WebFunc.GetHtmlEx("网址", Encoding.UTF8)
#7
这位大哥的编码没有乱了,但每个网站的编码格式都不一样啊,怎样通用所有的网站呢?
#8
而且采集到的html代码不完整啊,只有头部部分采集到了。
#9
这位大哥的编码没有乱了,但每个网站的编码格式都不一样啊,怎样通用所有的网站呢?
==
这个不可能的,你可以给个xml文件,里面存放了每个网站的相关配置信息,采集的时候读这个配置文件即可
比如
==
这个不可能的,你可以给个xml文件,里面存放了每个网站的相关配置信息,采集的时候读这个配置文件即可
比如
<Media>
<ClassName>xxx</ClassName>
<Name>xxx</Name>
<Site>http://www.xxx.com</Site>
<Encoding>UTF-8</Encoding>
<Enable>true</Enable>
<IsGatherAllData>false</IsGatherAllData>
<GatherPageCount>10</GatherPageCount>
<DelayMilliSeconds>0</DelayMilliSeconds>
#10
恩恩,谢谢,那网站采集不完整是什么问题呢?
#11
#12
现在已经把html全部采集到了,问下如题第5步该怎么实现?
#13
路过,
#14
没人知道4楼的第5步代码吗?
#15
各位大哥帮帮忙,能否贴个代码?不胜感激!
#16
楼主,我贴的就是高歌写的。原版来给你解释了。
#17
呵呵,这位大哥厉害。小妹还需要多学习学习啊!
#18
#19
什么什么啊!
#20
好奇怪,你从哪里来的我的代码
#21
你非常罕见非常罕见的回复过一个正则问题。。。
#22
//5、可以实现简单的表单提交及cookie保存
模拟表单提交的时候 request.Method = "post";
使用的cookie就用刚才获得的cookie
模拟表单提交的时候 request.Method = "post";
使用的cookie就用刚才获得的cookie
#23
呵呵,好像是的。
你和过客都没回复,我就献丑了
#24
路过,