采集到的html代码不完整，是bug吗？

采集到的html代码不完整，是bug吗？

还是说我的采集规则不正确呢？

并且采集出来的中文是乱码。

这是什么问题呢？

24 个解决方案

#1

不完整。采集规则有问题吧。
乱码。就是编码的问题咯

#2

代码呢。

#3

如果采集的时候有应用过滤或者正则的话，那应该要检查下过滤或正则有没有正确

#4


//1、提取网页的纯文本 
        HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create("http://community.csdn.net/");
        myReq.Accept = "Accept-Language:zh-cn";
        myReq.Referer = "http://quote.morningstar.com/fund/f.aspx?Country=USA&Symbol=AGDA";
        myReq.MaximumAutomaticRedirections = 1;
        myReq.AllowAutoRedirect = true;
        HttpWebResponse myres = (HttpWebResponse)myReq.GetResponse();//
        Stream resStream = myres.GetResponseStream();
        StreamReader sr = new StreamReader(resStream, System.Text.Encoding.GetEncoding("GB2312"));

        char[] buff = new char[20000];
        int c = 0; //实际读取的字节数
        while ((c = sr.Read(buff, 0, buff.Length)) > 0)//将抓到的代码逐步放到数组里面。
        {
            Response.Write(buff, 0, c);
        }

        //2、去所有html标签和javascript代码
        //html为得到的html纯文本
        string Htmlstring = "";
        foreach (char c1 in buff)
        {
            Htmlstring += c1.ToString();
        }

        //删除脚本
        Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
        //删除HTML
        Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
        //      add new 2006-12-30
        Htmlstring = Regex.Replace(Htmlstring, @"<.*?>", "", RegexOptions.IgnoreCase);//      清除所有标签
        Htmlstring = Regex.Replace(Htmlstring, @"<script.*>[\s\S]*?</script>", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"<td.*?>", "", RegexOptions.IgnoreCase);//      清除td
        Htmlstring = Regex.Replace(Htmlstring, @"</td>", "", RegexOptions.IgnoreCase);//      清除td
        Htmlstring = Regex.Replace(Htmlstring, @"<div.*?>", "", RegexOptions.IgnoreCase);//      清除div
        Htmlstring = Regex.Replace(Htmlstring, @"</div>", "", RegexOptions.IgnoreCase);//      清除td

        Htmlstring.Replace("<", "");
        Htmlstring.Replace(">", "");
        Htmlstring.Replace("\r\n", "");
        Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
        string clearScriptPattern = @"]*>(.|n)*?";//前面去掉空格，中间(.|n)*?为非贪婪匹配
        string clearStylePattern = @"(.|n)*?";
        string clearHtmlPattern = @"<[^>]*>";
        string clearSpacePattern = @" |&nbsp|s";
        RegexOptions options = RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled;

        string parseResult = Regex.Replace(Htmlstring, clearScriptPattern, "", options);
        parseResult = Regex.Replace(parseResult, clearStylePattern, "", options);
        parseResult = Regex.Replace(parseResult, clearHtmlPattern, "", options);
        parseResult = Regex.Replace(parseResult, clearSpacePattern, " ", options);

        //3、提取网页的链接，包括href和frame及iframe
        //Get_url_Array(userinput, webtext);

        //4、提取网页的title等(其它的标签可依此类推，正则是一样的) 
        string url = "http://www.cftea.com/images/logo.gif";
        WebClient wc = new WebClient();
        wc.Credentials = CredentialCache.DefaultCredentials;
        Byte[] pages = wc.DownloadData(url);
        string pageHtml = Encoding.GetEncoding("GB2312").GetString(pages);
        Match charSetMatch = Regex.Match(pageHtml, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
        string getcode = charSetMatch.Groups[2].Value;
        pageHtml = Encoding.GetEncoding(getcode).GetString(pages);
        Match title = Regex.Match(pageHtml, "<title>(.*)</title>");
        this.NE_Title.Text = title.Groups[1].Value.ToString();//获得标题
        string meatregex = "<meta" + @"\s+" + "name=\"description\"" + @"\s+" + "content=\"(?<content>[^\"" + @"\<\>" + "]*)\"";
        Match description = Regex.Match(pageHtml, meatregex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
        Label1.Text = description.Groups[1].Value;


        //5、可以实现简单的表单提交及cookie保存

#5

引用 1 楼 *jie 的回复:

不完整。采集规则有问题吧。
乱码。就是编码的问题咯

我的代码编码都有转换啊，但是好象没有什么作用？

#6


static class WebFunc
{
    private static CookieContainer cookie = new CookieContainer();
    private static string contentType = "application/x-www-form-urlencoded";
    private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
    private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";

    public static string GetHtmlEx(string url, Encoding encoding)
    {
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
        request.UserAgent = userAgent;
        request.ContentType = contentType;
        request.CookieContainer = cookie;
        request.Accept = accept;
        request.Method = "get";

        WebResponse response = request.GetResponse();
        Stream responseStream = response.GetResponseStream();
        StreamReader reader = new StreamReader(responseStream, encoding);
        String html = reader.ReadToEnd();
        response.Close();

        return html;
    }
}

//调用
WebFunc.GetHtmlEx("网址", Encoding.UTF8)

#7

引用 6 楼 wuyazhe 的回复:

C# code

static class WebFunc
{
    private static CookieContainer cookie = new CookieContainer();
    private static string contentType = "application/x-www-form-urlencoded";
    private static……

这位大哥的编码没有乱了，但每个网站的编码格式都不一样啊，怎样通用所有的网站呢？

#8

而且采集到的html代码不完整啊，只有头部部分采集到了。

#9

这位大哥的编码没有乱了，但每个网站的编码格式都不一样啊，怎样通用所有的网站呢？
==
这个不可能的，你可以给个xml文件，里面存放了每个网站的相关配置信息，采集的时候读这个配置文件即可

比如

    <Media>
        <ClassName>xxx</ClassName>
        <Name>xxx</Name>
        <Site>http://www.xxx.com</Site>
        <Encoding>UTF-8</Encoding>
        <Enable>true</Enable>
        <IsGatherAllData>false</IsGatherAllData>
        <GatherPageCount>10</GatherPageCount>
        <DelayMilliSeconds>0</DelayMilliSeconds>

#10

引用 9 楼 amandag 的回复:

这位大哥的编码没有乱了，但每个网站的编码格式都不一样啊，怎样通用所有的网站呢？
==
这个不可能的，你可以给个xml文件，里面存放了每个网站的相关配置信息，采集的时候读这个配置文件即可

比如

XML code
    <Media>
        <ClassName>xxx</ClassName>
        <Name>xxx</Name>
        <Si……

恩恩，谢谢，那网站采集不完整是什么问题呢？

#11

该回复于2010-07-27 16:07:26被版主删除

#12

现在已经把html全部采集到了，问下如题第5步该怎么实现？

#13

路过,

#14

没人知道4楼的第5步代码吗？

#15

各位大哥帮帮忙，能否贴个代码？不胜感激！

#16

引用 9 楼 amandag 的回复:

楼主，我贴的就是高歌写的。原版来给你解释了。

#17

引用 16 楼 wuyazhe 的回复:

引用 9 楼 amandag 的回复:

楼主，我贴的就是高歌写的。原版来给你解释了。

呵呵，这位大哥厉害。小妹还需要多学习学习啊！

#18

该回复于2010-07-27 17:46:46被版主删除

#19

什么什么啊！

#20

引用 16 楼 wuyazhe 的回复:

引用 9 楼 amandag 的回复:

楼主，我贴的就是高歌写的。原版来给你解释了。

好奇怪，你从哪里来的我的代码

#21

引用 20 楼 amandag 的回复:

引用 16 楼 wuyazhe 的回复:
引用 9 楼 amandag 的回复:

楼主，我贴的就是高歌写的。原版来给你解释了。

好奇怪，你从哪里来的我的代码

你非常罕见非常罕见的回复过一个正则问题。。。

#22

//5、可以实现简单的表单提交及cookie保存
模拟表单提交的时候 request.Method = "post";
使用的cookie就用刚才获得的cookie

#23

引用 21 楼 wuyazhe 的回复:

你非常罕见非常罕见的回复过一个正则问题。。。

呵呵，好像是的。

你和过客都没回复，我就献丑了

#24

路过,

#1

不完整。采集规则有问题吧。
乱码。就是编码的问题咯

#2

代码呢。

#3

如果采集的时候有应用过滤或者正则的话，那应该要检查下过滤或正则有没有正确

#4


//1、提取网页的纯文本 
        HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create("http://community.csdn.net/");
        myReq.Accept = "Accept-Language:zh-cn";
        myReq.Referer = "http://quote.morningstar.com/fund/f.aspx?Country=USA&Symbol=AGDA";
        myReq.MaximumAutomaticRedirections = 1;
        myReq.AllowAutoRedirect = true;
        HttpWebResponse myres = (HttpWebResponse)myReq.GetResponse();//
        Stream resStream = myres.GetResponseStream();
        StreamReader sr = new StreamReader(resStream, System.Text.Encoding.GetEncoding("GB2312"));

        char[] buff = new char[20000];
        int c = 0; //实际读取的字节数
        while ((c = sr.Read(buff, 0, buff.Length)) > 0)//将抓到的代码逐步放到数组里面。
        {
            Response.Write(buff, 0, c);
        }

        //2、去所有html标签和javascript代码
        //html为得到的html纯文本
        string Htmlstring = "";
        foreach (char c1 in buff)
        {
            Htmlstring += c1.ToString();
        }

        //删除脚本
        Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
        //删除HTML
        Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
        //      add new 2006-12-30
        Htmlstring = Regex.Replace(Htmlstring, @"<.*?>", "", RegexOptions.IgnoreCase);//      清除所有标签
        Htmlstring = Regex.Replace(Htmlstring, @"<script.*>[\s\S]*?</script>", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"<td.*?>", "", RegexOptions.IgnoreCase);//      清除td
        Htmlstring = Regex.Replace(Htmlstring, @"</td>", "", RegexOptions.IgnoreCase);//      清除td
        Htmlstring = Regex.Replace(Htmlstring, @"<div.*?>", "", RegexOptions.IgnoreCase);//      清除div
        Htmlstring = Regex.Replace(Htmlstring, @"</div>", "", RegexOptions.IgnoreCase);//      清除td

        Htmlstring.Replace("<", "");
        Htmlstring.Replace(">", "");
        Htmlstring.Replace("\r\n", "");
        Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
        string clearScriptPattern = @"]*>(.|n)*?";//前面去掉空格，中间(.|n)*?为非贪婪匹配
        string clearStylePattern = @"(.|n)*?";
        string clearHtmlPattern = @"<[^>]*>";
        string clearSpacePattern = @" |&nbsp|s";
        RegexOptions options = RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled;

        string parseResult = Regex.Replace(Htmlstring, clearScriptPattern, "", options);
        parseResult = Regex.Replace(parseResult, clearStylePattern, "", options);
        parseResult = Regex.Replace(parseResult, clearHtmlPattern, "", options);
        parseResult = Regex.Replace(parseResult, clearSpacePattern, " ", options);

        //3、提取网页的链接，包括href和frame及iframe
        //Get_url_Array(userinput, webtext);

        //4、提取网页的title等(其它的标签可依此类推，正则是一样的) 
        string url = "http://www.cftea.com/images/logo.gif";
        WebClient wc = new WebClient();
        wc.Credentials = CredentialCache.DefaultCredentials;
        Byte[] pages = wc.DownloadData(url);
        string pageHtml = Encoding.GetEncoding("GB2312").GetString(pages);
        Match charSetMatch = Regex.Match(pageHtml, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
        string getcode = charSetMatch.Groups[2].Value;
        pageHtml = Encoding.GetEncoding(getcode).GetString(pages);
        Match title = Regex.Match(pageHtml, "<title>(.*)</title>");
        this.NE_Title.Text = title.Groups[1].Value.ToString();//获得标题
        string meatregex = "<meta" + @"\s+" + "name=\"description\"" + @"\s+" + "content=\"(?<content>[^\"" + @"\<\>" + "]*)\"";
        Match description = Regex.Match(pageHtml, meatregex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
        Label1.Text = description.Groups[1].Value;


        //5、可以实现简单的表单提交及cookie保存

#5

引用 1 楼 *jie 的回复:

不完整。采集规则有问题吧。
乱码。就是编码的问题咯

我的代码编码都有转换啊，但是好象没有什么作用？

#6


static class WebFunc
{
    private static CookieContainer cookie = new CookieContainer();
    private static string contentType = "application/x-www-form-urlencoded";
    private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
    private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";

    public static string GetHtmlEx(string url, Encoding encoding)
    {
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
        request.UserAgent = userAgent;
        request.ContentType = contentType;
        request.CookieContainer = cookie;
        request.Accept = accept;
        request.Method = "get";

        WebResponse response = request.GetResponse();
        Stream responseStream = response.GetResponseStream();
        StreamReader reader = new StreamReader(responseStream, encoding);
        String html = reader.ReadToEnd();
        response.Close();

        return html;
    }
}

//调用
WebFunc.GetHtmlEx("网址", Encoding.UTF8)

#7

引用 6 楼 wuyazhe 的回复:

C# code

static class WebFunc
{
    private static CookieContainer cookie = new CookieContainer();
    private static string contentType = "application/x-www-form-urlencoded";
    private static……

这位大哥的编码没有乱了，但每个网站的编码格式都不一样啊，怎样通用所有的网站呢？

#8

而且采集到的html代码不完整啊，只有头部部分采集到了。

#9

    <Media>
        <ClassName>xxx</ClassName>
        <Name>xxx</Name>
        <Site>http://www.xxx.com</Site>
        <Encoding>UTF-8</Encoding>
        <Enable>true</Enable>
        <IsGatherAllData>false</IsGatherAllData>
        <GatherPageCount>10</GatherPageCount>
        <DelayMilliSeconds>0</DelayMilliSeconds>

#10

引用 9 楼 amandag 的回复:

这位大哥的编码没有乱了，但每个网站的编码格式都不一样啊，怎样通用所有的网站呢？
==
这个不可能的，你可以给个xml文件，里面存放了每个网站的相关配置信息，采集的时候读这个配置文件即可

比如

XML code
    <Media>
        <ClassName>xxx</ClassName>
        <Name>xxx</Name>
        <Si……

恩恩，谢谢，那网站采集不完整是什么问题呢？

#11

该回复于2010-07-27 16:07:26被版主删除

#12

现在已经把html全部采集到了，问下如题第5步该怎么实现？

#13

路过,

#14

没人知道4楼的第5步代码吗？

#15

各位大哥帮帮忙，能否贴个代码？不胜感激！

#16

引用 9 楼 amandag 的回复:

楼主，我贴的就是高歌写的。原版来给你解释了。

#17

引用 16 楼 wuyazhe 的回复:

引用 9 楼 amandag 的回复:

楼主，我贴的就是高歌写的。原版来给你解释了。

呵呵，这位大哥厉害。小妹还需要多学习学习啊！

#18

该回复于2010-07-27 17:46:46被版主删除

#19

什么什么啊！

#20

引用 16 楼 wuyazhe 的回复:

引用 9 楼 amandag 的回复:

楼主，我贴的就是高歌写的。原版来给你解释了。

好奇怪，你从哪里来的我的代码

#21

引用 20 楼 amandag 的回复:

引用 16 楼 wuyazhe 的回复:
引用 9 楼 amandag 的回复:

楼主，我贴的就是高歌写的。原版来给你解释了。

好奇怪，你从哪里来的我的代码

你非常罕见非常罕见的回复过一个正则问题。。。

#22

//5、可以实现简单的表单提交及cookie保存
模拟表单提交的时候 request.Method = "post";
使用的cookie就用刚才获得的cookie

#23

引用 21 楼 wuyazhe 的回复:

你非常罕见非常罕见的回复过一个正则问题。。。

呵呵，好像是的。

你和过客都没回复，我就献丑了

#24

路过,

采集到的html代码不完整，是bug吗？

24 个解决方案

#1

#2

#3

#4

#5

#6

#7

#8

#9

#10

#11

#12

#13

#14

#15

#16

#17

#18

#19

#20

#21

#22

#23

#24

#1

#2

#3

#4

#5

#6

#7

#8

#9

#10

#11

#12

#13

#14

#15

#16

#17

#18

#19

#20

#21

#22

#23

#24

相关文章