一段多编码兼容的 C# 网页读取关键代码

时间:2023-01-10 20:54:24

/// <summary>
/// 读取 Web 页面的内容。
/// </summary>
/// <param name="url">Web 页面的 URL。</param>
/// <param name="useProxy">True 则使用程序配置的代理服务器(如果有),False 则不使用。</param>
/// <param name="streamEnc">指定文本编码。当为 null 时,程序将尝试从 Response Headers 以及 Meta Tags(如果 readMetaData 为 true)中找到编码方式,如果找不到,则使用 Encoding.Default。</param>
/// <param name="readMetaData">指定是否读取 Meta Tags 标记以判断文本编码。操作可能会略延长程序时间。</param>
/// <returns>返回 Web 页面的内容。</returns>
public static string GetPageContent(
    string url,
    bool useProxy,
    Encoding streamEnc,
    bool readMetaData)
{
    WebRequest wr = null;
    WebResponse rsp = null;
    WebProxy wp = null;
    Stream st = null;
    StreamReader reader = null;
    MemoryStream ms = null;
    BinaryReader br = null;
    byte[] buffer = null;
    byte[] cnt = null;
    Encoding enc = streamEnc;

    try
    {
        // Create the webrequest
        wr = WebRequest.Create(url);

        // Init The Proxy Server
        if (NetConfig.UseProxy && useProxy)
        {
            wp = new WebProxy(NetConfig.ProxyServer, NetConfig.ProxyPort);
            wp.BypassProxyOnLocal = true;
            wp.UseDefaultCredentials = false;

            if (NetConfig.UseProxyCredential)
                wp.Credentials = new NetworkCredential(NetConfig.ProxyUser, NetConfig.ProxyPass);

            wr.Proxy = wp;
        }

        // Get the response
        rsp = wr.GetResponse();
        st = rsp.GetResponseStream();

        if (enc == null)
        {
            // Copy the stream to a byte array
            br = new BinaryReader(st);
            ms = new MemoryStream();
            while ((buffer = br.ReadBytes(8192)) != null
                && buffer.Length > 0)
            {
                ms.Write(buffer, 0, buffer.Length);
            }
            cnt = ms.ToArray();

            // clear up
            br.Close(); br = null;
            ms.Close(); ms = null;
            st.Close(); st = null;

            // Try to get the encoding
            if (enc == null)
            {
                int j = 0;
                string encStr = "";

                // try read the header
                if (rsp.Headers[HttpResponseHeader.ContentType] != null)
                {
                    j = rsp.Headers[HttpResponseHeader.ContentType].IndexOf("charset=", StringComparison.OrdinalIgnoreCase);
                    if (j >= 0)
                    {
                        encStr = rsp.Headers[HttpResponseHeader.ContentType].Substring(j + "charset=".Length).Trim();
                        if (!string.IsNullOrEmpty(encStr))
                        {
                            try
                            {
                                enc = Encoding.GetEncoding(encStr);
                            }
                            catch { }
                        }
                    }
                }

                // try read the meta data
                if (enc == null && readMetaData)
                {
                    string pgCnt = "";
                    pgCnt = Encoding.Default.GetString(cnt);
                    Regex regex = new Regex("<meta[^<>]+content=\"[^\"]*charset=(?<charset>[^\"]*)\"");
                    Match mc = regex.Match(pgCnt);
                    if (mc != null && mc.Success)
                    {
                        encStr = mc.Groups["charset"].Value;
                        if (encStr.Equals(Encoding.Default.WebName, StringComparison.OrdinalIgnoreCase))
                        {
                            return pgCnt;
                        }

                        if (!string.IsNullOrEmpty(encStr))
                        {
                            try
                            {
                                enc = Encoding.GetEncoding(encStr);
                            }
                            catch { }
                        }
                    }
                }
            }

            if (enc == null)
                enc = Encoding.Default;

            // 正式读取内容
            return enc.GetString(cnt);
        }
        else
        {
            // 如果已经指定了 Encoding
            // 不需要那么多的步骤
            reader = new StreamReader(st, enc);
            return reader.ReadToEnd();
        }
    }
    catch (Exception ex)
    {
        Debugger.LogException(ex);
        return null;
    }
    finally
    {
        if (br != null)
            br.Close();
        if (ms != null)
            ms.Close();
        if (reader != null)
            reader.Close();
        if (st != null)
            st.Close();
        if (rsp != null)
            rsp.Close();
    }
}