实现从Web网页提取文本之前,首先要识别网页的编码,有时候还需要进一步识别网页所使用的语言。因为同一种编码可能对应多种语言,例如UTF-8编码可能对应英文或中文等语言。
识别编码整体流程如下:
(1)从WEB服务器返回的content type头信息中提取编码,如果是GB2312的编码要当GBK处理。
(2)从网页mate标签中识别字符编码,如果content type中的编码不一致,以meta中声明的编码为准。
(3)如果仍然无法确定网页所使用的字符集,需要从返回流的二进制格式判断。
(4)确定网页所使用的语言,往往采用统计的方法来估计网页的语言。
判断编码的完整过程如下:(c#代码)
/// <summary>
/// 函数名称:GetDataFromUrl
/// 功能说明:获取url指定的网页的源码
/// 参数:string url用于指定 url
/// 参数:ref Encoding encode用来获取网页中的字符集编码
/// </summary>
public static string GetDataFromUrl(string url, ref Encoding encode)
{
string str = string.Empty;
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url); //设置http头
request.AllowAutoRedirect = true;
request.AllowWriteStreamBuffering = true;
request.Referer = "";
request.Timeout = * ;
request.UserAgent = "";
HttpWebResponse response = null;
response = (HttpWebResponse)request.GetResponse(); //根据http应答的http头来判断编码
string characterSet = response.CharacterSet;
//Encoding encode;
if (characterSet != "")
{
if (characterSet == "ISO-8859-1")
{
characterSet = "gb2312";
}
encode = Encoding.GetEncoding(characterSet);
}
else
{
encode = Encoding.Default;
} //声明一个内存流来保存http应答流
Stream receiveStream = response.GetResponseStream();
MemoryStream mStream = new MemoryStream(); byte[] bf = new byte[];
int count = receiveStream.Read(bf, , );
while (count > )
{
mStream.Write(bf, , count);
count = receiveStream.Read(bf, , );
}
receiveStream.Close(); mStream.Seek(, SeekOrigin.Begin); //从内存流里读取字符串
StreamReader reader = new StreamReader(mStream, encode);
char[] buffer = new char[];
count = reader.Read(buffer, , );
while (count > )
{
str += new String(buffer, , count);
count = reader.Read(buffer, , );
} //从解析出的字符串里判断charset,如果和http应答的编码不一直
//那么以页面声明的为准,再次从内存流里重新读取文本
Regex reg =
new Regex(@"<meta[\s\S]+?charset=(.*?)""[\s\S]+?>",
RegexOptions.Multiline | RegexOptions.IgnoreCase);
MatchCollection mc = reg.Matches(str);
if (mc.Count > )
{
string tempCharSet = mc[].Result("$1");
if (string.Compare(tempCharSet, characterSet, true) != )
{
encode = Encoding.GetEncoding(tempCharSet);
str = string.Empty;
mStream.Seek(, SeekOrigin.Begin);
reader = new StreamReader(mStream, encode);
buffer = new char[];
count = reader.Read(buffer, , );
while (count > )
{
str += new String(buffer, , count);
count = reader.Read(buffer, , );
}
}
}
reader.Close();
mStream.Close();
if (response != null)
response.Close(); return str; }