如何获得文件的编码格式

我想做一个小工具，能够将文件夹下所有文件中的指定字符进行替换。可是替换后的文件经常有乱码。
别人告诉我应该先获得文件的编码格式，然后在读取文件，
请问如何获得文件的编码格式？文件在重写之前系统内都是能够正常显示的，没有乱码。
但是我用StreamReader objReader读进来是带有乱码的，因为有的文件是UTF8,有的不是

我当初是这样设计的，

使用StreamReader objReader逐行读取文件，将所有的字符放到一个ArrayList arrsource中，然后删除原文件，将ArrayList写入文件

8 个解决方案

#1

UTF8编码的文件已EF BB BF开头

#2

还真没有过多关注过文件编码的问题，马克学习之。

#3

判断完全是不可能的, u8 的也不是一定有那个 BOM 头,
碰到这样的, 我现在用 ICSharpCode.TextEditor 中的一个辅助类,
代码是:



using System;

using System.IO;

using System.Text;


namespace ICSharpCode.TextEditor.Util

{

	/// <summary>

	/// Class that can open text files with auto-detection of the encoding.

	/// </summary>

	public static class FileReader

	{

		public static bool IsUnicode(Encoding encoding)

		{

			int codepage = encoding.CodePage;

			// return true if codepage is any UTF codepage

			return codepage == 65001 || codepage == 65000 || codepage == 1200 || codepage == 1201;

		}


		public static string ReadFileContent(string fileName, ref Encoding encoding, Encoding defaultEncoding)

		{

			using (FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read)) {

				using (StreamReader reader = OpenStream(fs, encoding, defaultEncoding)) {

					encoding = reader.CurrentEncoding;

					return reader.ReadToEnd();

				}

			}

		}


		public static StreamReader OpenStream(FileStream fs, Encoding suggestedEncoding, Encoding defaultEncoding)

		{

			if (fs.Length > 3) {

				// the autodetection of StreamReader is not capable of detecting the difference

				// between ISO-8859-1 and UTF-8 without BOM.

				int firstByte = fs.ReadByte();

				int secondByte = fs.ReadByte();

				switch ((firstByte << 8) | secondByte) {

					case 0x0000: // either UTF-32 Big Endian or a binary file; use StreamReader

					case 0xfffe: // Unicode BOM (UTF-16 LE or UTF-32 LE)

					case 0xfeff: // UTF-16 BE BOM

					case 0xefbb: // start of UTF-8 BOM

						// StreamReader autodetection works

						fs.Position = 0;

						return new StreamReader(fs);

					default:

						return AutoDetect(fs, (byte)firstByte, (byte)secondByte, defaultEncoding);

				}

			} else {

				if (suggestedEncoding != null) {

					return new StreamReader(fs, suggestedEncoding);

				} else {

					return new StreamReader(fs);

				}

			}

		}


		static StreamReader AutoDetect(FileStream fs, byte firstByte, byte secondByte, Encoding defaultEncoding)

		{

			int max = (int)Math.Min(fs.Length, 500000); // look at max. 500 KB

			const int ASCII = 0;

			const int Error = 1;

			const int UTF8  = 2;

			const int UTF8Sequence = 3;

			int state = ASCII;

			int sequenceLength = 0;

			byte b;

			for (int i = 0; i < max; i++) {

				if (i == 0) {

					b = firstByte;

				} else if (i == 1) {

					b = secondByte;

				} else {

					b = (byte)fs.ReadByte();

				}

				if (b < 0x80) {

					// normal ASCII character

					if (state == UTF8Sequence) {

						state = Error;

						break;

					}

				} else if (b < 0xc0) {

					// 10xxxxxx : continues UTF8 byte sequence

					if (state == UTF8Sequence) {

						--sequenceLength;

						if (sequenceLength < 0) {

							state = Error;

							break;

						} else if (sequenceLength == 0) {

							state = UTF8;

						}

					} else {

						state = Error;

						break;

					}

				} else if (b >= 0xc2 && b < 0xf5) {

					// beginning of byte sequence

					if (state == UTF8 || state == ASCII) {

						state = UTF8Sequence;

						if (b < 0xe0) {

							sequenceLength = 1; // one more byte following

						} else if (b < 0xf0) {

							sequenceLength = 2; // two more bytes following

						} else {

							sequenceLength = 3; // three more bytes following

						}

					} else {

						state = Error;

						break;

					}

				} else {

					// 0xc0, 0xc1, 0xf5 to 0xff are invalid in UTF-8 (see RFC 3629)

					state = Error;

					break;

				}

			}

			fs.Position = 0;

			switch (state) {

				case ASCII:

				case Error:

					// when the file seems to be ASCII or non-UTF8,

					// we read it using the user-specified encoding so it is saved again

					// using that encoding.

					if (IsUnicode(defaultEncoding)) {

						// the file is not Unicode, so don't read it using Unicode even if the

						// user has choosen Unicode as the default encoding.


						// If we don't do this, SD will end up always adding a Byte Order Mark

						// to ASCII files.

						defaultEncoding = Encoding.Default; // use system encoding instead

					}

					return new StreamReader(fs, defaultEncoding);

				default:

					return new StreamReader(fs);

			}

		}

	}

}

#4

我有个类直接用就行
直接使用类的静态函数TxtFileEncoding.GetEncoding(string fileName)

using System;
using System.Text;
using System.IO;
namespace BEST.Public
{
    /// <summary>
    /// 用于取得一个文本文件的编码方式(Encoding)。
    /// </summary>
    public class TxtFileEncoding
    {
        /// <summary>
        /// 构造
        /// </summary>
        public TxtFileEncoding()
        {
        }

        /// <summary>
        /// 取得一个文本文件的编码方式。如果无法在文件头部找到有效的前导符，Encoding.Default将被返回。
        /// </summary>
        /// <param name="fileName">文件名。</param>
        /// <returns></returns>
        public static Encoding GetEncoding(string fileName)
        {
            return GetEncoding(fileName, Encoding.Default);
        }
        /// <summary>
        /// 取得一个文本文件流的编码方式。
        /// </summary>
        /// <param name="stream">文本文件流。</param>
        /// <returns></returns>

        public static Encoding GetEncoding(FileStream stream)
        {
            return GetEncoding(stream, Encoding.Default);
        }
        /// <summary>
        /// 取得一个文本文件的编码方式。
        /// </summary>
        /// <param name="fileName">文件名。</param>
        /// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时，将返回该编码方式。</param>
        /// <returns></returns>
        public static Encoding GetEncoding(string fileName, Encoding defaultEncoding)
        {
            FileStream fs = null;
            Encoding targetEncoding = defaultEncoding;
            try
            {
                fs = new FileStream(fileName, FileMode.Open);
                targetEncoding = GetEncoding(fs, defaultEncoding);
            }
            catch
            {
            }
            if (fs != null)
            {
                fs.Close();
            }
            return targetEncoding;
        }

        /// <summary>
        /// 取得一个文本文件流的编码方式。
        /// </summary>
        /// <param name="stream">文本文件流。</param>
        /// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时，将返回该编码方式。</param>
        /// <returns></returns>
        public static Encoding GetEncoding(FileStream stream, Encoding defaultEncoding)
        {
            Encoding targetEncoding = defaultEncoding;
            if (stream != null && stream.Length >= 2)
            {
                //保存文件流的前4个字节
                byte byte1 = 0;
                byte byte2 = 0;
                byte byte3 = 0;
                byte byte4 = 0;
                //保存当前Seek位置
                long origPos = stream.Seek(0, SeekOrigin.Begin);
                stream.Seek(0, SeekOrigin.Begin);
                int nByte = stream.ReadByte();
                byte1 = Convert.ToByte(nByte);
                byte2 = Convert.ToByte(stream.ReadByte());
                if (stream.Length >= 3)
                {
                    byte3 = Convert.ToByte(stream.ReadByte());
                }
                if (stream.Length >= 4)
                {
                    byte4 = Convert.ToByte(stream.ReadByte());
                }
                //根据文件流的前4个字节判断Encoding
                //Unicode {0xFF, 0xFE};
                //BE-Unicode {0xFE, 0xFF};
                //UTF8 = {0xEF, 0xBB, 0xBF};
                if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
                {
                    targetEncoding = Encoding.BigEndianUnicode;
                }
                else if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
                {
                    targetEncoding = Encoding.Unicode;
                }
                else if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
                {
                    targetEncoding = Encoding.UTF8;
                }
                else if (byte2 == 0x0)//Unicode
                {
                    targetEncoding = Encoding.Unicode;
                }
                //恢复Seek位置　　　
                stream.Seek(origPos, SeekOrigin.Begin);
            }
            return targetEncoding;
        }
    }
}

#5

如果文件内容是由windows自带的notepad保存的，那么它会在文件的前几个字节加上bom头之类的字节。用.net程序在读的时候，不懂你设置的编码是什么，它都会根据文件头几个字节正确的读出文本的内容。但是，并不是所有的文本编辑器都会像notepad一样，有些直接将文本保存到文件里。这时候用程序读的时候，就要明确的指定文本的编码类型。编码其实这是一个很烦人的问题，可能网上有一些根据文本内容来判断其编码的程序，但并不能完全正确的判断出所有的编码类型。

http://blog.csdn.net/JGood/archive/2009/09/10/4540466.aspx

#6

#7

3楼Dobzhansky介绍的代码的好用，非常感谢

#8

该回复于2010-12-15 17:47:29被版主删除

#1