python 修改文件编码方式

 import chardet

 import os

 def strJudgeCode(str):

     return chardet.detect(str)

 def readFile(path):

     try:

         f = open(path, 'r')

         filecontent = f.read()

     finally:

         if f:

             f.close()

     return filecontent

 def WriteFile(str, path):

     try:

         f = open(path, 'w')

         f.write(str)

     finally:

         if f:

             f.close()

 def converCode(path):

     file_con = readFile(path)

     result = strJudgeCode(file_con)

     #print(file_con)

     if result['encoding'] == 'utf-8':

         #os.remove(path)

         a_unicode = file_con.decode('utf-8')

         gb2312 = a_unicode.encode('gbk')

         WriteFile(gb2312, path)

 def listDirFile(dir):

     list = os.listdir(dir)

     for line in list:

         filepath = os.path.join(dir, line)

         if os.path.isdir(filepath):

             listDirFile(filepath)

         else:

             print(line)

             converCode(filepath)            

 if __name__ == '__main__':

     listDirFile(u'.\TRMD')

详细解释：


 1 import chardet

 import os

 def strJudgeCode(str):

     return chardet.detect(str)

     '''

 chardet.detect()返回字典，其中confidence是检测精确度，encoding是编码形式

 {'confidence': 0.98999999999999999, 'encoding': 'GB2312'}

 （）网页编码判断：

 >>> import urllib

 >>> rawdata = urllib.urlopen('http://www.google.cn/').read()

 >>> import chardet

 >>> chardet.detect(rawdata)

 {'confidence': 0.98999999999999999, 'encoding': 'GB2312'}

 （）文件编码判断

 复制代码

 import chardet

 tt=open('c:\\111.txt','rb')

 ff=tt.readline()

 #这里试着换成read()也可以，但是换成readlines()后报错

 enc=chardet.detect(ff)

 print enc['encoding']

 tt.close()

     '''

 def readFile(path):

     try:

         f = open(path, 'r')

         filecontent = f.read()

     finally:

         if f:

             f.close()

     return filecontent

 def WriteFile(str, path):

     try:

         f = open(path, 'w')

         f.write(str)

     finally:

         if f:

             f.close()

 def converCode(path):

     file_con = readFile(path)

     result = strJudgeCode(file_con)

     #print(file_con)

     if result['encoding'] == 'utf-8':

         #os.remove(path)

         a_unicode = file_con.decode('utf-8')

     '''

 使用decode()和encode()来进行解码和编码

 u = '中文' #指定字符串类型对象u

 str = u.encode('gb2312') #以gb2312编码对u进行编码，获得bytes类型对象str

 u1 = str.decode('gb2312')#以gb2312编码对字符串str进行解码，获得字符串类型对象u1

 u2 = str.decode('utf-8')#如果以utf-8的编码对str进行解码得到的结果，将无法还原原来的字符串内容

     '''

         gb2312 = a_unicode.encode('gbk')

         WriteFile(gb2312, path)

 def listDirFile(dir):

     list = os.listdir(dir)#返回指定路径下的文件和文件夹列表。

     for line in list:

         filepath = os.path.join(dir, line)

         '''

 是在拼接路径的时候用的。举个例子，

 os.path.join(“home”, "me", "mywork")

 在Linux系统上会返回

 “home/me/mywork"

 在Windows系统上会返回

 "home\me\mywork"

 好处是可以根据系统自动选择正确的路径分隔符"/"或"\"

         '''

         if os.path.isdir(filepath):#os.path.isdir()函数判断某一路径是否为目录

             listDirFile(filepath)

         else:

             print(line)

             converCode(filepath)            

 if __name__ == '__main__':

     listDirFile(u'.\TRMD')

     '''

 u'string'  表示 已经是 unicode 编码的 'string' 字符串

 # -*- coding: UTF- -*-   这句是告诉python程序中的文本是utf-8编码，让python可以按照utf-8读取程

 中文前加u就是告诉python后面的是个unicode编码，存储时按unicode格式存储。

     '''

秒客网

python 修改文件编码方式

相关文章