多线程批量转换文件编码, 从GBK, GB2312编码转换到UTF-8编码(Python)

时间:2021-11-26 14:07:07

# coding=utf-8
# author:Jeffrey Ma
# version:0.1
# build 2
# created on:2015年3月31日
# description: 1. 批量转换文件编码,从GBK GB2312编码转换到UTF-8编码
# 2. 支持指定目录下所有的文件的转换,包括子目录中的文件
# 3. 支持检测原始编码,对已经是UTF-8编码的文件,不做转换
# 4. 支持只转换指定扩展名的编码
# 5. 支持多线程转换和控制台输出
# 6. 支持控制台显示线程池的状态
# 7. 支持日志记录
# usage: python gbk2utf8.py -s [文件路径]
# args : 文件的绝对路径
# notes : 转换前请备份原始文件,转换后的文件会覆盖原文件。

from __future__ import division
import sys
import os
import getopt
import logging
import logging.config
import Queue
import threadpool
import threading
from threading import Thread
from multiprocessing.dummy import Pool as ThreadPool
import chardet
import curses
import time
import locale

locale.setlocale(locale.LC_ALL, "")

global logger
global stdscr
global pool

stdscr = curses.initscr()

def GBK2UTF8(filename):

threadName = threading.currentThread().getName()

f = open(filename, 'rb')
s = f.read()
f.close()
encodingName = chardet.detect(s)['encoding']

str = "";
if (encodingName.startswith('GB')):
# GBK码,需要转换
try:
gbkContent = s.decode(encodingName)
utf8Content = gbkContent.encode('utf-8')
f = open(filename, 'w')
f.write(utf8Content)
f.close()
except UnicodeDecodeError:
str = "%s: %s, %s 转换出错" % (threadName, filename, encodingName)
# logger.error("%s: %s, %s 转换出错" % (threadName, filename, encodingName))
# logger.error('%s: decoe error %s' % (threadName, UnicodeDecodeError.reason))
pass

str = "%s: %s, %s 转换done" % (threadName, filename, encodingName)
else:
# 已经是UTF-8不需要转换
str = "%s: %s, %s 已经是UTF-8不需要转换" % (threadName, filename, encodingName)

return {"tName": threadName, "encodingName": encodingName,"filename":filename, "result":str}

def initLogger():
global logger
# 日志初始化
LOG_FILENAME = 'logging.conf'
logging.config.fileConfig(LOG_FILENAME)
logger = logging.getLogger("GBK2UTF8")
# 测试代码
# logger.debug("debug message")
# logger.info("info message")
# logger.warn("warn message")
# logger.error("error message")
# logger.critical("critical message")

def main():
initLogger()
shortargs = 's:d'
longargs = ['src=', 'dest']
try:
opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs)
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
# usage()
print "Usage: python gbk2utf8.py -s [file full path]"
return
# sys.exit(2)

srcPath = None
destPath = None
for o, a in opts:
if o in ("-s", "--src"):
srcPath = a
elif o in ("-d", "--dest"):
destPath = a
else:
assert False, "unhandled option"

if (srcPath != None and os.path.exists(srcPath) and os.path.isdir(srcPath)):
doWork(srcPath)

def doWork(sPath):
# Make the Pool of workers
global pool
pool = threadpool.ThreadPool(10)

extFilters = ['xml', 'java', 'js', 'txt', 'css', 'php', 'html', 'htm', 'tpl']
i = 0
arrFiles = []
for root, dirs, files in os.walk(sPath):
for file in files:
# print root
# print file
i = i+1
sFilePath = root + os.sep + file
extension = os.path.splitext(sFilePath)[1][1:]
if (extension in extFilters):
arrFiles.append(sFilePath)
else:
logger.info('Skipping %s' % sFilePath)
print 'waiting...job'

curses.noecho()
curses.cbreak()
requests = threadpool.makeRequests(GBK2UTF8, arrFiles, print_result)
[pool.putRequest(req) for req in requests]

#close the pool and wait for the work to finish
pool.wait()
curses.nocbreak()
curses.echo()
curses.endwin()

print 'end job'

def print_result(request, result):
try:
idx = 0
for t in pool.workers:
idx = idx+1
if(t.getName() == result["tName"]):
break

if idx > 0:
y, x = stdscr.getmaxyx()
# stdscr.deleteln()
text = result["result"]
textLen = len(text)
text = text.ljust(x)
stdscr.addstr(idx, 0, text)
stdscr.refresh()
logger.info(text)
except curses.error:
pass

if __name__ == '__main__':
main()







# coding=utf-8
# author:Jeffrey Ma
# version:0.1
# build 2
# created on:2015年3月31日
# description: 1. 批量转换文件编码,从GBK GB2312编码转换到UTF-8编码
# 2. 支持指定目录下所有的文件的转换,包括子目录中的文件
# 3. 支持检测原始编码,对已经是UTF-8编码的文件,不做转换
# 4. 支持只转换指定扩展名的编码
# 5. 支持多线程转换和控制台输出
# 6. 支持控制台显示线程池的状态
# 7. 支持日志记录
# usage: python gbk2utf8.py -s [文件路径]
# args : 文件的绝对路径
# notes : 转换前请备份原始文件,转换后的文件会覆盖原文件。

from __future__ import division
import sys
import os
import getopt
import logging
import logging.config
import Queue
import threadpool
import threading
from threading import Thread
from multiprocessing.dummy import Pool as ThreadPool
import chardet
import curses
import time
import locale

locale.setlocale(locale.LC_ALL, "")

global logger
global stdscr
global pool

stdscr = curses.initscr()

def GBK2UTF8(filename):

threadName = threading.currentThread().getName()

f = open(filename, 'rb')
s = f.read()
f.close()
encodingName = chardet.detect(s)['encoding']

str = "";
if (encodingName.startswith('GB')):
# GBK码,需要转换
try:
gbkContent = s.decode(encodingName)
utf8Content = gbkContent.encode('utf-8')
f = open(filename, 'w')
f.write(utf8Content)
f.close()
except UnicodeDecodeError:
str = "%s: %s, %s 转换出错" % (threadName, filename, encodingName)
# logger.error("%s: %s, %s 转换出错" % (threadName, filename, encodingName))
# logger.error('%s: decoe error %s' % (threadName, UnicodeDecodeError.reason))
pass

str = "%s: %s, %s 转换done" % (threadName, filename, encodingName)
else:
# 已经是UTF-8不需要转换
str = "%s: %s, %s 已经是UTF-8不需要转换" % (threadName, filename, encodingName)

return {"tName": threadName, "encodingName": encodingName,"filename":filename, "result":str}

def initLogger():
global logger
# 日志初始化
LOG_FILENAME = 'logging.conf'
logging.config.fileConfig(LOG_FILENAME)
logger = logging.getLogger("GBK2UTF8")
# 测试代码
# logger.debug("debug message")
# logger.info("info message")
# logger.warn("warn message")
# logger.error("error message")
# logger.critical("critical message")

def main():
initLogger()
shortargs = 's:d'
longargs = ['src=', 'dest']
try:
opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs)
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
# usage()
print "Usage: python gbk2utf8.py -s [file full path]"
return
# sys.exit(2)

srcPath = None
destPath = None
for o, a in opts:
if o in ("-s", "--src"):
srcPath = a
elif o in ("-d", "--dest"):
destPath = a
else:
assert False, "unhandled option"

if (srcPath != None and os.path.exists(srcPath) and os.path.isdir(srcPath)):
doWork(srcPath)

def doWork(sPath):
# Make the Pool of workers
global pool
pool = threadpool.ThreadPool(10)

extFilters = ['xml', 'java', 'js', 'txt', 'css', 'php', 'html', 'htm', 'tpl']
i = 0
arrFiles = []
for root, dirs, files in os.walk(sPath):
for file in files:
# print root
# print file
i = i+1
sFilePath = root + os.sep + file
extension = os.path.splitext(sFilePath)[1][1:]
if (extension in extFilters):
arrFiles.append(sFilePath)
else:
logger.info('Skipping %s' % sFilePath)
print 'waiting...job'

curses.noecho()
curses.cbreak()
requests = threadpool.makeRequests(GBK2UTF8, arrFiles, print_result)
[pool.putRequest(req) for req in requests]

#close the pool and wait for the work to finish
pool.wait()
curses.nocbreak()
curses.echo()
curses.endwin()

print 'end job'

def print_result(request, result):
try:
idx = 0
for t in pool.workers:
idx = idx+1
if(t.getName() == result["tName"]):
break

if idx > 0:
y, x = stdscr.getmaxyx()
# stdscr.deleteln()
text = result["result"]
textLen = len(text)
text = text.ljust(x)
stdscr.addstr(idx, 0, text)
stdscr.refresh()
logger.info(text)
except curses.error:
pass

if __name__ == '__main__':
main()