代码如下:
#!/usr/bin/env python # -*- coding: utf-8 -*- #2018/05/31 检测文件是否是utf-8无bom格式的 import sys, codecs def detectUTF8(file_name): state = 0 line_num = 0 file_obj = open(file_name) all_lines = file_obj.readlines() file_obj.close() for line in all_lines: line_num += 1 line_len = len(line) for index in range(line_len): if state == 0: if ord(line[index]) & 0x80 == 0x00: # 上表中的第一种情况 state = 0 elif ord(line[index]) & 0xE0 == 0xC0: # 上表中的第二种情况 state = 1 elif ord(line[index]) & 0xF0 == 0xE0: # 第三种 state = 2 elif ord(line[index]) & 0xF8 == 0xF0: # 第四种 state = 3 else: print "%s isn't a utf8 file,line:\t" % file_name + str(line_num) sys.exit(1) else: if not ord(line[index]) & 0xC0 == 0x80: print "%s isn't a utf8 file in line:\t" % file_name + str(line_num) sys.exit(1) state -= 1 if existBOM(file_name): print "%s isn't a standard utf8 file,include BOM header." % file_name sys.exit(1) def existBOM(file_name): file_obj = open(file_name, 'r') code = file_obj.read(3) file_obj.close() if code == codecs.BOM_UTF8: # 判断是否包含EF BB BF return True return False if __name__ == "__main__": file_name = 'new 2.txt' detectUTF8(file_name)借鉴了一下其他小伙伴的经验,如有侵权请留言告知。