城东书院 www.cdsy.xyz
- import codecs
- # 输出字符个数,中文,英文,数字都只是算一个
- print len(u"我是谁第1部abc")
- print len("我是谁".decode("utf-8"))
- #以utf-8格式读取文件内容,要求文件使用对应的utf-8格式保存
- content = codecs.open('c:\\test.txt', 'r', 'utf-8').read()
- print content
-
- import chardet
- from chardet.universaldetector import UniversalDetector
- def detectFile(file_name):
- detector = UniversalDetector()
- file_obj = open(file_name)
- for line in file_obj.readlines():
- # 分块进行测试,直到达到阈值
- detector.feed(line)
- if detector.done: break
- # 关闭检测对象
- detector.close()
- file_obj.close()
- # 输出检测结果
- if not detector.result.has_key("encoding"):
- detector.result["encoding"] = config.DEFAULT_ENCODING
- if str.lower(str(detector.result["encoding"])) == "gb2312":
- detector.result["encoding"] = "gbk"
- return detector.result
-
-
- def get_encoding_type_of_file(full_path):
- with open(full_path, 'r') as file_handler:
- file_content = file_handler.read()
- encoding_type = chardet.detect(file_content).get("encoding")
- return encoding_type
-
- def translateFile(full_path)
- with codecs.open(full_path, 'r') as filehandler:
- if isutf8encode:
- self.FileBuffer = codecs.EncodedFile(filehandler, 'utf-8').read()
- filehandler.seek(0)
- self.FileLinesBuffer = codecs.EncodedFile(filehandler, 'utf-8').readlines()
- else:
- self.FileBuffer = codecs.EncodedFile(filehandler, 'gbk', 'utf-8').read()
- filehandler.seek(0)
- self.FileLinesBuffer = codecs.EncodedFile(filehandler, 'gbk', 'utf-8').readlines()
城东书院 www.cdsy.xyz