From a0c3e83e2f2ba7ca8df8b2255ccf0a3f134cbe82 Mon Sep 17 00:00:00 2001 From: Oliverpool Date: Sat, 7 Nov 2015 11:27:51 +0100 Subject: [PATCH] Detect file encoding with a priority list --- patacrep/encoding.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/patacrep/encoding.py b/patacrep/encoding.py index fd58fc2f..4cecd11a 100644 --- a/patacrep/encoding.py +++ b/patacrep/encoding.py @@ -16,15 +16,32 @@ def open_read(filename, mode='r', encoding=None): If `encoding` is set, use it as the encoding (do not guess). """ if encoding is None: - with open(filename, 'rb') as file: - fileencoding = chardet.detect(file.read())['encoding'] - else: - fileencoding = encoding + encoding = detect_encoding(filename) with codecs.open( filename, mode=mode, - encoding=fileencoding, + encoding=encoding, errors='replace', ) as fileobject: yield fileobject + +def detect_encoding(filename): + """Return the most likely encoding of the file + """ + encodings = ['utf-8', 'windows-1250', 'windows-1252'] + for e in encodings: + try: + fh = codecs.open(filename, 'r', encoding=e) + fh.readlines() + fh.seek(0) + except UnicodeDecodeError: + pass + else: + if e != 'utf-8': + print('Opening `%s` with `%s` encoding' % (filename, e)) + return e + finally: + fh.close() + + raise Exception('Not suitable encoding found for {}'.format(filename))