Detect file encoding with a priority list

10 years ago · a0c3e83e2f
1 changed files with 22 additions and 5 deletions
--- a/patacrep/encoding.py
+++ b/patacrep/encoding.py
@ -16,15 +16,32 @@ def open_read(filename, mode='r', encoding=None):
    If `encoding` is set, use it as the encoding (do not guess).
    """
    if encoding is None:
-        with open(filename, 'rb') as file:
+        encoding = detect_encoding(filename)
            fileencoding = chardet.detect(file.read())['encoding']
    else:
        fileencoding = encoding
    with codecs.open(
        filename,
        mode=mode,
-        encoding=fileencoding,
+        encoding=encoding,
        errors='replace',
        ) as fileobject:
        yield fileobject
 def detect_encoding(filename):
    """Return the most likely encoding of the file
    """
    encodings = ['utf-8', 'windows-1250', 'windows-1252']
    for e in encodings:
        try:
            fh = codecs.open(filename, 'r', encoding=e)
            fh.readlines()
            fh.seek(0)
        except UnicodeDecodeError:
            pass
        else:
            if e != 'utf-8':
                print('Opening `%s` with `%s` encoding' % (filename, e))
            return e
        finally:
            fh.close()
    raise Exception('Not suitable encoding found for {}'.format(filename))