Detect file encoding with a priority list

9 years ago · a0c3e83e2f
1 changed files with 22 additions and 5 deletions
--- a/patacrep/encoding.py
+++ b/patacrep/encoding.py
@ -16,15 +16,32 @@ def open_read(filename, mode='r', encoding=None):
    If `encoding` is set, use it as the encoding (do not guess).
    """
    if encoding is None:
-        with open(filename, 'rb') as file:
-            fileencoding = chardet.detect(file.read())['encoding']
-    else:
-        fileencoding = encoding
+        encoding = detect_encoding(filename)

    with codecs.open(
        filename,
        mode=mode,
-        encoding=fileencoding,
+        encoding=encoding,
        errors='replace',
        ) as fileobject:
        yield fileobject
+
+def detect_encoding(filename):
+    """Return the most likely encoding of the file
+    """
+    encodings = ['utf-8', 'windows-1250', 'windows-1252']
+    for e in encodings:
+        try:
+            fh = codecs.open(filename, 'r', encoding=e)
+            fh.readlines()
+            fh.seek(0)
+        except UnicodeDecodeError:
+            pass
+        else:
+            if e != 'utf-8':
+                print('Opening `%s` with `%s` encoding' % (filename, e))
+            return e
+        finally:
+            fh.close()
+
+    raise Exception('Not suitable encoding found for {}'.format(filename))