Browse Source

Detect file encoding with a priority list

pull/161/head
Oliverpool 9 years ago
parent
commit
a0c3e83e2f
  1. 27
      patacrep/encoding.py

27
patacrep/encoding.py

@ -16,15 +16,32 @@ def open_read(filename, mode='r', encoding=None):
If `encoding` is set, use it as the encoding (do not guess). If `encoding` is set, use it as the encoding (do not guess).
""" """
if encoding is None: if encoding is None:
with open(filename, 'rb') as file: encoding = detect_encoding(filename)
fileencoding = chardet.detect(file.read())['encoding']
else:
fileencoding = encoding
with codecs.open( with codecs.open(
filename, filename,
mode=mode, mode=mode,
encoding=fileencoding, encoding=encoding,
errors='replace', errors='replace',
) as fileobject: ) as fileobject:
yield fileobject yield fileobject
def detect_encoding(filename):
"""Return the most likely encoding of the file
"""
encodings = ['utf-8', 'windows-1250', 'windows-1252']
for e in encodings:
try:
fh = codecs.open(filename, 'r', encoding=e)
fh.readlines()
fh.seek(0)
except UnicodeDecodeError:
pass
else:
if e != 'utf-8':
print('Opening `%s` with `%s` encoding' % (filename, e))
return e
finally:
fh.close()
raise Exception('Not suitable encoding found for {}'.format(filename))

Loading…
Cancel
Save