From a0c3e83e2f2ba7ca8df8b2255ccf0a3f134cbe82 Mon Sep 17 00:00:00 2001 From: Oliverpool Date: Sat, 7 Nov 2015 11:27:51 +0100 Subject: [PATCH 1/8] Detect file encoding with a priority list --- patacrep/encoding.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/patacrep/encoding.py b/patacrep/encoding.py index fd58fc2f..4cecd11a 100644 --- a/patacrep/encoding.py +++ b/patacrep/encoding.py @@ -16,15 +16,32 @@ def open_read(filename, mode='r', encoding=None): If `encoding` is set, use it as the encoding (do not guess). """ if encoding is None: - with open(filename, 'rb') as file: - fileencoding = chardet.detect(file.read())['encoding'] - else: - fileencoding = encoding + encoding = detect_encoding(filename) with codecs.open( filename, mode=mode, - encoding=fileencoding, + encoding=encoding, errors='replace', ) as fileobject: yield fileobject + +def detect_encoding(filename): + """Return the most likely encoding of the file + """ + encodings = ['utf-8', 'windows-1250', 'windows-1252'] + for e in encodings: + try: + fh = codecs.open(filename, 'r', encoding=e) + fh.readlines() + fh.seek(0) + except UnicodeDecodeError: + pass + else: + if e != 'utf-8': + print('Opening `%s` with `%s` encoding' % (filename, e)) + return e + finally: + fh.close() + + raise Exception('Not suitable encoding found for {}'.format(filename)) From ac2363ca37cd7d61467da2db01a62d8cc9788dda Mon Sep 17 00:00:00 2001 From: Oliverpool Date: Sat, 7 Nov 2015 11:31:29 +0100 Subject: [PATCH 2/8] Correct encoding of control file --- test/test_compilation/syntax.tex.control | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_compilation/syntax.tex.control b/test/test_compilation/syntax.tex.control index 2794e2fa..6f213ef2 100644 --- a/test/test_compilation/syntax.tex.control +++ b/test/test_compilation/syntax.tex.control @@ -92,7 +92,7 @@ guitar, \selectlanguage{english} -\beginsong{Song with Sharp in musicnote}[ +\beginsong{Song with Sharp in musicnote}[ by={ }, ] From d483c87d8b2bc4fb6f43ec119db27a7ec5c19029 Mon Sep 17 00:00:00 2001 From: Oliverpool Date: Sat, 7 Nov 2015 11:37:00 +0100 Subject: [PATCH 3/8] chardet package is not required anymore --- Requirements.txt | 1 - patacrep/encoding.py | 1 - setup.py | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Requirements.txt b/Requirements.txt index 1673cecb..1fd1ebb8 100644 --- a/Requirements.txt +++ b/Requirements.txt @@ -1,4 +1,3 @@ ply Jinja2==2.7.3 -chardet==2.2.1 unidecode>=0.04.16 diff --git a/patacrep/encoding.py b/patacrep/encoding.py index 4cecd11a..20783111 100644 --- a/patacrep/encoding.py +++ b/patacrep/encoding.py @@ -1,7 +1,6 @@ """Dealing with encoding problems.""" import codecs -import chardet import logging import contextlib diff --git a/setup.py b/setup.py index 023201eb..85979a31 100755 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ setup( packages=find_packages(exclude=["test*"]), license="GPLv2 or any later version", install_requires=[ - "unidecode", "jinja2", "chardet", "ply", + "unidecode", "jinja2", "ply", ], setup_requires=["hgtools"], package_data={'patacrep': [ From 46ceecd1c01d19b3c824e07872fad8d6d5e182f7 Mon Sep 17 00:00:00 2001 From: Oliverpool Date: Sat, 7 Nov 2015 11:39:49 +0100 Subject: [PATCH 4/8] Improve variable names (pylint) --- patacrep/encoding.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/patacrep/encoding.py b/patacrep/encoding.py index 20783111..9f0018da 100644 --- a/patacrep/encoding.py +++ b/patacrep/encoding.py @@ -29,18 +29,18 @@ def detect_encoding(filename): """Return the most likely encoding of the file """ encodings = ['utf-8', 'windows-1250', 'windows-1252'] - for e in encodings: + for encoding in encodings: try: - fh = codecs.open(filename, 'r', encoding=e) - fh.readlines() - fh.seek(0) + filehandler = codecs.open(filename, 'r', encoding=encoding) + filehandler.readlines() + filehandler.seek(0) except UnicodeDecodeError: pass else: - if e != 'utf-8': - print('Opening `%s` with `%s` encoding' % (filename, e)) - return e + if encoding != 'utf-8': + print('Opening `%s` with `%s` encoding' % (filename, encoding)) + return encoding finally: - fh.close() + filehandler.close() raise Exception('Not suitable encoding found for {}'.format(filename)) From b220d6d36aa6b215dba11b67462107dbce41308c Mon Sep 17 00:00:00 2001 From: Oliverpool Date: Sat, 7 Nov 2015 11:39:59 +0100 Subject: [PATCH 5/8] Improve exception type --- patacrep/encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/patacrep/encoding.py b/patacrep/encoding.py index 9f0018da..d38daf60 100644 --- a/patacrep/encoding.py +++ b/patacrep/encoding.py @@ -43,4 +43,4 @@ def detect_encoding(filename): finally: filehandler.close() - raise Exception('Not suitable encoding found for {}'.format(filename)) + raise UnicodeDecodeError('Not suitable encoding found for {}'.format(filename)) From 4c63e6fe707cba64494fb0e2d0048bf374403b28 Mon Sep 17 00:00:00 2001 From: Oliverpool Date: Sat, 7 Nov 2015 12:59:15 +0100 Subject: [PATCH 6/8] Use logger instead of print --- patacrep/encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/patacrep/encoding.py b/patacrep/encoding.py index d38daf60..189db9a3 100644 --- a/patacrep/encoding.py +++ b/patacrep/encoding.py @@ -38,7 +38,7 @@ def detect_encoding(filename): pass else: if encoding != 'utf-8': - print('Opening `%s` with `%s` encoding' % (filename, encoding)) + LOGGER.info('Opening `%s` with `%s` encoding' % (filename, encoding)) return encoding finally: filehandler.close() From d3046339d1633c399eac26fb8aed460acffb2a2f Mon Sep 17 00:00:00 2001 From: Oliverpool Date: Sat, 7 Nov 2015 13:53:40 +0100 Subject: [PATCH 7/8] pylint - use format for string formatting --- patacrep/encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/patacrep/encoding.py b/patacrep/encoding.py index 189db9a3..1ed15b94 100644 --- a/patacrep/encoding.py +++ b/patacrep/encoding.py @@ -38,7 +38,7 @@ def detect_encoding(filename): pass else: if encoding != 'utf-8': - LOGGER.info('Opening `%s` with `%s` encoding' % (filename, encoding)) + LOGGER.info('Opening `{}` with `{}` encoding'.format(filename, encoding)) return encoding finally: filehandler.close() From f22b726484f12292d91bf711c77a55456476c7b3 Mon Sep 17 00:00:00 2001 From: Oliverpool Date: Sat, 7 Nov 2015 14:14:56 +0100 Subject: [PATCH 8/8] Correct Exception for unknown encoding --- patacrep/encoding.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/patacrep/encoding.py b/patacrep/encoding.py index 1ed15b94..3f1512cf 100644 --- a/patacrep/encoding.py +++ b/patacrep/encoding.py @@ -42,5 +42,4 @@ def detect_encoding(filename): return encoding finally: filehandler.close() - - raise UnicodeDecodeError('Not suitable encoding found for {}'.format(filename)) + raise UnicodeError('Not suitable encoding found for {}'.format(filename))