diff --git a/patacrep/authors.py b/patacrep/authors.py index 3a3e8600..8cb66962 100644 --- a/patacrep/authors.py +++ b/patacrep/authors.py @@ -3,14 +3,27 @@ """Authors string management.""" +import logging import re +LOGGER = logging.getLogger(__name__) + DEFAULT_AUTHWORDS = { "after": ["by"], "ignore": ["unknown"], "sep": ["and"], } +def to_utf8(string): + """Convert a string (encoded in unicode or iso-8859-1 to utf-8""" + if type(string) is unicode: + return string.encode('utf-8') + elif type(string) is str: + return string.decode('iso-8859-1').encode('utf-8') + else: + LOGGER.warning("Ignoring a word I can not decode...") + return None + def compile_authwords(authwords): """Convert strings of authwords to compiled regular expressions. @@ -30,6 +43,7 @@ def compile_authwords(authwords): re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE) for word in ([" %s" % word for word in authwords['sep']] + [',']) ] + authwords['ignore'] = [to_utf8(word) for word in authwords['ignore'] if to_utf8(word)] return authwords diff --git a/patacrep/index.py b/patacrep/index.py index 9ee5e99d..07c1d558 100755 --- a/patacrep/index.py +++ b/patacrep/index.py @@ -41,9 +41,9 @@ def process_sxd(filename): Return an Index object. """ data = [] - with codecs.open(filename, 'r', 'utf-8') as index_file: + with codecs.open(filename, 'r', 'iso-8859-1') as index_file: for line in index_file: - data.append(line.strip()) + data.append(line.strip().encode('utf-8')) i = 1 idx = Index(data[0])