|
|
@ -3,14 +3,27 @@ |
|
|
|
|
|
|
|
"""Authors string management.""" |
|
|
|
|
|
|
|
import logging |
|
|
|
import re |
|
|
|
|
|
|
|
LOGGER = logging.getLogger(__name__) |
|
|
|
|
|
|
|
DEFAULT_AUTHWORDS = { |
|
|
|
"after": ["by"], |
|
|
|
"ignore": ["unknown"], |
|
|
|
"sep": ["and"], |
|
|
|
} |
|
|
|
|
|
|
|
def to_utf8(string): |
|
|
|
"""Convert a string (encoded in unicode or iso-8859-1 to utf-8""" |
|
|
|
if type(string) is unicode: |
|
|
|
return string.encode('utf-8') |
|
|
|
elif type(string) is str: |
|
|
|
return string.decode('iso-8859-1').encode('utf-8') |
|
|
|
else: |
|
|
|
LOGGER.warning("Ignoring a word I can not decode...") |
|
|
|
return None |
|
|
|
|
|
|
|
def compile_authwords(authwords): |
|
|
|
"""Convert strings of authwords to compiled regular expressions. |
|
|
|
|
|
|
@ -30,6 +43,7 @@ def compile_authwords(authwords): |
|
|
|
re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE) |
|
|
|
for word in ([" %s" % word for word in authwords['sep']] + [',']) |
|
|
|
] |
|
|
|
authwords['ignore'] = [to_utf8(word) for word in authwords['ignore'] if to_utf8(word)] |
|
|
|
|
|
|
|
return authwords |
|
|
|
|
|
|
|