Browse Source

Correcting encoding problems #50

pull/54/head
Louis 11 years ago
parent
commit
3455204f6d
  1. 14
      patacrep/authors.py
  2. 4
      patacrep/index.py

14
patacrep/authors.py

@ -3,14 +3,27 @@
"""Authors string management."""
import logging
import re
LOGGER = logging.getLogger(__name__)
DEFAULT_AUTHWORDS = {
"after": ["by"],
"ignore": ["unknown"],
"sep": ["and"],
}
def to_utf8(string):
"""Convert a string (encoded in unicode or iso-8859-1 to utf-8"""
if type(string) is unicode:
return string.encode('utf-8')
elif type(string) is str:
return string.decode('iso-8859-1').encode('utf-8')
else:
LOGGER.warning("Ignoring a word I can not decode...")
return None
def compile_authwords(authwords):
"""Convert strings of authwords to compiled regular expressions.
@ -30,6 +43,7 @@ def compile_authwords(authwords):
re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE)
for word in ([" %s" % word for word in authwords['sep']] + [','])
]
authwords['ignore'] = [to_utf8(word) for word in authwords['ignore'] if to_utf8(word)]
return authwords

4
patacrep/index.py

@ -41,9 +41,9 @@ def process_sxd(filename):
Return an Index object.
"""
data = []
with codecs.open(filename, 'r', 'utf-8') as index_file:
with codecs.open(filename, 'r', 'iso-8859-1') as index_file:
for line in index_file:
data.append(line.strip())
data.append(line.strip().encode('utf-8'))
i = 1
idx = Index(data[0])

Loading…
Cancel
Save