mirror of https://github.com/patacrep/patacrep.git
Browse Source
Every manipulated string is unicode. * We guess encoding of files we read before opening them, and string read from it are converted to unicode. * We guess encoding of strings got from other modules (plasTeX), and they are converted to unicode.pull/54/head
Louis
11 years ago
16 changed files with 136 additions and 75 deletions
@ -0,0 +1,24 @@ |
|||
{ |
|||
"bookoptions" : [ |
|||
"importantdiagramonly", |
|||
"repeatchords", |
|||
"lilypond", |
|||
"pictures" |
|||
], |
|||
"booktype" : "chorded", |
|||
"lang" : "french", |
|||
"authwords" : { |
|||
"sep" : ["and", "et", "À"], |
|||
"ignore" : ["À"], |
|||
"after" : ["À"] |
|||
}, |
|||
"titleprefixwords": ["À"], |
|||
"datadir" : ".", |
|||
"content" : [["section", "Traditional"], |
|||
"chevaliers_de_la_table_ronde.sg", |
|||
"greensleeves.sg", |
|||
"vent_frais.sg", |
|||
["section", "Example"], |
|||
"example-fr.sg", |
|||
"example-en.sg"] |
|||
} |
@ -0,0 +1,49 @@ |
|||
#!/usr/bin/python |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
"""Dealing with encoding problems.""" |
|||
|
|||
import codecs |
|||
import chardet |
|||
import logging |
|||
from unidecode import unidecode as unidecode_orig |
|||
|
|||
LOGGER = logging.getLogger(__name__) |
|||
|
|||
def open_read(filename, mode='r'): |
|||
"""Open a file for reading, guessing the right encoding. |
|||
|
|||
Return a fileobject, reading unicode strings. |
|||
""" |
|||
return codecs.open( |
|||
filename, |
|||
mode=mode, |
|||
encoding=chardet.detect(open(filename, "r").read())['encoding'], |
|||
errors='replace', |
|||
) |
|||
|
|||
def basestring2unicode(arg): |
|||
"""Return the unicode version of the argument, guessing original encoding. |
|||
""" |
|||
if isinstance(arg, unicode): |
|||
return arg |
|||
elif isinstance(arg, basestring): |
|||
return arg.decode( |
|||
encoding=chardet.detect(arg)['encoding'], |
|||
errors='replace', |
|||
) |
|||
else: |
|||
LOGGER.warning("Cannot decode string {}. Ignored.".format(str(arg))) |
|||
return "" |
|||
|
|||
def list2unicode(arg): |
|||
"""Return the unicode version of the argument, guessing original encoding. |
|||
|
|||
Argument is a list of strings. If an item is of another type, it is |
|||
silently ignored (an empty string is returned). |
|||
""" |
|||
return [basestring2unicode(item) for item in arg] |
|||
|
|||
def unidecode(arg): |
|||
"""Return a unicode version of a unidecoded string.""" |
|||
return unicode(unidecode_orig(arg)) |
Loading…
Reference in new issue