mirror of https://github.com/patacrep/patacrep.git
Browse Source
Every manipulated string is unicode. * We guess encoding of files we read before opening them, and string read from it are converted to unicode. * We guess encoding of strings got from other modules (plasTeX), and they are converted to unicode.pull/54/head
Louis
11 years ago
16 changed files with 136 additions and 75 deletions
@ -0,0 +1,24 @@ |
|||||
|
{ |
||||
|
"bookoptions" : [ |
||||
|
"importantdiagramonly", |
||||
|
"repeatchords", |
||||
|
"lilypond", |
||||
|
"pictures" |
||||
|
], |
||||
|
"booktype" : "chorded", |
||||
|
"lang" : "french", |
||||
|
"authwords" : { |
||||
|
"sep" : ["and", "et", "À"], |
||||
|
"ignore" : ["À"], |
||||
|
"after" : ["À"] |
||||
|
}, |
||||
|
"titleprefixwords": ["À"], |
||||
|
"datadir" : ".", |
||||
|
"content" : [["section", "Traditional"], |
||||
|
"chevaliers_de_la_table_ronde.sg", |
||||
|
"greensleeves.sg", |
||||
|
"vent_frais.sg", |
||||
|
["section", "Example"], |
||||
|
"example-fr.sg", |
||||
|
"example-en.sg"] |
||||
|
} |
@ -0,0 +1,49 @@ |
|||||
|
#!/usr/bin/python |
||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
"""Dealing with encoding problems.""" |
||||
|
|
||||
|
import codecs |
||||
|
import chardet |
||||
|
import logging |
||||
|
from unidecode import unidecode as unidecode_orig |
||||
|
|
||||
|
LOGGER = logging.getLogger(__name__) |
||||
|
|
||||
|
def open_read(filename, mode='r'): |
||||
|
"""Open a file for reading, guessing the right encoding. |
||||
|
|
||||
|
Return a fileobject, reading unicode strings. |
||||
|
""" |
||||
|
return codecs.open( |
||||
|
filename, |
||||
|
mode=mode, |
||||
|
encoding=chardet.detect(open(filename, "r").read())['encoding'], |
||||
|
errors='replace', |
||||
|
) |
||||
|
|
||||
|
def basestring2unicode(arg): |
||||
|
"""Return the unicode version of the argument, guessing original encoding. |
||||
|
""" |
||||
|
if isinstance(arg, unicode): |
||||
|
return arg |
||||
|
elif isinstance(arg, basestring): |
||||
|
return arg.decode( |
||||
|
encoding=chardet.detect(arg)['encoding'], |
||||
|
errors='replace', |
||||
|
) |
||||
|
else: |
||||
|
LOGGER.warning("Cannot decode string {}. Ignored.".format(str(arg))) |
||||
|
return "" |
||||
|
|
||||
|
def list2unicode(arg): |
||||
|
"""Return the unicode version of the argument, guessing original encoding. |
||||
|
|
||||
|
Argument is a list of strings. If an item is of another type, it is |
||||
|
silently ignored (an empty string is returned). |
||||
|
""" |
||||
|
return [basestring2unicode(item) for item in arg] |
||||
|
|
||||
|
def unidecode(arg): |
||||
|
"""Return a unicode version of a unidecoded string.""" |
||||
|
return unicode(unidecode_orig(arg)) |
Loading…
Reference in new issue