From 21d4f0c24557cc80f5b1a6c1895bf58fdde90b53 Mon Sep 17 00:00:00 2001 From: Louis Date: Fri, 4 Jul 2014 17:37:56 +0200 Subject: [PATCH] Sanitize encoding of manipulated strings Every manipulated string is unicode. * We guess encoding of files we read before opening them, and string read from it are converted to unicode. * We guess encoding of strings got from other modules (plasTeX), and they are converted to unicode. --- patacrep/authors.py | 22 ++-------- patacrep/content/__init__.py | 2 +- patacrep/content/section.py | 4 +- patacrep/content/song.py | 6 +-- patacrep/content/songsection.py | 2 +- patacrep/content/sorted.py | 5 ++- patacrep/content/tex.py | 2 +- patacrep/data/examples/example_encoding.sb | 24 +++++++++++ patacrep/encoding.py | 49 ++++++++++++++++++++++ patacrep/index.py | 36 ++++++++-------- patacrep/plastex.py | 6 +-- patacrep/plastex_chord.py | 2 +- patacrep/plastex_songs.py | 14 ++++--- patacrep/songs.py | 5 +-- patacrep/templates.py | 25 ++++++----- songbook | 7 +++- 16 files changed, 136 insertions(+), 75 deletions(-) create mode 100644 patacrep/data/examples/example_encoding.sb create mode 100644 patacrep/encoding.py diff --git a/patacrep/authors.py b/patacrep/authors.py index 9f8911f9..15ef07b1 100644 --- a/patacrep/authors.py +++ b/patacrep/authors.py @@ -14,19 +14,6 @@ DEFAULT_AUTHWORDS = { "sep": ["and"], } -def to_utf8(string): - """Convert a string (encoded in unicode or iso-8859-1 to utf-8""" - if type(string) is unicode: - return string.encode('utf-8') - elif type(string) is str: - return string.decode('iso-8859-1').encode('utf-8') - else: - try: - return string.encode('utf-8') - except: - LOGGER.warning("Ignoring a word I can not decode...") - return "" - def compile_authwords(authwords): """Convert strings of authwords to compiled regular expressions. @@ -46,10 +33,7 @@ def compile_authwords(authwords): re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE) for word in ([" %s" % word for word in authwords['sep']] + [',']) ] - authwords['ignore'] = [ - to_utf8(word) - for word in authwords['ignore'] if to_utf8(word) - ] + authwords['ignore'] = [word for word in authwords['ignore'] if word] return authwords @@ -160,7 +144,7 @@ def processauthors_ignore_authors(authors_list, ignore): for author in authors_list: ignored = False for ignoreword in ignore: - if author.find(str(ignoreword)) != -1: + if author.find(ignoreword) != -1: ignored = True break if not ignored: @@ -188,7 +172,7 @@ def processauthors_invert_names(authors_list): for author in authors_list: first, last = split_author_names(author) if first: - dest.append(r"\indexauthor{{{first}}}{{{last}}}".format( + dest.append(ur"\indexauthor{{{first}}}{{{last}}}".format( first=first.strip(), last=last.strip(), )) diff --git a/patacrep/content/__init__.py b/patacrep/content/__init__.py index 6e363f85..2f4fea0e 100755 --- a/patacrep/content/__init__.py +++ b/patacrep/content/__init__.py @@ -225,7 +225,7 @@ def process_content(content, config=None): """ contentlist = [] plugins = load_plugins(config) - keyword_re = re.compile(r'^ *(?P\w*) *(\((?P.*)\))? *$') + keyword_re = re.compile(ur'^ *(?P\w*) *(\((?P.*)\))? *$') if not content: content = [["song"]] for elem in content: diff --git a/patacrep/content/section.py b/patacrep/content/section.py index b800ea5f..2bde2eb2 100755 --- a/patacrep/content/section.py +++ b/patacrep/content/section.py @@ -26,9 +26,9 @@ class Section(Content): def render(self, __context): if self.short is None: - return r'\{}{{{}}}'.format(self.keyword, self.name) + return ur'\{}{{{}}}'.format(self.keyword, self.name) else: - return r'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name) + return ur'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name) #pylint: disable=unused-argument def parse(keyword, argument, contentlist, config): diff --git a/patacrep/content/song.py b/patacrep/content/song.py index d6dc4886..9965c861 100755 --- a/patacrep/content/song.py +++ b/patacrep/content/song.py @@ -26,15 +26,15 @@ class SongRenderer(Content, Song): indexes = context.resolve("indexes") if isinstance(indexes, jinja2.runtime.Undefined): indexes = "" - return r'\begin{songs}{%s}' % indexes + return ur'\begin{songs}{%s}' % indexes def end_block(self, __context): """Return the string to end a block.""" - return r'\end{songs}' + return ur'\end{songs}' def render(self, context): """Return the string that will render the song.""" - return r'\input{{{}}}'.format(files.relpath( + return ur'\input{{{}}}'.format(files.relpath( self.path, os.path.dirname(context['filename']) )) diff --git a/patacrep/content/songsection.py b/patacrep/content/songsection.py index 2d67c35a..b4c9d446 100755 --- a/patacrep/content/songsection.py +++ b/patacrep/content/songsection.py @@ -19,7 +19,7 @@ class SongSection(Content): def render(self, __context): """Render this section or chapter.""" - return r'\{}{{{}}}'.format(self.keyword, self.name) + return ur'\{}{{{}}}'.format(self.keyword, self.name) #pylint: disable=unused-argument def parse(keyword, argument, contentlist, config): diff --git a/patacrep/content/sorted.py b/patacrep/content/sorted.py index 96ec7af4..12c63189 100755 --- a/patacrep/content/sorted.py +++ b/patacrep/content/sorted.py @@ -11,6 +11,7 @@ import locale import logging from patacrep import files +from patacrep import encoding from patacrep.content import ContentError from patacrep.content.song import OnlySongsError, process_songs @@ -26,7 +27,7 @@ def normalize_string(string): - lower case; - passed through locale.strxfrm(). """ - return locale.strxfrm(string.lower().strip()) + return locale.strxfrm(encoding.unidecode(string.lower().strip())) def normalize_field(field): """Return a normalized field, it being a string or a list of strings.""" @@ -62,7 +63,7 @@ def key_generator(sort): files.relpath(song.path), ) ) - field = "" + field = u"" songkey.append(normalize_field(field)) return songkey return ordered_song_keys diff --git a/patacrep/content/tex.py b/patacrep/content/tex.py index a934de48..1e18ecfd 100755 --- a/patacrep/content/tex.py +++ b/patacrep/content/tex.py @@ -18,7 +18,7 @@ class LaTeX(Content): self.filename = filename def render(self, context): - return r'\input{{{}}}'.format(files.relpath( + return ur'\input{{{}}}'.format(files.relpath( self.filename, os.path.dirname(context['filename']), )) diff --git a/patacrep/data/examples/example_encoding.sb b/patacrep/data/examples/example_encoding.sb new file mode 100644 index 00000000..0519b144 --- /dev/null +++ b/patacrep/data/examples/example_encoding.sb @@ -0,0 +1,24 @@ +{ +"bookoptions" : [ + "importantdiagramonly", + "repeatchords", + "lilypond", + "pictures" + ], +"booktype" : "chorded", +"lang" : "french", +"authwords" : { + "sep" : ["and", "et", "À"], + "ignore" : ["À"], + "after" : ["À"] + }, + "titleprefixwords": ["À"], + "datadir" : ".", + "content" : [["section", "Traditional"], + "chevaliers_de_la_table_ronde.sg", + "greensleeves.sg", + "vent_frais.sg", + ["section", "Example"], + "example-fr.sg", + "example-en.sg"] +} diff --git a/patacrep/encoding.py b/patacrep/encoding.py new file mode 100644 index 00000000..d3c32a84 --- /dev/null +++ b/patacrep/encoding.py @@ -0,0 +1,49 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +"""Dealing with encoding problems.""" + +import codecs +import chardet +import logging +from unidecode import unidecode as unidecode_orig + +LOGGER = logging.getLogger(__name__) + +def open_read(filename, mode='r'): + """Open a file for reading, guessing the right encoding. + + Return a fileobject, reading unicode strings. + """ + return codecs.open( + filename, + mode=mode, + encoding=chardet.detect(open(filename, "r").read())['encoding'], + errors='replace', + ) + +def basestring2unicode(arg): + """Return the unicode version of the argument, guessing original encoding. + """ + if isinstance(arg, unicode): + return arg + elif isinstance(arg, basestring): + return arg.decode( + encoding=chardet.detect(arg)['encoding'], + errors='replace', + ) + else: + LOGGER.warning("Cannot decode string {}. Ignored.".format(str(arg))) + return "" + +def list2unicode(arg): + """Return the unicode version of the argument, guessing original encoding. + + Argument is a list of strings. If an item is of another type, it is + silently ignored (an empty string is returned). + """ + return [basestring2unicode(item) for item in arg] + +def unidecode(arg): + """Return a unicode version of a unidecoded string.""" + return unicode(unidecode_orig(arg)) diff --git a/patacrep/index.py b/patacrep/index.py index 07c1d558..b921a65d 100755 --- a/patacrep/index.py +++ b/patacrep/index.py @@ -8,19 +8,18 @@ the original makeindex program written in C that produces an index file (.sbx) from a file generated by the latex compilation of the songbook (.sxd). """ -from unidecode import unidecode import locale import re -import codecs from patacrep import authors +from patacrep import encoding from patacrep.plastex import simpleparse EOL = u"\n" # Pattern set to ignore latex command in title prefix -KEYWORD_PATTERN = re.compile(r"^%(\w+)\s?(.*)$", re.LOCALE) -FIRST_LETTER_PATTERN = re.compile(r"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE) +KEYWORD_PATTERN = re.compile(ur"^%(\w+)\s?(.*)$", re.LOCALE) +FIRST_LETTER_PATTERN = re.compile(ur"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE) def sortkey(value): @@ -31,7 +30,7 @@ def sortkey(value): the sort with latex escape sequences. """ return locale.strxfrm( - unidecode(simpleparse(value).replace(' ', 'A')).lower() + encoding.unidecode(simpleparse(value).replace(' ', 'A')).lower() ) @@ -41,9 +40,12 @@ def process_sxd(filename): Return an Index object. """ data = [] - with codecs.open(filename, 'r', 'iso-8859-1') as index_file: + try: + index_file = encoding.open_read(filename, 'r') for line in index_file: - data.append(line.strip().encode('utf-8')) + data.append(line.strip()) + finally: + index_file.close() i = 1 idx = Index(data[0]) @@ -82,7 +84,7 @@ class Index(object): def get_first_letter(key): """Return the uppercase first letter of key.""" letter = FIRST_LETTER_PATTERN.match(key).group(1) - if re.match(r'\d', letter): + if re.match(ur'\d', letter): letter = '0-9' return letter.upper() @@ -98,9 +100,9 @@ class Index(object): if 'prefix' in self.keywords: for prefix in self.keywords['prefix']: self.prefix_patterns.append(re.compile( - r"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix), - re.LOCALE - )) + ur"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix), + re.LOCALE + )) if self.indextype == "AUTHOR": self.authwords = authors.compile_authwords(self.keywords) @@ -126,10 +128,10 @@ class Index(object): if self.indextype == "TITLE": # Removing prefixes before titles for pattern in self.prefix_patterns: - match = pattern.match(key.encode('utf-8')) + match = pattern.match(key) if match: self._raw_add( - r"\indextitle{{{}}}{{{}}}".format( + ur"\indextitle{{{}}}{{{}}}".format( match.group(1).strip(), (match.group(2) + match.group(3)).strip(), ), @@ -149,12 +151,10 @@ class Index(object): @staticmethod def ref_to_str(ref): """Return the LaTeX code corresponding to the reference.""" - return r'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref) + return ur'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref) def entry_to_str(self, key, entry): """Return the LaTeX code corresponding to the entry.""" - if not isinstance(key, unicode): - key = unicode(key, "UTF-8") return unicode(ur'\idxentry{{{0}}}{{{1}}}' + EOL).format( key, ur'\\'.join([self.ref_to_str(ref) for ref in entry]), @@ -166,10 +166,10 @@ class Index(object): Here, an index block is a letter, and all data beginning with this letter. """ - string = r'\begin{idxblock}{' + letter + '}' + EOL + string = ur'\begin{idxblock}{' + letter + '}' + EOL for key in sorted(entries.keys(), key=sortkey): string += self.entry_to_str(key, entries[key]) - string += r'\end{idxblock}' + EOL + string += ur'\end{idxblock}' + EOL return string def entries_to_str(self): diff --git a/patacrep/plastex.py b/patacrep/plastex.py index c4c81e62..b1c906b2 100644 --- a/patacrep/plastex.py +++ b/patacrep/plastex.py @@ -6,11 +6,11 @@ from plasTeX.TeX import TeX from plasTeX.Base.LaTeX import Sentences -import codecs import locale import os import sys +from patacrep import encoding def process_unbr_spaces(node): #pylint: disable=line-too-long @@ -39,8 +39,6 @@ def simpleparse(text): """Parse a simple LaTeX string. """ tex = TeX() - if not isinstance(text, unicode): - text = text.decode("utf-8") tex.input(text) doc = tex.parse() return process_unbr_spaces(doc.textContent) @@ -66,7 +64,7 @@ class SongParser(object): def parse(cls, filename): """Parse a TeX file, and return its plasTeX representation.""" tex = cls.create_tex() - tex.input(codecs.open(filename, 'r', 'utf-8', 'replace')) + tex.input(encoding.open_read(filename, 'r')) return tex.parse() diff --git a/patacrep/plastex_chord.py b/patacrep/plastex_chord.py index 6b213335..4fc0afe5 100644 --- a/patacrep/plastex_chord.py +++ b/patacrep/plastex_chord.py @@ -74,7 +74,7 @@ class Chord(Command): @property def source(self): """Return chord LaTeX code.""" - return r'\[{}]'.format(self.chord) + return ur'\[{}]'.format(self.chord) class BeginChordOrDisplayMath(BeginDisplayMath): r"""Wrapper to BeginDisplayMath diff --git a/patacrep/plastex_songs.py b/patacrep/plastex_songs.py index 43c31986..9bc0eba2 100755 --- a/patacrep/plastex_songs.py +++ b/patacrep/plastex_songs.py @@ -6,6 +6,7 @@ import plasTeX +from patacrep import encoding from patacrep.plastex import process_unbr_spaces @@ -28,8 +29,9 @@ def split_linebreak(texlist): return_list.append(current) current = [] else: - current.append( - process_unbr_spaces(token).textContent.encode('utf-8')) + current.append(encoding.basestring2unicode( + process_unbr_spaces(token).textContent + )) if current: return_list.append(current) return return_list @@ -49,15 +51,17 @@ class beginsong(plasTeX.Command): # pylint: disable=invalid-name,too-many-public titles = [] for tokens in split_linebreak(self.attributes['titles'].allChildNodes): titles.append("".join(tokens)) - self.attributes['titles'] = titles + self.attributes['titles'] = encoding.list2unicode(titles) # Parsing keyval arguments args = {} for (key, val) in self.attributes['args'].iteritems(): if isinstance(val, plasTeX.DOM.Element): - args[key] = process_unbr_spaces(val).textContent.encode('utf-8') + args[key] = encoding.basestring2unicode( + process_unbr_spaces(val).textContent + ) elif isinstance(val, basestring): - args[key] = val.encode('utf-8') + args[key] = encoding.basestring2unicode(val) else: args[key] = unicode(val) self.attributes['args'] = args diff --git a/patacrep/songs.py b/patacrep/songs.py index 2cc49846..a3adc856 100755 --- a/patacrep/songs.py +++ b/patacrep/songs.py @@ -3,7 +3,6 @@ """Song management.""" -from unidecode import unidecode import re from patacrep.authors import processauthors @@ -19,7 +18,7 @@ class Song(object): self.titles = data['titles'] self.unprefixed_titles = [ unprefixed_title( - unidecode(unicode(title, "utf-8")), + title, config['titleprefixwords'] ) for title @@ -43,7 +42,7 @@ def unprefixed_title(title, prefixes): """Remove the first prefix of the list in the beginning of title (if any). """ for prefix in prefixes: - match = re.compile(r"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title) + match = re.compile(ur"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title) if match: return match.group(2) return title diff --git a/patacrep/templates.py b/patacrep/templates.py index 9ac415a1..1030aaa1 100755 --- a/patacrep/templates.py +++ b/patacrep/templates.py @@ -6,23 +6,23 @@ from jinja2 import Environment, FileSystemLoader, ChoiceLoader, \ TemplateNotFound, nodes from jinja2.ext import Extension from jinja2.meta import find_referenced_templates as find_templates -import codecs import os import re import json +from patacrep import encoding from patacrep import errors _LATEX_SUBS = ( - (re.compile(r'\\'), r'\\textbackslash'), - (re.compile(r'([{}_#%&$])'), r'\\\1'), - (re.compile(r'~'), r'\~{}'), - (re.compile(r'\^'), r'\^{}'), - (re.compile(r'"'), r"''"), - (re.compile(r'\.\.\.+'), r'\\ldots'), + (re.compile(ur'\\'), ur'\\textbackslash'), + (re.compile(ur'([{}_#%&$])'), ur'\\\1'), + (re.compile(ur'~'), ur'\~{}'), + (re.compile(ur'\^'), ur'\^{}'), + (re.compile(ur'"'), ur"''"), + (re.compile(ur'\.\.\.+'), ur'\\ldots'), ) -_VARIABLE_REGEXP = re.compile(r""" +_VARIABLE_REGEXP = re.compile(ur""" \(\*\ *variables\ *\*\) # Match (* variables *) ( # Match and capture the following: (?: # Start of non-capturing group, used to match a single character @@ -178,11 +178,8 @@ class TexRenderer(object): subvariables = {} templatename = self.texenv.get_template(template).filename - with codecs.open( - templatename, - 'r', - 'utf-8' - ) as template_file: + try: + template_file = encoding.open_read(templatename, 'r') content = template_file.read() subtemplates = list(find_templates(self.texenv.parse(content))) match = re.findall(_VARIABLE_REGEXP, content) @@ -202,6 +199,8 @@ class TexRenderer(object): jsonstring=var, ) ) + finally: + template_file.close() return (subvariables, subtemplates) diff --git a/songbook b/songbook index 53d707a1..f9aee049 100755 --- a/songbook +++ b/songbook @@ -15,6 +15,7 @@ import sys from patacrep.build import SongbookBuilder, DEFAULT_STEPS from patacrep import __STR_VERSION__ from patacrep import errors +from patacrep import encoding # Logging configuration logging.basicConfig(level=logging.INFO) @@ -101,12 +102,14 @@ def main(): basename = os.path.basename(songbook_path)[:-3] try: - with open(songbook_path) as songbook_file: - songbook = json.load(songbook_file) + songbook_file = encoding.open_read(songbook_path) + songbook = json.load(songbook_file) except Exception as error: # pylint: disable=broad-except LOGGER.error(error) LOGGER.error("Error while loading file '{}'.".format(songbook_path)) sys.exit(1) + finally: + songbook_file.close() # Gathering datadirs datadirs = []