Sanitize encoding of manipulated strings

Every manipulated string is unicode. * We guess encoding of files we read before opening them, and string read from it are converted to unicode. * We guess encoding of strings got from other modules (plasTeX), and they are converted to unicode.
11 years ago · 21d4f0c245
16 changed files with 136 additions and 75 deletions
--- a/patacrep/authors.py
+++ b/patacrep/authors.py
@ -14,19 +14,6 @@ DEFAULT_AUTHWORDS = {
        "sep": ["and"],
        }
 def to_utf8(string):
    """Convert a string (encoded in unicode or iso-8859-1 to utf-8"""
    if type(string) is unicode:
        return string.encode('utf-8')
    elif type(string) is str:
        return string.decode('iso-8859-1').encode('utf-8')
    else:
        try:
            return string.encode('utf-8')
        except:
            LOGGER.warning("Ignoring a word I can not decode...")
            return ""
 def compile_authwords(authwords):
    """Convert strings of authwords to compiled regular expressions.
@ -46,10 +33,7 @@ def compile_authwords(authwords):
            re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE)
            for word in ([" %s" % word for word in authwords['sep']] + [','])
            ]
-    authwords['ignore'] = [
+    authwords['ignore'] = [word for word in authwords['ignore'] if word]
            to_utf8(word)
            for word in authwords['ignore'] if to_utf8(word)
            ]
    return authwords
@ -160,7 +144,7 @@ def processauthors_ignore_authors(authors_list, ignore):
    for author in authors_list:
        ignored = False
        for ignoreword in ignore:
-            if author.find(str(ignoreword)) != -1:
+            if author.find(ignoreword) != -1:
                ignored = True
                break
        if not ignored:
@ -188,7 +172,7 @@ def processauthors_invert_names(authors_list):
    for author in authors_list:
        first, last = split_author_names(author)
        if first:
-            dest.append(r"\indexauthor{{{first}}}{{{last}}}".format(
+            dest.append(ur"\indexauthor{{{first}}}{{{last}}}".format(
                first=first.strip(),
                last=last.strip(),
                ))
--- a/patacrep/content/init.py
+++ b/patacrep/content/init.py
@ -225,7 +225,7 @@ def process_content(content, config=None):
    """
    contentlist = []
    plugins = load_plugins(config)
-    keyword_re = re.compile(r'^ *(?P<keyword>\w*) *(\((?P<argument>.*)\))? *$')
+    keyword_re = re.compile(ur'^ *(?P<keyword>\w*) *(\((?P<argument>.*)\))? *$')
    if not content:
        content = [["song"]]
    for elem in content:
--- a/patacrep/content/section.py
+++ b/patacrep/content/section.py
@ -26,9 +26,9 @@ class Section(Content):
    def render(self, __context):
        if self.short is None:
-            return r'\{}{{{}}}'.format(self.keyword, self.name)
+            return ur'\{}{{{}}}'.format(self.keyword, self.name)
        else:
-            return r'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name)
+            return ur'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name)
 #pylint: disable=unused-argument
 def parse(keyword, argument, contentlist, config):
--- a/patacrep/content/song.py
+++ b/patacrep/content/song.py
@ -26,15 +26,15 @@ class SongRenderer(Content, Song):
        indexes = context.resolve("indexes")
        if isinstance(indexes, jinja2.runtime.Undefined):
            indexes = ""
-        return r'\begin{songs}{%s}' % indexes
+        return ur'\begin{songs}{%s}' % indexes
    def end_block(self, __context):
        """Return the string to end a block."""
-        return r'\end{songs}'
+        return ur'\end{songs}'
    def render(self, context):
        """Return the string that will render the song."""
-        return r'\input{{{}}}'.format(files.relpath(
+        return ur'\input{{{}}}'.format(files.relpath(
            self.path,
            os.path.dirname(context['filename'])
            ))
--- a/patacrep/content/songsection.py
+++ b/patacrep/content/songsection.py
@ -19,7 +19,7 @@ class SongSection(Content):
    def render(self, __context):
        """Render this section or chapter."""
-        return r'\{}{{{}}}'.format(self.keyword, self.name)
+        return ur'\{}{{{}}}'.format(self.keyword, self.name)
 #pylint: disable=unused-argument
 def parse(keyword, argument, contentlist, config):
--- a/patacrep/content/sorted.py
+++ b/patacrep/content/sorted.py
@ -11,6 +11,7 @@ import locale
 import logging
 from patacrep import files
 from patacrep import encoding
 from patacrep.content import ContentError
 from patacrep.content.song import OnlySongsError, process_songs
@ -26,7 +27,7 @@ def normalize_string(string):
    - lower case;
    - passed through locale.strxfrm().
    """
-    return locale.strxfrm(string.lower().strip())
+    return locale.strxfrm(encoding.unidecode(string.lower().strip()))
 def normalize_field(field):
    """Return a normalized field, it being a string or a list of strings."""
@ -62,7 +63,7 @@ def key_generator(sort):
                                files.relpath(song.path),
                                )
                            )
-                    field = ""
+                    field = u""
            songkey.append(normalize_field(field))
        return songkey
    return ordered_song_keys
--- a/patacrep/content/tex.py
+++ b/patacrep/content/tex.py
@ -18,7 +18,7 @@ class LaTeX(Content):
        self.filename = filename
    def render(self, context):
-        return r'\input{{{}}}'.format(files.relpath(
+        return ur'\input{{{}}}'.format(files.relpath(
            self.filename,
            os.path.dirname(context['filename']),
            ))
--- a/patacrep/data/examples/example_encoding.sb
+++ b/patacrep/data/examples/example_encoding.sb
@ -0,0 +1,24 @@
 {
 "bookoptions" : [
    "importantdiagramonly",
    "repeatchords",
    "lilypond",
    "pictures"
  ],
 "booktype" : "chorded",
 "lang" : "french",
 "authwords" : {
  "sep" : ["and", "et", "À"],
  "ignore" : ["À"],
  "after" : ["À"]
  },
  "titleprefixwords": ["À"],
 "datadir" : ".",
 "content" : [["section", "Traditional"],
                "chevaliers_de_la_table_ronde.sg",
                "greensleeves.sg",
                "vent_frais.sg",
                ["section", "Example"],
                "example-fr.sg",
                "example-en.sg"]
 }               
--- a/patacrep/encoding.py
+++ b/patacrep/encoding.py
@ -0,0 +1,49 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 """Dealing with encoding problems."""
 import codecs
 import chardet
 import logging
 from unidecode import unidecode as unidecode_orig
 LOGGER = logging.getLogger(__name__)
 def open_read(filename, mode='r'):
    """Open a file for reading, guessing the right encoding.
    Return a fileobject, reading unicode strings.
    """
    return codecs.open(
            filename,
            mode=mode,
            encoding=chardet.detect(open(filename, "r").read())['encoding'],
            errors='replace',
            )
 def basestring2unicode(arg):
    """Return the unicode version of the argument, guessing original encoding.
    """
    if isinstance(arg, unicode):
        return arg
    elif isinstance(arg, basestring):
        return arg.decode(
                encoding=chardet.detect(arg)['encoding'],
                errors='replace',
                )
    else:
        LOGGER.warning("Cannot decode string {}. Ignored.".format(str(arg)))
        return ""
 def list2unicode(arg):
    """Return the unicode version of the argument, guessing original encoding.
    Argument is a list of strings.  If an item is of another type, it is
    silently ignored (an empty string is returned).
    """
    return [basestring2unicode(item) for item in arg]
 def unidecode(arg):
    """Return a unicode version of a unidecoded string."""
    return unicode(unidecode_orig(arg))
--- a/patacrep/index.py
+++ b/patacrep/index.py
@ -8,19 +8,18 @@ the original makeindex program written in C that produces an index file (.sbx)
 from a file generated by the latex compilation of the songbook (.sxd).
 """
 from unidecode import unidecode
 import locale
 import re
 import codecs
 from patacrep import authors
 from patacrep import encoding
 from patacrep.plastex import simpleparse
 EOL = u"\n"
 # Pattern set to ignore latex command in title prefix
-KEYWORD_PATTERN = re.compile(r"^%(\w+)\s?(.*)$", re.LOCALE)
+KEYWORD_PATTERN = re.compile(ur"^%(\w+)\s?(.*)$", re.LOCALE)
-FIRST_LETTER_PATTERN = re.compile(r"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE)
+FIRST_LETTER_PATTERN = re.compile(ur"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE)
 def sortkey(value):
@ -31,7 +30,7 @@ def sortkey(value):
    the sort with  latex escape sequences.
    """
    return locale.strxfrm(
-            unidecode(simpleparse(value).replace(' ', 'A')).lower()
+            encoding.unidecode(simpleparse(value).replace(' ', 'A')).lower()
            )
@ -41,9 +40,12 @@ def process_sxd(filename):
    Return an Index object.
    """
    data = []
-    with codecs.open(filename, 'r', 'iso-8859-1') as index_file:
+    try:
        index_file = encoding.open_read(filename, 'r')
        for line in index_file:
-            data.append(line.strip().encode('utf-8'))
+            data.append(line.strip())
    finally:
        index_file.close()
    i = 1
    idx = Index(data[0])
@ -82,7 +84,7 @@ class Index(object):
    def get_first_letter(key):
        """Return the uppercase first letter of key."""
        letter = FIRST_LETTER_PATTERN.match(key).group(1)
-        if re.match(r'\d', letter):
+        if re.match(ur'\d', letter):
            letter = '0-9'
        return letter.upper()
@ -98,9 +100,9 @@ class Index(object):
            if 'prefix' in self.keywords:
                for prefix in self.keywords['prefix']:
                    self.prefix_patterns.append(re.compile(
-                            r"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix),
+                        ur"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix),
-                            re.LOCALE
+                        re.LOCALE
-                            ))
+                        ))
        if self.indextype == "AUTHOR":
            self.authwords = authors.compile_authwords(self.keywords)
@ -126,10 +128,10 @@ class Index(object):
        if self.indextype == "TITLE":
            # Removing prefixes before titles
            for pattern in self.prefix_patterns:
-                match = pattern.match(key.encode('utf-8'))
+                match = pattern.match(key)
                if match:
                    self._raw_add(
-                            r"\indextitle{{{}}}{{{}}}".format(
+                            ur"\indextitle{{{}}}{{{}}}".format(
                                match.group(1).strip(),
                                (match.group(2) + match.group(3)).strip(),
                                ),
@ -149,12 +151,10 @@ class Index(object):
    @staticmethod
    def ref_to_str(ref):
        """Return the LaTeX code corresponding to the reference."""
-        return r'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref)
+        return ur'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref)
    def entry_to_str(self, key, entry):
        """Return the LaTeX code corresponding to the entry."""
        if not isinstance(key, unicode):
            key = unicode(key, "UTF-8")
        return unicode(ur'\idxentry{{{0}}}{{{1}}}' + EOL).format(
                key,
                ur'\\'.join([self.ref_to_str(ref) for ref in entry]),
@ -166,10 +166,10 @@ class Index(object):
        Here, an index block is a letter, and all data beginning with this
        letter.
        """
-        string = r'\begin{idxblock}{' + letter + '}' + EOL
+        string = ur'\begin{idxblock}{' + letter + '}' + EOL
        for key in sorted(entries.keys(), key=sortkey):
            string += self.entry_to_str(key, entries[key])
-        string += r'\end{idxblock}' + EOL
+        string += ur'\end{idxblock}' + EOL
        return string
    def entries_to_str(self):
--- a/patacrep/plastex.py
+++ b/patacrep/plastex.py
@ -6,11 +6,11 @@
 from plasTeX.TeX import TeX
 from plasTeX.Base.LaTeX import Sentences
 import codecs
 import locale
 import os
 import sys
 from patacrep import encoding
 def process_unbr_spaces(node):
    #pylint: disable=line-too-long
@ -39,8 +39,6 @@ def simpleparse(text):
    """Parse a simple LaTeX string.
    """
    tex = TeX()
    if not isinstance(text, unicode):
        text = text.decode("utf-8")
    tex.input(text)
    doc = tex.parse()
    return process_unbr_spaces(doc.textContent)
@ -66,7 +64,7 @@ class SongParser(object):
    def parse(cls, filename):
        """Parse a TeX file, and return its plasTeX representation."""
        tex = cls.create_tex()
-        tex.input(codecs.open(filename, 'r', 'utf-8', 'replace'))
+        tex.input(encoding.open_read(filename, 'r'))
        return tex.parse()
--- a/patacrep/plastex_chord.py
+++ b/patacrep/plastex_chord.py
@ -74,7 +74,7 @@ class Chord(Command):
    @property
    def source(self):
        """Return chord LaTeX code."""
-        return r'\[{}]'.format(self.chord)
+        return ur'\[{}]'.format(self.chord)
 class BeginChordOrDisplayMath(BeginDisplayMath):
    r"""Wrapper to BeginDisplayMath
--- a/patacrep/plastex_songs.py
+++ b/patacrep/plastex_songs.py
@ -6,6 +6,7 @@
 import plasTeX
 from patacrep import encoding
 from patacrep.plastex import process_unbr_spaces
@ -28,8 +29,9 @@ def split_linebreak(texlist):
            return_list.append(current)
            current = []
        else:
-            current.append(
+            current.append(encoding.basestring2unicode(
-                    process_unbr_spaces(token).textContent.encode('utf-8'))
+                process_unbr_spaces(token).textContent
                ))
    if current:
        return_list.append(current)
    return return_list
@ -49,15 +51,17 @@ class beginsong(plasTeX.Command): # pylint: disable=invalid-name,too-many-public
        titles = []
        for tokens in split_linebreak(self.attributes['titles'].allChildNodes):
            titles.append("".join(tokens))
-        self.attributes['titles'] = titles
+        self.attributes['titles'] = encoding.list2unicode(titles)
        # Parsing keyval arguments
        args = {}
        for (key, val) in self.attributes['args'].iteritems():
            if isinstance(val, plasTeX.DOM.Element):
-                args[key] = process_unbr_spaces(val).textContent.encode('utf-8')
+                args[key] = encoding.basestring2unicode(
                        process_unbr_spaces(val).textContent
                        )
            elif isinstance(val, basestring):
-                args[key] = val.encode('utf-8')
+                args[key] = encoding.basestring2unicode(val)
            else:
                args[key] = unicode(val)
        self.attributes['args'] = args
--- a/patacrep/songs.py
+++ b/patacrep/songs.py
@ -3,7 +3,6 @@
 """Song management."""
 from unidecode import unidecode
 import re
 from patacrep.authors import processauthors
@ -19,7 +18,7 @@ class Song(object):
        self.titles = data['titles']
        self.unprefixed_titles = [
                unprefixed_title(
-                    unidecode(unicode(title, "utf-8")),
+                    title,
                    config['titleprefixwords']
                    )
                for title
@ -43,7 +42,7 @@ def unprefixed_title(title, prefixes):
    """Remove the first prefix of the list in the beginning of title (if any).
    """
    for prefix in prefixes:
-        match = re.compile(r"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title)
+        match = re.compile(ur"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title)
        if match:
            return match.group(2)
    return title
--- a/patacrep/templates.py
+++ b/patacrep/templates.py
@ -6,23 +6,23 @@ from jinja2 import Environment, FileSystemLoader, ChoiceLoader, \
        TemplateNotFound, nodes
 from jinja2.ext import Extension
 from jinja2.meta import find_referenced_templates as find_templates
 import codecs
 import os
 import re
 import json
 from patacrep import encoding
 from patacrep import errors
 _LATEX_SUBS = (
-    (re.compile(r'\\'), r'\\textbackslash'),
+    (re.compile(ur'\\'), ur'\\textbackslash'),
-    (re.compile(r'([{}_#%&$])'), r'\\\1'),
+    (re.compile(ur'([{}_#%&$])'), ur'\\\1'),
-    (re.compile(r'~'), r'\~{}'),
+    (re.compile(ur'~'), ur'\~{}'),
-    (re.compile(r'\^'), r'\^{}'),
+    (re.compile(ur'\^'), ur'\^{}'),
-    (re.compile(r'"'), r"''"),
+    (re.compile(ur'"'), ur"''"),
-    (re.compile(r'\.\.\.+'), r'\\ldots'),
+    (re.compile(ur'\.\.\.+'), ur'\\ldots'),
 )
-_VARIABLE_REGEXP = re.compile(r"""
+_VARIABLE_REGEXP = re.compile(ur"""
    \(\*\ *variables\ *\*\)    # Match (* variables *)
    (                          # Match and capture the following:
    (?:                        # Start of non-capturing group, used to match a single character
@ -178,11 +178,8 @@ class TexRenderer(object):
        subvariables = {}
        templatename = self.texenv.get_template(template).filename
-        with codecs.open(
+        try:
-                templatename,
+            template_file = encoding.open_read(templatename, 'r')
                'r',
                'utf-8'
                ) as template_file:
            content = template_file.read()
            subtemplates = list(find_templates(self.texenv.parse(content)))
            match = re.findall(_VARIABLE_REGEXP, content)
@ -202,6 +199,8 @@ class TexRenderer(object):
                                    jsonstring=var,
                                    )
                                )
        finally:
            template_file.close()
        return (subvariables, subtemplates)
--- a/7
+++ b/7
@ -15,6 +15,7 @@ import sys
 from patacrep.build import SongbookBuilder, DEFAULT_STEPS
 from patacrep import __STR_VERSION__
 from patacrep import errors
 from patacrep import encoding
 # Logging configuration
 logging.basicConfig(level=logging.INFO)
@ -101,12 +102,14 @@ def main():
    basename = os.path.basename(songbook_path)[:-3]
    try:
-        with open(songbook_path) as songbook_file:
+        songbook_file = encoding.open_read(songbook_path)
-            songbook = json.load(songbook_file)
+        songbook = json.load(songbook_file)
    except Exception as error: # pylint: disable=broad-except
        LOGGER.error(error)
        LOGGER.error("Error while loading file '{}'.".format(songbook_path))
        sys.exit(1)
    finally:
        songbook_file.close()
    # Gathering datadirs
    datadirs = []