Sanitize encoding of manipulated strings

Every manipulated string is unicode. * We guess encoding of files we read before opening them, and string read from it are converted to unicode. * We guess encoding of strings got from other modules (plasTeX), and they are converted to unicode.
11 years ago · 21d4f0c245
16 changed files with 136 additions and 75 deletions
--- a/patacrep/authors.py
+++ b/patacrep/authors.py
@ -14,19 +14,6 @@ DEFAULT_AUTHWORDS = {
        "sep": ["and"],
        }

-def to_utf8(string):
-    """Convert a string (encoded in unicode or iso-8859-1 to utf-8"""
-    if type(string) is unicode:
-        return string.encode('utf-8')
-    elif type(string) is str:
-        return string.decode('iso-8859-1').encode('utf-8')
-    else:
-        try:
-            return string.encode('utf-8')
-        except:
-            LOGGER.warning("Ignoring a word I can not decode...")
-            return ""
-
 def compile_authwords(authwords):
    """Convert strings of authwords to compiled regular expressions.

@ -46,10 +33,7 @@ def compile_authwords(authwords):
            re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE)
            for word in ([" %s" % word for word in authwords['sep']] + [','])
            ]
-    authwords['ignore'] = [
-            to_utf8(word)
-            for word in authwords['ignore'] if to_utf8(word)
-            ]
+    authwords['ignore'] = [word for word in authwords['ignore'] if word]

    return authwords

@ -160,7 +144,7 @@ def processauthors_ignore_authors(authors_list, ignore):
    for author in authors_list:
        ignored = False
        for ignoreword in ignore:
-            if author.find(str(ignoreword)) != -1:
+            if author.find(ignoreword) != -1:
                ignored = True
                break
        if not ignored:
@ -188,7 +172,7 @@ def processauthors_invert_names(authors_list):
    for author in authors_list:
        first, last = split_author_names(author)
        if first:
-            dest.append(r"\indexauthor{{{first}}}{{{last}}}".format(
+            dest.append(ur"\indexauthor{{{first}}}{{{last}}}".format(
                first=first.strip(),
                last=last.strip(),
                ))
--- a/patacrep/content/init.py
+++ b/patacrep/content/init.py
@ -225,7 +225,7 @@ def process_content(content, config=None):
    """
    contentlist = []
    plugins = load_plugins(config)
-    keyword_re = re.compile(r'^ *(?P<keyword>\w*) *(\((?P<argument>.*)\))? *$')
+    keyword_re = re.compile(ur'^ *(?P<keyword>\w*) *(\((?P<argument>.*)\))? *$')
    if not content:
        content = [["song"]]
    for elem in content:
--- a/patacrep/content/section.py
+++ b/patacrep/content/section.py
@ -26,9 +26,9 @@ class Section(Content):

    def render(self, __context):
        if self.short is None:
-            return r'\{}{{{}}}'.format(self.keyword, self.name)
+            return ur'\{}{{{}}}'.format(self.keyword, self.name)
        else:
-            return r'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name)
+            return ur'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name)

 #pylint: disable=unused-argument
 def parse(keyword, argument, contentlist, config):
--- a/patacrep/content/song.py
+++ b/patacrep/content/song.py
@ -26,15 +26,15 @@ class SongRenderer(Content, Song):
        indexes = context.resolve("indexes")
        if isinstance(indexes, jinja2.runtime.Undefined):
            indexes = ""
-        return r'\begin{songs}{%s}' % indexes
+        return ur'\begin{songs}{%s}' % indexes

    def end_block(self, __context):
        """Return the string to end a block."""
-        return r'\end{songs}'
+        return ur'\end{songs}'

    def render(self, context):
        """Return the string that will render the song."""
-        return r'\input{{{}}}'.format(files.relpath(
+        return ur'\input{{{}}}'.format(files.relpath(
            self.path,
            os.path.dirname(context['filename'])
            ))
--- a/patacrep/content/songsection.py
+++ b/patacrep/content/songsection.py
@ -19,7 +19,7 @@ class SongSection(Content):

    def render(self, __context):
        """Render this section or chapter."""
-        return r'\{}{{{}}}'.format(self.keyword, self.name)
+        return ur'\{}{{{}}}'.format(self.keyword, self.name)

 #pylint: disable=unused-argument
 def parse(keyword, argument, contentlist, config):
--- a/patacrep/content/sorted.py
+++ b/patacrep/content/sorted.py
@ -11,6 +11,7 @@ import locale
 import logging

 from patacrep import files
+from patacrep import encoding
 from patacrep.content import ContentError
 from patacrep.content.song import OnlySongsError, process_songs

@ -26,7 +27,7 @@ def normalize_string(string):
    - lower case;
    - passed through locale.strxfrm().
    """
-    return locale.strxfrm(string.lower().strip())
+    return locale.strxfrm(encoding.unidecode(string.lower().strip()))

 def normalize_field(field):
    """Return a normalized field, it being a string or a list of strings."""
@ -62,7 +63,7 @@ def key_generator(sort):
                                files.relpath(song.path),
                                )
                            )
-                    field = ""
+                    field = u""
            songkey.append(normalize_field(field))
        return songkey
    return ordered_song_keys
--- a/patacrep/content/tex.py
+++ b/patacrep/content/tex.py
@ -18,7 +18,7 @@ class LaTeX(Content):
        self.filename = filename

    def render(self, context):
-        return r'\input{{{}}}'.format(files.relpath(
+        return ur'\input{{{}}}'.format(files.relpath(
            self.filename,
            os.path.dirname(context['filename']),
            ))
--- a/patacrep/data/examples/example_encoding.sb
+++ b/patacrep/data/examples/example_encoding.sb
@ -0,0 +1,24 @@
+{
+"bookoptions" : [
+    "importantdiagramonly",
+    "repeatchords",
+    "lilypond",
+    "pictures"
+  ],
+"booktype" : "chorded",
+"lang" : "french",
+"authwords" : {
+  "sep" : ["and", "et", "À"],
+  "ignore" : ["À"],
+  "after" : ["À"]
+  },
+  "titleprefixwords": ["À"],
+ "datadir" : ".",
+ "content" : [["section", "Traditional"],
+                "chevaliers_de_la_table_ronde.sg",
+                "greensleeves.sg",
+                "vent_frais.sg",
+                ["section", "Example"],
+                "example-fr.sg",
+                "example-en.sg"]
+}               
--- a/patacrep/encoding.py
+++ b/patacrep/encoding.py
@ -0,0 +1,49 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""Dealing with encoding problems."""
+
+import codecs
+import chardet
+import logging
+from unidecode import unidecode as unidecode_orig
+
+LOGGER = logging.getLogger(__name__)
+
+def open_read(filename, mode='r'):
+    """Open a file for reading, guessing the right encoding.
+
+    Return a fileobject, reading unicode strings.
+    """
+    return codecs.open(
+            filename,
+            mode=mode,
+            encoding=chardet.detect(open(filename, "r").read())['encoding'],
+            errors='replace',
+            )
+
+def basestring2unicode(arg):
+    """Return the unicode version of the argument, guessing original encoding.
+    """
+    if isinstance(arg, unicode):
+        return arg
+    elif isinstance(arg, basestring):
+        return arg.decode(
+                encoding=chardet.detect(arg)['encoding'],
+                errors='replace',
+                )
+    else:
+        LOGGER.warning("Cannot decode string {}. Ignored.".format(str(arg)))
+        return ""
+
+def list2unicode(arg):
+    """Return the unicode version of the argument, guessing original encoding.
+
+    Argument is a list of strings.  If an item is of another type, it is
+    silently ignored (an empty string is returned).
+    """
+    return [basestring2unicode(item) for item in arg]
+
+def unidecode(arg):
+    """Return a unicode version of a unidecoded string."""
+    return unicode(unidecode_orig(arg))
--- a/patacrep/index.py
+++ b/patacrep/index.py
@ -8,19 +8,18 @@ the original makeindex program written in C that produces an index file (.sbx)
 from a file generated by the latex compilation of the songbook (.sxd).
 """

-from unidecode import unidecode
 import locale
 import re
-import codecs

 from patacrep import authors
+from patacrep import encoding
 from patacrep.plastex import simpleparse

 EOL = u"\n"

 # Pattern set to ignore latex command in title prefix
-KEYWORD_PATTERN = re.compile(r"^%(\w+)\s?(.*)$", re.LOCALE)
-FIRST_LETTER_PATTERN = re.compile(r"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE)
+KEYWORD_PATTERN = re.compile(ur"^%(\w+)\s?(.*)$", re.LOCALE)
+FIRST_LETTER_PATTERN = re.compile(ur"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE)


 def sortkey(value):
@ -31,7 +30,7 @@ def sortkey(value):
    the sort with  latex escape sequences.
    """
    return locale.strxfrm(
-            unidecode(simpleparse(value).replace(' ', 'A')).lower()
+            encoding.unidecode(simpleparse(value).replace(' ', 'A')).lower()
            )


@ -41,9 +40,12 @@ def process_sxd(filename):
    Return an Index object.
    """
    data = []
-    with codecs.open(filename, 'r', 'iso-8859-1') as index_file:
+    try:
+        index_file = encoding.open_read(filename, 'r')
        for line in index_file:
-            data.append(line.strip().encode('utf-8'))
+            data.append(line.strip())
+    finally:
+        index_file.close()

    i = 1
    idx = Index(data[0])
@ -82,7 +84,7 @@ class Index(object):
    def get_first_letter(key):
        """Return the uppercase first letter of key."""
        letter = FIRST_LETTER_PATTERN.match(key).group(1)
-        if re.match(r'\d', letter):
+        if re.match(ur'\d', letter):
            letter = '0-9'
        return letter.upper()

@ -98,9 +100,9 @@ class Index(object):
            if 'prefix' in self.keywords:
                for prefix in self.keywords['prefix']:
                    self.prefix_patterns.append(re.compile(
-                            r"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix),
-                            re.LOCALE
-                            ))
+                        ur"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix),
+                        re.LOCALE
+                        ))

        if self.indextype == "AUTHOR":
            self.authwords = authors.compile_authwords(self.keywords)
@ -126,10 +128,10 @@ class Index(object):
        if self.indextype == "TITLE":
            # Removing prefixes before titles
            for pattern in self.prefix_patterns:
-                match = pattern.match(key.encode('utf-8'))
+                match = pattern.match(key)
                if match:
                    self._raw_add(
-                            r"\indextitle{{{}}}{{{}}}".format(
+                            ur"\indextitle{{{}}}{{{}}}".format(
                                match.group(1).strip(),
                                (match.group(2) + match.group(3)).strip(),
                                ),
@ -149,12 +151,10 @@ class Index(object):
    @staticmethod
    def ref_to_str(ref):
        """Return the LaTeX code corresponding to the reference."""
-        return r'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref)
+        return ur'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref)

    def entry_to_str(self, key, entry):
        """Return the LaTeX code corresponding to the entry."""
-        if not isinstance(key, unicode):
-            key = unicode(key, "UTF-8")
        return unicode(ur'\idxentry{{{0}}}{{{1}}}' + EOL).format(
                key,
                ur'\\'.join([self.ref_to_str(ref) for ref in entry]),
@ -166,10 +166,10 @@ class Index(object):
        Here, an index block is a letter, and all data beginning with this
        letter.
        """
-        string = r'\begin{idxblock}{' + letter + '}' + EOL
+        string = ur'\begin{idxblock}{' + letter + '}' + EOL
        for key in sorted(entries.keys(), key=sortkey):
            string += self.entry_to_str(key, entries[key])
-        string += r'\end{idxblock}' + EOL
+        string += ur'\end{idxblock}' + EOL
        return string

    def entries_to_str(self):
--- a/patacrep/plastex.py
+++ b/patacrep/plastex.py
@ -6,11 +6,11 @@
 from plasTeX.TeX import TeX
 from plasTeX.Base.LaTeX import Sentences

-import codecs
 import locale
 import os
 import sys

+from patacrep import encoding

 def process_unbr_spaces(node):
    #pylint: disable=line-too-long
@ -39,8 +39,6 @@ def simpleparse(text):
    """Parse a simple LaTeX string.
    """
    tex = TeX()
-    if not isinstance(text, unicode):
-        text = text.decode("utf-8")
    tex.input(text)
    doc = tex.parse()
    return process_unbr_spaces(doc.textContent)
@ -66,7 +64,7 @@ class SongParser(object):
    def parse(cls, filename):
        """Parse a TeX file, and return its plasTeX representation."""
        tex = cls.create_tex()
-        tex.input(codecs.open(filename, 'r', 'utf-8', 'replace'))
+        tex.input(encoding.open_read(filename, 'r'))
        return tex.parse()


--- a/patacrep/plastex_chord.py
+++ b/patacrep/plastex_chord.py
@ -74,7 +74,7 @@ class Chord(Command):
    @property
    def source(self):
        """Return chord LaTeX code."""
-        return r'\[{}]'.format(self.chord)
+        return ur'\[{}]'.format(self.chord)

 class BeginChordOrDisplayMath(BeginDisplayMath):
    r"""Wrapper to BeginDisplayMath
--- a/patacrep/plastex_songs.py
+++ b/patacrep/plastex_songs.py
@ -6,6 +6,7 @@

 import plasTeX

+from patacrep import encoding
 from patacrep.plastex import process_unbr_spaces


@ -28,8 +29,9 @@ def split_linebreak(texlist):
            return_list.append(current)
            current = []
        else:
-            current.append(
-                    process_unbr_spaces(token).textContent.encode('utf-8'))
+            current.append(encoding.basestring2unicode(
+                process_unbr_spaces(token).textContent
+                ))
    if current:
        return_list.append(current)
    return return_list
@ -49,15 +51,17 @@ class beginsong(plasTeX.Command): # pylint: disable=invalid-name,too-many-public
        titles = []
        for tokens in split_linebreak(self.attributes['titles'].allChildNodes):
            titles.append("".join(tokens))
-        self.attributes['titles'] = titles
+        self.attributes['titles'] = encoding.list2unicode(titles)

        # Parsing keyval arguments
        args = {}
        for (key, val) in self.attributes['args'].iteritems():
            if isinstance(val, plasTeX.DOM.Element):
-                args[key] = process_unbr_spaces(val).textContent.encode('utf-8')
+                args[key] = encoding.basestring2unicode(
+                        process_unbr_spaces(val).textContent
+                        )
            elif isinstance(val, basestring):
-                args[key] = val.encode('utf-8')
+                args[key] = encoding.basestring2unicode(val)
            else:
                args[key] = unicode(val)
        self.attributes['args'] = args
--- a/patacrep/songs.py
+++ b/patacrep/songs.py
@ -3,7 +3,6 @@

 """Song management."""

-from unidecode import unidecode
 import re

 from patacrep.authors import processauthors
@ -19,7 +18,7 @@ class Song(object):
        self.titles = data['titles']
        self.unprefixed_titles = [
                unprefixed_title(
-                    unidecode(unicode(title, "utf-8")),
+                    title,
                    config['titleprefixwords']
                    )
                for title
@ -43,7 +42,7 @@ def unprefixed_title(title, prefixes):
    """Remove the first prefix of the list in the beginning of title (if any).
    """
    for prefix in prefixes:
-        match = re.compile(r"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title)
+        match = re.compile(ur"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title)
        if match:
            return match.group(2)
    return title
--- a/patacrep/templates.py
+++ b/patacrep/templates.py
@ -6,23 +6,23 @@ from jinja2 import Environment, FileSystemLoader, ChoiceLoader, \
        TemplateNotFound, nodes
 from jinja2.ext import Extension
 from jinja2.meta import find_referenced_templates as find_templates
-import codecs
 import os
 import re
 import json

+from patacrep import encoding
 from patacrep import errors

 _LATEX_SUBS = (
-    (re.compile(r'\\'), r'\\textbackslash'),
-    (re.compile(r'([{}_#%&$])'), r'\\\1'),
-    (re.compile(r'~'), r'\~{}'),
-    (re.compile(r'\^'), r'\^{}'),
-    (re.compile(r'"'), r"''"),
-    (re.compile(r'\.\.\.+'), r'\\ldots'),
+    (re.compile(ur'\\'), ur'\\textbackslash'),
+    (re.compile(ur'([{}_#%&$])'), ur'\\\1'),
+    (re.compile(ur'~'), ur'\~{}'),
+    (re.compile(ur'\^'), ur'\^{}'),
+    (re.compile(ur'"'), ur"''"),
+    (re.compile(ur'\.\.\.+'), ur'\\ldots'),
 )

-_VARIABLE_REGEXP = re.compile(r"""
+_VARIABLE_REGEXP = re.compile(ur"""
    \(\*\ *variables\ *\*\)    # Match (* variables *)
    (                          # Match and capture the following:
    (?:                        # Start of non-capturing group, used to match a single character
@ -178,11 +178,8 @@ class TexRenderer(object):

        subvariables = {}
        templatename = self.texenv.get_template(template).filename
-        with codecs.open(
-                templatename,
-                'r',
-                'utf-8'
-                ) as template_file:
+        try:
+            template_file = encoding.open_read(templatename, 'r')
            content = template_file.read()
            subtemplates = list(find_templates(self.texenv.parse(content)))
            match = re.findall(_VARIABLE_REGEXP, content)
@ -202,6 +199,8 @@ class TexRenderer(object):
                                    jsonstring=var,
                                    )
                                )
+        finally:
+            template_file.close()

        return (subvariables, subtemplates)

--- a/7
+++ b/7
@ -15,6 +15,7 @@ import sys
 from patacrep.build import SongbookBuilder, DEFAULT_STEPS
 from patacrep import __STR_VERSION__
 from patacrep import errors
+from patacrep import encoding

 # Logging configuration
 logging.basicConfig(level=logging.INFO)
@ -101,12 +102,14 @@ def main():
    basename = os.path.basename(songbook_path)[:-3]

    try:
-        with open(songbook_path) as songbook_file:
-            songbook = json.load(songbook_file)
+        songbook_file = encoding.open_read(songbook_path)
+        songbook = json.load(songbook_file)
    except Exception as error: # pylint: disable=broad-except
        LOGGER.error(error)
        LOGGER.error("Error while loading file '{}'.".format(songbook_path))
        sys.exit(1)
+    finally:
+        songbook_file.close()

    # Gathering datadirs
    datadirs = []