From 21d4f0c24557cc80f5b1a6c1895bf58fdde90b53 Mon Sep 17 00:00:00 2001
From: Louis <spalax@gresille.org>
Date: Fri, 4 Jul 2014 17:37:56 +0200
Subject: [PATCH] Sanitize encoding of manipulated strings

Every manipulated string is unicode.

* We guess encoding of files we read before opening them, and string
  read from it are converted to unicode.
* We guess encoding of strings got from other modules (plasTeX), and
  they are converted to unicode.
---
 patacrep/authors.py                        | 22 ++--------
 patacrep/content/__init__.py               |  2 +-
 patacrep/content/section.py                |  4 +-
 patacrep/content/song.py                   |  6 +--
 patacrep/content/songsection.py            |  2 +-
 patacrep/content/sorted.py                 |  5 ++-
 patacrep/content/tex.py                    |  2 +-
 patacrep/data/examples/example_encoding.sb | 24 +++++++++++
 patacrep/encoding.py                       | 49 ++++++++++++++++++++++
 patacrep/index.py                          | 36 ++++++++--------
 patacrep/plastex.py                        |  6 +--
 patacrep/plastex_chord.py                  |  2 +-
 patacrep/plastex_songs.py                  | 14 ++++---
 patacrep/songs.py                          |  5 +--
 patacrep/templates.py                      | 25 ++++++-----
 songbook                                   |  7 +++-
 16 files changed, 136 insertions(+), 75 deletions(-)
 create mode 100644 patacrep/data/examples/example_encoding.sb
 create mode 100644 patacrep/encoding.py
diff --git a/patacrep/authors.py b/patacrep/authors.py
index 9f8911f9..15ef07b1 100644
--- a/patacrep/authors.py
+++ b/patacrep/authors.py
@@ -14,19 +14,6 @@ DEFAULT_AUTHWORDS = {
         "sep": ["and"],
         }
 
-def to_utf8(string):
-    """Convert a string (encoded in unicode or iso-8859-1 to utf-8"""
-    if type(string) is unicode:
-        return string.encode('utf-8')
-    elif type(string) is str:
-        return string.decode('iso-8859-1').encode('utf-8')
-    else:
-        try:
-            return string.encode('utf-8')
-        except:
-            LOGGER.warning("Ignoring a word I can not decode...")
-            return ""
-
 def compile_authwords(authwords):
     """Convert strings of authwords to compiled regular expressions.
 
@@ -46,10 +33,7 @@ def compile_authwords(authwords):
             re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE)
             for word in ([" %s" % word for word in authwords['sep']] + [','])
             ]
-    authwords['ignore'] = [
-            to_utf8(word)
-            for word in authwords['ignore'] if to_utf8(word)
-            ]
+    authwords['ignore'] = [word for word in authwords['ignore'] if word]
 
     return authwords
 
@@ -160,7 +144,7 @@ def processauthors_ignore_authors(authors_list, ignore):
     for author in authors_list:
         ignored = False
         for ignoreword in ignore:
-            if author.find(str(ignoreword)) != -1:
+            if author.find(ignoreword) != -1:
                 ignored = True
                 break
         if not ignored:
@@ -188,7 +172,7 @@ def processauthors_invert_names(authors_list):
     for author in authors_list:
         first, last = split_author_names(author)
         if first:
-            dest.append(r"\indexauthor{{{first}}}{{{last}}}".format(
+            dest.append(ur"\indexauthor{{{first}}}{{{last}}}".format(
                 first=first.strip(),
                 last=last.strip(),
                 ))
diff --git a/patacrep/content/__init__.py b/patacrep/content/__init__.py
index 6e363f85..2f4fea0e 100755
--- a/patacrep/content/__init__.py
+++ b/patacrep/content/__init__.py
@@ -225,7 +225,7 @@ def process_content(content, config=None):
     """
     contentlist = []
     plugins = load_plugins(config)
-    keyword_re = re.compile(r'^ *(?P<keyword>\w*) *(\((?P<argument>.*)\))? *$')
+    keyword_re = re.compile(ur'^ *(?P<keyword>\w*) *(\((?P<argument>.*)\))? *$')
     if not content:
         content = [["song"]]
     for elem in content:
diff --git a/patacrep/content/section.py b/patacrep/content/section.py
index b800ea5f..2bde2eb2 100755
--- a/patacrep/content/section.py
+++ b/patacrep/content/section.py
@@ -26,9 +26,9 @@ class Section(Content):
 
     def render(self, __context):
         if self.short is None:
-            return r'\{}{{{}}}'.format(self.keyword, self.name)
+            return ur'\{}{{{}}}'.format(self.keyword, self.name)
         else:
-            return r'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name)
+            return ur'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name)
 
 #pylint: disable=unused-argument
 def parse(keyword, argument, contentlist, config):
diff --git a/patacrep/content/song.py b/patacrep/content/song.py
index d6dc4886..9965c861 100755
--- a/patacrep/content/song.py
+++ b/patacrep/content/song.py
@@ -26,15 +26,15 @@ class SongRenderer(Content, Song):
         indexes = context.resolve("indexes")
         if isinstance(indexes, jinja2.runtime.Undefined):
             indexes = ""
-        return r'\begin{songs}{%s}' % indexes
+        return ur'\begin{songs}{%s}' % indexes
 
     def end_block(self, __context):
         """Return the string to end a block."""
-        return r'\end{songs}'
+        return ur'\end{songs}'
 
     def render(self, context):
         """Return the string that will render the song."""
-        return r'\input{{{}}}'.format(files.relpath(
+        return ur'\input{{{}}}'.format(files.relpath(
             self.path,
             os.path.dirname(context['filename'])
             ))
diff --git a/patacrep/content/songsection.py b/patacrep/content/songsection.py
index 2d67c35a..b4c9d446 100755
--- a/patacrep/content/songsection.py
+++ b/patacrep/content/songsection.py
@@ -19,7 +19,7 @@ class SongSection(Content):
 
     def render(self, __context):
         """Render this section or chapter."""
-        return r'\{}{{{}}}'.format(self.keyword, self.name)
+        return ur'\{}{{{}}}'.format(self.keyword, self.name)
 
 #pylint: disable=unused-argument
 def parse(keyword, argument, contentlist, config):
diff --git a/patacrep/content/sorted.py b/patacrep/content/sorted.py
index 96ec7af4..12c63189 100755
--- a/patacrep/content/sorted.py
+++ b/patacrep/content/sorted.py
@@ -11,6 +11,7 @@ import locale
 import logging
 
 from patacrep import files
+from patacrep import encoding
 from patacrep.content import ContentError
 from patacrep.content.song import OnlySongsError, process_songs
 
@@ -26,7 +27,7 @@ def normalize_string(string):
     - lower case;
     - passed through locale.strxfrm().
     """
-    return locale.strxfrm(string.lower().strip())
+    return locale.strxfrm(encoding.unidecode(string.lower().strip()))
 
 def normalize_field(field):
     """Return a normalized field, it being a string or a list of strings."""
@@ -62,7 +63,7 @@ def key_generator(sort):
                                 files.relpath(song.path),
                                 )
                             )
-                    field = ""
+                    field = u""
             songkey.append(normalize_field(field))
         return songkey
     return ordered_song_keys
diff --git a/patacrep/content/tex.py b/patacrep/content/tex.py
index a934de48..1e18ecfd 100755
--- a/patacrep/content/tex.py
+++ b/patacrep/content/tex.py
@@ -18,7 +18,7 @@ class LaTeX(Content):
         self.filename = filename
 
     def render(self, context):
-        return r'\input{{{}}}'.format(files.relpath(
+        return ur'\input{{{}}}'.format(files.relpath(
             self.filename,
             os.path.dirname(context['filename']),
             ))
diff --git a/patacrep/data/examples/example_encoding.sb b/patacrep/data/examples/example_encoding.sb
new file mode 100644
index 00000000..0519b144
--- /dev/null
+++ b/patacrep/data/examples/example_encoding.sb
@@ -0,0 +1,24 @@
+{
+"bookoptions" : [
+    "importantdiagramonly",
+    "repeatchords",
+    "lilypond",
+    "pictures"
+  ],
+"booktype" : "chorded",
+"lang" : "french",
+"authwords" : {
+  "sep" : ["and", "et", "À"],
+  "ignore" : ["À"],
+  "after" : ["À"]
+  },
+  "titleprefixwords": ["À"],
+ "datadir" : ".",
+ "content" : [["section", "Traditional"],
+                "chevaliers_de_la_table_ronde.sg",
+                "greensleeves.sg",
+                "vent_frais.sg",
+                ["section", "Example"],
+                "example-fr.sg",
+                "example-en.sg"]
+}               
diff --git a/patacrep/encoding.py b/patacrep/encoding.py
new file mode 100644
index 00000000..d3c32a84
--- /dev/null
+++ b/patacrep/encoding.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""Dealing with encoding problems."""
+
+import codecs
+import chardet
+import logging
+from unidecode import unidecode as unidecode_orig
+
+LOGGER = logging.getLogger(__name__)
+
+def open_read(filename, mode='r'):
+    """Open a file for reading, guessing the right encoding.
+
+    Return a fileobject, reading unicode strings.
+    """
+    return codecs.open(
+            filename,
+            mode=mode,
+            encoding=chardet.detect(open(filename, "r").read())['encoding'],
+            errors='replace',
+            )
+
+def basestring2unicode(arg):
+    """Return the unicode version of the argument, guessing original encoding.
+    """
+    if isinstance(arg, unicode):
+        return arg
+    elif isinstance(arg, basestring):
+        return arg.decode(
+                encoding=chardet.detect(arg)['encoding'],
+                errors='replace',
+                )
+    else:
+        LOGGER.warning("Cannot decode string {}. Ignored.".format(str(arg)))
+        return ""
+
+def list2unicode(arg):
+    """Return the unicode version of the argument, guessing original encoding.
+
+    Argument is a list of strings.  If an item is of another type, it is
+    silently ignored (an empty string is returned).
+    """
+    return [basestring2unicode(item) for item in arg]
+
+def unidecode(arg):
+    """Return a unicode version of a unidecoded string."""
+    return unicode(unidecode_orig(arg))
diff --git a/patacrep/index.py b/patacrep/index.py
index 07c1d558..b921a65d 100755
--- a/patacrep/index.py
+++ b/patacrep/index.py
@@ -8,19 +8,18 @@ the original makeindex program written in C that produces an index file (.sbx)
 from a file generated by the latex compilation of the songbook (.sxd).
 """
 
-from unidecode import unidecode
 import locale
 import re
-import codecs
 
 from patacrep import authors
+from patacrep import encoding
 from patacrep.plastex import simpleparse
 
 EOL = u"\n"
 
 # Pattern set to ignore latex command in title prefix
-KEYWORD_PATTERN = re.compile(r"^%(\w+)\s?(.*)$", re.LOCALE)
-FIRST_LETTER_PATTERN = re.compile(r"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE)
+KEYWORD_PATTERN = re.compile(ur"^%(\w+)\s?(.*)$", re.LOCALE)
+FIRST_LETTER_PATTERN = re.compile(ur"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE)
 
 
 def sortkey(value):
@@ -31,7 +30,7 @@ def sortkey(value):
     the sort with  latex escape sequences.
     """
     return locale.strxfrm(
-            unidecode(simpleparse(value).replace(' ', 'A')).lower()
+            encoding.unidecode(simpleparse(value).replace(' ', 'A')).lower()
             )
 
 
@@ -41,9 +40,12 @@ def process_sxd(filename):
     Return an Index object.
     """
     data = []
-    with codecs.open(filename, 'r', 'iso-8859-1') as index_file:
+    try:
+        index_file = encoding.open_read(filename, 'r')
         for line in index_file:
-            data.append(line.strip().encode('utf-8'))
+            data.append(line.strip())
+    finally:
+        index_file.close()
 
     i = 1
     idx = Index(data[0])
@@ -82,7 +84,7 @@ class Index(object):
     def get_first_letter(key):
         """Return the uppercase first letter of key."""
         letter = FIRST_LETTER_PATTERN.match(key).group(1)
-        if re.match(r'\d', letter):
+        if re.match(ur'\d', letter):
             letter = '0-9'
         return letter.upper()
 
@@ -98,9 +100,9 @@ class Index(object):
             if 'prefix' in self.keywords:
                 for prefix in self.keywords['prefix']:
                     self.prefix_patterns.append(re.compile(
-                            r"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix),
-                            re.LOCALE
-                            ))
+                        ur"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix),
+                        re.LOCALE
+                        ))
 
         if self.indextype == "AUTHOR":
             self.authwords = authors.compile_authwords(self.keywords)
@@ -126,10 +128,10 @@ class Index(object):
         if self.indextype == "TITLE":
             # Removing prefixes before titles
             for pattern in self.prefix_patterns:
-                match = pattern.match(key.encode('utf-8'))
+                match = pattern.match(key)
                 if match:
                     self._raw_add(
-                            r"\indextitle{{{}}}{{{}}}".format(
+                            ur"\indextitle{{{}}}{{{}}}".format(
                                 match.group(1).strip(),
                                 (match.group(2) + match.group(3)).strip(),
                                 ),
@@ -149,12 +151,10 @@ class Index(object):
     @staticmethod
     def ref_to_str(ref):
         """Return the LaTeX code corresponding to the reference."""
-        return r'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref)
+        return ur'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref)
 
     def entry_to_str(self, key, entry):
         """Return the LaTeX code corresponding to the entry."""
-        if not isinstance(key, unicode):
-            key = unicode(key, "UTF-8")
         return unicode(ur'\idxentry{{{0}}}{{{1}}}' + EOL).format(
                 key,
                 ur'\\'.join([self.ref_to_str(ref) for ref in entry]),
@@ -166,10 +166,10 @@ class Index(object):
         Here, an index block is a letter, and all data beginning with this
         letter.
         """
-        string = r'\begin{idxblock}{' + letter + '}' + EOL
+        string = ur'\begin{idxblock}{' + letter + '}' + EOL
         for key in sorted(entries.keys(), key=sortkey):
             string += self.entry_to_str(key, entries[key])
-        string += r'\end{idxblock}' + EOL
+        string += ur'\end{idxblock}' + EOL
         return string
 
     def entries_to_str(self):
diff --git a/patacrep/plastex.py b/patacrep/plastex.py
index c4c81e62..b1c906b2 100644
--- a/patacrep/plastex.py
+++ b/patacrep/plastex.py
@@ -6,11 +6,11 @@
 from plasTeX.TeX import TeX
 from plasTeX.Base.LaTeX import Sentences
 
-import codecs
 import locale
 import os
 import sys
 
+from patacrep import encoding
 
 def process_unbr_spaces(node):
     #pylint: disable=line-too-long
@@ -39,8 +39,6 @@ def simpleparse(text):
     """Parse a simple LaTeX string.
     """
     tex = TeX()
-    if not isinstance(text, unicode):
-        text = text.decode("utf-8")
     tex.input(text)
     doc = tex.parse()
     return process_unbr_spaces(doc.textContent)
@@ -66,7 +64,7 @@ class SongParser(object):
     def parse(cls, filename):
         """Parse a TeX file, and return its plasTeX representation."""
         tex = cls.create_tex()
-        tex.input(codecs.open(filename, 'r', 'utf-8', 'replace'))
+        tex.input(encoding.open_read(filename, 'r'))
         return tex.parse()
 
 
diff --git a/patacrep/plastex_chord.py b/patacrep/plastex_chord.py
index 6b213335..4fc0afe5 100644
--- a/patacrep/plastex_chord.py
+++ b/patacrep/plastex_chord.py
@@ -74,7 +74,7 @@ class Chord(Command):
     @property
     def source(self):
         """Return chord LaTeX code."""
-        return r'\[{}]'.format(self.chord)
+        return ur'\[{}]'.format(self.chord)
 
 class BeginChordOrDisplayMath(BeginDisplayMath):
     r"""Wrapper to BeginDisplayMath
diff --git a/patacrep/plastex_songs.py b/patacrep/plastex_songs.py
index 43c31986..9bc0eba2 100755
--- a/patacrep/plastex_songs.py
+++ b/patacrep/plastex_songs.py
@@ -6,6 +6,7 @@
 
 import plasTeX
 
+from patacrep import encoding
 from patacrep.plastex import process_unbr_spaces
 
 
@@ -28,8 +29,9 @@ def split_linebreak(texlist):
             return_list.append(current)
             current = []
         else:
-            current.append(
-                    process_unbr_spaces(token).textContent.encode('utf-8'))
+            current.append(encoding.basestring2unicode(
+                process_unbr_spaces(token).textContent
+                ))
     if current:
         return_list.append(current)
     return return_list
@@ -49,15 +51,17 @@ class beginsong(plasTeX.Command): # pylint: disable=invalid-name,too-many-public
         titles = []
         for tokens in split_linebreak(self.attributes['titles'].allChildNodes):
             titles.append("".join(tokens))
-        self.attributes['titles'] = titles
+        self.attributes['titles'] = encoding.list2unicode(titles)
 
         # Parsing keyval arguments
         args = {}
         for (key, val) in self.attributes['args'].iteritems():
             if isinstance(val, plasTeX.DOM.Element):
-                args[key] = process_unbr_spaces(val).textContent.encode('utf-8')
+                args[key] = encoding.basestring2unicode(
+                        process_unbr_spaces(val).textContent
+                        )
             elif isinstance(val, basestring):
-                args[key] = val.encode('utf-8')
+                args[key] = encoding.basestring2unicode(val)
             else:
                 args[key] = unicode(val)
         self.attributes['args'] = args
diff --git a/patacrep/songs.py b/patacrep/songs.py
index 2cc49846..a3adc856 100755
--- a/patacrep/songs.py
+++ b/patacrep/songs.py
@@ -3,7 +3,6 @@
 
 """Song management."""
 
-from unidecode import unidecode
 import re
 
 from patacrep.authors import processauthors
@@ -19,7 +18,7 @@ class Song(object):
         self.titles = data['titles']
         self.unprefixed_titles = [
                 unprefixed_title(
-                    unidecode(unicode(title, "utf-8")),
+                    title,
                     config['titleprefixwords']
                     )
                 for title
@@ -43,7 +42,7 @@ def unprefixed_title(title, prefixes):
     """Remove the first prefix of the list in the beginning of title (if any).
     """
     for prefix in prefixes:
-        match = re.compile(r"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title)
+        match = re.compile(ur"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title)
         if match:
             return match.group(2)
     return title
diff --git a/patacrep/templates.py b/patacrep/templates.py
index 9ac415a1..1030aaa1 100755
--- a/patacrep/templates.py
+++ b/patacrep/templates.py
@@ -6,23 +6,23 @@ from jinja2 import Environment, FileSystemLoader, ChoiceLoader, \
         TemplateNotFound, nodes
 from jinja2.ext import Extension
 from jinja2.meta import find_referenced_templates as find_templates
-import codecs
 import os
 import re
 import json
 
+from patacrep import encoding
 from patacrep import errors
 
 _LATEX_SUBS = (
-    (re.compile(r'\\'), r'\\textbackslash'),
-    (re.compile(r'([{}_#%&$])'), r'\\\1'),
-    (re.compile(r'~'), r'\~{}'),
-    (re.compile(r'\^'), r'\^{}'),
-    (re.compile(r'"'), r"''"),
-    (re.compile(r'\.\.\.+'), r'\\ldots'),
+    (re.compile(ur'\\'), ur'\\textbackslash'),
+    (re.compile(ur'([{}_#%&$])'), ur'\\\1'),
+    (re.compile(ur'~'), ur'\~{}'),
+    (re.compile(ur'\^'), ur'\^{}'),
+    (re.compile(ur'"'), ur"''"),
+    (re.compile(ur'\.\.\.+'), ur'\\ldots'),
 )
 
-_VARIABLE_REGEXP = re.compile(r"""
+_VARIABLE_REGEXP = re.compile(ur"""
     \(\*\ *variables\ *\*\)    # Match (* variables *)
     (                          # Match and capture the following:
     (?:                        # Start of non-capturing group, used to match a single character
@@ -178,11 +178,8 @@ class TexRenderer(object):
 
         subvariables = {}
         templatename = self.texenv.get_template(template).filename
-        with codecs.open(
-                templatename,
-                'r',
-                'utf-8'
-                ) as template_file:
+        try:
+            template_file = encoding.open_read(templatename, 'r')
             content = template_file.read()
             subtemplates = list(find_templates(self.texenv.parse(content)))
             match = re.findall(_VARIABLE_REGEXP, content)
@@ -202,6 +199,8 @@ class TexRenderer(object):
                                     jsonstring=var,
                                     )
                                 )
+        finally:
+            template_file.close()
 
         return (subvariables, subtemplates)
 
diff --git a/songbook b/songbook
index 53d707a1..f9aee049 100755
--- a/songbook
+++ b/songbook
@@ -15,6 +15,7 @@ import sys
 from patacrep.build import SongbookBuilder, DEFAULT_STEPS
 from patacrep import __STR_VERSION__
 from patacrep import errors
+from patacrep import encoding
 
 # Logging configuration
 logging.basicConfig(level=logging.INFO)
@@ -101,12 +102,14 @@ def main():
     basename = os.path.basename(songbook_path)[:-3]
 
     try:
-        with open(songbook_path) as songbook_file:
-            songbook = json.load(songbook_file)
+        songbook_file = encoding.open_read(songbook_path)
+        songbook = json.load(songbook_file)
     except Exception as error: # pylint: disable=broad-except
         LOGGER.error(error)
         LOGGER.error("Error while loading file '{}'.".format(songbook_path))
         sys.exit(1)
+    finally:
+        songbook_file.close()
 
     # Gathering datadirs
     datadirs = []