Merge branch 'master' into cache (mainly managing string encoding)

Conflicts: patacrep/content/song.py patacrep/songs.py
11 years ago · e3edc2b9e7
19 changed files with 242 additions and 86 deletions
--- a/Requirements.txt
+++ b/Requirements.txt
@ -1,3 +1,4 @@
 Jinja2==2.7.3
 argparse==1.2.1
-e git+https://github.com/tiarno/plastex#egg=plasTeX
+chardet==2.2.1
+https://github.com/tiarno/plastex/archive/master.zip
--- a/patacrep/authors.py
+++ b/patacrep/authors.py
@ -14,16 +14,6 @@ DEFAULT_AUTHWORDS = {
        "sep": ["and"],
        }

-def to_utf8(string):
-    """Convert a string (encoded in unicode or iso-8859-1 to utf-8"""
-    if type(string) is unicode:
-        return string.encode('utf-8')
-    elif type(string) is str:
-        return string.decode('iso-8859-1').encode('utf-8')
-    else:
-        LOGGER.warning("Ignoring a word I can not decode...")
-        return None
-
 def compile_authwords(authwords):
    """Convert strings of authwords to compiled regular expressions.

@ -43,7 +33,6 @@ def compile_authwords(authwords):
            re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE)
            for word in ([" %s" % word for word in authwords['sep']] + [','])
            ]
-    authwords['ignore'] = [to_utf8(word) for word in authwords['ignore'] if to_utf8(word)]

    return authwords

@ -154,7 +143,7 @@ def processauthors_ignore_authors(authors_list, ignore):
    for author in authors_list:
        ignored = False
        for ignoreword in ignore:
-            if author.find(str(ignoreword)) != -1:
+            if author.find(ignoreword) != -1:
                ignored = True
                break
        if not ignored:
@ -182,7 +171,7 @@ def processauthors_invert_names(authors_list):
    for author in authors_list:
        first, last = split_author_names(author)
        if first:
-            dest.append(r"\indexauthor{{{first}}}{{{last}}}".format(
+            dest.append(ur"\indexauthor{{{first}}}{{{last}}}".format(
                first=first.strip(),
                last=last.strip(),
                ))
--- a/patacrep/content/init.py
+++ b/patacrep/content/init.py
@ -225,7 +225,7 @@ def process_content(content, config=None):
    """
    contentlist = []
    plugins = load_plugins(config)
-    keyword_re = re.compile(r'^ *(?P<keyword>\w*) *(\((?P<argument>.*)\))? *$')
+    keyword_re = re.compile(ur'^ *(?P<keyword>\w*) *(\((?P<argument>.*)\))? *$')
    if not content:
        content = [["song"]]
    for elem in content:
--- a/patacrep/content/include.py
+++ b/patacrep/content/include.py
@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+
+"""Include an external list of songs
+
+This plugin provides keyword 'include', used to include an external list of
+songs in JSON format.
+"""
+
+import json
+import os
+import sys
+import logging
+
+from patacrep.content import process_content, ContentError
+from patacrep import encoding
+
+LOGGER = logging.getLogger(__name__)
+
+def load_from_datadirs(path, config=None):
+    """Load 'path' from one of the datadirs.
+
+    Raise an exception if it was found if none of the datadirs of 'config'.
+    """
+    for datadir in config.get("datadir", []):
+        filepath = os.path.join(datadir, path)
+        if os.path.exists(filepath):
+            return filepath
+    # File not found
+    raise ContentError("include", "The file '{0}' was not found in the "
+                        "datadirs.".format(path))
+
+#pylint: disable=unused-argument
+def parse(keyword, config, argument, contentlist):
+    """Include an external file content.
+
+    Arguments:
+        - keyword: the string 'include';
+        - config: the current songbook configuration dictionary;
+        - argument: None;
+        - contentlist: a list of file paths to be included.
+    """
+    new_contentlist = []
+
+    for path in contentlist:
+        filepath = load_from_datadirs(path, config)
+        content_file = None
+        try:
+            content_file = encoding.open_read(filepath, 'r')
+            new_content = json.load(content_file)
+        except Exception as error: # pylint: disable=broad-except
+            LOGGER.error(error)
+            LOGGER.error("Error while loading file '{}'.".format(filepath))
+            sys.exit(1)
+        finally:
+            if content_file:
+                content_file.close()
+
+        config["datadir"].append(os.path.abspath(os.path.dirname(filepath)))
+        new_contentlist += process_content(new_content, config)
+        config["datadir"].pop()
+
+    return new_contentlist
+
+CONTENT_PLUGINS = {'include': parse}
--- a/patacrep/content/section.py
+++ b/patacrep/content/section.py
@ -26,9 +26,9 @@ class Section(Content):

    def render(self, __context):
        if self.short is None:
-            return r'\{}{{{}}}'.format(self.keyword, self.name)
+            return ur'\{}{{{}}}'.format(self.keyword, self.name)
        else:
-            return r'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name)
+            return ur'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name)

 #pylint: disable=unused-argument
 def parse(keyword, argument, contentlist, config):
--- a/patacrep/content/song.py
+++ b/patacrep/content/song.py
@ -26,15 +26,15 @@ class SongRenderer(Content, Song):
        indexes = context.resolve("indexes")
        if isinstance(indexes, jinja2.runtime.Undefined):
            indexes = ""
-        return r'\begin{songs}{%s}' % indexes
+        return ur'\begin{songs}{%s}' % indexes

    def end_block(self, __context):
        """Return the string to end a block."""
-        return r'\end{songs}'
+        return ur'\end{songs}'

    def render(self, context):
        """Return the string that will render the song."""
-        return r'\input{{{}}}'.format(files.relpath(
+        return ur'\input{{{}}}'.format(files.relpath(
            self.fullpath,
            os.path.dirname(context['filename'])
            ))
--- a/patacrep/content/songsection.py
+++ b/patacrep/content/songsection.py
@ -19,7 +19,7 @@ class SongSection(Content):

    def render(self, __context):
        """Render this section or chapter."""
-        return r'\{}{{{}}}'.format(self.keyword, self.name)
+        return ur'\{}{{{}}}'.format(self.keyword, self.name)

 #pylint: disable=unused-argument
 def parse(keyword, argument, contentlist, config):
--- a/patacrep/content/sorted.py
+++ b/patacrep/content/sorted.py
@ -11,6 +11,7 @@ import locale
 import logging

 from patacrep import files
+from patacrep import encoding
 from patacrep.content import ContentError
 from patacrep.content.song import OnlySongsError, process_songs

@ -26,7 +27,7 @@ def normalize_string(string):
    - lower case;
    - passed through locale.strxfrm().
    """
-    return locale.strxfrm(string.lower().strip())
+    return locale.strxfrm(encoding.unidecode(string.lower().strip()))

 def normalize_field(field):
    """Return a normalized field, it being a string or a list of strings."""
@ -62,7 +63,7 @@ def key_generator(sort):
                                files.relpath(song.fullpath),
                                )
                            )
-                    field = ""
+                    field = u""
            songkey.append(normalize_field(field))
        return songkey
    return ordered_song_keys
--- a/patacrep/content/tex.py
+++ b/patacrep/content/tex.py
@ -18,7 +18,7 @@ class LaTeX(Content):
        self.filename = filename

    def render(self, context):
-        return r'\input{{{}}}'.format(files.relpath(
+        return ur'\input{{{}}}'.format(files.relpath(
            self.filename,
            os.path.dirname(context['filename']),
            ))
--- a/patacrep/data/examples/example_encoding.sb
+++ b/patacrep/data/examples/example_encoding.sb
@ -0,0 +1,24 @@
+{
+"bookoptions" : [
+    "importantdiagramonly",
+    "repeatchords",
+    "lilypond",
+    "pictures"
+  ],
+"booktype" : "chorded",
+"lang" : "french",
+"authwords" : {
+  "sep" : ["and", "et", "À"],
+  "ignore" : ["À"],
+  "after" : ["À"]
+  },
+  "titleprefixwords": ["À"],
+ "datadir" : ".",
+ "content" : [["section", "Traditional"],
+                "chevaliers_de_la_table_ronde.sg",
+                "greensleeves.sg",
+                "vent_frais.sg",
+                ["section", "Example"],
+                "example-fr.sg",
+                "example-en.sg"]
+}               
--- a/patacrep/data/templates/default.tex
+++ b/patacrep/data/templates/default.tex
@ -117,9 +117,11 @@
 (* block chords *)
   % list of chords
   \ifchorded
-   \phantomsection
-   \addcontentsline{toc}{section}{\chordlistname}
-   \chords
+      \ifdiagram
+         \phantomsection
+         \addcontentsline{toc}{section}{\chordlistname}
+         \chords
+      \fi
   \fi
 (* endblock *)

--- a/patacrep/encoding.py
+++ b/patacrep/encoding.py
@ -0,0 +1,49 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""Dealing with encoding problems."""
+
+import codecs
+import chardet
+import logging
+from unidecode import unidecode as unidecode_orig
+
+LOGGER = logging.getLogger(__name__)
+
+def open_read(filename, mode='r'):
+    """Open a file for reading, guessing the right encoding.
+
+    Return a fileobject, reading unicode strings.
+    """
+    return codecs.open(
+            filename,
+            mode=mode,
+            encoding=chardet.detect(open(filename, "r").read())['encoding'],
+            errors='replace',
+            )
+
+def basestring2unicode(arg):
+    """Return the unicode version of the argument, guessing original encoding.
+    """
+    if isinstance(arg, unicode):
+        return arg
+    elif isinstance(arg, basestring):
+        return arg.decode(
+                encoding=chardet.detect(arg)['encoding'],
+                errors='replace',
+                )
+    else:
+        LOGGER.warning("Cannot decode string {}. Ignored.".format(str(arg)))
+        return ""
+
+def list2unicode(arg):
+    """Return the unicode version of the argument, guessing original encoding.
+
+    Argument is a list of strings.  If an item is of another type, it is
+    silently ignored (an empty string is returned).
+    """
+    return [basestring2unicode(item) for item in arg]
+
+def unidecode(arg):
+    """Return a unicode version of a unidecoded string."""
+    return unicode(unidecode_orig(arg))
--- a/patacrep/index.py
+++ b/patacrep/index.py
@ -8,19 +8,18 @@ the original makeindex program written in C that produces an index file (.sbx)
 from a file generated by the latex compilation of the songbook (.sxd).
 """

-from unidecode import unidecode
 import locale
 import re
-import codecs

 from patacrep import authors
+from patacrep import encoding
 from patacrep.plastex import simpleparse

 EOL = u"\n"

 # Pattern set to ignore latex command in title prefix
-KEYWORD_PATTERN = re.compile(r"^%(\w+)\s?(.*)$", re.LOCALE)
-FIRST_LETTER_PATTERN = re.compile(r"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE)
+KEYWORD_PATTERN = re.compile(ur"^%(\w+)\s?(.*)$", re.LOCALE)
+FIRST_LETTER_PATTERN = re.compile(ur"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE)


 def sortkey(value):
@ -31,7 +30,7 @@ def sortkey(value):
    the sort with  latex escape sequences.
    """
    return locale.strxfrm(
-            unidecode(simpleparse(value).replace(' ', 'A')).lower()
+            encoding.unidecode(simpleparse(value).replace(' ', 'A')).lower()
            )


@ -41,9 +40,14 @@ def process_sxd(filename):
    Return an Index object.
    """
    data = []
-    with codecs.open(filename, 'r', 'iso-8859-1') as index_file:
+    index_file = None
+    try:
+        index_file = encoding.open_read(filename, 'r')
        for line in index_file:
-            data.append(line.strip().encode('utf-8'))
+            data.append(line.strip())
+    finally:
+        if index_file:
+            index_file.close()

    i = 1
    idx = Index(data[0])
@ -82,7 +86,7 @@ class Index(object):
    def get_first_letter(key):
        """Return the uppercase first letter of key."""
        letter = FIRST_LETTER_PATTERN.match(key).group(1)
-        if re.match(r'\d', letter):
+        if re.match(ur'\d', letter):
            letter = '0-9'
        return letter.upper()

@ -98,9 +102,9 @@ class Index(object):
            if 'prefix' in self.keywords:
                for prefix in self.keywords['prefix']:
                    self.prefix_patterns.append(re.compile(
-                            r"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix),
-                            re.LOCALE
-                            ))
+                        ur"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix),
+                        re.LOCALE
+                        ))

        if self.indextype == "AUTHOR":
            self.authwords = authors.compile_authwords(self.keywords)
@ -126,10 +130,10 @@ class Index(object):
        if self.indextype == "TITLE":
            # Removing prefixes before titles
            for pattern in self.prefix_patterns:
-                match = pattern.match(key.encode('utf-8'))
+                match = pattern.match(key)
                if match:
                    self._raw_add(
-                            r"\indextitle{{{}}}{{{}}}".format(
+                            ur"\indextitle{{{}}}{{{}}}".format(
                                match.group(1).strip(),
                                (match.group(2) + match.group(3)).strip(),
                                ),
@ -149,12 +153,10 @@ class Index(object):
    @staticmethod
    def ref_to_str(ref):
        """Return the LaTeX code corresponding to the reference."""
-        return r'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref)
+        return ur'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref)

    def entry_to_str(self, key, entry):
        """Return the LaTeX code corresponding to the entry."""
-        if not isinstance(key, unicode):
-            key = unicode(key, "UTF-8")
        return unicode(ur'\idxentry{{{0}}}{{{1}}}' + EOL).format(
                key,
                ur'\\'.join([self.ref_to_str(ref) for ref in entry]),
@ -166,10 +168,10 @@ class Index(object):
        Here, an index block is a letter, and all data beginning with this
        letter.
        """
-        string = r'\begin{idxblock}{' + letter + '}' + EOL
+        string = ur'\begin{idxblock}{' + letter + '}' + EOL
        for key in sorted(entries.keys(), key=sortkey):
            string += self.entry_to_str(key, entries[key])
-        string += r'\end{idxblock}' + EOL
+        string += ur'\end{idxblock}' + EOL
        return string

    def entries_to_str(self):
--- a/patacrep/plastex.py
+++ b/patacrep/plastex.py
@ -6,11 +6,11 @@
 from plasTeX.TeX import TeX
 from plasTeX.Base.LaTeX import Sentences

-import codecs
 import locale
 import os
 import sys

+from patacrep import encoding

 def process_unbr_spaces(node):
    #pylint: disable=line-too-long
@ -39,8 +39,6 @@ def simpleparse(text):
    """Parse a simple LaTeX string.
    """
    tex = TeX()
-    if not isinstance(text, unicode):
-        text = text.decode("utf-8")
    tex.input(text)
    doc = tex.parse()
    return process_unbr_spaces(doc.textContent)
@ -66,7 +64,7 @@ class SongParser(object):
    def parse(cls, filename):
        """Parse a TeX file, and return its plasTeX representation."""
        tex = cls.create_tex()
-        tex.input(codecs.open(filename, 'r', 'utf-8', 'replace'))
+        tex.input(encoding.open_read(filename, 'r'))
        return tex.parse()


--- a/patacrep/plastex_chord.py
+++ b/patacrep/plastex_chord.py
@ -74,7 +74,7 @@ class Chord(Command):
    @property
    def source(self):
        """Return chord LaTeX code."""
-        return r'\[{}]'.format(self.chord)
+        return ur'\[{}]'.format(self.chord)

 class BeginChordOrDisplayMath(BeginDisplayMath):
    r"""Wrapper to BeginDisplayMath
--- a/patacrep/plastex_songs.py
+++ b/patacrep/plastex_songs.py
@ -6,6 +6,7 @@

 import plasTeX

+from patacrep import encoding
 from patacrep.plastex import process_unbr_spaces


@ -28,8 +29,9 @@ def split_linebreak(texlist):
            return_list.append(current)
            current = []
        else:
-            current.append(
-                    process_unbr_spaces(token).textContent.encode('utf-8'))
+            current.append(encoding.basestring2unicode(
+                process_unbr_spaces(token).textContent
+                ))
    if current:
        return_list.append(current)
    return return_list
@ -49,15 +51,17 @@ class beginsong(plasTeX.Command): # pylint: disable=invalid-name,too-many-public
        titles = []
        for tokens in split_linebreak(self.attributes['titles'].allChildNodes):
            titles.append("".join(tokens))
-        self.attributes['titles'] = titles
+        self.attributes['titles'] = encoding.list2unicode(titles)

        # Parsing keyval arguments
        args = {}
        for (key, val) in self.attributes['args'].iteritems():
            if isinstance(val, plasTeX.DOM.Element):
-                args[key] = process_unbr_spaces(val).textContent.encode('utf-8')
+                args[key] = encoding.basestring2unicode(
+                        process_unbr_spaces(val).textContent
+                        )
            elif isinstance(val, basestring):
-                args[key] = val.encode('utf-8')
+                args[key] = encoding.basestring2unicode(val)
            else:
                args[key] = unicode(val)
        self.attributes['args'] = args
--- a/patacrep/songs.py
+++ b/patacrep/songs.py
@ -3,9 +3,9 @@

 """Song management."""

-from unidecode import unidecode
 import errno
 import hashlib
+import logging
 import os
 import re

@ -17,10 +17,11 @@ except ImportError:
 from patacrep.authors import processauthors
 from patacrep.plastex import parsetex

+LOGGER = logging.getLogger(__name__)

 def cached_name(datadir, filename):
    """Return the filename of the cache version of the file."""
-    fullpath = os.path.join(datadir, '.cache', filename)
+    fullpath = os.path.abspath(os.path.join(datadir, '.cache', filename))
    directory = os.path.dirname(fullpath)
    try:
        os.makedirs(directory)
@ -96,14 +97,22 @@ class Song(object):
                    open(self.fullpath, 'rb').read()
                    ).hexdigest()
            if os.path.exists(cached_name(datadir, subpath)):
-                cached = pickle.load(open(cached_name(datadir, subpath), 'rb'))
-                if (
-                        cached['_filehash'] == self._filehash
-                        and cached['_version'] == self.CACHE_VERSION
-                        ):
-                    for attribute in self.cached_attributes:
-                        setattr(self, attribute, cached[attribute])
-                    return
+                try:
+                    cached = pickle.load(open(
+                        cached_name(datadir, subpath),
+                        'rb',
+                        ))
+                    if (
+                            cached['_filehash'] == self._filehash
+                            and cached['_version'] == self.CACHE_VERSION
+                            ):
+                        for attribute in self.cached_attributes:
+                            setattr(self, attribute, cached[attribute])
+                        return
+                except: # pylint: disable=bare-except
+                    LOGGER.warning("Could not use cached version of {}.".format(
+                        self.fullpath
+                        ))

        # Data extraction from the song with plastex
        data = parsetex(self.fullpath)
@ -111,7 +120,7 @@ class Song(object):
        self.datadir = datadir
        self.unprefixed_titles = [
                unprefixed_title(
-                    unidecode(unicode(title, "utf-8")),
+                    title,
                    config['titleprefixwords']
                    )
                for title
@ -136,7 +145,14 @@ class Song(object):
        if self.datadir:
            cached = {}
            for attribute in self.cached_attributes:
-                cached[attribute] = getattr(self, attribute)
+                if attribute == "args":
+                    cached[attribute] = dict([
+                        (key, u"{}".format(value)) # Force conversion to unicode
+                        for (key, value)
+                        in self.args.iteritems()
+                        ])
+                else:
+                    cached[attribute] = getattr(self, attribute)
            pickle.dump(
                    cached,
                    open(cached_name(self.datadir, self.subpath), 'wb'),
@ -149,7 +165,7 @@ def unprefixed_title(title, prefixes):
    """Remove the first prefix of the list in the beginning of title (if any).
    """
    for prefix in prefixes:
-        match = re.compile(r"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title)
+        match = re.compile(ur"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title)
        if match:
            return match.group(2)
    return title
--- a/patacrep/templates.py
+++ b/patacrep/templates.py
@ -6,23 +6,23 @@ from jinja2 import Environment, FileSystemLoader, ChoiceLoader, \
        TemplateNotFound, nodes
 from jinja2.ext import Extension
 from jinja2.meta import find_referenced_templates as find_templates
-import codecs
 import os
 import re
 import json

+from patacrep import encoding
 from patacrep import errors

 _LATEX_SUBS = (
-    (re.compile(r'\\'), r'\\textbackslash'),
-    (re.compile(r'([{}_#%&$])'), r'\\\1'),
-    (re.compile(r'~'), r'\~{}'),
-    (re.compile(r'\^'), r'\^{}'),
-    (re.compile(r'"'), r"''"),
-    (re.compile(r'\.\.\.+'), r'\\ldots'),
+    (re.compile(ur'\\'), ur'\\textbackslash'),
+    (re.compile(ur'([{}_#%&$])'), ur'\\\1'),
+    (re.compile(ur'~'), ur'\~{}'),
+    (re.compile(ur'\^'), ur'\^{}'),
+    (re.compile(ur'"'), ur"''"),
+    (re.compile(ur'\.\.\.+'), ur'\\ldots'),
 )

-_VARIABLE_REGEXP = re.compile(r"""
+_VARIABLE_REGEXP = re.compile(ur"""
    \(\*\ *variables\ *\*\)    # Match (* variables *)
    (                          # Match and capture the following:
    (?:                        # Start of non-capturing group, used to match a single character
@ -177,12 +177,10 @@ class TexRenderer(object):
        """

        subvariables = {}
+        template_file = None
        templatename = self.texenv.get_template(template).filename
-        with codecs.open(
-                templatename,
-                'r',
-                'utf-8'
-                ) as template_file:
+        try:
+            template_file = encoding.open_read(templatename, 'r')
            content = template_file.read()
            subtemplates = list(find_templates(self.texenv.parse(content)))
            match = re.findall(_VARIABLE_REGEXP, content)
@ -202,6 +200,9 @@ class TexRenderer(object):
                                    jsonstring=var,
                                    )
                                )
+        finally:
+            if template_file:
+                template_file.close()

        return (subvariables, subtemplates)

--- a/15
+++ b/15
@ -15,6 +15,7 @@ import sys
 from patacrep.build import SongbookBuilder, DEFAULT_STEPS
 from patacrep import __STR_VERSION__
 from patacrep import errors
+from patacrep import encoding

 # Logging configuration
 logging.basicConfig(level=logging.INFO)
@ -100,13 +101,17 @@ def main():

    basename = os.path.basename(songbook_path)[:-3]

+    songbook_file = None
    try:
-        with open(songbook_path) as songbook_file:
-            songbook = json.load(songbook_file)
+        songbook_file = encoding.open_read(songbook_path)
+        songbook = json.load(songbook_file)
    except Exception as error: # pylint: disable=broad-except
        LOGGER.error(error)
        LOGGER.error("Error while loading file '{}'.".format(songbook_path))
        sys.exit(1)
+    finally:
+        if songbook_file:
+            songbook_file.close()

    # Gathering datadirs
    datadirs = []
@ -124,9 +129,9 @@ def main():
                    )
                for path in songbook['datadir']
                ]
-    if not datadirs:
-        # Default value
-        datadirs = [os.path.dirname(os.path.abspath(songbook_path))]
+    # Default value
+    datadirs.append(os.path.dirname(os.path.abspath(songbook_path)))
+
    songbook['datadir'] = datadirs

    try: