Browse Source

Sanitize encoding of manipulated strings

Every manipulated string is unicode.

* We guess encoding of files we read before opening them, and string
  read from it are converted to unicode.
* We guess encoding of strings got from other modules (plasTeX), and
  they are converted to unicode.
pull/54/head
Louis 10 years ago
parent
commit
21d4f0c245
  1. 22
      patacrep/authors.py
  2. 2
      patacrep/content/__init__.py
  3. 4
      patacrep/content/section.py
  4. 6
      patacrep/content/song.py
  5. 2
      patacrep/content/songsection.py
  6. 5
      patacrep/content/sorted.py
  7. 2
      patacrep/content/tex.py
  8. 24
      patacrep/data/examples/example_encoding.sb
  9. 49
      patacrep/encoding.py
  10. 36
      patacrep/index.py
  11. 6
      patacrep/plastex.py
  12. 2
      patacrep/plastex_chord.py
  13. 14
      patacrep/plastex_songs.py
  14. 5
      patacrep/songs.py
  15. 25
      patacrep/templates.py
  16. 7
      songbook

22
patacrep/authors.py

@ -14,19 +14,6 @@ DEFAULT_AUTHWORDS = {
"sep": ["and"], "sep": ["and"],
} }
def to_utf8(string):
"""Convert a string (encoded in unicode or iso-8859-1 to utf-8"""
if type(string) is unicode:
return string.encode('utf-8')
elif type(string) is str:
return string.decode('iso-8859-1').encode('utf-8')
else:
try:
return string.encode('utf-8')
except:
LOGGER.warning("Ignoring a word I can not decode...")
return ""
def compile_authwords(authwords): def compile_authwords(authwords):
"""Convert strings of authwords to compiled regular expressions. """Convert strings of authwords to compiled regular expressions.
@ -46,10 +33,7 @@ def compile_authwords(authwords):
re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE) re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE)
for word in ([" %s" % word for word in authwords['sep']] + [',']) for word in ([" %s" % word for word in authwords['sep']] + [','])
] ]
authwords['ignore'] = [ authwords['ignore'] = [word for word in authwords['ignore'] if word]
to_utf8(word)
for word in authwords['ignore'] if to_utf8(word)
]
return authwords return authwords
@ -160,7 +144,7 @@ def processauthors_ignore_authors(authors_list, ignore):
for author in authors_list: for author in authors_list:
ignored = False ignored = False
for ignoreword in ignore: for ignoreword in ignore:
if author.find(str(ignoreword)) != -1: if author.find(ignoreword) != -1:
ignored = True ignored = True
break break
if not ignored: if not ignored:
@ -188,7 +172,7 @@ def processauthors_invert_names(authors_list):
for author in authors_list: for author in authors_list:
first, last = split_author_names(author) first, last = split_author_names(author)
if first: if first:
dest.append(r"\indexauthor{{{first}}}{{{last}}}".format( dest.append(ur"\indexauthor{{{first}}}{{{last}}}".format(
first=first.strip(), first=first.strip(),
last=last.strip(), last=last.strip(),
)) ))

2
patacrep/content/__init__.py

@ -225,7 +225,7 @@ def process_content(content, config=None):
""" """
contentlist = [] contentlist = []
plugins = load_plugins(config) plugins = load_plugins(config)
keyword_re = re.compile(r'^ *(?P<keyword>\w*) *(\((?P<argument>.*)\))? *$') keyword_re = re.compile(ur'^ *(?P<keyword>\w*) *(\((?P<argument>.*)\))? *$')
if not content: if not content:
content = [["song"]] content = [["song"]]
for elem in content: for elem in content:

4
patacrep/content/section.py

@ -26,9 +26,9 @@ class Section(Content):
def render(self, __context): def render(self, __context):
if self.short is None: if self.short is None:
return r'\{}{{{}}}'.format(self.keyword, self.name) return ur'\{}{{{}}}'.format(self.keyword, self.name)
else: else:
return r'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name) return ur'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name)
#pylint: disable=unused-argument #pylint: disable=unused-argument
def parse(keyword, argument, contentlist, config): def parse(keyword, argument, contentlist, config):

6
patacrep/content/song.py

@ -26,15 +26,15 @@ class SongRenderer(Content, Song):
indexes = context.resolve("indexes") indexes = context.resolve("indexes")
if isinstance(indexes, jinja2.runtime.Undefined): if isinstance(indexes, jinja2.runtime.Undefined):
indexes = "" indexes = ""
return r'\begin{songs}{%s}' % indexes return ur'\begin{songs}{%s}' % indexes
def end_block(self, __context): def end_block(self, __context):
"""Return the string to end a block.""" """Return the string to end a block."""
return r'\end{songs}' return ur'\end{songs}'
def render(self, context): def render(self, context):
"""Return the string that will render the song.""" """Return the string that will render the song."""
return r'\input{{{}}}'.format(files.relpath( return ur'\input{{{}}}'.format(files.relpath(
self.path, self.path,
os.path.dirname(context['filename']) os.path.dirname(context['filename'])
)) ))

2
patacrep/content/songsection.py

@ -19,7 +19,7 @@ class SongSection(Content):
def render(self, __context): def render(self, __context):
"""Render this section or chapter.""" """Render this section or chapter."""
return r'\{}{{{}}}'.format(self.keyword, self.name) return ur'\{}{{{}}}'.format(self.keyword, self.name)
#pylint: disable=unused-argument #pylint: disable=unused-argument
def parse(keyword, argument, contentlist, config): def parse(keyword, argument, contentlist, config):

5
patacrep/content/sorted.py

@ -11,6 +11,7 @@ import locale
import logging import logging
from patacrep import files from patacrep import files
from patacrep import encoding
from patacrep.content import ContentError from patacrep.content import ContentError
from patacrep.content.song import OnlySongsError, process_songs from patacrep.content.song import OnlySongsError, process_songs
@ -26,7 +27,7 @@ def normalize_string(string):
- lower case; - lower case;
- passed through locale.strxfrm(). - passed through locale.strxfrm().
""" """
return locale.strxfrm(string.lower().strip()) return locale.strxfrm(encoding.unidecode(string.lower().strip()))
def normalize_field(field): def normalize_field(field):
"""Return a normalized field, it being a string or a list of strings.""" """Return a normalized field, it being a string or a list of strings."""
@ -62,7 +63,7 @@ def key_generator(sort):
files.relpath(song.path), files.relpath(song.path),
) )
) )
field = "" field = u""
songkey.append(normalize_field(field)) songkey.append(normalize_field(field))
return songkey return songkey
return ordered_song_keys return ordered_song_keys

2
patacrep/content/tex.py

@ -18,7 +18,7 @@ class LaTeX(Content):
self.filename = filename self.filename = filename
def render(self, context): def render(self, context):
return r'\input{{{}}}'.format(files.relpath( return ur'\input{{{}}}'.format(files.relpath(
self.filename, self.filename,
os.path.dirname(context['filename']), os.path.dirname(context['filename']),
)) ))

24
patacrep/data/examples/example_encoding.sb

@ -0,0 +1,24 @@
{
"bookoptions" : [
"importantdiagramonly",
"repeatchords",
"lilypond",
"pictures"
],
"booktype" : "chorded",
"lang" : "french",
"authwords" : {
"sep" : ["and", "et", "À"],
"ignore" : ["À"],
"after" : ["À"]
},
"titleprefixwords": ["À"],
"datadir" : ".",
"content" : [["section", "Traditional"],
"chevaliers_de_la_table_ronde.sg",
"greensleeves.sg",
"vent_frais.sg",
["section", "Example"],
"example-fr.sg",
"example-en.sg"]
}

49
patacrep/encoding.py

@ -0,0 +1,49 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Dealing with encoding problems."""
import codecs
import chardet
import logging
from unidecode import unidecode as unidecode_orig
LOGGER = logging.getLogger(__name__)
def open_read(filename, mode='r'):
"""Open a file for reading, guessing the right encoding.
Return a fileobject, reading unicode strings.
"""
return codecs.open(
filename,
mode=mode,
encoding=chardet.detect(open(filename, "r").read())['encoding'],
errors='replace',
)
def basestring2unicode(arg):
"""Return the unicode version of the argument, guessing original encoding.
"""
if isinstance(arg, unicode):
return arg
elif isinstance(arg, basestring):
return arg.decode(
encoding=chardet.detect(arg)['encoding'],
errors='replace',
)
else:
LOGGER.warning("Cannot decode string {}. Ignored.".format(str(arg)))
return ""
def list2unicode(arg):
"""Return the unicode version of the argument, guessing original encoding.
Argument is a list of strings. If an item is of another type, it is
silently ignored (an empty string is returned).
"""
return [basestring2unicode(item) for item in arg]
def unidecode(arg):
"""Return a unicode version of a unidecoded string."""
return unicode(unidecode_orig(arg))

36
patacrep/index.py

@ -8,19 +8,18 @@ the original makeindex program written in C that produces an index file (.sbx)
from a file generated by the latex compilation of the songbook (.sxd). from a file generated by the latex compilation of the songbook (.sxd).
""" """
from unidecode import unidecode
import locale import locale
import re import re
import codecs
from patacrep import authors from patacrep import authors
from patacrep import encoding
from patacrep.plastex import simpleparse from patacrep.plastex import simpleparse
EOL = u"\n" EOL = u"\n"
# Pattern set to ignore latex command in title prefix # Pattern set to ignore latex command in title prefix
KEYWORD_PATTERN = re.compile(r"^%(\w+)\s?(.*)$", re.LOCALE) KEYWORD_PATTERN = re.compile(ur"^%(\w+)\s?(.*)$", re.LOCALE)
FIRST_LETTER_PATTERN = re.compile(r"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE) FIRST_LETTER_PATTERN = re.compile(ur"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE)
def sortkey(value): def sortkey(value):
@ -31,7 +30,7 @@ def sortkey(value):
the sort with latex escape sequences. the sort with latex escape sequences.
""" """
return locale.strxfrm( return locale.strxfrm(
unidecode(simpleparse(value).replace(' ', 'A')).lower() encoding.unidecode(simpleparse(value).replace(' ', 'A')).lower()
) )
@ -41,9 +40,12 @@ def process_sxd(filename):
Return an Index object. Return an Index object.
""" """
data = [] data = []
with codecs.open(filename, 'r', 'iso-8859-1') as index_file: try:
index_file = encoding.open_read(filename, 'r')
for line in index_file: for line in index_file:
data.append(line.strip().encode('utf-8')) data.append(line.strip())
finally:
index_file.close()
i = 1 i = 1
idx = Index(data[0]) idx = Index(data[0])
@ -82,7 +84,7 @@ class Index(object):
def get_first_letter(key): def get_first_letter(key):
"""Return the uppercase first letter of key.""" """Return the uppercase first letter of key."""
letter = FIRST_LETTER_PATTERN.match(key).group(1) letter = FIRST_LETTER_PATTERN.match(key).group(1)
if re.match(r'\d', letter): if re.match(ur'\d', letter):
letter = '0-9' letter = '0-9'
return letter.upper() return letter.upper()
@ -98,9 +100,9 @@ class Index(object):
if 'prefix' in self.keywords: if 'prefix' in self.keywords:
for prefix in self.keywords['prefix']: for prefix in self.keywords['prefix']:
self.prefix_patterns.append(re.compile( self.prefix_patterns.append(re.compile(
r"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix), ur"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix),
re.LOCALE re.LOCALE
)) ))
if self.indextype == "AUTHOR": if self.indextype == "AUTHOR":
self.authwords = authors.compile_authwords(self.keywords) self.authwords = authors.compile_authwords(self.keywords)
@ -126,10 +128,10 @@ class Index(object):
if self.indextype == "TITLE": if self.indextype == "TITLE":
# Removing prefixes before titles # Removing prefixes before titles
for pattern in self.prefix_patterns: for pattern in self.prefix_patterns:
match = pattern.match(key.encode('utf-8')) match = pattern.match(key)
if match: if match:
self._raw_add( self._raw_add(
r"\indextitle{{{}}}{{{}}}".format( ur"\indextitle{{{}}}{{{}}}".format(
match.group(1).strip(), match.group(1).strip(),
(match.group(2) + match.group(3)).strip(), (match.group(2) + match.group(3)).strip(),
), ),
@ -149,12 +151,10 @@ class Index(object):
@staticmethod @staticmethod
def ref_to_str(ref): def ref_to_str(ref):
"""Return the LaTeX code corresponding to the reference.""" """Return the LaTeX code corresponding to the reference."""
return r'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref) return ur'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref)
def entry_to_str(self, key, entry): def entry_to_str(self, key, entry):
"""Return the LaTeX code corresponding to the entry.""" """Return the LaTeX code corresponding to the entry."""
if not isinstance(key, unicode):
key = unicode(key, "UTF-8")
return unicode(ur'\idxentry{{{0}}}{{{1}}}' + EOL).format( return unicode(ur'\idxentry{{{0}}}{{{1}}}' + EOL).format(
key, key,
ur'\\'.join([self.ref_to_str(ref) for ref in entry]), ur'\\'.join([self.ref_to_str(ref) for ref in entry]),
@ -166,10 +166,10 @@ class Index(object):
Here, an index block is a letter, and all data beginning with this Here, an index block is a letter, and all data beginning with this
letter. letter.
""" """
string = r'\begin{idxblock}{' + letter + '}' + EOL string = ur'\begin{idxblock}{' + letter + '}' + EOL
for key in sorted(entries.keys(), key=sortkey): for key in sorted(entries.keys(), key=sortkey):
string += self.entry_to_str(key, entries[key]) string += self.entry_to_str(key, entries[key])
string += r'\end{idxblock}' + EOL string += ur'\end{idxblock}' + EOL
return string return string
def entries_to_str(self): def entries_to_str(self):

6
patacrep/plastex.py

@ -6,11 +6,11 @@
from plasTeX.TeX import TeX from plasTeX.TeX import TeX
from plasTeX.Base.LaTeX import Sentences from plasTeX.Base.LaTeX import Sentences
import codecs
import locale import locale
import os import os
import sys import sys
from patacrep import encoding
def process_unbr_spaces(node): def process_unbr_spaces(node):
#pylint: disable=line-too-long #pylint: disable=line-too-long
@ -39,8 +39,6 @@ def simpleparse(text):
"""Parse a simple LaTeX string. """Parse a simple LaTeX string.
""" """
tex = TeX() tex = TeX()
if not isinstance(text, unicode):
text = text.decode("utf-8")
tex.input(text) tex.input(text)
doc = tex.parse() doc = tex.parse()
return process_unbr_spaces(doc.textContent) return process_unbr_spaces(doc.textContent)
@ -66,7 +64,7 @@ class SongParser(object):
def parse(cls, filename): def parse(cls, filename):
"""Parse a TeX file, and return its plasTeX representation.""" """Parse a TeX file, and return its plasTeX representation."""
tex = cls.create_tex() tex = cls.create_tex()
tex.input(codecs.open(filename, 'r', 'utf-8', 'replace')) tex.input(encoding.open_read(filename, 'r'))
return tex.parse() return tex.parse()

2
patacrep/plastex_chord.py

@ -74,7 +74,7 @@ class Chord(Command):
@property @property
def source(self): def source(self):
"""Return chord LaTeX code.""" """Return chord LaTeX code."""
return r'\[{}]'.format(self.chord) return ur'\[{}]'.format(self.chord)
class BeginChordOrDisplayMath(BeginDisplayMath): class BeginChordOrDisplayMath(BeginDisplayMath):
r"""Wrapper to BeginDisplayMath r"""Wrapper to BeginDisplayMath

14
patacrep/plastex_songs.py

@ -6,6 +6,7 @@
import plasTeX import plasTeX
from patacrep import encoding
from patacrep.plastex import process_unbr_spaces from patacrep.plastex import process_unbr_spaces
@ -28,8 +29,9 @@ def split_linebreak(texlist):
return_list.append(current) return_list.append(current)
current = [] current = []
else: else:
current.append( current.append(encoding.basestring2unicode(
process_unbr_spaces(token).textContent.encode('utf-8')) process_unbr_spaces(token).textContent
))
if current: if current:
return_list.append(current) return_list.append(current)
return return_list return return_list
@ -49,15 +51,17 @@ class beginsong(plasTeX.Command): # pylint: disable=invalid-name,too-many-public
titles = [] titles = []
for tokens in split_linebreak(self.attributes['titles'].allChildNodes): for tokens in split_linebreak(self.attributes['titles'].allChildNodes):
titles.append("".join(tokens)) titles.append("".join(tokens))
self.attributes['titles'] = titles self.attributes['titles'] = encoding.list2unicode(titles)
# Parsing keyval arguments # Parsing keyval arguments
args = {} args = {}
for (key, val) in self.attributes['args'].iteritems(): for (key, val) in self.attributes['args'].iteritems():
if isinstance(val, plasTeX.DOM.Element): if isinstance(val, plasTeX.DOM.Element):
args[key] = process_unbr_spaces(val).textContent.encode('utf-8') args[key] = encoding.basestring2unicode(
process_unbr_spaces(val).textContent
)
elif isinstance(val, basestring): elif isinstance(val, basestring):
args[key] = val.encode('utf-8') args[key] = encoding.basestring2unicode(val)
else: else:
args[key] = unicode(val) args[key] = unicode(val)
self.attributes['args'] = args self.attributes['args'] = args

5
patacrep/songs.py

@ -3,7 +3,6 @@
"""Song management.""" """Song management."""
from unidecode import unidecode
import re import re
from patacrep.authors import processauthors from patacrep.authors import processauthors
@ -19,7 +18,7 @@ class Song(object):
self.titles = data['titles'] self.titles = data['titles']
self.unprefixed_titles = [ self.unprefixed_titles = [
unprefixed_title( unprefixed_title(
unidecode(unicode(title, "utf-8")), title,
config['titleprefixwords'] config['titleprefixwords']
) )
for title for title
@ -43,7 +42,7 @@ def unprefixed_title(title, prefixes):
"""Remove the first prefix of the list in the beginning of title (if any). """Remove the first prefix of the list in the beginning of title (if any).
""" """
for prefix in prefixes: for prefix in prefixes:
match = re.compile(r"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title) match = re.compile(ur"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title)
if match: if match:
return match.group(2) return match.group(2)
return title return title

25
patacrep/templates.py

@ -6,23 +6,23 @@ from jinja2 import Environment, FileSystemLoader, ChoiceLoader, \
TemplateNotFound, nodes TemplateNotFound, nodes
from jinja2.ext import Extension from jinja2.ext import Extension
from jinja2.meta import find_referenced_templates as find_templates from jinja2.meta import find_referenced_templates as find_templates
import codecs
import os import os
import re import re
import json import json
from patacrep import encoding
from patacrep import errors from patacrep import errors
_LATEX_SUBS = ( _LATEX_SUBS = (
(re.compile(r'\\'), r'\\textbackslash'), (re.compile(ur'\\'), ur'\\textbackslash'),
(re.compile(r'([{}_#%&$])'), r'\\\1'), (re.compile(ur'([{}_#%&$])'), ur'\\\1'),
(re.compile(r'~'), r'\~{}'), (re.compile(ur'~'), ur'\~{}'),
(re.compile(r'\^'), r'\^{}'), (re.compile(ur'\^'), ur'\^{}'),
(re.compile(r'"'), r"''"), (re.compile(ur'"'), ur"''"),
(re.compile(r'\.\.\.+'), r'\\ldots'), (re.compile(ur'\.\.\.+'), ur'\\ldots'),
) )
_VARIABLE_REGEXP = re.compile(r""" _VARIABLE_REGEXP = re.compile(ur"""
\(\*\ *variables\ *\*\) # Match (* variables *) \(\*\ *variables\ *\*\) # Match (* variables *)
( # Match and capture the following: ( # Match and capture the following:
(?: # Start of non-capturing group, used to match a single character (?: # Start of non-capturing group, used to match a single character
@ -178,11 +178,8 @@ class TexRenderer(object):
subvariables = {} subvariables = {}
templatename = self.texenv.get_template(template).filename templatename = self.texenv.get_template(template).filename
with codecs.open( try:
templatename, template_file = encoding.open_read(templatename, 'r')
'r',
'utf-8'
) as template_file:
content = template_file.read() content = template_file.read()
subtemplates = list(find_templates(self.texenv.parse(content))) subtemplates = list(find_templates(self.texenv.parse(content)))
match = re.findall(_VARIABLE_REGEXP, content) match = re.findall(_VARIABLE_REGEXP, content)
@ -202,6 +199,8 @@ class TexRenderer(object):
jsonstring=var, jsonstring=var,
) )
) )
finally:
template_file.close()
return (subvariables, subtemplates) return (subvariables, subtemplates)

7
songbook

@ -15,6 +15,7 @@ import sys
from patacrep.build import SongbookBuilder, DEFAULT_STEPS from patacrep.build import SongbookBuilder, DEFAULT_STEPS
from patacrep import __STR_VERSION__ from patacrep import __STR_VERSION__
from patacrep import errors from patacrep import errors
from patacrep import encoding
# Logging configuration # Logging configuration
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
@ -101,12 +102,14 @@ def main():
basename = os.path.basename(songbook_path)[:-3] basename = os.path.basename(songbook_path)[:-3]
try: try:
with open(songbook_path) as songbook_file: songbook_file = encoding.open_read(songbook_path)
songbook = json.load(songbook_file) songbook = json.load(songbook_file)
except Exception as error: # pylint: disable=broad-except except Exception as error: # pylint: disable=broad-except
LOGGER.error(error) LOGGER.error(error)
LOGGER.error("Error while loading file '{}'.".format(songbook_path)) LOGGER.error("Error while loading file '{}'.".format(songbook_path))
sys.exit(1) sys.exit(1)
finally:
songbook_file.close()
# Gathering datadirs # Gathering datadirs
datadirs = [] datadirs = []

Loading…
Cancel
Save