Browse Source

Sanitize encoding of manipulated strings

Every manipulated string is unicode.

* We guess encoding of files we read before opening them, and string
  read from it are converted to unicode.
* We guess encoding of strings got from other modules (plasTeX), and
  they are converted to unicode.
pull/54/head
Louis 10 years ago
parent
commit
21d4f0c245
  1. 22
      patacrep/authors.py
  2. 2
      patacrep/content/__init__.py
  3. 4
      patacrep/content/section.py
  4. 6
      patacrep/content/song.py
  5. 2
      patacrep/content/songsection.py
  6. 5
      patacrep/content/sorted.py
  7. 2
      patacrep/content/tex.py
  8. 24
      patacrep/data/examples/example_encoding.sb
  9. 49
      patacrep/encoding.py
  10. 36
      patacrep/index.py
  11. 6
      patacrep/plastex.py
  12. 2
      patacrep/plastex_chord.py
  13. 14
      patacrep/plastex_songs.py
  14. 5
      patacrep/songs.py
  15. 25
      patacrep/templates.py
  16. 7
      songbook

22
patacrep/authors.py

@ -14,19 +14,6 @@ DEFAULT_AUTHWORDS = {
"sep": ["and"],
}
def to_utf8(string):
"""Convert a string (encoded in unicode or iso-8859-1 to utf-8"""
if type(string) is unicode:
return string.encode('utf-8')
elif type(string) is str:
return string.decode('iso-8859-1').encode('utf-8')
else:
try:
return string.encode('utf-8')
except:
LOGGER.warning("Ignoring a word I can not decode...")
return ""
def compile_authwords(authwords):
"""Convert strings of authwords to compiled regular expressions.
@ -46,10 +33,7 @@ def compile_authwords(authwords):
re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE)
for word in ([" %s" % word for word in authwords['sep']] + [','])
]
authwords['ignore'] = [
to_utf8(word)
for word in authwords['ignore'] if to_utf8(word)
]
authwords['ignore'] = [word for word in authwords['ignore'] if word]
return authwords
@ -160,7 +144,7 @@ def processauthors_ignore_authors(authors_list, ignore):
for author in authors_list:
ignored = False
for ignoreword in ignore:
if author.find(str(ignoreword)) != -1:
if author.find(ignoreword) != -1:
ignored = True
break
if not ignored:
@ -188,7 +172,7 @@ def processauthors_invert_names(authors_list):
for author in authors_list:
first, last = split_author_names(author)
if first:
dest.append(r"\indexauthor{{{first}}}{{{last}}}".format(
dest.append(ur"\indexauthor{{{first}}}{{{last}}}".format(
first=first.strip(),
last=last.strip(),
))

2
patacrep/content/__init__.py

@ -225,7 +225,7 @@ def process_content(content, config=None):
"""
contentlist = []
plugins = load_plugins(config)
keyword_re = re.compile(r'^ *(?P<keyword>\w*) *(\((?P<argument>.*)\))? *$')
keyword_re = re.compile(ur'^ *(?P<keyword>\w*) *(\((?P<argument>.*)\))? *$')
if not content:
content = [["song"]]
for elem in content:

4
patacrep/content/section.py

@ -26,9 +26,9 @@ class Section(Content):
def render(self, __context):
if self.short is None:
return r'\{}{{{}}}'.format(self.keyword, self.name)
return ur'\{}{{{}}}'.format(self.keyword, self.name)
else:
return r'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name)
return ur'\{}[{}]{{{}}}'.format(self.keyword, self.short, self.name)
#pylint: disable=unused-argument
def parse(keyword, argument, contentlist, config):

6
patacrep/content/song.py

@ -26,15 +26,15 @@ class SongRenderer(Content, Song):
indexes = context.resolve("indexes")
if isinstance(indexes, jinja2.runtime.Undefined):
indexes = ""
return r'\begin{songs}{%s}' % indexes
return ur'\begin{songs}{%s}' % indexes
def end_block(self, __context):
"""Return the string to end a block."""
return r'\end{songs}'
return ur'\end{songs}'
def render(self, context):
"""Return the string that will render the song."""
return r'\input{{{}}}'.format(files.relpath(
return ur'\input{{{}}}'.format(files.relpath(
self.path,
os.path.dirname(context['filename'])
))

2
patacrep/content/songsection.py

@ -19,7 +19,7 @@ class SongSection(Content):
def render(self, __context):
"""Render this section or chapter."""
return r'\{}{{{}}}'.format(self.keyword, self.name)
return ur'\{}{{{}}}'.format(self.keyword, self.name)
#pylint: disable=unused-argument
def parse(keyword, argument, contentlist, config):

5
patacrep/content/sorted.py

@ -11,6 +11,7 @@ import locale
import logging
from patacrep import files
from patacrep import encoding
from patacrep.content import ContentError
from patacrep.content.song import OnlySongsError, process_songs
@ -26,7 +27,7 @@ def normalize_string(string):
- lower case;
- passed through locale.strxfrm().
"""
return locale.strxfrm(string.lower().strip())
return locale.strxfrm(encoding.unidecode(string.lower().strip()))
def normalize_field(field):
"""Return a normalized field, it being a string or a list of strings."""
@ -62,7 +63,7 @@ def key_generator(sort):
files.relpath(song.path),
)
)
field = ""
field = u""
songkey.append(normalize_field(field))
return songkey
return ordered_song_keys

2
patacrep/content/tex.py

@ -18,7 +18,7 @@ class LaTeX(Content):
self.filename = filename
def render(self, context):
return r'\input{{{}}}'.format(files.relpath(
return ur'\input{{{}}}'.format(files.relpath(
self.filename,
os.path.dirname(context['filename']),
))

24
patacrep/data/examples/example_encoding.sb

@ -0,0 +1,24 @@
{
"bookoptions" : [
"importantdiagramonly",
"repeatchords",
"lilypond",
"pictures"
],
"booktype" : "chorded",
"lang" : "french",
"authwords" : {
"sep" : ["and", "et", "À"],
"ignore" : ["À"],
"after" : ["À"]
},
"titleprefixwords": ["À"],
"datadir" : ".",
"content" : [["section", "Traditional"],
"chevaliers_de_la_table_ronde.sg",
"greensleeves.sg",
"vent_frais.sg",
["section", "Example"],
"example-fr.sg",
"example-en.sg"]
}

49
patacrep/encoding.py

@ -0,0 +1,49 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Dealing with encoding problems."""
import codecs
import chardet
import logging
from unidecode import unidecode as unidecode_orig
LOGGER = logging.getLogger(__name__)
def open_read(filename, mode='r'):
"""Open a file for reading, guessing the right encoding.
Return a fileobject, reading unicode strings.
"""
return codecs.open(
filename,
mode=mode,
encoding=chardet.detect(open(filename, "r").read())['encoding'],
errors='replace',
)
def basestring2unicode(arg):
"""Return the unicode version of the argument, guessing original encoding.
"""
if isinstance(arg, unicode):
return arg
elif isinstance(arg, basestring):
return arg.decode(
encoding=chardet.detect(arg)['encoding'],
errors='replace',
)
else:
LOGGER.warning("Cannot decode string {}. Ignored.".format(str(arg)))
return ""
def list2unicode(arg):
"""Return the unicode version of the argument, guessing original encoding.
Argument is a list of strings. If an item is of another type, it is
silently ignored (an empty string is returned).
"""
return [basestring2unicode(item) for item in arg]
def unidecode(arg):
"""Return a unicode version of a unidecoded string."""
return unicode(unidecode_orig(arg))

36
patacrep/index.py

@ -8,19 +8,18 @@ the original makeindex program written in C that produces an index file (.sbx)
from a file generated by the latex compilation of the songbook (.sxd).
"""
from unidecode import unidecode
import locale
import re
import codecs
from patacrep import authors
from patacrep import encoding
from patacrep.plastex import simpleparse
EOL = u"\n"
# Pattern set to ignore latex command in title prefix
KEYWORD_PATTERN = re.compile(r"^%(\w+)\s?(.*)$", re.LOCALE)
FIRST_LETTER_PATTERN = re.compile(r"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE)
KEYWORD_PATTERN = re.compile(ur"^%(\w+)\s?(.*)$", re.LOCALE)
FIRST_LETTER_PATTERN = re.compile(ur"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE)
def sortkey(value):
@ -31,7 +30,7 @@ def sortkey(value):
the sort with latex escape sequences.
"""
return locale.strxfrm(
unidecode(simpleparse(value).replace(' ', 'A')).lower()
encoding.unidecode(simpleparse(value).replace(' ', 'A')).lower()
)
@ -41,9 +40,12 @@ def process_sxd(filename):
Return an Index object.
"""
data = []
with codecs.open(filename, 'r', 'iso-8859-1') as index_file:
try:
index_file = encoding.open_read(filename, 'r')
for line in index_file:
data.append(line.strip().encode('utf-8'))
data.append(line.strip())
finally:
index_file.close()
i = 1
idx = Index(data[0])
@ -82,7 +84,7 @@ class Index(object):
def get_first_letter(key):
"""Return the uppercase first letter of key."""
letter = FIRST_LETTER_PATTERN.match(key).group(1)
if re.match(r'\d', letter):
if re.match(ur'\d', letter):
letter = '0-9'
return letter.upper()
@ -98,9 +100,9 @@ class Index(object):
if 'prefix' in self.keywords:
for prefix in self.keywords['prefix']:
self.prefix_patterns.append(re.compile(
r"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix),
re.LOCALE
))
ur"^({prefix})(\b|\\)(\s*.*)$".format(prefix=prefix),
re.LOCALE
))
if self.indextype == "AUTHOR":
self.authwords = authors.compile_authwords(self.keywords)
@ -126,10 +128,10 @@ class Index(object):
if self.indextype == "TITLE":
# Removing prefixes before titles
for pattern in self.prefix_patterns:
match = pattern.match(key.encode('utf-8'))
match = pattern.match(key)
if match:
self._raw_add(
r"\indextitle{{{}}}{{{}}}".format(
ur"\indextitle{{{}}}{{{}}}".format(
match.group(1).strip(),
(match.group(2) + match.group(3)).strip(),
),
@ -149,12 +151,10 @@ class Index(object):
@staticmethod
def ref_to_str(ref):
"""Return the LaTeX code corresponding to the reference."""
return r'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref)
return ur'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref)
def entry_to_str(self, key, entry):
"""Return the LaTeX code corresponding to the entry."""
if not isinstance(key, unicode):
key = unicode(key, "UTF-8")
return unicode(ur'\idxentry{{{0}}}{{{1}}}' + EOL).format(
key,
ur'\\'.join([self.ref_to_str(ref) for ref in entry]),
@ -166,10 +166,10 @@ class Index(object):
Here, an index block is a letter, and all data beginning with this
letter.
"""
string = r'\begin{idxblock}{' + letter + '}' + EOL
string = ur'\begin{idxblock}{' + letter + '}' + EOL
for key in sorted(entries.keys(), key=sortkey):
string += self.entry_to_str(key, entries[key])
string += r'\end{idxblock}' + EOL
string += ur'\end{idxblock}' + EOL
return string
def entries_to_str(self):

6
patacrep/plastex.py

@ -6,11 +6,11 @@
from plasTeX.TeX import TeX
from plasTeX.Base.LaTeX import Sentences
import codecs
import locale
import os
import sys
from patacrep import encoding
def process_unbr_spaces(node):
#pylint: disable=line-too-long
@ -39,8 +39,6 @@ def simpleparse(text):
"""Parse a simple LaTeX string.
"""
tex = TeX()
if not isinstance(text, unicode):
text = text.decode("utf-8")
tex.input(text)
doc = tex.parse()
return process_unbr_spaces(doc.textContent)
@ -66,7 +64,7 @@ class SongParser(object):
def parse(cls, filename):
"""Parse a TeX file, and return its plasTeX representation."""
tex = cls.create_tex()
tex.input(codecs.open(filename, 'r', 'utf-8', 'replace'))
tex.input(encoding.open_read(filename, 'r'))
return tex.parse()

2
patacrep/plastex_chord.py

@ -74,7 +74,7 @@ class Chord(Command):
@property
def source(self):
"""Return chord LaTeX code."""
return r'\[{}]'.format(self.chord)
return ur'\[{}]'.format(self.chord)
class BeginChordOrDisplayMath(BeginDisplayMath):
r"""Wrapper to BeginDisplayMath

14
patacrep/plastex_songs.py

@ -6,6 +6,7 @@
import plasTeX
from patacrep import encoding
from patacrep.plastex import process_unbr_spaces
@ -28,8 +29,9 @@ def split_linebreak(texlist):
return_list.append(current)
current = []
else:
current.append(
process_unbr_spaces(token).textContent.encode('utf-8'))
current.append(encoding.basestring2unicode(
process_unbr_spaces(token).textContent
))
if current:
return_list.append(current)
return return_list
@ -49,15 +51,17 @@ class beginsong(plasTeX.Command): # pylint: disable=invalid-name,too-many-public
titles = []
for tokens in split_linebreak(self.attributes['titles'].allChildNodes):
titles.append("".join(tokens))
self.attributes['titles'] = titles
self.attributes['titles'] = encoding.list2unicode(titles)
# Parsing keyval arguments
args = {}
for (key, val) in self.attributes['args'].iteritems():
if isinstance(val, plasTeX.DOM.Element):
args[key] = process_unbr_spaces(val).textContent.encode('utf-8')
args[key] = encoding.basestring2unicode(
process_unbr_spaces(val).textContent
)
elif isinstance(val, basestring):
args[key] = val.encode('utf-8')
args[key] = encoding.basestring2unicode(val)
else:
args[key] = unicode(val)
self.attributes['args'] = args

5
patacrep/songs.py

@ -3,7 +3,6 @@
"""Song management."""
from unidecode import unidecode
import re
from patacrep.authors import processauthors
@ -19,7 +18,7 @@ class Song(object):
self.titles = data['titles']
self.unprefixed_titles = [
unprefixed_title(
unidecode(unicode(title, "utf-8")),
title,
config['titleprefixwords']
)
for title
@ -43,7 +42,7 @@ def unprefixed_title(title, prefixes):
"""Remove the first prefix of the list in the beginning of title (if any).
"""
for prefix in prefixes:
match = re.compile(r"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title)
match = re.compile(ur"^(%s)\b\s*(.*)$" % prefix, re.LOCALE).match(title)
if match:
return match.group(2)
return title

25
patacrep/templates.py

@ -6,23 +6,23 @@ from jinja2 import Environment, FileSystemLoader, ChoiceLoader, \
TemplateNotFound, nodes
from jinja2.ext import Extension
from jinja2.meta import find_referenced_templates as find_templates
import codecs
import os
import re
import json
from patacrep import encoding
from patacrep import errors
_LATEX_SUBS = (
(re.compile(r'\\'), r'\\textbackslash'),
(re.compile(r'([{}_#%&$])'), r'\\\1'),
(re.compile(r'~'), r'\~{}'),
(re.compile(r'\^'), r'\^{}'),
(re.compile(r'"'), r"''"),
(re.compile(r'\.\.\.+'), r'\\ldots'),
(re.compile(ur'\\'), ur'\\textbackslash'),
(re.compile(ur'([{}_#%&$])'), ur'\\\1'),
(re.compile(ur'~'), ur'\~{}'),
(re.compile(ur'\^'), ur'\^{}'),
(re.compile(ur'"'), ur"''"),
(re.compile(ur'\.\.\.+'), ur'\\ldots'),
)
_VARIABLE_REGEXP = re.compile(r"""
_VARIABLE_REGEXP = re.compile(ur"""
\(\*\ *variables\ *\*\) # Match (* variables *)
( # Match and capture the following:
(?: # Start of non-capturing group, used to match a single character
@ -178,11 +178,8 @@ class TexRenderer(object):
subvariables = {}
templatename = self.texenv.get_template(template).filename
with codecs.open(
templatename,
'r',
'utf-8'
) as template_file:
try:
template_file = encoding.open_read(templatename, 'r')
content = template_file.read()
subtemplates = list(find_templates(self.texenv.parse(content)))
match = re.findall(_VARIABLE_REGEXP, content)
@ -202,6 +199,8 @@ class TexRenderer(object):
jsonstring=var,
)
)
finally:
template_file.close()
return (subvariables, subtemplates)

7
songbook

@ -15,6 +15,7 @@ import sys
from patacrep.build import SongbookBuilder, DEFAULT_STEPS
from patacrep import __STR_VERSION__
from patacrep import errors
from patacrep import encoding
# Logging configuration
logging.basicConfig(level=logging.INFO)
@ -101,12 +102,14 @@ def main():
basename = os.path.basename(songbook_path)[:-3]
try:
with open(songbook_path) as songbook_file:
songbook = json.load(songbook_file)
songbook_file = encoding.open_read(songbook_path)
songbook = json.load(songbook_file)
except Exception as error: # pylint: disable=broad-except
LOGGER.error(error)
LOGGER.error("Error while loading file '{}'.".format(songbook_path))
sys.exit(1)
finally:
songbook_file.close()
# Gathering datadirs
datadirs = []

Loading…
Cancel
Save