diff --git a/index.py b/index.py index 2c06e722..e6ee884b 100644 --- a/index.py +++ b/index.py @@ -9,13 +9,15 @@ # src is the .sxd file generated by latex # -from plasTeX.TeX import TeX from unidecode import unidecode import sys import re import locale import warnings +from tools import processauthors +from utils.plastex import simpleparse + # Pattern set to ignore latex command in title prefix keywordPattern = re.compile(r"^%(\w+)\s?(.*)$") firstLetterPattern = re.compile(r"^(?:\{?\\\w+\}?)*[^\w]*(\w)") @@ -26,9 +28,7 @@ def sortkey(value): don't forget to call locale.setlocale(locale.LC_ALL, '')). It also handles the sort with latex escape sequences. ''' - tex = TeX() - tex.input(value) - return locale.strxfrm(unidecode(tex.parse().textContent.replace(' ', 'A'))) + return locale.strxfrm(unidecode(simpleparse(value).replace(' ', 'A'))) def processSXDEntry(tab): return (tab[0], tab[1], tab[2]) @@ -40,9 +40,8 @@ def processSXD(filename): data.append(line.strip()) file.close() - type = data[0] i = 1 - idx = index() + idx = index(data[0]) if len(data) > 1: while data[i].startswith('%'): @@ -54,12 +53,21 @@ def processSXD(filename): for i in range(i,len(data),3): entry = processSXDEntry(data[i:i+3]) idx.add(entry[0],entry[1],entry[2]) + return idx class index: - def __init__(self): + def __init__(self, indextype): self.data = dict() self.keywords = dict() + if indextype == "TITLE INDEX DATA FILE": + self.indextype = "TITLE" + elif indextype == "SCRIPTURE INDEX DATA FILE": + self.indextype = "SCRIPTURE" + elif indextype == "AUTHOR INDEX DATA FILE": + self.indextype = "AUTHOR" + else: + self.indextype = "" def filter(self, key): letter = firstLetterPattern.match(key).group(1) @@ -74,16 +82,27 @@ class index: def compileKeywords(self): self.prefix_patterns = [] - if 'prefix' in self.keywords: - for prefix in self.keywords['prefix']: - self.prefix_patterns.append(re.compile(r"^(%s)(\b|\\)(\s*.*)$" % prefix)) - - def add(self, key, number, link): - for pattern in self.prefix_patterns: - match = pattern.match(key) - if match: - key = "%s (%s)" % (match.group(2) + match.group(3), match.group(1)) - break # Only one match per key + if self.indextype == "TITLE": + if 'prefix' in self.keywords: + for prefix in self.keywords['prefix']: + self.prefix_patterns.append(re.compile(r"^(%s)(\b|\\)(\s*.*)$" % prefix)) + + self.authwords = {"after": [], "ignore": [], "sep": []} + if self.indextype == "AUTHOR": + for key in self.keywords: + if key in self.authwords: + self.authwords[key] = self.keywords[key] + for word in self.authwords.keys(): + if word in self.keywords: + if word == "after": + self.authwords[word] = [re.compile(r"^.*%s\b(.*)" % after) for after in self.keywords[word]] + elif word == "sep": + self.authwords[word] = [" %s" % sep for sep in self.authwords[word]] + [","] + self.authwords[word] = [re.compile(r"^(.*)%s (.*)$" % sep) for sep in self.authwords[word] ] + else: + self.authwords[word] = self.keywords[word] + + def _raw_add(self, key, number, link): (first, key) = self.filter(key) if not self.data.has_key(first): self.data[first] = dict() @@ -91,6 +110,25 @@ class index: self.data[first][key] = [] self.data[first][key].append({'num':number, 'link':link}) + def add(self, key, number, link): + if self.indextype == "TITLE": + # Removing prefixes before titles + for pattern in self.prefix_patterns: + match = pattern.match(key) + if match: + self._raw_add( + "%s (%s)" % (match.group(2) + match.group(3), match.group(1)), + number, link) + return + self._raw_add(key, number, link) + + if self.indextype == "AUTHOR": + # Processing authors + for author in processauthors( + key, + **self.authwords): + self._raw_add(author, number, link) + def refToStr(self, ref): if sys.version_info >= (2,6): return '\\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref) @@ -99,9 +137,9 @@ class index: def entryToStr(self, key, entry): if sys.version_info >= (2,6): - return '\\idxentry{{{0}}}{{{1}}}\n'.format(key, '\\\\'.join(map(self.refToStr, entry))) + return unicode('\\idxentry{{{0}}}{{{1}}}\n').format(key, '\\\\'.join(map(self.refToStr, entry))) else: - return '\\idxentry{%s}{%s}\n' % (key, '\\\\'.join(map(self.refToStr, entry))) + return unicode('\\idxentry{%s}{%s}\n') % (key, '\\\\'.join(map(self.refToStr, entry))) def idxBlockToStr(self, letter, entries): str = '\\begin{idxblock}{'+letter+'}'+'\n' diff --git a/songbook.py b/songbook.py index 1ce076a4..f54ce992 100755 --- a/songbook.py +++ b/songbook.py @@ -11,7 +11,7 @@ import json import glob import re from subprocess import call -from tools import recursiveFind +from tools import recursiveFind, processauthors from index import * from unidecode import unidecode from utils.plastex import parsetex @@ -19,8 +19,10 @@ from utils.plastex import parsetex class Song: #: Ordre de tri sort = [] - #: Préfixes à ignorer pour le tri + #: Préfixes à ignorer pour le tri par titres prefixes = [] + #: Dictionnaire des options pour le traitement des auteurs + authwords = {"after": [], "ignore": [], "sep": []} def __init__(self, path, languages, titles, args): self.titles = titles @@ -28,6 +30,14 @@ class Song: self.args = args self.path = path self.languages = languages + if "by" in self.args.keys(): + self.normalized_authors = [ + locale.strxfrm(author) + for author + in processauthors(self.args["by"], **self.authwords) + ] + else: + self.normalized_authors = [] def __repr__(self): return repr((self.titles, self.args, self.path)) @@ -40,8 +50,11 @@ class Song: self_key = self.normalized_titles other_key = other.normalized_titles elif key == "@path": - self.key = locale.strxfrm(self.path) + self_key = locale.strxfrm(self.path) other_key = locale.strxfrm(other.path) + elif key == "by": + self_key = self.normalized_authors + other_key = other.normalized_authors else: self_key = locale.strxfrm(self.args.get(key, "")) other_key = locale.strxfrm(other.args.get(key, "")) @@ -165,9 +178,13 @@ def makeTexFile(sb, library, output): # default value template = "patacrep.tmpl" songs = [] - titleprefixwords = "" + + prefixes_tex = "" prefixes = [] + authwords_tex = "" + authwords = {"after": ["by"], "ignore": ["unknown"], "sep": ["and"]} + # parse the songbook data if "template" in sb: template = sb["template"] @@ -178,8 +195,28 @@ def makeTexFile(sb, library, output): if "titleprefixwords" in sb: prefixes = sb["titleprefixwords"] for prefix in sb["titleprefixwords"]: - titleprefixwords += "\\titleprefixword{%s}\n" % prefix - sb["titleprefixwords"] = titleprefixwords + prefixes_tex += "\\titleprefixword{%s}\n" % prefix + sb["titleprefixwords"] = prefixes_tex + if "authwords" in sb: + # Populating default value + for key in ["after", "sep", "ignore"]: + if key not in sb["authwords"]: + sb["authwords"][key] = authwords[key] + # Processing authwords values + authwords = sb["authwords"] + for key in ["after", "sep", "ignore"]: + for word in authwords[key]: + if key == "after": + authwords_tex += "\\auth%sword{%s}\n" % ("by", word) + else: + authwords_tex += "\\auth%sword{%s}\n" % (key, word) + sb["authwords"] = authwords_tex + if "after" in authwords: + authwords["after"] = [re.compile(r"^.*%s\b(.*)" % after) for after in authwords["after"]] + if "sep" in authwords: + authwords["sep"] = [" %s" % sep for sep in authwords["sep"]] + [","] + authwords["sep"] = [re.compile(r"^(.*)%s (.*)$" % sep) for sep in authwords["sep"] ] + if "lang" not in sb: sb["lang"] = "french" if "sort" in sb: @@ -189,6 +226,7 @@ def makeTexFile(sb, library, output): sort = [u"by", u"album", u"@title"] Song.sort = sort Song.prefixes = prefixes + Song.authwords = authwords parameters = parseTemplate("templates/"+template) @@ -327,7 +365,7 @@ def main(): print "processing " + sxdFile idx = processSXD(sxdFile) indexFile = open(sxdFile[:-3]+"sbx", "w") - indexFile.write(idx.entriesToStr()) + indexFile.write(idx.entriesToStr().encode('utf8')) indexFile.close() # Second pdflatex pass diff --git a/templates/minimal.tmpl b/templates/minimal.tmpl index c6157c5a..1af407df 100644 --- a/templates/minimal.tmpl +++ b/templates/minimal.tmpl @@ -31,6 +31,7 @@ %%: {"name":"bookoptions", "description":"Options", "type":"flag", "values":["diagram","importantdiagramonly","lilypond","pictures","tabs","repeatchords","onesongperpage"], "join":",", "mandatory":true, "default":["pictures"]}, %%: {"name":"mainfontsize", "description":"Font Size", "type":"font", "default":"10"}, %%: {"name":"titleprefixwords", "description":"Ignore some words in the beginning of song titles"}, +%%: {"name":"authwords", "descriptipn":"Set of options to process author string (LaTeX commands authsepword, authignoreword, authbyword)"}, %%: {"name":"languages", "description":"List of languages used by songs", "default":""} %%:] %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -54,6 +55,7 @@ } \gettitleprefixwords +\getauthwords \nosongnumbers \pagestyle{empty} diff --git a/templates/patacrep.tmpl b/templates/patacrep.tmpl index cc493b3f..0fc40565 100644 --- a/templates/patacrep.tmpl +++ b/templates/patacrep.tmpl @@ -44,6 +44,7 @@ %%: {"name":"notebgcolor", "description":"Note Shade", "type":"color", "default":"#D1E4AE"}, %%: {"name":"indexbgcolor", "description":"Index Shade", "type":"color", "default":"#D1E4AE"}, %%: {"name":"titleprefixwords", "description":"Ignore some words in the beginning of song titles"}, +%%: {"name":"authwords", "descriptipn":"Set of options to process author string (LaTeX commands authsepword, authignoreword, authbyword)"}, %%: {"name":"languages", "description":"List of languages used by songs", "default":""} %%:] %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -90,6 +91,7 @@ \renewcommand{\idxbgcolor}{IndexBgColor} \gettitleprefixwords +\getauthwords \pagestyle{empty} diff --git a/tools.py b/tools.py index 576c66b2..56cb280d 100644 --- a/tools.py +++ b/tools.py @@ -11,3 +11,132 @@ def recursiveFind(root_directory, pattern): for filename in fnmatch.filter(filenames, pattern): matches.append(os.path.join(root, filename)) return matches + +def split_author_names(string): + """Split author between first and last name. + + The last space separates first and last name, but spaces following a + backslash or a command are not separators. + Examples: + - Edgar Allan Poe => Poe, Edgar Allan + - Edgar Allan \emph {Poe} => \emph {Poe}, Edgar Allan + - The Rolling\ Stones => Rolling\ Stones, The + - The {Rolling Stones} => {Rolling Stones}, The + """ + ignore_space = False + last_space = index = 0 + brace_count = 0 + for char in string: + index += 1 + if brace_count == 0: + if char == "\\": + ignore_space = True + elif not char.isalnum() and ignore_space: + ignore_space = False + elif char == " ": + last_space = index + if char == "}": + brace_count += 1 + if char == "{": + brace_count -= 1 + return string[:last_space], string[last_space:] + +def split_sep_author(string, sep): + authors = [] + match = sep.match(string) + while match: + authors.append(match.group(2)) + string = match.group(1) + match = sep.match(string) + authors.append(string) + return authors + +def processauthors(authors_string, after = [], ignore = [], sep = []): + """Return a list of authors + + For example, we are processing: + # processauthors( + # "Lyrics by William Blake (from Milton, 1808), music by Hubert Parry (1916), and sung by The Royal\ Choir~of~Nowhere (just here to show you how processing is done)", + # after = ["by"], + # ignore = ["anonymous"], + # sep = ["and"] + # ) + + The "authors_string" string is processed as: + + 1) First, parenthesis (and its content) are removed. + # "Lyrics by William Blake, music by Hubert Parry, and sung by The Royal\ Choir~of~Nowhere" + + 2) String is split, separators being comma and words from "sep". + # ["Lyrics by William Blake", "music by Hubert Parry", "sung by The Royal\ Choir~of~Nowhere"] + + 3) Everything before words in "after" is removed. + # ["William Blake", "Hubert Parry", "The Royal\ Choir~of~Nowhere"] + + 4) Strings containing words of "ignore" are dropped. + # ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere"] + + 5) First names are moved after last names + # ["Blake, William", "Parry, Hubert", Royal\ Choir~of~Nowhere, The"] + """ + + # Removing parentheses + opening = 0 + dest = "" + for char in authors_string: + if char == '(': + opening += 1 + elif char == ')' and opening > 0: + opening -= 1 + elif opening == 0: + dest += char + authors_string = dest + + # Splitting strings + authors_list = [authors_string] + for sepword in sep: + dest = [] + for author in authors_list: + dest.extend(split_sep_author(author, sepword)) + authors_list = dest + + # Removing stuff before "after" + dest = [] + for author in authors_list: + for afterword in after: + match = afterword.match(author) + if match: + author = match.group(1) + break + dest.append(author) + authors_list = dest + + # Ignoring ignored authors + dest = [] + for author in authors_list: + ignored = False + for ignoreword in ignore: + if author.find(str(ignoreword)) != -1: + ignored = True + break + if not ignored: + dest.append(author) + authors_list = dest + + # Cleaning: removing empty authors and unnecessary spaces + authors_list = [author.lstrip() for author in authors_list if author.lstrip()] + + # Moving first names after last names + dest = [] + for author in authors_list: + first, last = split_author_names(author) + if first: + dest.append("%(last)s, %(first)s" % { + 'first': first.lstrip(), + 'last': last.lstrip(), + }) + else: + dest.append(last.lstrip()) + authors_list = dest + + return authors_list diff --git a/utils/plastex.py b/utils/plastex.py index b316111d..664aa7a1 100755 --- a/utils/plastex.py +++ b/utils/plastex.py @@ -7,6 +7,14 @@ import copy import os import sys +def simpleparse(text): + """Parse a simple LaTeX string. + """ + tex = TeX() + tex.input(text) + doc = tex.parse() + return doc.textContent + class SongParser: """Analyseur syntaxique de fichiers .sg"""