mirror of https://github.com/patacrep/patacrep.git
Engine for LaTeX songbooks
http://www.patacrep.com
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
256 lines
7.5 KiB
256 lines
7.5 KiB
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""Authors string management."""
|
|
|
|
import logging
|
|
import re
|
|
|
|
LOGGER = logging.getLogger(__name__)
|
|
|
|
DEFAULT_AUTHWORDS = {
|
|
"after": ["by"],
|
|
"ignore": ["unknown"],
|
|
"sep": ["and"],
|
|
}
|
|
|
|
def to_utf8(string):
|
|
"""Convert a string (encoded in unicode or iso-8859-1 to utf-8"""
|
|
if type(string) is unicode:
|
|
return string.encode('utf-8')
|
|
elif type(string) is str:
|
|
return string.decode('iso-8859-1').encode('utf-8')
|
|
else:
|
|
try:
|
|
return string.encode('utf-8')
|
|
except:
|
|
LOGGER.warning("Ignoring a word I can not decode...")
|
|
return ""
|
|
|
|
def compile_authwords(authwords):
|
|
"""Convert strings of authwords to compiled regular expressions.
|
|
|
|
This regexp will later be used to match these words in authors strings.
|
|
"""
|
|
# Fill missing values
|
|
for (key, value) in DEFAULT_AUTHWORDS.items():
|
|
if key not in authwords:
|
|
authwords[key] = value
|
|
|
|
# Compilation
|
|
authwords['after'] = [
|
|
re.compile(r"^.*\b%s\b(.*)$" % word, re.LOCALE)
|
|
for word in authwords['after']
|
|
]
|
|
authwords['sep'] = [
|
|
re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE)
|
|
for word in ([" %s" % word for word in authwords['sep']] + [','])
|
|
]
|
|
authwords['ignore'] = [
|
|
to_utf8(word)
|
|
for word in authwords['ignore'] if to_utf8(word)
|
|
]
|
|
|
|
return authwords
|
|
|
|
|
|
def split_author_names(string):
|
|
r"""Split author between first and last name.
|
|
|
|
The last space separates first and last name, but spaces following a
|
|
backslash or a command are not separators.
|
|
Examples:
|
|
- Edgar Allan Poe => Poe, Edgar Allan
|
|
- Edgar Allan \emph {Poe} => \emph {Poe}, Edgar Allan
|
|
- The Rolling\ Stones => Rolling\ Stones, The
|
|
- The {Rolling Stones} => {Rolling Stones}, The
|
|
"""
|
|
ignore_space = False
|
|
last_space = index = 0
|
|
brace_count = 0
|
|
for char in string.strip():
|
|
index += 1
|
|
if brace_count == 0:
|
|
if char == "\\":
|
|
ignore_space = True
|
|
elif not char.isalnum() and ignore_space:
|
|
ignore_space = False
|
|
elif char == " ":
|
|
last_space = index
|
|
if char == "}":
|
|
brace_count += 1
|
|
if char == "{":
|
|
brace_count -= 1
|
|
return string[:last_space], string[last_space:]
|
|
|
|
|
|
def split_sep_author(string, sep):
|
|
"""Split authors string according to separators.
|
|
|
|
Arguments:
|
|
- string: string containing authors names ;
|
|
- sep: regexp matching a separator.
|
|
|
|
>>> split_sep_author("Tintin and Milou", re.compile('^(.*) and (.*)$'))
|
|
["Tintin", "Milou"]
|
|
"""
|
|
authors = []
|
|
match = sep.match(string)
|
|
while match:
|
|
authors.append(match.group(2))
|
|
string = match.group(1)
|
|
match = sep.match(string)
|
|
authors.append(string)
|
|
return authors
|
|
|
|
################################################################################
|
|
### Process authors tools.
|
|
################################################################################
|
|
|
|
def processauthors_removeparen(authors_string):
|
|
"""Remove parentheses
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
opening = 0
|
|
dest = ""
|
|
for char in authors_string:
|
|
if char == '(':
|
|
opening += 1
|
|
elif char == ')' and opening > 0:
|
|
opening -= 1
|
|
elif opening == 0:
|
|
dest += char
|
|
return dest
|
|
|
|
def processauthors_split_string(authors_string, sep):
|
|
"""Split strings
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
authors_list = [authors_string]
|
|
for sepword in sep:
|
|
dest = []
|
|
for author in authors_list:
|
|
dest.extend(split_sep_author(author, sepword))
|
|
authors_list = dest
|
|
return authors_list
|
|
|
|
def processauthors_remove_after(authors_list, after):
|
|
"""Remove stuff before "after"
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
dest = []
|
|
for author in authors_list:
|
|
for afterword in after:
|
|
match = afterword.match(author)
|
|
if match:
|
|
author = match.group(1)
|
|
break
|
|
dest.append(author)
|
|
return dest
|
|
|
|
def processauthors_ignore_authors(authors_list, ignore):
|
|
"""Ignore ignored authors
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
dest = []
|
|
for author in authors_list:
|
|
ignored = False
|
|
for ignoreword in ignore:
|
|
if author.find(str(ignoreword)) != -1:
|
|
ignored = True
|
|
break
|
|
if not ignored:
|
|
dest.append(author)
|
|
return dest
|
|
|
|
def processauthors_clean_authors(authors_list):
|
|
"""Clean: remove empty authors and unnecessary spaces
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
return [
|
|
author.lstrip()
|
|
for author
|
|
in authors_list
|
|
if author.lstrip()
|
|
]
|
|
|
|
def processauthors_invert_names(authors_list):
|
|
"""Move first names after last names
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
dest = []
|
|
for author in authors_list:
|
|
first, last = split_author_names(author)
|
|
if first:
|
|
dest.append(r"\indexauthor{{{first}}}{{{last}}}".format(
|
|
first=first.strip(),
|
|
last=last.strip(),
|
|
))
|
|
else:
|
|
dest.append(last.lstrip())
|
|
return dest
|
|
|
|
def processauthors(authors_string, after=None, ignore=None, sep=None):
|
|
r"""Return a list of authors
|
|
|
|
For example, we are processing:
|
|
# processauthors(
|
|
# "Lyrics by William Blake (from Milton, 1808),
|
|
music by Hubert Parry (1916),
|
|
and sung by The Royal\ Choir~of~Nowhere
|
|
(just here to show you how processing is done)",
|
|
# after = ["by"],
|
|
# ignore = ["anonymous"],
|
|
# sep = [re.compile('^(.*) and (.*)$')],
|
|
# )
|
|
|
|
|
|
The "authors_string" string is processed as:
|
|
|
|
1) First, parenthesis (and its content) are removed.
|
|
# "Lyrics by William Blake, music by Hubert Parry,
|
|
and sung by The Royal\ Choir~of~Nowhere"
|
|
|
|
2) String is split, separators being comma and words from "sep".
|
|
# ["Lyrics by William Blake", "music by Hubert Parry",
|
|
"sung by The Royal\ Choir~of~Nowhere"]
|
|
|
|
3) Everything before words in "after" is removed.
|
|
# ["William Blake", "Hubert Parry", "The Royal\ Choir~of~Nowhere"]
|
|
|
|
4) Strings containing words of "ignore" are dropped.
|
|
# ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere"]
|
|
|
|
5) First and last names are processed through LaTeX command \indexauthor
|
|
(which will, by default, invert first and last names).
|
|
# ["\indexauthor{William}{Blake}", "\indexauthor{Hubert}{Parry}",
|
|
# \indexthaor{The}{Royal\ Choir~of~Nowhere}"]
|
|
"""
|
|
|
|
if not sep:
|
|
sep = []
|
|
if not after:
|
|
after = []
|
|
if not ignore:
|
|
ignore = []
|
|
|
|
return processauthors_invert_names(
|
|
processauthors_clean_authors(
|
|
processauthors_ignore_authors(
|
|
processauthors_remove_after(
|
|
processauthors_split_string(
|
|
processauthors_removeparen(
|
|
authors_string
|
|
),
|
|
sep),
|
|
after),
|
|
ignore)
|
|
)
|
|
)
|
|
|
|
|