mirror of https://github.com/patacrep/patacrep.git
Engine for LaTeX songbooks
http://www.patacrep.com
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
236 lines
6.6 KiB
236 lines
6.6 KiB
"""Authors string management."""
|
|
|
|
import logging
|
|
import re
|
|
|
|
LOGGER = logging.getLogger(__name__)
|
|
|
|
DEFAULT_AUTHWORDS = {
|
|
"after": ["by"],
|
|
"ignore": ["unknown"],
|
|
"sep": ["and"],
|
|
}
|
|
|
|
def compile_authwords(authwords):
|
|
"""Convert strings of authwords to compiled regular expressions.
|
|
|
|
This regexp will later be used to match these words in authors strings.
|
|
"""
|
|
# Fill missing values
|
|
for (key, value) in DEFAULT_AUTHWORDS.items():
|
|
if key not in authwords:
|
|
authwords[key] = value
|
|
|
|
# Compilation
|
|
authwords['after'] = [
|
|
re.compile(r"^.*\b%s\b(.*)$" % word, re.LOCALE)
|
|
for word in authwords['after']
|
|
]
|
|
authwords['sep'] = [
|
|
re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE)
|
|
for word in ([" %s" % word for word in authwords['sep']] + [',', ';'])
|
|
]
|
|
|
|
return authwords
|
|
|
|
|
|
def split_author_names(string):
|
|
r"""Split author between first and last name.
|
|
|
|
The last space separates first and last name, but spaces following a
|
|
backslash or a command are not separators.
|
|
Examples:
|
|
- Edgar Allan Poe => Poe, Edgar Allan
|
|
- Edgar Allan \emph {Poe} => \emph {Poe}, Edgar Allan
|
|
- The Rolling\ Stones => Rolling\ Stones, The
|
|
- The {Rolling Stones} => {Rolling Stones}, The
|
|
"""
|
|
ignore_space = False
|
|
last_space = index = 0
|
|
brace_count = 0
|
|
for char in string.strip():
|
|
index += 1
|
|
if brace_count == 0:
|
|
if char == "\\":
|
|
ignore_space = True
|
|
elif not char.isalnum() and ignore_space:
|
|
ignore_space = False
|
|
elif char == " ":
|
|
last_space = index
|
|
if char == "}":
|
|
brace_count += 1
|
|
if char == "{":
|
|
brace_count -= 1
|
|
return string[last_space:], string[:last_space]
|
|
|
|
|
|
def split_sep_author(string, sep):
|
|
"""Split authors string according to separators.
|
|
|
|
Arguments:
|
|
- string: string containing authors names ;
|
|
- sep: regexp matching a separator.
|
|
|
|
>>> split_sep_author("Tintin and Milou", re.compile('^(.*) and (.*)$'))
|
|
["Tintin", "Milou"]
|
|
"""
|
|
authors = []
|
|
match = sep.match(string)
|
|
while match:
|
|
authors.append(match.group(2))
|
|
string = match.group(1)
|
|
match = sep.match(string)
|
|
authors.append(string)
|
|
return authors
|
|
|
|
################################################################################
|
|
### Process authors tools.
|
|
################################################################################
|
|
|
|
def processauthors_removeparen(authors_string):
|
|
"""Remove parentheses
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
opening = 0
|
|
dest = ""
|
|
for char in authors_string:
|
|
if char == '(':
|
|
opening += 1
|
|
elif char == ')' and opening > 0:
|
|
opening -= 1
|
|
elif opening == 0:
|
|
dest += char
|
|
return dest
|
|
|
|
def processauthors_split_string(authors_string, sep):
|
|
"""Split strings
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
authors_list = [authors_string]
|
|
for sepword in sep:
|
|
dest = []
|
|
for author in authors_list:
|
|
dest.extend(split_sep_author(author, sepword))
|
|
authors_list = dest
|
|
return authors_list
|
|
|
|
def processauthors_remove_after(authors_list, after):
|
|
"""Remove stuff before "after"
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
dest = []
|
|
for author in authors_list:
|
|
for afterword in after:
|
|
match = afterword.match(author)
|
|
if match:
|
|
author = match.group(1)
|
|
break
|
|
dest.append(author)
|
|
return dest
|
|
|
|
def processauthors_ignore_authors(authors_list, ignore):
|
|
"""Ignore ignored authors
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
dest = []
|
|
for author in authors_list:
|
|
ignored = False
|
|
for ignoreword in ignore:
|
|
if author.find(ignoreword) != -1:
|
|
ignored = True
|
|
break
|
|
if not ignored:
|
|
dest.append(author)
|
|
return dest
|
|
|
|
def processauthors_clean_authors(authors_list):
|
|
"""Clean: remove empty authors and unnecessary spaces
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
return [
|
|
author.lstrip()
|
|
for author
|
|
in authors_list
|
|
if author.lstrip()
|
|
]
|
|
|
|
def processauthors(authors_string, after=None, ignore=None, sep=None):
|
|
r"""Return a list of authors
|
|
|
|
For example, we are processing:
|
|
# processauthors(
|
|
# [
|
|
# "
|
|
# Lyrics by William Blake (from Milton, 1808),
|
|
# music by Hubert Parry (1916),
|
|
# and sung by The Royal\ Choir~of~Nowhere
|
|
# (just here to show you how processing is done)
|
|
# ",
|
|
# ],
|
|
# after = ["by"],
|
|
# ignore = ["anonymous"],
|
|
# sep = [re.compile('^(.*) and (.*)$')],
|
|
# )
|
|
|
|
|
|
The "authors_string" is processed as:
|
|
|
|
1) First, parenthesis (and its content) are removed.
|
|
# "Lyrics by William Blake, music by Hubert Parry,
|
|
and sung by The Royal\ Choir~of~Nowhere"
|
|
|
|
2) String is split, separators being comma and words from "sep".
|
|
# ["Lyrics by William Blake", "music by Hubert Parry",
|
|
"sung by The Royal\ Choir~of~Nowhere"]
|
|
|
|
3) Everything before words in "after" is removed.
|
|
# ["William Blake", "Hubert Parry", "The Royal\ Choir~of~Nowhere"]
|
|
|
|
4) Strings containing words of "ignore" are dropped.
|
|
# ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere"]
|
|
|
|
5) First and last names are splitted
|
|
# [
|
|
# ("Blake", "William"),
|
|
# ("Parry", "Hubert"),
|
|
# ("Royal\ Choir~of~Nowhere", "The"),
|
|
# ]
|
|
"""
|
|
|
|
if not sep:
|
|
sep = []
|
|
if not after:
|
|
after = []
|
|
if not ignore:
|
|
ignore = []
|
|
|
|
return [
|
|
split_author_names(author)
|
|
for author
|
|
in processauthors_clean_authors(
|
|
processauthors_ignore_authors(
|
|
processauthors_remove_after(
|
|
processauthors_split_string(
|
|
processauthors_removeparen(
|
|
authors_string
|
|
),
|
|
sep),
|
|
after),
|
|
ignore)
|
|
)
|
|
]
|
|
|
|
def process_listauthors(authors_list, after=None, ignore=None, sep=None):
|
|
"""Process a list of authors, and return the list of resulting authors."""
|
|
authors = []
|
|
for sublist in [
|
|
processauthors(string, after, ignore, sep)
|
|
for string in authors_list
|
|
]:
|
|
authors.extend(sublist)
|
|
return authors
|
|
|