mirror of https://github.com/patacrep/patacrep.git
Engine for LaTeX songbooks
http://www.patacrep.com
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
245 lines
7.2 KiB
245 lines
7.2 KiB
"""Authors string management."""
|
|
|
|
import logging
|
|
import re
|
|
|
|
LOGGER = logging.getLogger(__name__)
|
|
|
|
DEFAULT_AUTHWORDS = {
|
|
"after": ["by"],
|
|
"ignore": ["unknown"],
|
|
"sep": ["and"],
|
|
}
|
|
RE_AFTER = r"^.*\b{}\b(.*)$"
|
|
RE_SEPARATOR = r"^(.*)\b *{} *(\b.*)?$"
|
|
|
|
def compile_authwords(authwords):
|
|
"""Convert strings of authwords to compiled regular expressions.
|
|
|
|
This regexp will later be used to match these words in authors strings.
|
|
"""
|
|
# Fill missing values
|
|
for (key, value) in DEFAULT_AUTHWORDS.items():
|
|
if key not in authwords:
|
|
authwords[key] = value
|
|
|
|
# Compilation
|
|
authwords['after'] = [
|
|
re.compile(RE_AFTER.format(word), re.LOCALE)
|
|
for word in authwords['after']
|
|
]
|
|
authwords['sep'] = [
|
|
re.compile(RE_SEPARATOR.format(word), re.LOCALE)
|
|
for word in ([" %s" % word for word in authwords['sep']] + [',', ';'])
|
|
]
|
|
|
|
return authwords
|
|
|
|
|
|
def split_author_names(string):
|
|
r"""Split author between first and last name.
|
|
|
|
The last space separates first and last name. LaTeX commands are ignored.
|
|
|
|
>>> split_author_names("Edgar Allan Poe")
|
|
('Poe', 'Edgar Allan')
|
|
>>> split_author_names("Edgar Allan \emph {Poe}")
|
|
('{Poe}', 'Edgar Allan \\emph')
|
|
>>> split_author_names(r"The Rolling\ Stones")
|
|
('Stones', 'The Rolling\\')
|
|
>>> split_author_names("The {Rolling Stones}")
|
|
('Stones}', 'The {Rolling')
|
|
>>> split_author_names("The Rolling Stones")
|
|
('Rolling\xa0Stones', 'The')
|
|
>>> split_author_names(" John Doe ")
|
|
('Doe', 'John')
|
|
"""
|
|
chunks = string.strip().split(" ")
|
|
return (chunks[-1].strip(), " ".join(chunks[:-1]).strip())
|
|
|
|
|
|
def split_sep_author(string, sep):
|
|
"""Split authors string according to separators.
|
|
|
|
Arguments:
|
|
- string: string containing authors names ;
|
|
- sep: regexp matching a separator.
|
|
|
|
>>> split_sep_author("Tintin and Milou", re.compile(RE_SEPARATOR.format("and")))
|
|
['Tintin', 'Milou']
|
|
>>> split_sep_author("Tintin,", re.compile(RE_SEPARATOR.format(",")))
|
|
['Tintin']
|
|
"""
|
|
authors = []
|
|
match = sep.match(string)
|
|
while match:
|
|
if match.group(2) is not None:
|
|
authors.append(match.group(2).strip())
|
|
string = match.group(1)
|
|
match = sep.match(string)
|
|
authors.insert(0, string.strip())
|
|
return authors
|
|
|
|
################################################################################
|
|
### Process authors tools.
|
|
################################################################################
|
|
|
|
def processauthors_removeparen(authors_string):
|
|
"""Remove parentheses
|
|
|
|
See docstring of processauthors() for more information.
|
|
|
|
>>> processauthors_removeparen("This (foo) string (bar) contains (baz) parenthesis")
|
|
'This string contains parenthesis'
|
|
"""
|
|
opening = 0
|
|
dest = ""
|
|
for char in authors_string:
|
|
if char == '(':
|
|
opening += 1
|
|
elif char == ')' and opening > 0:
|
|
opening -= 1
|
|
elif opening == 0:
|
|
dest += char
|
|
return dest
|
|
|
|
def processauthors_split_string(authors_string, sep):
|
|
"""Split strings
|
|
|
|
See docstring of processauthors() for more information.
|
|
|
|
>>> processauthors_split_string("Tintin and Milou", [re.compile(RE_SEPARATOR.format("and"))])
|
|
['Tintin', 'Milou']
|
|
>>> processauthors_split_string("Tintin, Milou", [re.compile(RE_SEPARATOR.format(","))])
|
|
['Tintin', 'Milou']
|
|
>>> processauthors_split_string(
|
|
... "Tintin, and Milou",
|
|
... [re.compile(RE_SEPARATOR.format(word)) for word in ['and', ',']]
|
|
... )
|
|
['Tintin', 'Milou']
|
|
"""
|
|
authors_list = [authors_string]
|
|
for sepword in sep:
|
|
dest = []
|
|
for author in authors_list:
|
|
dest.extend(split_sep_author(author, sepword))
|
|
authors_list = dest
|
|
return authors_list
|
|
|
|
def processauthors_remove_after(authors_list, after):
|
|
"""Remove stuff before "after"
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
dest = []
|
|
for author in authors_list:
|
|
for afterword in after:
|
|
match = afterword.match(author)
|
|
if match:
|
|
author = match.group(1)
|
|
break
|
|
dest.append(author)
|
|
return dest
|
|
|
|
def processauthors_ignore_authors(authors_list, ignore):
|
|
"""Ignore ignored authors
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
dest = []
|
|
for author in authors_list:
|
|
ignored = False
|
|
for ignoreword in ignore:
|
|
if author.find(ignoreword) != -1:
|
|
ignored = True
|
|
break
|
|
if not ignored:
|
|
dest.append(author)
|
|
return dest
|
|
|
|
def processauthors_clean_authors(authors_list):
|
|
"""Clean: remove empty authors and unnecessary spaces
|
|
|
|
See docstring of processauthors() for more information.
|
|
"""
|
|
return [
|
|
author.lstrip()
|
|
for author
|
|
in authors_list
|
|
if author.lstrip()
|
|
]
|
|
|
|
def processauthors(authors_string, after=None, ignore=None, sep=None):
|
|
r"""Return an iterator of authors
|
|
|
|
For example, in the following call:
|
|
|
|
>>> set(processauthors(
|
|
... (
|
|
... "Lyrics by William Blake (from Milton, 1808), "
|
|
... "music by Hubert Parry (1916), "
|
|
... "and sung by The Royal~Choir~of~FooBar "
|
|
... "(just here to show you how processing is done)"
|
|
... ),
|
|
... **compile_authwords({
|
|
... 'after': ["by"],
|
|
... 'ignore': ["anonymous"],
|
|
... 'sep': ["and", ","],
|
|
... })
|
|
... )) == {("Blake", "William"), ("Parry", "Hubert"), ("Royal~Choir~of~FooBar", "The")}
|
|
True
|
|
|
|
|
|
The "authors_string" is processed as:
|
|
|
|
1) First, parenthesis (and its content) are removed.
|
|
# "Lyrics by William Blake, music by Hubert Parry,
|
|
and sung by The Royal~Choir~of~FooBar"
|
|
|
|
2) String is split, separators being comma and words from "sep".
|
|
# ["Lyrics by William Blake", "music by Hubert Parry",
|
|
"sung by The Royal~Choir~of~FooBar"]
|
|
|
|
3) Everything before words in "after" is removed.
|
|
# ["William Blake", "Hubert Parry", "The Royal~Choir~of~FooBar"]
|
|
|
|
4) Strings containing words of "ignore" are dropped.
|
|
# ["William Blake", "Hubert Parry", The Royal~Choir~of~FooBar"]
|
|
|
|
5) First and last names are splitted
|
|
# [
|
|
# ("Blake", "William"),
|
|
# ("Parry", "Hubert"),
|
|
# ("Royal~Choir~of~FooBar", "The"),
|
|
# ]
|
|
"""
|
|
|
|
if not sep:
|
|
sep = []
|
|
if not after:
|
|
after = []
|
|
if not ignore:
|
|
ignore = []
|
|
|
|
for author in processauthors_clean_authors(
|
|
processauthors_ignore_authors(
|
|
processauthors_remove_after(
|
|
processauthors_split_string(
|
|
processauthors_removeparen(
|
|
authors_string
|
|
),
|
|
sep),
|
|
after),
|
|
ignore)
|
|
):
|
|
yield split_author_names(author)
|
|
|
|
def process_listauthors(authors_list, after=None, ignore=None, sep=None):
|
|
"""Process a list of authors, and return the list of resulting authors."""
|
|
authors = []
|
|
for sublist in [
|
|
processauthors(string, after, ignore, sep)
|
|
for string in authors_list
|
|
]:
|
|
authors.extend(sublist)
|
|
return authors
|
|
|