|
@ -5,11 +5,8 @@ import re |
|
|
|
|
|
|
|
|
LOGGER = logging.getLogger(__name__) |
|
|
LOGGER = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
DEFAULT_AUTHWORDS = { |
|
|
AUTHWORDS_KEYS = ["after", "ignore", "separators"] |
|
|
"after": ["by"], |
|
|
|
|
|
"ignore": ["unknown"], |
|
|
|
|
|
"sep": ["and"], |
|
|
|
|
|
} |
|
|
|
|
|
RE_AFTER = r"^.*\b{}\b(.*)$" |
|
|
RE_AFTER = r"^.*\b{}\b(.*)$" |
|
|
RE_SEPARATOR = r"^(.*)\b *{} *(\b.*)?$" |
|
|
RE_SEPARATOR = r"^(.*)\b *{} *(\b.*)?$" |
|
|
|
|
|
|
|
@ -19,18 +16,18 @@ def compile_authwords(authwords): |
|
|
This regexp will later be used to match these words in authors strings. |
|
|
This regexp will later be used to match these words in authors strings. |
|
|
""" |
|
|
""" |
|
|
# Fill missing values |
|
|
# Fill missing values |
|
|
for (key, value) in DEFAULT_AUTHWORDS.items(): |
|
|
for key in AUTHWORDS_KEYS: |
|
|
if key not in authwords: |
|
|
if key not in authwords: |
|
|
authwords[key] = value |
|
|
authwords[key] = [] |
|
|
|
|
|
|
|
|
# Compilation |
|
|
# Compilation |
|
|
authwords['after'] = [ |
|
|
authwords['after'] = [ |
|
|
re.compile(RE_AFTER.format(word), re.LOCALE) |
|
|
re.compile(RE_AFTER.format(word), re.LOCALE) |
|
|
for word in authwords['after'] |
|
|
for word in authwords['after'] |
|
|
] |
|
|
] |
|
|
authwords['sep'] = [ |
|
|
authwords['separators'] = [ |
|
|
re.compile(RE_SEPARATOR.format(word), re.LOCALE) |
|
|
re.compile(RE_SEPARATOR.format(word), re.LOCALE) |
|
|
for word in ([" %s" % word for word in authwords['sep']] + [',', ';']) |
|
|
for word in ([" %s" % word for word in authwords['separators']] + [',', ';']) |
|
|
] |
|
|
] |
|
|
|
|
|
|
|
|
return authwords |
|
|
return authwords |
|
@ -60,12 +57,12 @@ def split_author_names(string): |
|
|
return (chunks[-1].strip(), " ".join(chunks[:-1]).strip()) |
|
|
return (chunks[-1].strip(), " ".join(chunks[:-1]).strip()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def split_sep_author(string, sep): |
|
|
def split_sep_author(string, separators): |
|
|
"""Split authors string according to separators. |
|
|
"""Split authors string according to separators. |
|
|
|
|
|
|
|
|
Arguments: |
|
|
Arguments: |
|
|
- string: string containing authors names ; |
|
|
- string: string containing authors names ; |
|
|
- sep: regexp matching a separator. |
|
|
- separators: regexp matching a separator. |
|
|
|
|
|
|
|
|
>>> split_sep_author("Tintin and Milou", re.compile(RE_SEPARATOR.format("and"))) |
|
|
>>> split_sep_author("Tintin and Milou", re.compile(RE_SEPARATOR.format("and"))) |
|
|
['Tintin', 'Milou'] |
|
|
['Tintin', 'Milou'] |
|
@ -73,12 +70,12 @@ def split_sep_author(string, sep): |
|
|
['Tintin'] |
|
|
['Tintin'] |
|
|
""" |
|
|
""" |
|
|
authors = [] |
|
|
authors = [] |
|
|
match = sep.match(string) |
|
|
match = separators.match(string) |
|
|
while match: |
|
|
while match: |
|
|
if match.group(2) is not None: |
|
|
if match.group(2) is not None: |
|
|
authors.append(match.group(2).strip()) |
|
|
authors.append(match.group(2).strip()) |
|
|
string = match.group(1) |
|
|
string = match.group(1) |
|
|
match = sep.match(string) |
|
|
match = separators.match(string) |
|
|
authors.insert(0, string.strip()) |
|
|
authors.insert(0, string.strip()) |
|
|
return authors |
|
|
return authors |
|
|
|
|
|
|
|
@ -105,7 +102,7 @@ def processauthors_removeparen(authors_string): |
|
|
dest += char |
|
|
dest += char |
|
|
return dest |
|
|
return dest |
|
|
|
|
|
|
|
|
def processauthors_split_string(authors_string, sep): |
|
|
def processauthors_split_string(authors_string, separators): |
|
|
"""Split strings |
|
|
"""Split strings |
|
|
|
|
|
|
|
|
See docstring of processauthors() for more information. |
|
|
See docstring of processauthors() for more information. |
|
@ -121,7 +118,7 @@ def processauthors_split_string(authors_string, sep): |
|
|
['Tintin', 'Milou'] |
|
|
['Tintin', 'Milou'] |
|
|
""" |
|
|
""" |
|
|
authors_list = [authors_string] |
|
|
authors_list = [authors_string] |
|
|
for sepword in sep: |
|
|
for sepword in separators: |
|
|
dest = [] |
|
|
dest = [] |
|
|
for author in authors_list: |
|
|
for author in authors_list: |
|
|
dest.extend(split_sep_author(author, sepword)) |
|
|
dest.extend(split_sep_author(author, sepword)) |
|
@ -171,7 +168,7 @@ def processauthors_clean_authors(authors_list): |
|
|
if author.lstrip() |
|
|
if author.lstrip() |
|
|
] |
|
|
] |
|
|
|
|
|
|
|
|
def processauthors(authors_string, after=None, ignore=None, sep=None): |
|
|
def processauthors(authors_string, after=None, ignore=None, separators=None): |
|
|
r"""Return an iterator of authors |
|
|
r"""Return an iterator of authors |
|
|
|
|
|
|
|
|
For example, in the following call: |
|
|
For example, in the following call: |
|
@ -186,7 +183,7 @@ def processauthors(authors_string, after=None, ignore=None, sep=None): |
|
|
... **compile_authwords({ |
|
|
... **compile_authwords({ |
|
|
... 'after': ["by"], |
|
|
... 'after': ["by"], |
|
|
... 'ignore': ["anonymous"], |
|
|
... 'ignore': ["anonymous"], |
|
|
... 'sep': ["and", ","], |
|
|
... 'separators': ["and", ","], |
|
|
... }) |
|
|
... }) |
|
|
... )) == {("Blake", "William"), ("Parry", "Hubert"), ("Royal~Choir~of~FooBar", "The")} |
|
|
... )) == {("Blake", "William"), ("Parry", "Hubert"), ("Royal~Choir~of~FooBar", "The")} |
|
|
True |
|
|
True |
|
@ -198,7 +195,7 @@ def processauthors(authors_string, after=None, ignore=None, sep=None): |
|
|
# "Lyrics by William Blake, music by Hubert Parry, |
|
|
# "Lyrics by William Blake, music by Hubert Parry, |
|
|
and sung by The Royal~Choir~of~FooBar" |
|
|
and sung by The Royal~Choir~of~FooBar" |
|
|
|
|
|
|
|
|
2) String is split, separators being comma and words from "sep". |
|
|
2) String is split, separators being comma and words from "separators". |
|
|
# ["Lyrics by William Blake", "music by Hubert Parry", |
|
|
# ["Lyrics by William Blake", "music by Hubert Parry", |
|
|
"sung by The Royal~Choir~of~FooBar"] |
|
|
"sung by The Royal~Choir~of~FooBar"] |
|
|
|
|
|
|
|
@ -216,8 +213,8 @@ def processauthors(authors_string, after=None, ignore=None, sep=None): |
|
|
# ] |
|
|
# ] |
|
|
""" |
|
|
""" |
|
|
|
|
|
|
|
|
if not sep: |
|
|
if not separators: |
|
|
sep = [] |
|
|
separators = [] |
|
|
if not after: |
|
|
if not after: |
|
|
after = [] |
|
|
after = [] |
|
|
if not ignore: |
|
|
if not ignore: |
|
@ -230,17 +227,17 @@ def processauthors(authors_string, after=None, ignore=None, sep=None): |
|
|
processauthors_removeparen( |
|
|
processauthors_removeparen( |
|
|
authors_string |
|
|
authors_string |
|
|
), |
|
|
), |
|
|
sep), |
|
|
separators), |
|
|
after), |
|
|
after), |
|
|
ignore) |
|
|
ignore) |
|
|
): |
|
|
): |
|
|
yield split_author_names(author) |
|
|
yield split_author_names(author) |
|
|
|
|
|
|
|
|
def process_listauthors(authors_list, after=None, ignore=None, sep=None): |
|
|
def process_listauthors(authors_list, after=None, ignore=None, separators=None): |
|
|
"""Process a list of authors, and return the list of resulting authors.""" |
|
|
"""Process a list of authors, and return the list of resulting authors.""" |
|
|
authors = [] |
|
|
authors = [] |
|
|
for sublist in [ |
|
|
for sublist in [ |
|
|
processauthors(string, after, ignore, sep) |
|
|
processauthors(string, after, ignore, separators) |
|
|
for string in authors_list |
|
|
for string in authors_list |
|
|
]: |
|
|
]: |
|
|
authors.extend(sublist) |
|
|
authors.extend(sublist) |
|
|