#!/usr/bin/env python # -*- coding: utf-8 -*- """Authors string management.""" import re DEFAULT_AUTHWORDS = { "after": ["by"], "ignore": ["unknown"], "sep": ["and"], } def compile_authwords(authwords): """Convert strings of authwords to compiled regular expressions. This regexp will later be used to match these words in authors strings. """ # Fill missing values for (key, value) in DEFAULT_AUTHWORDS.items(): if key not in authwords: authwords[key] = value # Compilation authwords['after'] = [ re.compile(r"^.*%s\b(.*)" % word) for word in authwords['after'] ] authwords['sep'] = [ re.compile(r"^(.*)%s (.*)$" % word) for word in ([" %s" % word for word in authwords['sep']] + [',']) ] return authwords def split_author_names(string): r"""Split author between first and last name. The last space separates first and last name, but spaces following a backslash or a command are not separators. Examples: - Edgar Allan Poe => Poe, Edgar Allan - Edgar Allan \emph {Poe} => \emph {Poe}, Edgar Allan - The Rolling\ Stones => Rolling\ Stones, The - The {Rolling Stones} => {Rolling Stones}, The """ ignore_space = False last_space = index = 0 brace_count = 0 for char in string: index += 1 if brace_count == 0: if char == "\\": ignore_space = True elif not char.isalnum() and ignore_space: ignore_space = False elif char == " ": last_space = index if char == "}": brace_count += 1 if char == "{": brace_count -= 1 return string[:last_space], string[last_space:] def split_sep_author(string, sep): """Split authors string according to separators. Arguments: - string: string containing authors names ; - sep: regexp matching a separator. >>> split_sep_author("Tintin and Milou", '^(.*) and (.*)$') ["Tintin", "Milou"] """ authors = [] match = sep.match(string) while match: authors.append(match.group(2)) string = match.group(1) match = sep.match(string) authors.append(string) return authors ################################################################################ ### Process authors tools. ################################################################################ def processauthors_removeparen(authors_string): """Remove parentheses See docstring of processauthors() for more information. """ opening = 0 dest = "" for char in authors_string: if char == '(': opening += 1 elif char == ')' and opening > 0: opening -= 1 elif opening == 0: dest += char return dest def processauthors_split_string(authors_string, sep): """Split strings See docstring of processauthors() for more information. """ authors_list = [authors_string] for sepword in sep: dest = [] for author in authors_list: dest.extend(split_sep_author(author, sepword)) authors_list = dest return authors_list def processauthors_remove_after(authors_list, after): """Remove stuff before "after" See docstring of processauthors() for more information. """ dest = [] for author in authors_list: for afterword in after: match = afterword.match(author) if match: author = match.group(1) break dest.append(author) return dest def processauthors_ignore_authors(authors_list, ignore): """Ignore ignored authors See docstring of processauthors() for more information. """ dest = [] for author in authors_list: ignored = False for ignoreword in ignore: if author.find(str(ignoreword)) != -1: ignored = True break if not ignored: dest.append(author) return dest def processauthors_clean_authors(authors_list): """Clean: remove empty authors and unnecessary spaces See docstring of processauthors() for more information. """ return [ author.lstrip() for author in authors_list if author.lstrip() ] def processauthors_invert_names(authors_list): """Move first names after last names See docstring of processauthors() for more information. """ dest = [] for author in authors_list: first, last = split_author_names(author) if first: dest.append("%(last)s, %(first)s" % { 'first': first.lstrip(), 'last': last.lstrip(), }) else: dest.append(last.lstrip()) return dest def processauthors(authors_string, after=None, ignore=None, sep=None): r"""Return a list of authors For example, we are processing: # processauthors( # "Lyrics by William Blake (from Milton, 1808), music by Hubert Parry (1916), and sung by The Royal\ Choir~of~Nowhere (just here to show you how processing is done)", # after = ["by"], # ignore = ["anonymous"], # sep = [re.compile('^(.*) and (.*)$')], # ) The "authors_string" string is processed as: 1) First, parenthesis (and its content) are removed. # "Lyrics by William Blake, music by Hubert Parry, and sung by The Royal\ Choir~of~Nowhere" 2) String is split, separators being comma and words from "sep". # ["Lyrics by William Blake", "music by Hubert Parry", "sung by The Royal\ Choir~of~Nowhere"] 3) Everything before words in "after" is removed. # ["William Blake", "Hubert Parry", "The Royal\ Choir~of~Nowhere"] 4) Strings containing words of "ignore" are dropped. # ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere"] 5) First names are moved after last names # ["Blake, William", "Parry, Hubert", Royal\ Choir~of~Nowhere, The"] """ if not sep: sep = [] if not after: after = [] if not ignore: ignore = [] return processauthors_invert_names( processauthors_clean_authors( processauthors_ignore_authors( processauthors_remove_after( processauthors_split_string( processauthors_removeparen( authors_string ), sep), after), ignore) ) )