patacrep/patacrep/authors.py


								"""Authors string management."""


								import logging

								import re


								LOGGER = logging.getLogger(__name__)


								DEFAULT_AUTHWORDS = {

								        "after": ["by"],

								        "ignore": ["unknown"],

								        "sep": ["and"],

								        }


								def compile_authwords(authwords):

								    """Convert strings of authwords to compiled regular expressions.


								    This regexp will later be used to match these words in authors strings.

								    """

								    # Fill missing values

								    for (key, value) in DEFAULT_AUTHWORDS.items():

								        if key not in authwords:

								            authwords[key] = value


								    # Compilation

								    authwords['after'] = [

								            re.compile(r"^.*\b%s\b(.*)$" % word, re.LOCALE)

								            for word in authwords['after']

								            ]

								    authwords['sep'] = [

								            re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE)

								            for word in ([" %s" % word for word in authwords['sep']] + [','])

								            ]


								    return authwords


								def split_author_names(string):

								    r"""Split author between first and last name.


								    The last space separates first and last name, but spaces following a

								    backslash or a command are not separators.

								    Examples:

								    - Edgar Allan Poe => Poe, Edgar Allan

								    - Edgar Allan \emph {Poe} => \emph {Poe}, Edgar Allan

								    - The Rolling\ Stones => Rolling\ Stones, The

								    - The {Rolling Stones} => {Rolling Stones}, The

								    """

								    ignore_space = False

								    last_space = index = 0

								    brace_count = 0

								    for char in string.strip():

								        index += 1

								        if brace_count == 0:

								            if char == "\\":

								                ignore_space = True

								            elif not char.isalnum() and ignore_space:

								                ignore_space = False

								            elif char == " ":

								                last_space = index

								        if char == "}":

								            brace_count += 1

								        if char == "{":

								            brace_count -= 1

								    return string[last_space:], string[:last_space]


								def split_sep_author(string, sep):

								    """Split authors string according to separators.


								    Arguments:

								    - string: string containing authors names ;

								    - sep: regexp matching a separator.


								    >>> split_sep_author("Tintin and Milou", re.compile('^(.*) and (.*)$'))

								    ["Tintin", "Milou"]

								    """

								    authors = []

								    match = sep.match(string)

								    while match:

								        authors.append(match.group(2))

								        string = match.group(1)

								        match = sep.match(string)

								    authors.append(string)

								    return authors


								################################################################################

								### Process authors tools.

								################################################################################


								def processauthors_removeparen(authors_string):

								    """Remove parentheses


								    See docstring of processauthors() for more information.

								    """

								    opening = 0

								    dest = ""

								    for char in authors_string:

								        if char == '(':

								            opening += 1

								        elif char == ')' and opening > 0:

								            opening -= 1

								        elif opening == 0:

								            dest += char

								    return dest


								def processauthors_split_string(authors_string, sep):

								    """Split strings


								    See docstring of processauthors() for more information.

								    """

								    authors_list = [authors_string]

								    for sepword in sep:

								        dest = []

								        for author in authors_list:

								            dest.extend(split_sep_author(author, sepword))

								        authors_list = dest

								    return authors_list


								def processauthors_remove_after(authors_list, after):

								    """Remove stuff before "after"


								    See docstring of processauthors() for more information.

								    """

								    dest = []

								    for author in authors_list:

								        for afterword in after:

								            match = afterword.match(author)

								            if match:

								                author = match.group(1)

								                break

								        dest.append(author)

								    return dest


								def processauthors_ignore_authors(authors_list, ignore):

								    """Ignore ignored authors


								    See docstring of processauthors() for more information.

								    """

								    dest = []

								    for author in authors_list:

								        ignored = False

								        for ignoreword in ignore:

								            if author.find(ignoreword) != -1:

								                ignored = True

								                break

								        if not ignored:

								            dest.append(author)

								    return dest


								def processauthors_clean_authors(authors_list):

								    """Clean: remove empty authors and unnecessary spaces


								    See docstring of processauthors() for more information.

								    """

								    return [

								            author.lstrip()

								            for author

								            in authors_list

								            if author.lstrip()

								            ]


								def processauthors(authors_string, after=None, ignore=None, sep=None):

								    r"""Return a list of authors


								    For example, we are processing:

								    # processauthors(

								    #   "Lyrics by William Blake (from Milton, 1808),

								                    music by Hubert Parry (1916),

								                    and sung by The Royal\ Choir~of~Nowhere

								                    (just here to show you how processing is done)",

								    #   after = ["by"],

								    #   ignore = ["anonymous"],

								    #   sep = [re.compile('^(.*) and (.*)$')],

								    #   )


								    The "authors_string" string is processed as:


								    1) First, parenthesis (and its content) are removed.

								    # "Lyrics by William Blake, music by Hubert Parry,

								                and sung by The Royal\ Choir~of~Nowhere"


								    2) String is split, separators being comma and words from "sep".

								    # ["Lyrics by William Blake", "music by Hubert Parry",

								                "sung by The Royal\ Choir~of~Nowhere"]


								    3) Everything before words in "after" is removed.

								    # ["William Blake", "Hubert Parry", "The Royal\ Choir~of~Nowhere"]


								    4) Strings containing words of "ignore" are dropped.

								    # ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere"]


								    5) First and last names are splitted

								    # [

								    #   ("Blake", "William"),

								    #   ("Parry", "Hubert"),

								    #   ("Royal\ Choir~of~Nowhere", "The"),

								    # ]

								    """


								    if not sep:

								        sep = []

								    if not after:

								        after = []

								    if not ignore:

								        ignore = []


								    return [

								            split_author_names(author)

								            for author

								            in processauthors_clean_authors(

								                processauthors_ignore_authors(

								                    processauthors_remove_after(

								                        processauthors_split_string(

								                            processauthors_removeparen(

								                                authors_string

								                                ),

								                            sep),

								                        after),

								                    ignore)

								                )

								            ]