patacrep/songbook_core/authors.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Authors string management."""

import re

DEFAULT_AUTHWORDS = {
        "after": ["by"],
        "ignore": ["unknown"],
        "sep": ["and"],
        }

def compile_authwords(authwords):
    """Convert strings of authwords to compiled regular expressions.

    This regexp will later be used to match these words in authors strings.
    """
    # Fill missing values
    for (key, value) in DEFAULT_AUTHWORDS.items():
        if key not in authwords:
            authwords[key] = value

    # Compilation
    authwords['after'] = [
            re.compile(r"^.*%s\b(.*)" % word)
            for word in authwords['after']
            ]
    authwords['sep'] = [
            re.compile(r"^(.*)%s (.*)$" % word)
            for word in ([" %s" % word for word in authwords['sep']] + [','])
            ]

    return authwords


def split_author_names(string):
    r"""Split author between first and last name.

    The last space separates first and last name, but spaces following a
    backslash or a command are not separators.
    Examples:
    - Edgar Allan Poe => Poe, Edgar Allan
    - Edgar Allan \emph {Poe} => \emph {Poe}, Edgar Allan
    - The Rolling\ Stones => Rolling\ Stones, The
    - The {Rolling Stones} => {Rolling Stones}, The
    """
    ignore_space = False
    last_space = index = 0
    brace_count = 0
    for char in string:
        index += 1
        if brace_count == 0:
            if char == "\\":
                ignore_space = True
            elif not char.isalnum() and ignore_space:
                ignore_space = False
            elif char == " ":
                last_space = index
        if char == "}":
            brace_count += 1
        if char == "{":
            brace_count -= 1
    return string[:last_space], string[last_space:]


def split_sep_author(string, sep):
    """Split authors string according to separators.

    Arguments:
    - string: string containing authors names ;
    - sep: regexp matching a separator.

    >>> split_sep_author("Tintin and Milou", '^(.*) and (.*)$')
    ["Tintin", "Milou"]
    """
    authors = []
    match = sep.match(string)
    while match:
        authors.append(match.group(2))
        string = match.group(1)
        match = sep.match(string)
    authors.append(string)
    return authors

################################################################################
### Process authors tools.
################################################################################

def processauthors_removeparen(authors_string):
    """Remove parentheses

    See docstring of processauthors() for more information.
    """
    opening = 0
    dest = ""
    for char in authors_string:
        if char == '(':
            opening += 1
        elif char == ')' and opening > 0:
            opening -= 1
        elif opening == 0:
            dest += char
    return dest

def processauthors_split_string(authors_string, sep):
    """Split strings

    See docstring of processauthors() for more information.
    """
    authors_list = [authors_string]
    for sepword in sep:
        dest = []
        for author in authors_list:
            dest.extend(split_sep_author(author, sepword))
        authors_list = dest
    return authors_list

def processauthors_remove_after(authors_list, after):
    """Remove stuff before "after"

    See docstring of processauthors() for more information.
    """
    dest = []
    for author in authors_list:
        for afterword in after:
            match = afterword.match(author)
            if match:
                author = match.group(1)
                break
        dest.append(author)
    return dest

def processauthors_ignore_authors(authors_list, ignore):
    """Ignore ignored authors

    See docstring of processauthors() for more information.
    """
    dest = []
    for author in authors_list:
        ignored = False
        for ignoreword in ignore:
            if author.find(str(ignoreword)) != -1:
                ignored = True
                break
        if not ignored:
            dest.append(author)
    return dest

def processauthors_clean_authors(authors_list):
    """Clean: remove empty authors and unnecessary spaces

    See docstring of processauthors() for more information.
    """
    return [
            author.lstrip()
            for author
            in authors_list
            if author.lstrip()
            ]

def processauthors_invert_names(authors_list):
    """Move first names after last names

    See docstring of processauthors() for more information.
    """
    dest = []
    for author in authors_list:
        first, last = split_author_names(author)
        if first:
            dest.append("%(last)s, %(first)s" % {
                'first': first.lstrip(),
                'last': last.lstrip(),
                })
        else:
            dest.append(last.lstrip())
    return dest

def processauthors(authors_string, after=None, ignore=None, sep=None):
    r"""Return a list of authors

    For example, we are processing:
    # processauthors(
    #   "Lyrics by William Blake (from Milton, 1808),
                    music by Hubert Parry (1916),
                    and sung by The Royal\ Choir~of~Nowhere
                    (just here to show you how processing is done)",
    #   after = ["by"],
    #   ignore = ["anonymous"],
    #   sep = [re.compile('^(.*) and (.*)$')],
    #   )


    The "authors_string" string is processed as:

    1) First, parenthesis (and its content) are removed.
    # "Lyrics by William Blake, music by Hubert Parry,
                and sung by The Royal\ Choir~of~Nowhere"

    2) String is split, separators being comma and words from "sep".
    # ["Lyrics by William Blake", "music by Hubert Parry",
                "sung by The Royal\ Choir~of~Nowhere"]

    3) Everything before words in "after" is removed.
    # ["William Blake", "Hubert Parry", "The Royal\ Choir~of~Nowhere"]

    4) Strings containing words of "ignore" are dropped.
    # ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere"]

    5) First names are moved after last names
    # ["Blake, William", "Parry, Hubert", Royal\ Choir~of~Nowhere, The"]
    """

    if not sep:
        sep = []
    if not after:
        after = []
    if not ignore:
        ignore = []

    return processauthors_invert_names(
            processauthors_clean_authors(
                processauthors_ignore_authors(
                    processauthors_remove_after(
                        processauthors_split_string(
                            processauthors_removeparen(
                                authors_string
                                ),
                            sep),
                        after),
                    ignore)
                )
            )
Réorganisation ; Création d'un module digne de ce nom 11 years ago			`#!/usr/bin/env python`
use recursive find method to get song and cover files With this, the songs hierarchy in the library can be different from songs/author/song.sg 12 years ago			`# -- coding: utf-8 --`
Les auteurs sont désormais triés par nom de famille 12 years ago
Added docstrings 11 years ago			`"""Authors string management."""`
Début de la mise à niveau pep8 (#4) 11 years ago
Sorted out configuration defaults (between hard-coded defaults, templates, and configuration files) 11 years ago			`import re`

			`DEFAULT_AUTHWORDS = {`
			`"after": ["by"],`
			`"ignore": ["unknown"],`
			`"sep": ["and"],`
			`}`

			`def compile_authwords(authwords):`
The code is pylint compliant. 11 years ago			`"""Convert strings of authwords to compiled regular expressions.`

			`This regexp will later be used to match these words in authors strings.`
			`"""`
			`# Fill missing values`
Sorted out configuration defaults (between hard-coded defaults, templates, and configuration files) 11 years ago			`for (key, value) in DEFAULT_AUTHWORDS.items():`
			`if key not in authwords:`
			`authwords[key] = value`

			`# Compilation`
			`authwords['after'] = [`
			`re.compile(r"^.%s\b(.)" % word)`
			`for word in authwords['after']`
			`]`
			`authwords['sep'] = [`
			`re.compile(r"^(.)%s (.)$" % word)`
			`for word in ([" %s" % word for word in authwords['sep']] + [','])`
			`]`

			`return authwords`


Les auteurs sont désormais triés par nom de famille 12 years ago			`def split_author_names(string):`
Added docstrings 11 years ago			`r"""Split author between first and last name.`
Les auteurs sont désormais triés par nom de famille 12 years ago
			`The last space separates first and last name, but spaces following a`
			`backslash or a command are not separators.`
			`Examples:`
			`- Edgar Allan Poe => Poe, Edgar Allan`
			`- Edgar Allan \emph {Poe} => \emph {Poe}, Edgar Allan`
			`- The Rolling\ Stones => Rolling\ Stones, The`
			`- The {Rolling Stones} => {Rolling Stones}, The`
			`"""`
			`ignore_space = False`
			`last_space = index = 0`
			`brace_count = 0`
			`for char in string:`
			`index += 1`
			`if brace_count == 0:`
			`if char == "\\":`
			`ignore_space = True`
			`elif not char.isalnum() and ignore_space:`
			`ignore_space = False`
			`elif char == " ":`
			`last_space = index`
			`if char == "}":`
			`brace_count += 1`
			`if char == "{":`
			`brace_count -= 1`
			`return string[:last_space], string[last_space:]`

Début de la mise à niveau pep8 (#4) 11 years ago
Les auteurs sont désormais triés par nom de famille 12 years ago			`def split_sep_author(string, sep):`
Added docstrings 11 years ago			`"""Split authors string according to separators.`

			`Arguments:`
			`- string: string containing authors names ;`
			`- sep: regexp matching a separator.`

			`>>> split_sep_author("Tintin and Milou", '^(.) and (.)$')`
			`["Tintin", "Milou"]`
			`"""`
Les auteurs sont désormais triés par nom de famille 12 years ago			`authors = []`
			`match = sep.match(string)`
			`while match:`
			`authors.append(match.group(2))`
			`string = match.group(1)`
			`match = sep.match(string)`
			`authors.append(string)`
			`return authors`

More Pylint formatting. Almost done! (#4) 11 years ago			`################################################################################`
			`### Process authors tools.`
			`################################################################################`
Début de la mise à niveau pep8 (#4) 11 years ago
More Pylint formatting. Almost done! (#4) 11 years ago			`def processauthors_removeparen(authors_string):`
			`"""Remove parentheses`
Les auteurs sont désormais triés par nom de famille 12 years ago
More Pylint formatting. Almost done! (#4) 11 years ago			`See docstring of processauthors() for more information.`
Les auteurs sont désormais triés par nom de famille 12 years ago			`"""`
			`opening = 0`
			`dest = ""`
			`for char in authors_string:`
			`if char == '(':`
			`opening += 1`
			`elif char == ')' and opening > 0:`
			`opening -= 1`
			`elif opening == 0:`
			`dest += char`
More Pylint formatting. Almost done! (#4) 11 years ago			`return dest`

			`def processauthors_split_string(authors_string, sep):`
			`"""Split strings`
Les auteurs sont désormais triés par nom de famille 12 years ago
More Pylint formatting. Almost done! (#4) 11 years ago			`See docstring of processauthors() for more information.`
			`"""`
Les auteurs sont désormais triés par nom de famille 12 years ago			`authors_list = [authors_string]`
			`for sepword in sep:`
			`dest = []`
			`for author in authors_list:`
			`dest.extend(split_sep_author(author, sepword))`
			`authors_list = dest`
More Pylint formatting. Almost done! (#4) 11 years ago			`return authors_list`
Les auteurs sont désormais triés par nom de famille 12 years ago
More Pylint formatting. Almost done! (#4) 11 years ago			`def processauthors_remove_after(authors_list, after):`
			`"""Remove stuff before "after"`

			`See docstring of processauthors() for more information.`
			`"""`
Les auteurs sont désormais triés par nom de famille 12 years ago			`dest = []`
			`for author in authors_list:`
			`for afterword in after:`
			`match = afterword.match(author)`
			`if match:`
			`author = match.group(1)`
			`break`
			`dest.append(author)`
More Pylint formatting. Almost done! (#4) 11 years ago			`return dest`

			`def processauthors_ignore_authors(authors_list, ignore):`
			`"""Ignore ignored authors`
Les auteurs sont désormais triés par nom de famille 12 years ago
More Pylint formatting. Almost done! (#4) 11 years ago			`See docstring of processauthors() for more information.`
			`"""`
Les auteurs sont désormais triés par nom de famille 12 years ago			`dest = []`
			`for author in authors_list:`
			`ignored = False`
			`for ignoreword in ignore:`
			`if author.find(str(ignoreword)) != -1:`
			`ignored = True`
			`break`
			`if not ignored:`
			`dest.append(author)`
More Pylint formatting. Almost done! (#4) 11 years ago			`return dest`
Les auteurs sont désormais triés par nom de famille 12 years ago
More Pylint formatting. Almost done! (#4) 11 years ago			`def processauthors_clean_authors(authors_list):`
			`"""Clean: remove empty authors and unnecessary spaces`
Les auteurs sont désormais triés par nom de famille 12 years ago
More Pylint formatting. Almost done! (#4) 11 years ago			`See docstring of processauthors() for more information.`
			`"""`
			`return [`
			`author.lstrip()`
			`for author`
			`in authors_list`
			`if author.lstrip()`
			`]`

			`def processauthors_invert_names(authors_list):`
			`"""Move first names after last names`

			`See docstring of processauthors() for more information.`
			`"""`
Les auteurs sont désormais triés par nom de famille 12 years ago			`dest = []`
			`for author in authors_list:`
			`first, last = split_author_names(author)`
			`if first:`
			`dest.append("%(last)s, %(first)s" % {`
			`'first': first.lstrip(),`
			`'last': last.lstrip(),`
			`})`
			`else:`
			`dest.append(last.lstrip())`
More Pylint formatting. Almost done! (#4) 11 years ago			`return dest`

			`def processauthors(authors_string, after=None, ignore=None, sep=None):`
			`r"""Return a list of authors`

			`For example, we are processing:`
			`# processauthors(`
			`# "Lyrics by William Blake (from Milton, 1808),`
			`music by Hubert Parry (1916),`
			`and sung by The Royal\ Choir~of~Nowhere`
			`(just here to show you how processing is done)",`
			`# after = ["by"],`
			`# ignore = ["anonymous"],`
			`# sep = [re.compile('^(.) and (.)$')],`
			`# )`


			`The "authors_string" string is processed as:`

			`1) First, parenthesis (and its content) are removed.`
			`# "Lyrics by William Blake, music by Hubert Parry,`
			`and sung by The Royal\ Choir~of~Nowhere"`

			`2) String is split, separators being comma and words from "sep".`
			`# ["Lyrics by William Blake", "music by Hubert Parry",`
			`"sung by The Royal\ Choir~of~Nowhere"]`

			`3) Everything before words in "after" is removed.`
			`# ["William Blake", "Hubert Parry", "The Royal\ Choir~of~Nowhere"]`

			`4) Strings containing words of "ignore" are dropped.`
			`# ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere"]`

			`5) First names are moved after last names`
			`# ["Blake, William", "Parry, Hubert", Royal\ Choir~of~Nowhere, The"]`
			`"""`

			`if not sep:`
			`sep = []`
			`if not after:`
			`after = []`
			`if not ignore:`
			`ignore = []`

			`return processauthors_invert_names(`
			`processauthors_clean_authors(`
			`processauthors_ignore_authors(`
			`processauthors_remove_after(`
			`processauthors_split_string(`
			`processauthors_removeparen(`
			`authors_string`
			`),`
			`sep),`
			`after),`
			`ignore)`
			`)`
			`)`
Les auteurs sont désormais triés par nom de famille 12 years ago