patacrep/tools.py


								#!/usr/bin/python

								# -*- coding: utf-8 -*-

								#


								import fnmatch

								import os


								def recursiveFind(root_directory, pattern):

								   matches = []

								   for root, dirnames, filenames in os.walk(root_directory):

								      for filename in fnmatch.filter(filenames, pattern):

								         matches.append(os.path.join(root, filename))

								   return matches


								def split_author_names(string):

								    """Split author between first and last name.


								    The last space separates first and last name, but spaces following a

								    backslash or a command are not separators.

								    Examples:

								    - Edgar Allan Poe => Poe, Edgar Allan

								    - Edgar Allan \emph {Poe} => \emph {Poe}, Edgar Allan

								    - The Rolling\ Stones => Rolling\ Stones, The

								    - The {Rolling Stones} => {Rolling Stones}, The

								    """

								    ignore_space = False

								    last_space = index = 0

								    brace_count = 0

								    for char in string:

								        index += 1

								        if brace_count == 0:

								            if char == "\\":

								                ignore_space = True

								            elif not char.isalnum() and ignore_space:

								                ignore_space = False

								            elif char == " ":

								                last_space = index

								        if char == "}":

								            brace_count += 1

								        if char == "{":

								            brace_count -= 1

								    return string[:last_space], string[last_space:]


								def split_sep_author(string, sep):

								    authors = []

								    match = sep.match(string)

								    while match:

								        authors.append(match.group(2))

								        string = match.group(1)

								        match = sep.match(string)

								    authors.append(string)

								    return authors


								def processauthors(authors_string, after = [], ignore = [], sep = []):

								    """Return a list of authors


								    For example, we are processing:

								    # processauthors(

								    #   "Lyrics by William Blake (from Milton, 1808), music by Hubert Parry (1916), and sung by The Royal\ Choir~of~Nowhere (just here to show you how processing is done)",

								    #   after = ["by"],

								    #   ignore = ["anonymous"],

								    #   sep = ["and"]

								    #   )


								    The "authors_string" string is processed as:


								    1) First, parenthesis (and its content) are removed.

								    # "Lyrics by William Blake, music by Hubert Parry, and sung by The Royal\ Choir~of~Nowhere"


								    2) String is split, separators being comma and words from "sep".

								    # ["Lyrics by William Blake", "music by Hubert Parry", "sung by The Royal\ Choir~of~Nowhere"]


								    3) Everything before words in "after" is removed.

								    # ["William Blake", "Hubert Parry", "The Royal\ Choir~of~Nowhere"]


								    4) Strings containing words of "ignore" are dropped.

								    # ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere"]


								    5) First names are moved after last names

								    # ["Blake, William", "Parry, Hubert", Royal\ Choir~of~Nowhere, The"]

								    """


								    # Removing parentheses

								    opening = 0

								    dest = ""

								    for char in authors_string:

								        if char == '(':

								            opening += 1

								        elif char == ')' and opening > 0:

								            opening -= 1

								        elif opening == 0:

								            dest += char

								    authors_string = dest


								    # Splitting strings

								    authors_list = [authors_string]

								    for sepword in sep:

								        dest = []

								        for author in authors_list:

								            dest.extend(split_sep_author(author, sepword))

								        authors_list = dest


								    # Removing stuff before "after"

								    dest = []

								    for author in authors_list:

								        for afterword in after:

								            match = afterword.match(author)

								            if match:

								                author = match.group(1)

								                break

								        dest.append(author)

								    authors_list = dest


								    # Ignoring ignored authors

								    dest = []

								    for author in authors_list:

								        ignored = False

								        for ignoreword in ignore:

								            if author.find(str(ignoreword)) != -1:

								                ignored = True

								                break

								        if not ignored:

								            dest.append(author)

								    authors_list = dest


								    # Cleaning: removing empty authors and unnecessary spaces

								    authors_list = [author.lstrip() for author in authors_list if author.lstrip()]


								    # Moving first names after last names

								    dest = []

								    for author in authors_list:

								        first, last = split_author_names(author)

								        if first:

								            dest.append("%(last)s, %(first)s" % {

								                'first': first.lstrip(),

								                'last': last.lstrip(),

								                })

								        else:

								            dest.append(last.lstrip())

								    authors_list = dest


								    return authors_list