@ -10,6 +10,8 @@ DEFAULT_AUTHWORDS = {
" ignore " : [ " unknown " ] ,
" sep " : [ " and " ] ,
}
RE_AFTER = r " ^.* \ b {} \ b(.*)$ "
RE_SEPARATOR = r " ^(.*) \ b * {} *( \ b.*)?$ "
def compile_authwords ( authwords ) :
""" Convert strings of authwords to compiled regular expressions.
@ -23,11 +25,11 @@ def compile_authwords(authwords):
# Compilation
authwords [ ' after ' ] = [
re . compile ( r " ^.* \ b %s \ b(.*)$ " % word , re . LOCALE )
re . compile ( RE_AFTER . format ( word ) , re . LOCALE )
for word in authwords [ ' after ' ]
]
authwords [ ' sep ' ] = [
re . compile ( r " ^(.*) %s +(.*)$ " % word , re . LOCALE )
re . compile ( RE_SEPARATOR . format ( word ) , re . LOCALE )
for word in ( [ " %s " % word for word in authwords [ ' sep ' ] ] + [ ' , ' , ' ; ' ] )
]
@ -37,31 +39,23 @@ def compile_authwords(authwords):
def split_author_names ( string ) :
r """ Split author between first and last name.
The last space separates first and last name , but spaces following a
backslash or a command are not separators .
Examples :
- Edgar Allan Poe = > Poe , Edgar Allan
- Edgar Allan \emph { Poe } = > \emph { Poe } , Edgar Allan
- The Rolling \ Stones = > Rolling \ Stones , The
- The { Rolling Stones } = > { Rolling Stones } , The
The last space separates first and last name . LaTeX commands are ignored .
>> > split_author_names ( " Edgar Allan Poe " )
( ' Poe ' , ' Edgar Allan ' )
>> > split_author_names ( " Edgar Allan \ emph {Poe} " )
( ' {Poe} ' , ' Edgar Allan \\ emph ' )
>> > split_author_names ( r " The Rolling \ Stones " )
( ' Stones ' , ' The Rolling \\ ' )
>> > split_author_names ( " The { Rolling Stones} " )
( ' Stones} ' , ' The { Rolling ' )
>> > split_author_names ( " The Rolling Stones " )
( ' Rolling \xa0 Stones ' , ' The ' )
>> > split_author_names ( " John Doe " )
( ' Doe ' , ' John ' )
"""
ignore_space = False
last_space = index = 0
brace_count = 0
for char in string . strip ( ) :
index + = 1
if brace_count == 0 :
if char == " \\ " :
ignore_space = True
elif not char . isalnum ( ) and ignore_space :
ignore_space = False
elif char == " " :
last_space = index
if char == " } " :
brace_count + = 1
if char == " { " :
brace_count - = 1
return string [ last_space : ] , string [ : last_space ]
chunks = string . strip ( ) . split ( " " )
return ( chunks [ - 1 ] . strip ( ) , " " . join ( chunks [ : - 1 ] ) . strip ( ) )
def split_sep_author ( string , sep ) :
@ -71,16 +65,19 @@ def split_sep_author(string, sep):
- string : string containing authors names ;
- sep : regexp matching a separator .
>> > split_sep_author ( " Tintin and Milou " , re . compile ( ' ^(.*) and (.*)$ ' ) )
>> > split_sep_author ( " Tintin and Milou " , re . compile ( RE_SEPARATOR . format ( " and " ) ) )
[ ' Tintin ' , ' Milou ' ]
>> > split_sep_author ( " Tintin, " , re . compile ( RE_SEPARATOR . format ( " , " ) ) )
[ ' Tintin ' ]
"""
authors = [ ]
match = sep . match ( string )
while match :
authors . append ( match . group ( 2 ) )
if match . group ( 2 ) is not None :
authors . append ( match . group ( 2 ) . strip ( ) )
string = match . group ( 1 )
match = sep . match ( string )
authors . insert ( 0 , string )
authors . insert ( 0 , string . strip ( ) )
return authors
################################################################################
@ -91,6 +88,9 @@ def processauthors_removeparen(authors_string):
""" Remove parentheses
See docstring of processauthors ( ) for more information .
>> > processauthors_removeparen ( " This (foo) string (bar) contains (baz) parenthesis " )
' This string contains parenthesis '
"""
opening = 0
dest = " "
@ -107,6 +107,16 @@ def processauthors_split_string(authors_string, sep):
""" Split strings
See docstring of processauthors ( ) for more information .
>> > processauthors_split_string ( " Tintin and Milou " , [ re . compile ( RE_SEPARATOR . format ( " and " ) ) ] )
[ ' Tintin ' , ' Milou ' ]
>> > processauthors_split_string ( " Tintin, Milou " , [ re . compile ( RE_SEPARATOR . format ( " , " ) ) ] )
[ ' Tintin ' , ' Milou ' ]
>> > processauthors_split_string (
. . . " Tintin, and Milou " ,
. . . [ re . compile ( RE_SEPARATOR . format ( word ) ) for word in [ ' and ' , ' , ' ] ]
. . . )
[ ' Tintin ' , ' Milou ' ]
"""
authors_list = [ authors_string ]
for sepword in sep :
@ -160,45 +170,47 @@ def processauthors_clean_authors(authors_list):
]
def processauthors ( authors_string , after = None , ignore = None , sep = None ) :
r """ Return a list of authors
For example , we are processing :
# processauthors(
# [
# "
# Lyrics by William Blake (from Milton, 1808),
# music by Hubert Parry (1916),
# and sung by The Royal\ Choir~of~Nowhere
# (just here to show you how processing is done)
# ",
# ],
# after = ["by"],
# ignore = ["anonymous"],
# sep = [re.compile('^(.*) and (.*)$')],
# )
r """ Return an iterator of authors
For example , in the following call :
>> > set ( processauthors (
. . . (
. . . " Lyrics by William Blake (from Milton, 1808), "
. . . " music by Hubert Parry (1916), "
. . . " and sung by The Royal~Choir~of~FooBar "
. . . " (just here to show you how processing is done) "
. . . ) ,
. . . * * compile_authwords ( {
. . . ' after ' : [ " by " ] ,
. . . ' ignore ' : [ " anonymous " ] ,
. . . ' sep ' : [ " and " , " , " ] ,
. . . } )
. . . ) ) == { ( " Blake " , " William " ) , ( " Parry " , " Hubert " ) , ( " Royal~Choir~of~FooBar " , " The " ) }
True
The " authors_string " is processed as :
1 ) First , parenthesis ( and its content ) are removed .
# "Lyrics by William Blake, music by Hubert Parry,
and sung by The Royal \ Choir ~ of ~ Nowhere "
and sung by The Royal ~ Choir ~ of ~ FooBar "
2 ) String is split , separators being comma and words from " sep " .
# ["Lyrics by William Blake", "music by Hubert Parry",
" sung by The Royal \ Choir~of~Nowhere " ]
" sung by The Royal~Choir~of~FooBar " ]
3 ) Everything before words in " after " is removed .
# ["William Blake", "Hubert Parry", "The Royal\ Choir~of~Nowhere "]
# ["William Blake", "Hubert Parry", "The Royal~Choir~of~FooBar "]
4 ) Strings containing words of " ignore " are dropped .
# ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere "]
# ["William Blake", "Hubert Parry", The Royal~Choir~of~FooBar "]
5 ) First and last names are splitted
# [
# ("Blake", "William"),
# ("Parry", "Hubert"),
# ("Royal\ Choir~of~Nowhere ", "The"),
# ("Royal~Choir~of~FooBar ", "The"),
# ]
"""
@ -209,10 +221,7 @@ def processauthors(authors_string, after=None, ignore=None, sep=None):
if not ignore :
ignore = [ ]
return [
split_author_names ( author )
for author
in processauthors_clean_authors (
for author in processauthors_clean_authors (
processauthors_ignore_authors (
processauthors_remove_after (
processauthors_split_string (
@ -222,8 +231,8 @@ def processauthors(authors_string, after=None, ignore=None, sep=None):
sep ) ,
after ) ,
ignore )
)
]
) :
yield split_author_names ( author )
def process_listauthors ( authors_list , after = None , ignore = None , sep = None ) :
""" Process a list of authors, and return the list of resulting authors. """