Browse Source

Improve authors processing

- LaTeX commands are no longer supported
- add tests
pull/97/head
Louis 9 years ago
parent
commit
3e3b689c0b
  1. 121
      patacrep/authors.py
  2. 2
      patacrep/songs/chordpro/data/chordpro/song
  3. 2
      patacrep/songs/chordpro/data/latex/song
  4. 45
      test/test_authors.py
  5. 2
      test/test_chordpro/author_names.sgc
  6. 4
      test/test_chordpro/author_names.tex
  7. 2
      test/test_chordpro/greensleeves.sgc
  8. 2
      test/test_chordpro/greensleeves.tex
  9. 4
      test/test_chordpro/metadata.sgc
  10. 4
      test/test_chordpro/metadata.tex

121
patacrep/authors.py

@ -10,6 +10,8 @@ DEFAULT_AUTHWORDS = {
"ignore": ["unknown"], "ignore": ["unknown"],
"sep": ["and"], "sep": ["and"],
} }
RE_AFTER = r"^.*\b{}\b(.*)$"
RE_SEPARATOR = r"^(.*)\b *{} *(\b.*)?$"
def compile_authwords(authwords): def compile_authwords(authwords):
"""Convert strings of authwords to compiled regular expressions. """Convert strings of authwords to compiled regular expressions.
@ -23,11 +25,11 @@ def compile_authwords(authwords):
# Compilation # Compilation
authwords['after'] = [ authwords['after'] = [
re.compile(r"^.*\b%s\b(.*)$" % word, re.LOCALE) re.compile(RE_AFTER.format(word), re.LOCALE)
for word in authwords['after'] for word in authwords['after']
] ]
authwords['sep'] = [ authwords['sep'] = [
re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE) re.compile(RE_SEPARATOR.format(word), re.LOCALE)
for word in ([" %s" % word for word in authwords['sep']] + [',', ';']) for word in ([" %s" % word for word in authwords['sep']] + [',', ';'])
] ]
@ -37,31 +39,23 @@ def compile_authwords(authwords):
def split_author_names(string): def split_author_names(string):
r"""Split author between first and last name. r"""Split author between first and last name.
The last space separates first and last name, but spaces following a The last space separates first and last name. LaTeX commands are ignored.
backslash or a command are not separators.
Examples: >>> split_author_names("Edgar Allan Poe")
- Edgar Allan Poe => Poe, Edgar Allan ('Poe', 'Edgar Allan')
- Edgar Allan \emph {Poe} => \emph {Poe}, Edgar Allan >>> split_author_names("Edgar Allan \emph {Poe}")
- The Rolling\ Stones => Rolling\ Stones, The ('{Poe}', 'Edgar Allan \\emph')
- The {Rolling Stones} => {Rolling Stones}, The >>> split_author_names(r"The Rolling\ Stones")
('Stones', 'The Rolling\\')
>>> split_author_names("The {Rolling Stones}")
('Stones}', 'The {Rolling')
>>> split_author_names("The Rolling Stones")
('Rolling\xa0Stones', 'The')
>>> split_author_names(" John Doe ")
('Doe', 'John')
""" """
ignore_space = False chunks = string.strip().split(" ")
last_space = index = 0 return (chunks[-1].strip(), " ".join(chunks[:-1]).strip())
brace_count = 0
for char in string.strip():
index += 1
if brace_count == 0:
if char == "\\":
ignore_space = True
elif not char.isalnum() and ignore_space:
ignore_space = False
elif char == " ":
last_space = index
if char == "}":
brace_count += 1
if char == "{":
brace_count -= 1
return string[last_space:], string[:last_space]
def split_sep_author(string, sep): def split_sep_author(string, sep):
@ -71,16 +65,19 @@ def split_sep_author(string, sep):
- string: string containing authors names ; - string: string containing authors names ;
- sep: regexp matching a separator. - sep: regexp matching a separator.
>>> split_sep_author("Tintin and Milou", re.compile('^(.*) and (.*)$')) >>> split_sep_author("Tintin and Milou", re.compile(RE_SEPARATOR.format("and")))
['Tintin', 'Milou'] ['Tintin', 'Milou']
>>> split_sep_author("Tintin,", re.compile(RE_SEPARATOR.format(",")))
['Tintin']
""" """
authors = [] authors = []
match = sep.match(string) match = sep.match(string)
while match: while match:
authors.append(match.group(2)) if match.group(2) is not None:
authors.append(match.group(2).strip())
string = match.group(1) string = match.group(1)
match = sep.match(string) match = sep.match(string)
authors.insert(0, string) authors.insert(0, string.strip())
return authors return authors
################################################################################ ################################################################################
@ -91,6 +88,9 @@ def processauthors_removeparen(authors_string):
"""Remove parentheses """Remove parentheses
See docstring of processauthors() for more information. See docstring of processauthors() for more information.
>>> processauthors_removeparen("This (foo) string (bar) contains (baz) parenthesis")
'This string contains parenthesis'
""" """
opening = 0 opening = 0
dest = "" dest = ""
@ -107,6 +107,16 @@ def processauthors_split_string(authors_string, sep):
"""Split strings """Split strings
See docstring of processauthors() for more information. See docstring of processauthors() for more information.
>>> processauthors_split_string("Tintin and Milou", [re.compile(RE_SEPARATOR.format("and"))])
['Tintin', 'Milou']
>>> processauthors_split_string("Tintin, Milou", [re.compile(RE_SEPARATOR.format(","))])
['Tintin', 'Milou']
>>> processauthors_split_string(
... "Tintin, and Milou",
... [re.compile(RE_SEPARATOR.format(word)) for word in ['and', ',']]
... )
['Tintin', 'Milou']
""" """
authors_list = [authors_string] authors_list = [authors_string]
for sepword in sep: for sepword in sep:
@ -160,45 +170,47 @@ def processauthors_clean_authors(authors_list):
] ]
def processauthors(authors_string, after=None, ignore=None, sep=None): def processauthors(authors_string, after=None, ignore=None, sep=None):
r"""Return a list of authors r"""Return an iterator of authors
For example, we are processing: For example, in the following call:
# processauthors(
# [ >>> set(processauthors(
# " ... (
# Lyrics by William Blake (from Milton, 1808), ... "Lyrics by William Blake (from Milton, 1808), "
# music by Hubert Parry (1916), ... "music by Hubert Parry (1916), "
# and sung by The Royal\ Choir~of~Nowhere ... "and sung by The Royal~Choir~of~FooBar "
# (just here to show you how processing is done) ... "(just here to show you how processing is done)"
# ", ... ),
# ], ... **compile_authwords({
# after = ["by"], ... 'after': ["by"],
# ignore = ["anonymous"], ... 'ignore': ["anonymous"],
# sep = [re.compile('^(.*) and (.*)$')], ... 'sep': ["and", ","],
# ) ... })
... )) == {("Blake", "William"), ("Parry", "Hubert"), ("Royal~Choir~of~FooBar", "The")}
True
The "authors_string" is processed as: The "authors_string" is processed as:
1) First, parenthesis (and its content) are removed. 1) First, parenthesis (and its content) are removed.
# "Lyrics by William Blake, music by Hubert Parry, # "Lyrics by William Blake, music by Hubert Parry,
and sung by The Royal\ Choir~of~Nowhere" and sung by The Royal~Choir~of~FooBar"
2) String is split, separators being comma and words from "sep". 2) String is split, separators being comma and words from "sep".
# ["Lyrics by William Blake", "music by Hubert Parry", # ["Lyrics by William Blake", "music by Hubert Parry",
"sung by The Royal\ Choir~of~Nowhere"] "sung by The Royal~Choir~of~FooBar"]
3) Everything before words in "after" is removed. 3) Everything before words in "after" is removed.
# ["William Blake", "Hubert Parry", "The Royal\ Choir~of~Nowhere"] # ["William Blake", "Hubert Parry", "The Royal~Choir~of~FooBar"]
4) Strings containing words of "ignore" are dropped. 4) Strings containing words of "ignore" are dropped.
# ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere"] # ["William Blake", "Hubert Parry", The Royal~Choir~of~FooBar"]
5) First and last names are splitted 5) First and last names are splitted
# [ # [
# ("Blake", "William"), # ("Blake", "William"),
# ("Parry", "Hubert"), # ("Parry", "Hubert"),
# ("Royal\ Choir~of~Nowhere", "The"), # ("Royal~Choir~of~FooBar", "The"),
# ] # ]
""" """
@ -209,10 +221,7 @@ def processauthors(authors_string, after=None, ignore=None, sep=None):
if not ignore: if not ignore:
ignore = [] ignore = []
return [ for author in processauthors_clean_authors(
split_author_names(author)
for author
in processauthors_clean_authors(
processauthors_ignore_authors( processauthors_ignore_authors(
processauthors_remove_after( processauthors_remove_after(
processauthors_split_string( processauthors_split_string(
@ -222,8 +231,8 @@ def processauthors(authors_string, after=None, ignore=None, sep=None):
sep), sep),
after), after),
ignore) ignore)
) ):
] yield split_author_names(author)
def process_listauthors(authors_list, after=None, ignore=None, sep=None): def process_listauthors(authors_list, after=None, ignore=None, sep=None):
"""Process a list of authors, and return the list of resulting authors.""" """Process a list of authors, and return the list of resulting authors."""

2
patacrep/songs/chordpro/data/chordpro/song

@ -13,7 +13,7 @@
(* endfor -*) (* endfor -*)
(*- for author in authors -*) (*- for author in authors -*)
{artist: (( author[1] ))(( author[0] ))} {artist: (( author[1] )) (( author[0] ))}
(* endfor *) (* endfor *)
(*- for key in ['album', 'copyright', 'cov', 'tag'] *) (*- for key in ['album', 'copyright', 'cov', 'tag'] *)

2
patacrep/songs/chordpro/data/latex/song

@ -16,7 +16,7 @@
}[ }[
by={ by={
(* for author in authors *) (* for author in authors *)
(( author[1] ))(( author[0] )) (( author[1] )) (( author[0] ))
(*- if not loop.last -*) (*- if not loop.last -*)
, ,
(* endif *) (* endif *)

45
test/test_authors.py

@ -2,7 +2,6 @@
# pylint: disable=too-few-public-methods # pylint: disable=too-few-public-methods
import re
import unittest import unittest
from patacrep import authors from patacrep import authors
@ -18,30 +17,34 @@ SPLIT_AUTHORS_DATA = [
("The mamas and the papas", ("mamas and the papas", "The")), # Unbreakable spaces ("The mamas and the papas", ("mamas and the papas", "The")), # Unbreakable spaces
(r"\LaTeX command", ("command", r"\LaTeX")), # LaTeX commands are ignored (r"\LaTeX command", ("command", r"\LaTeX")), # LaTeX commands are ignored
(r"\emph{Some braces}", ("braces}", r"\emph{Some")), # LaTeX commands are ignored (r"\emph{Some braces}", ("braces}", r"\emph{Some")), # LaTeX commands are ignored
(r"The Rolling\ Stones", ("Stones", r"The Rolling\\")), # LaTeX commands are ignored (r"The Rolling\ Stones", ("Stones", 'The Rolling\\')), # LaTeX commands are ignored
] ]
PROCESS_AUTHORS_DATA = [ PROCESS_AUTHORS_DATA = [
(
( (
"Lyrics by William Blake (from Milton, 1808), music by Hubert Parry (1916), and sung by The Royal~Choir~of~FooBar (just here to show you how processing is done)", "Lyrics by William Blake (from Milton, 1808), music by Hubert "
[ "Parry (1916), and sung by The Royal~Choir~of~FooBar (just here to "
("Blake", "William"), "show you how processing is done)"
("Parry", "Hubert"), ),
("Royal~Choir~of~FooBar", "The"), [
] ("Blake", "William"),
), ("Parry", "Hubert"),
( ("Royal~Choir~of~FooBar", "The"),
"Anonyme (1967)",
[],
),
(
"Lucky Luke et Jolly Jumper",
[
("Luke", "Lucky"),
("Jumper", "Jolly"),
],
),
] ]
),
(
"Anonyme (1967)",
[],
),
(
"Lucky Luke et Jolly Jumper",
[
("Luke", "Lucky"),
("Jumper", "Jolly"),
],
),
]
AUTHWORDS = authors.compile_authwords({ AUTHWORDS = authors.compile_authwords({
"after": ["by"], "after": ["by"],
@ -53,11 +56,13 @@ class TestAutors(unittest.TestCase):
"""Test of author parsing.""" """Test of author parsing."""
def test_split_author_names(self): def test_split_author_names(self):
"""Test of :func:`patacrep.authors.split_author_names` function."""
for argument, expected in SPLIT_AUTHORS_DATA: for argument, expected in SPLIT_AUTHORS_DATA:
with self.subTest(argument=argument, expected=expected): with self.subTest(argument=argument, expected=expected):
self.assertEqual(authors.split_author_names(argument), expected) self.assertEqual(authors.split_author_names(argument), expected)
def test_processauthors(self): def test_processauthors(self):
"""Test of :func:`patacrep.authors.processauthors` function."""
for argument, expected in PROCESS_AUTHORS_DATA: for argument, expected in PROCESS_AUTHORS_DATA:
with self.subTest(argument=argument, expected=expected): with self.subTest(argument=argument, expected=expected):
self.assertEqual( self.assertEqual(

2
test/test_chordpro/author_names.sgc

@ -1,6 +1,6 @@
{language: english} {language: english}
{title: Title} {title: Title}
{artist: The Beatles} {artist: The Beatles}
{artist: Oasis} {artist: Oasis}
{artist: The the beatles} {artist: The the beatles}

4
test/test_chordpro/author_names.tex

@ -3,10 +3,10 @@
\beginsong{Title}[ \beginsong{Title}[
by={ by={
The Beatles, The Beatles,
Oasis, Oasis,
The the beatles }, The the beatles },
] ]
\endsong \endsong

2
test/test_chordpro/greensleeves.sgc

@ -3,7 +3,7 @@
{title: Greensleeves} {title: Greensleeves}
{title: Un autre sous-titre} {title: Un autre sous-titre}
{title: Un sous titre} {title: Un sous titre}
{artist: Traditionnel} {artist: Traditionnel}
{album: Angleterre} {album: Angleterre}
{cov: traditionnel} {cov: traditionnel}

2
test/test_chordpro/greensleeves.tex

@ -5,7 +5,7 @@
Un autre sous-titre\\ Un autre sous-titre\\
Un sous titre}[ Un sous titre}[
by={ by={
Traditionnel }, Traditionnel },
album={Angleterre}, album={Angleterre},
cov={traditionnel}, cov={traditionnel},
] ]

4
test/test_chordpro/metadata.sgc

@ -6,8 +6,8 @@
{title: Subtitle3} {title: Subtitle3}
{title: Subtitle4} {title: Subtitle4}
{title: Subtitle5} {title: Subtitle5}
{artist: Author1} {artist: Author1}
{artist: Author2} {artist: Author2}
{album: Album} {album: Album}
{copyright: Copyright} {copyright: Copyright}
{cov: Cover} {cov: Cover}

4
test/test_chordpro/metadata.tex

@ -7,8 +7,8 @@ Subtitle3\\
Subtitle4\\ Subtitle4\\
Subtitle5}[ Subtitle5}[
by={ by={
Author1, Author1,
Author2 }, Author2 },
album={Album}, album={Album},
copyright={Copyright}, copyright={Copyright},
cov={Cover}, cov={Cover},

Loading…
Cancel
Save