From 3e3b689c0b771a4517c59d2e9c3999cd8ed90ce9 Mon Sep 17 00:00:00 2001 From: Louis Date: Sat, 3 Oct 2015 22:51:35 +0200 Subject: [PATCH] Improve authors processing - LaTeX commands are no longer supported - add tests --- patacrep/authors.py | 121 +++++++++++---------- patacrep/songs/chordpro/data/chordpro/song | 2 +- patacrep/songs/chordpro/data/latex/song | 2 +- test/test_authors.py | 45 ++++---- test/test_chordpro/author_names.sgc | 2 +- test/test_chordpro/author_names.tex | 4 +- test/test_chordpro/greensleeves.sgc | 2 +- test/test_chordpro/greensleeves.tex | 2 +- test/test_chordpro/metadata.sgc | 4 +- test/test_chordpro/metadata.tex | 4 +- 10 files changed, 101 insertions(+), 87 deletions(-) diff --git a/patacrep/authors.py b/patacrep/authors.py index 8cde76cf..83a072e3 100644 --- a/patacrep/authors.py +++ b/patacrep/authors.py @@ -10,6 +10,8 @@ DEFAULT_AUTHWORDS = { "ignore": ["unknown"], "sep": ["and"], } +RE_AFTER = r"^.*\b{}\b(.*)$" +RE_SEPARATOR = r"^(.*)\b *{} *(\b.*)?$" def compile_authwords(authwords): """Convert strings of authwords to compiled regular expressions. @@ -23,11 +25,11 @@ def compile_authwords(authwords): # Compilation authwords['after'] = [ - re.compile(r"^.*\b%s\b(.*)$" % word, re.LOCALE) + re.compile(RE_AFTER.format(word), re.LOCALE) for word in authwords['after'] ] authwords['sep'] = [ - re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE) + re.compile(RE_SEPARATOR.format(word), re.LOCALE) for word in ([" %s" % word for word in authwords['sep']] + [',', ';']) ] @@ -37,31 +39,23 @@ def compile_authwords(authwords): def split_author_names(string): r"""Split author between first and last name. - The last space separates first and last name, but spaces following a - backslash or a command are not separators. - Examples: - - Edgar Allan Poe => Poe, Edgar Allan - - Edgar Allan \emph {Poe} => \emph {Poe}, Edgar Allan - - The Rolling\ Stones => Rolling\ Stones, The - - The {Rolling Stones} => {Rolling Stones}, The + The last space separates first and last name. LaTeX commands are ignored. + + >>> split_author_names("Edgar Allan Poe") + ('Poe', 'Edgar Allan') + >>> split_author_names("Edgar Allan \emph {Poe}") + ('{Poe}', 'Edgar Allan \\emph') + >>> split_author_names(r"The Rolling\ Stones") + ('Stones', 'The Rolling\\') + >>> split_author_names("The {Rolling Stones}") + ('Stones}', 'The {Rolling') + >>> split_author_names("The Rolling Stones") + ('Rolling\xa0Stones', 'The') + >>> split_author_names(" John Doe ") + ('Doe', 'John') """ - ignore_space = False - last_space = index = 0 - brace_count = 0 - for char in string.strip(): - index += 1 - if brace_count == 0: - if char == "\\": - ignore_space = True - elif not char.isalnum() and ignore_space: - ignore_space = False - elif char == " ": - last_space = index - if char == "}": - brace_count += 1 - if char == "{": - brace_count -= 1 - return string[last_space:], string[:last_space] + chunks = string.strip().split(" ") + return (chunks[-1].strip(), " ".join(chunks[:-1]).strip()) def split_sep_author(string, sep): @@ -71,16 +65,19 @@ def split_sep_author(string, sep): - string: string containing authors names ; - sep: regexp matching a separator. - >>> split_sep_author("Tintin and Milou", re.compile('^(.*) and (.*)$')) + >>> split_sep_author("Tintin and Milou", re.compile(RE_SEPARATOR.format("and"))) ['Tintin', 'Milou'] + >>> split_sep_author("Tintin,", re.compile(RE_SEPARATOR.format(","))) + ['Tintin'] """ authors = [] match = sep.match(string) while match: - authors.append(match.group(2)) + if match.group(2) is not None: + authors.append(match.group(2).strip()) string = match.group(1) match = sep.match(string) - authors.insert(0, string) + authors.insert(0, string.strip()) return authors ################################################################################ @@ -91,6 +88,9 @@ def processauthors_removeparen(authors_string): """Remove parentheses See docstring of processauthors() for more information. + + >>> processauthors_removeparen("This (foo) string (bar) contains (baz) parenthesis") + 'This string contains parenthesis' """ opening = 0 dest = "" @@ -107,6 +107,16 @@ def processauthors_split_string(authors_string, sep): """Split strings See docstring of processauthors() for more information. + + >>> processauthors_split_string("Tintin and Milou", [re.compile(RE_SEPARATOR.format("and"))]) + ['Tintin', 'Milou'] + >>> processauthors_split_string("Tintin, Milou", [re.compile(RE_SEPARATOR.format(","))]) + ['Tintin', 'Milou'] + >>> processauthors_split_string( + ... "Tintin, and Milou", + ... [re.compile(RE_SEPARATOR.format(word)) for word in ['and', ',']] + ... ) + ['Tintin', 'Milou'] """ authors_list = [authors_string] for sepword in sep: @@ -160,45 +170,47 @@ def processauthors_clean_authors(authors_list): ] def processauthors(authors_string, after=None, ignore=None, sep=None): - r"""Return a list of authors - - For example, we are processing: - # processauthors( - # [ - # " - # Lyrics by William Blake (from Milton, 1808), - # music by Hubert Parry (1916), - # and sung by The Royal\ Choir~of~Nowhere - # (just here to show you how processing is done) - # ", - # ], - # after = ["by"], - # ignore = ["anonymous"], - # sep = [re.compile('^(.*) and (.*)$')], - # ) + r"""Return an iterator of authors + + For example, in the following call: + + >>> set(processauthors( + ... ( + ... "Lyrics by William Blake (from Milton, 1808), " + ... "music by Hubert Parry (1916), " + ... "and sung by The Royal~Choir~of~FooBar " + ... "(just here to show you how processing is done)" + ... ), + ... **compile_authwords({ + ... 'after': ["by"], + ... 'ignore': ["anonymous"], + ... 'sep': ["and", ","], + ... }) + ... )) == {("Blake", "William"), ("Parry", "Hubert"), ("Royal~Choir~of~FooBar", "The")} + True The "authors_string" is processed as: 1) First, parenthesis (and its content) are removed. # "Lyrics by William Blake, music by Hubert Parry, - and sung by The Royal\ Choir~of~Nowhere" + and sung by The Royal~Choir~of~FooBar" 2) String is split, separators being comma and words from "sep". # ["Lyrics by William Blake", "music by Hubert Parry", - "sung by The Royal\ Choir~of~Nowhere"] + "sung by The Royal~Choir~of~FooBar"] 3) Everything before words in "after" is removed. - # ["William Blake", "Hubert Parry", "The Royal\ Choir~of~Nowhere"] + # ["William Blake", "Hubert Parry", "The Royal~Choir~of~FooBar"] 4) Strings containing words of "ignore" are dropped. - # ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere"] + # ["William Blake", "Hubert Parry", The Royal~Choir~of~FooBar"] 5) First and last names are splitted # [ # ("Blake", "William"), # ("Parry", "Hubert"), - # ("Royal\ Choir~of~Nowhere", "The"), + # ("Royal~Choir~of~FooBar", "The"), # ] """ @@ -209,10 +221,7 @@ def processauthors(authors_string, after=None, ignore=None, sep=None): if not ignore: ignore = [] - return [ - split_author_names(author) - for author - in processauthors_clean_authors( + for author in processauthors_clean_authors( processauthors_ignore_authors( processauthors_remove_after( processauthors_split_string( @@ -222,8 +231,8 @@ def processauthors(authors_string, after=None, ignore=None, sep=None): sep), after), ignore) - ) - ] + ): + yield split_author_names(author) def process_listauthors(authors_list, after=None, ignore=None, sep=None): """Process a list of authors, and return the list of resulting authors.""" diff --git a/patacrep/songs/chordpro/data/chordpro/song b/patacrep/songs/chordpro/data/chordpro/song index d721df33..92f474ee 100644 --- a/patacrep/songs/chordpro/data/chordpro/song +++ b/patacrep/songs/chordpro/data/chordpro/song @@ -13,7 +13,7 @@ (* endfor -*) (*- for author in authors -*) - {artist: (( author[1] ))(( author[0] ))} + {artist: (( author[1] )) (( author[0] ))} (* endfor *) (*- for key in ['album', 'copyright', 'cov', 'tag'] *) diff --git a/patacrep/songs/chordpro/data/latex/song b/patacrep/songs/chordpro/data/latex/song index e3be0f83..4cb04412 100644 --- a/patacrep/songs/chordpro/data/latex/song +++ b/patacrep/songs/chordpro/data/latex/song @@ -16,7 +16,7 @@ }[ by={ (* for author in authors *) - (( author[1] ))(( author[0] )) + (( author[1] )) (( author[0] )) (*- if not loop.last -*) , (* endif *) diff --git a/test/test_authors.py b/test/test_authors.py index 98e0d3bf..f9dfc053 100644 --- a/test/test_authors.py +++ b/test/test_authors.py @@ -2,7 +2,6 @@ # pylint: disable=too-few-public-methods -import re import unittest from patacrep import authors @@ -18,30 +17,34 @@ SPLIT_AUTHORS_DATA = [ ("The mamas and the papas", ("mamas and the papas", "The")), # Unbreakable spaces (r"\LaTeX command", ("command", r"\LaTeX")), # LaTeX commands are ignored (r"\emph{Some braces}", ("braces}", r"\emph{Some")), # LaTeX commands are ignored - (r"The Rolling\ Stones", ("Stones", r"The Rolling\\")), # LaTeX commands are ignored + (r"The Rolling\ Stones", ("Stones", 'The Rolling\\')), # LaTeX commands are ignored ] PROCESS_AUTHORS_DATA = [ + ( ( - "Lyrics by William Blake (from Milton, 1808), music by Hubert Parry (1916), and sung by The Royal~Choir~of~FooBar (just here to show you how processing is done)", - [ - ("Blake", "William"), - ("Parry", "Hubert"), - ("Royal~Choir~of~FooBar", "The"), - ] - ), - ( - "Anonyme (1967)", - [], - ), - ( - "Lucky Luke et Jolly Jumper", - [ - ("Luke", "Lucky"), - ("Jumper", "Jolly"), - ], - ), + "Lyrics by William Blake (from Milton, 1808), music by Hubert " + "Parry (1916), and sung by The Royal~Choir~of~FooBar (just here to " + "show you how processing is done)" + ), + [ + ("Blake", "William"), + ("Parry", "Hubert"), + ("Royal~Choir~of~FooBar", "The"), ] + ), + ( + "Anonyme (1967)", + [], + ), + ( + "Lucky Luke et Jolly Jumper", + [ + ("Luke", "Lucky"), + ("Jumper", "Jolly"), + ], + ), +] AUTHWORDS = authors.compile_authwords({ "after": ["by"], @@ -53,11 +56,13 @@ class TestAutors(unittest.TestCase): """Test of author parsing.""" def test_split_author_names(self): + """Test of :func:`patacrep.authors.split_author_names` function.""" for argument, expected in SPLIT_AUTHORS_DATA: with self.subTest(argument=argument, expected=expected): self.assertEqual(authors.split_author_names(argument), expected) def test_processauthors(self): + """Test of :func:`patacrep.authors.processauthors` function.""" for argument, expected in PROCESS_AUTHORS_DATA: with self.subTest(argument=argument, expected=expected): self.assertEqual( diff --git a/test/test_chordpro/author_names.sgc b/test/test_chordpro/author_names.sgc index a6818f7c..36a39230 100644 --- a/test/test_chordpro/author_names.sgc +++ b/test/test_chordpro/author_names.sgc @@ -1,6 +1,6 @@ {language: english} {title: Title} {artist: The Beatles} -{artist: Oasis} +{artist: Oasis} {artist: The the beatles} diff --git a/test/test_chordpro/author_names.tex b/test/test_chordpro/author_names.tex index 123a58d3..8bf4fcfc 100644 --- a/test/test_chordpro/author_names.tex +++ b/test/test_chordpro/author_names.tex @@ -3,10 +3,10 @@ \beginsong{Title}[ by={ The Beatles, - Oasis, + Oasis, The the beatles }, ] -\endsong \ No newline at end of file +\endsong diff --git a/test/test_chordpro/greensleeves.sgc b/test/test_chordpro/greensleeves.sgc index ba6081a2..bd0ddd71 100644 --- a/test/test_chordpro/greensleeves.sgc +++ b/test/test_chordpro/greensleeves.sgc @@ -3,7 +3,7 @@ {title: Greensleeves} {title: Un autre sous-titre} {title: Un sous titre} -{artist: Traditionnel} +{artist: Traditionnel} {album: Angleterre} {cov: traditionnel} diff --git a/test/test_chordpro/greensleeves.tex b/test/test_chordpro/greensleeves.tex index 905e1664..6a745a81 100644 --- a/test/test_chordpro/greensleeves.tex +++ b/test/test_chordpro/greensleeves.tex @@ -5,7 +5,7 @@ Un autre sous-titre\\ Un sous titre}[ by={ - Traditionnel }, + Traditionnel }, album={Angleterre}, cov={traditionnel}, ] diff --git a/test/test_chordpro/metadata.sgc b/test/test_chordpro/metadata.sgc index e7b35506..e4bb1b21 100644 --- a/test/test_chordpro/metadata.sgc +++ b/test/test_chordpro/metadata.sgc @@ -6,8 +6,8 @@ {title: Subtitle3} {title: Subtitle4} {title: Subtitle5} -{artist: Author1} -{artist: Author2} +{artist: Author1} +{artist: Author2} {album: Album} {copyright: Copyright} {cov: Cover} diff --git a/test/test_chordpro/metadata.tex b/test/test_chordpro/metadata.tex index 555f59d4..11004ef6 100644 --- a/test/test_chordpro/metadata.tex +++ b/test/test_chordpro/metadata.tex @@ -7,8 +7,8 @@ Subtitle3\\ Subtitle4\\ Subtitle5}[ by={ - Author1, - Author2 }, + Author1, + Author2 }, album={Album}, copyright={Copyright}, cov={Cover},