Improve authors processing

- LaTeX commands are no longer supported - add tests
10 years ago · 3e3b689c0b
10 changed files with 101 additions and 87 deletions
--- a/patacrep/authors.py
+++ b/patacrep/authors.py
@ -10,6 +10,8 @@ DEFAULT_AUTHWORDS = {
    "ignore": ["unknown"],
    "sep": ["and"],
    }
+RE_AFTER = r"^.*\b{}\b(.*)$"
+RE_SEPARATOR = r"^(.*)\b *{} *(\b.*)?$"

 def compile_authwords(authwords):
    """Convert strings of authwords to compiled regular expressions.
@ -23,11 +25,11 @@ def compile_authwords(authwords):

    # Compilation
    authwords['after'] = [
-        re.compile(r"^.*\b%s\b(.*)$" % word, re.LOCALE)
+        re.compile(RE_AFTER.format(word), re.LOCALE)
        for word in authwords['after']
        ]
    authwords['sep'] = [
-        re.compile(r"^(.*)%s +(.*)$" % word, re.LOCALE)
+        re.compile(RE_SEPARATOR.format(word), re.LOCALE)
        for word in ([" %s" % word for word in authwords['sep']] + [',', ';'])
        ]

@ -37,31 +39,23 @@ def compile_authwords(authwords):
 def split_author_names(string):
    r"""Split author between first and last name.

-    The last space separates first and last name, but spaces following a
-    backslash or a command are not separators.
-    Examples:
-    - Edgar Allan Poe => Poe, Edgar Allan
-    - Edgar Allan \emph {Poe} => \emph {Poe}, Edgar Allan
-    - The Rolling\ Stones => Rolling\ Stones, The
-    - The {Rolling Stones} => {Rolling Stones}, The
+    The last space separates first and last name. LaTeX commands are ignored.
+
+    >>> split_author_names("Edgar Allan Poe")
+    ('Poe', 'Edgar Allan')
+    >>> split_author_names("Edgar Allan \emph {Poe}")
+    ('{Poe}', 'Edgar Allan \\emph')
+    >>> split_author_names(r"The Rolling\ Stones")
+    ('Stones', 'The Rolling\\')
+    >>> split_author_names("The {Rolling Stones}")
+    ('Stones}', 'The {Rolling')
+    >>> split_author_names("The Rolling Stones")
+    ('Rolling\xa0Stones', 'The')
+    >>> split_author_names("   John   Doe  ")
+    ('Doe', 'John')
    """
-    ignore_space = False
-    last_space = index = 0
-    brace_count = 0
-    for char in string.strip():
-        index += 1
-        if brace_count == 0:
-            if char == "\\":
-                ignore_space = True
-            elif not char.isalnum() and ignore_space:
-                ignore_space = False
-            elif char == " ":
-                last_space = index
-        if char == "}":
-            brace_count += 1
-        if char == "{":
-            brace_count -= 1
-    return string[last_space:], string[:last_space]
+    chunks = string.strip().split(" ")
+    return (chunks[-1].strip(), " ".join(chunks[:-1]).strip())


 def split_sep_author(string, sep):
@ -71,16 +65,19 @@ def split_sep_author(string, sep):
    - string: string containing authors names ;
    - sep: regexp matching a separator.

-    >>> split_sep_author("Tintin and Milou", re.compile('^(.*) and (.*)$'))
+    >>> split_sep_author("Tintin and Milou", re.compile(RE_SEPARATOR.format("and")))
    ['Tintin', 'Milou']
+    >>> split_sep_author("Tintin,", re.compile(RE_SEPARATOR.format(",")))
+    ['Tintin']
    """
    authors = []
    match = sep.match(string)
    while match:
-        authors.append(match.group(2))
+        if match.group(2) is not None:
+            authors.append(match.group(2).strip())
        string = match.group(1)
        match = sep.match(string)
-    authors.insert(0, string)
+    authors.insert(0, string.strip())
    return authors

 ################################################################################
@ -91,6 +88,9 @@ def processauthors_removeparen(authors_string):
    """Remove parentheses

    See docstring of processauthors() for more information.
+
+    >>> processauthors_removeparen("This (foo) string (bar) contains (baz) parenthesis")
+    'This  string  contains  parenthesis'
    """
    opening = 0
    dest = ""
@ -107,6 +107,16 @@ def processauthors_split_string(authors_string, sep):
    """Split strings

    See docstring of processauthors() for more information.
+
+    >>> processauthors_split_string("Tintin and Milou", [re.compile(RE_SEPARATOR.format("and"))])
+    ['Tintin', 'Milou']
+    >>> processauthors_split_string("Tintin, Milou", [re.compile(RE_SEPARATOR.format(","))])
+    ['Tintin', 'Milou']
+    >>> processauthors_split_string(
+    ...     "Tintin, and Milou",
+    ...     [re.compile(RE_SEPARATOR.format(word)) for word in ['and', ',']]
+    ... )
+    ['Tintin', 'Milou']
    """
    authors_list = [authors_string]
    for sepword in sep:
@ -160,45 +170,47 @@ def processauthors_clean_authors(authors_list):
        ]

 def processauthors(authors_string, after=None, ignore=None, sep=None):
-    r"""Return a list of authors
-
-    For example, we are processing:
-    # processauthors(
-    #    [
-    #        "
-    #            Lyrics by William Blake (from Milton, 1808),
-    #            music by Hubert Parry (1916),
-    #            and sung by The Royal\ Choir~of~Nowhere
-    #            (just here to show you how processing is done)
-    #        ",
-    #    ],
-    #   after = ["by"],
-    #   ignore = ["anonymous"],
-    #   sep = [re.compile('^(.*) and (.*)$')],
-    #   )
+    r"""Return an iterator of authors
+
+    For example, in the following call:
+
+    >>> set(processauthors(
+    ...   (
+    ...       "Lyrics by William Blake (from Milton, 1808), "
+    ...       "music by Hubert Parry (1916), "
+    ...       "and sung by The Royal~Choir~of~FooBar "
+    ...       "(just here to show you how processing is done)"
+    ...   ),
+    ...   **compile_authwords({
+    ...         'after': ["by"],
+    ...         'ignore': ["anonymous"],
+    ...         'sep': ["and", ","],
+    ...         })
+    ...   )) == {("Blake", "William"), ("Parry", "Hubert"), ("Royal~Choir~of~FooBar", "The")}
+    True


    The "authors_string" is processed as:

    1) First, parenthesis (and its content) are removed.
    # "Lyrics by William Blake, music by Hubert Parry,
-                and sung by The Royal\ Choir~of~Nowhere"
+                and sung by The Royal~Choir~of~FooBar"

    2) String is split, separators being comma and words from "sep".
    # ["Lyrics by William Blake", "music by Hubert Parry",
-                "sung by The Royal\ Choir~of~Nowhere"]
+                "sung by The Royal~Choir~of~FooBar"]

    3) Everything before words in "after" is removed.
-    # ["William Blake", "Hubert Parry", "The Royal\ Choir~of~Nowhere"]
+    # ["William Blake", "Hubert Parry", "The Royal~Choir~of~FooBar"]

    4) Strings containing words of "ignore" are dropped.
-    # ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere"]
+    # ["William Blake", "Hubert Parry", The Royal~Choir~of~FooBar"]

    5) First and last names are splitted
    # [
    #   ("Blake", "William"),
    #   ("Parry", "Hubert"),
-    #   ("Royal\ Choir~of~Nowhere", "The"),
+    #   ("Royal~Choir~of~FooBar", "The"),
    # ]
    """

@ -209,10 +221,7 @@ def processauthors(authors_string, after=None, ignore=None, sep=None):
    if not ignore:
        ignore = []

-    return [
-        split_author_names(author)
-        for author
-        in processauthors_clean_authors(
+    for author in processauthors_clean_authors(
            processauthors_ignore_authors(
                processauthors_remove_after(
                    processauthors_split_string(
@ -222,8 +231,8 @@ def processauthors(authors_string, after=None, ignore=None, sep=None):
                        sep),
                    after),
                ignore)
-            )
-        ]
+        ):
+        yield split_author_names(author)

 def process_listauthors(authors_list, after=None, ignore=None, sep=None):
    """Process a list of authors, and return the list of resulting authors."""
--- a/patacrep/songs/chordpro/data/chordpro/song
+++ b/patacrep/songs/chordpro/data/chordpro/song
@ -13,7 +13,7 @@
 (* endfor -*)

 (*- for author in authors -*)
-  {artist: (( author[1] ))(( author[0] ))}
+  {artist: (( author[1] )) (( author[0] ))}
 (* endfor *)

 (*- for key in ['album', 'copyright', 'cov', 'tag'] *)
--- a/patacrep/songs/chordpro/data/latex/song
+++ b/patacrep/songs/chordpro/data/latex/song
@ -16,7 +16,7 @@
 }[
  by={
      (* for author in authors *)
-        (( author[1] ))(( author[0] ))
+        (( author[1] )) (( author[0] ))
        (*- if not loop.last -*)
        ,
        (* endif *)
--- a/test/test_authors.py
+++ b/test/test_authors.py
@ -2,7 +2,6 @@

 # pylint: disable=too-few-public-methods

-import re
 import unittest

 from patacrep import authors
@ -18,30 +17,34 @@ SPLIT_AUTHORS_DATA = [
    ("The mamas and the papas", ("mamas and the papas", "The")), # Unbreakable spaces
    (r"\LaTeX command", ("command", r"\LaTeX")), # LaTeX commands are ignored
    (r"\emph{Some braces}", ("braces}", r"\emph{Some")), # LaTeX commands are ignored
-    (r"The Rolling\ Stones", ("Stones", r"The Rolling\\")), # LaTeX commands are ignored
+    (r"The Rolling\ Stones", ("Stones", 'The Rolling\\')), # LaTeX commands are ignored
    ]

 PROCESS_AUTHORS_DATA = [
+    (
        (
-            "Lyrics by William Blake (from Milton, 1808), music by Hubert Parry (1916), and sung by The Royal~Choir~of~FooBar (just here to show you how processing is done)",
-            [
-                ("Blake", "William"),
-                ("Parry", "Hubert"),
-                ("Royal~Choir~of~FooBar", "The"),
-            ]
-        ),
-        (
-            "Anonyme (1967)",
-            [],
-        ),
-        (
-            "Lucky Luke et Jolly Jumper",
-            [
-                ("Luke", "Lucky"),
-                ("Jumper", "Jolly"),
-            ],
-        ),
+            "Lyrics by William Blake (from Milton, 1808), music by Hubert "
+            "Parry (1916), and sung by The Royal~Choir~of~FooBar (just here to "
+            "show you how processing is done)"
+            ),
+        [
+            ("Blake", "William"),
+            ("Parry", "Hubert"),
+            ("Royal~Choir~of~FooBar", "The"),
        ]
+    ),
+    (
+        "Anonyme (1967)",
+        [],
+    ),
+    (
+        "Lucky Luke et Jolly Jumper",
+        [
+            ("Luke", "Lucky"),
+            ("Jumper", "Jolly"),
+        ],
+    ),
+]

 AUTHWORDS = authors.compile_authwords({
    "after": ["by"],
@ -53,11 +56,13 @@ class TestAutors(unittest.TestCase):
    """Test of author parsing."""

    def test_split_author_names(self):
+        """Test of :func:`patacrep.authors.split_author_names` function."""
        for argument, expected in SPLIT_AUTHORS_DATA:
            with self.subTest(argument=argument, expected=expected):
                self.assertEqual(authors.split_author_names(argument), expected)

    def test_processauthors(self):
+        """Test of :func:`patacrep.authors.processauthors` function."""
        for argument, expected in PROCESS_AUTHORS_DATA:
            with self.subTest(argument=argument, expected=expected):
                self.assertEqual(
--- a/test/test_chordpro/author_names.sgc
+++ b/test/test_chordpro/author_names.sgc
@ -1,6 +1,6 @@
 {language: english}
 {title: Title}
 {artist: The Beatles}
-{artist: Oasis}
+{artist:  Oasis}
 {artist: The the beatles}

--- a/test/test_chordpro/author_names.tex
+++ b/test/test_chordpro/author_names.tex
@ -3,10 +3,10 @@
 \beginsong{Title}[
  by={
        The Beatles,
-        Oasis,
+         Oasis,
        The the beatles  },
 ]



-\endsong
+\endsong
--- a/test/test_chordpro/greensleeves.sgc
+++ b/test/test_chordpro/greensleeves.sgc
@ -3,7 +3,7 @@
 {title: Greensleeves}
 {title: Un autre sous-titre}
 {title: Un sous titre}
-{artist: Traditionnel}
+{artist:  Traditionnel}
 {album: Angleterre}
 {cov: traditionnel}

--- a/test/test_chordpro/greensleeves.tex
+++ b/test/test_chordpro/greensleeves.tex
@ -5,7 +5,7 @@
 Un autre sous-titre\\
 Un sous titre}[
  by={
-        Traditionnel  },
+         Traditionnel  },
  album={Angleterre},
  cov={traditionnel},
 ]
--- a/test/test_chordpro/metadata.sgc
+++ b/test/test_chordpro/metadata.sgc
@ -6,8 +6,8 @@
 {title: Subtitle3}
 {title: Subtitle4}
 {title: Subtitle5}
-{artist: Author1}
-{artist: Author2}
+{artist:  Author1}
+{artist:  Author2}
 {album: Album}
 {copyright: Copyright}
 {cov: Cover}
--- a/test/test_chordpro/metadata.tex
+++ b/test/test_chordpro/metadata.tex
@ -7,8 +7,8 @@ Subtitle3\\
 Subtitle4\\
 Subtitle5}[
  by={
-        Author1,
-        Author2  },
+         Author1,
+         Author2  },
  album={Album},
  copyright={Copyright},
  cov={Cover},