From 752ff203243a42290f8d6f1a01bc184635760dd1 Mon Sep 17 00:00:00 2001 From: Louis Date: Wed, 2 Jul 2014 17:31:30 +0200 Subject: [PATCH 1/8] Implement cache for songs --- patacrep/songs.py | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/patacrep/songs.py b/patacrep/songs.py index 2cc49846..0c15a498 100755 --- a/patacrep/songs.py +++ b/patacrep/songs.py @@ -4,16 +4,50 @@ """Song management.""" from unidecode import unidecode +import cPickle +import hashlib +import os import re from patacrep.authors import processauthors from patacrep.plastex import parsetex -# pylint: disable=too-few-public-methods + +def cached_name(filename): + """Return the filename of the cache version of the file.""" + return filename + ".cache" + +# pylint: disable=too-few-public-methods, too-many-instance-attributes class Song(object): """Song management""" + # Version format of cached song. + CACHE_VERSION = 0 + + # List of attributes to cache + cached_attributes = [ + "titles", + "unprefixed_titles", + "args", + "path", + "languages", + "authors", + "_filehash", + "_version", + ] + def __init__(self, filename, config): + self._filehash = hashlib.md5(open(filename, 'rb').read()).hexdigest() + if os.path.exists(cached_name(filename)): + cached = cPickle.load(open(cached_name(filename), 'rb')) + if ( + cached['_filehash'] == self._filehash + and cached['_version'] == self.CACHE_VERSION + ): + for attribute in self.cached_attributes: + setattr(self, attribute, cached[attribute]) + return + # Data extraction from the song with plastex data = parsetex(filename) self.titles = data['titles'] @@ -36,6 +70,16 @@ class Song(object): else: self.authors = [] + self._version = self.CACHE_VERSION + self._write_cache() + + def _write_cache(self): + """Write a dumbed down version of self to the cache.""" + cached = {} + for attribute in self.cached_attributes: + cached[attribute] = getattr(self, attribute) + cPickle.dump(cached, open(cached_name(self.path), 'wb')) + def __repr__(self): return repr((self.titles, self.args, self.path)) From ae39389d11a4c1a89725cc8a0b27cb75c8d801e2 Mon Sep 17 00:00:00 2001 From: Louis Date: Wed, 2 Jul 2014 19:11:24 +0200 Subject: [PATCH 2/8] Safer import of pickle --- patacrep/songs.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/patacrep/songs.py b/patacrep/songs.py index 0c15a498..ebaa06e7 100755 --- a/patacrep/songs.py +++ b/patacrep/songs.py @@ -4,11 +4,15 @@ """Song management.""" from unidecode import unidecode -import cPickle import hashlib import os import re +try: + import cPickle as pickle +except ImportError: + import pickle + from patacrep.authors import processauthors from patacrep.plastex import parsetex @@ -39,7 +43,7 @@ class Song(object): def __init__(self, filename, config): self._filehash = hashlib.md5(open(filename, 'rb').read()).hexdigest() if os.path.exists(cached_name(filename)): - cached = cPickle.load(open(cached_name(filename), 'rb')) + cached = pickle.load(open(cached_name(filename), 'rb')) if ( cached['_filehash'] == self._filehash and cached['_version'] == self.CACHE_VERSION @@ -78,7 +82,7 @@ class Song(object): cached = {} for attribute in self.cached_attributes: cached[attribute] = getattr(self, attribute) - cPickle.dump(cached, open(cached_name(self.path), 'wb')) + pickle.dump(cached, open(cached_name(self.path), 'wb')) def __repr__(self): return repr((self.titles, self.args, self.path)) From 6209f3e9abcc52fc21cff15f631097a4dc876dda Mon Sep 17 00:00:00 2001 From: Louis Date: Fri, 4 Jul 2014 21:12:18 +0200 Subject: [PATCH 3/8] comment --- patacrep/songs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/patacrep/songs.py b/patacrep/songs.py index ebaa06e7..a84ef253 100755 --- a/patacrep/songs.py +++ b/patacrep/songs.py @@ -25,7 +25,8 @@ def cached_name(filename): class Song(object): """Song management""" - # Version format of cached song. + # Version format of cached song. Increment this number if we update + # information stored in cache. CACHE_VERSION = 0 # List of attributes to cache From 52c3007684d3a0b24a7d57c37e8a24eb4511ffec Mon Sep 17 00:00:00 2001 From: Louis Date: Sat, 5 Jul 2014 10:36:50 +0200 Subject: [PATCH 4/8] Songs are cached only if they are in a datadir --- patacrep/build.py | 3 +- patacrep/content/cwd.py | 7 ++- patacrep/content/song.py | 25 +++++---- patacrep/content/sorted.py | 4 +- patacrep/content/tex.py | 7 ++- patacrep/files.py | 30 +++++++++-- patacrep/songs.py | 105 ++++++++++++++++++++++++++++--------- 7 files changed, 136 insertions(+), 45 deletions(-) diff --git a/patacrep/build.py b/patacrep/build.py index dc5cdda0..13a173ab 100755 --- a/patacrep/build.py +++ b/patacrep/build.py @@ -13,6 +13,7 @@ from subprocess import Popen, PIPE, call from patacrep import __DATADIR__, authors, content, errors from patacrep.index import process_sxd from patacrep.templates import TexRenderer +from patacrep.songs import DataSubpath LOGGER = logging.getLogger(__name__) EOL = "\n" @@ -75,7 +76,7 @@ class Songbook(object): self.config['datadir'] = abs_datadir self.config['_songdir'] = [ - os.path.join(path, 'songs') + DataSubpath(path, 'songs') for path in self.config['datadir'] ] diff --git a/patacrep/content/cwd.py b/patacrep/content/cwd.py index fff4c10e..338adb76 100755 --- a/patacrep/content/cwd.py +++ b/patacrep/content/cwd.py @@ -3,9 +3,8 @@ """Change base directory before importing songs.""" -import os - from patacrep.content import process_content +from patacrep.songs import DataSubpath #pylint: disable=unused-argument def parse(keyword, config, argument, contentlist): @@ -28,8 +27,8 @@ def parse(keyword, config, argument, contentlist): """ old_songdir = config['_songdir'] config['_songdir'] = ( - [argument] + - [os.path.join(path, argument) for path in config['_songdir']] + + [DataSubpath("", argument)] + + [path.clone().join(argument) for path in config['_songdir']] + config['_songdir'] ) processed_content = process_content(contentlist, config) diff --git a/patacrep/content/song.py b/patacrep/content/song.py index d6dc4886..170f8c2c 100755 --- a/patacrep/content/song.py +++ b/patacrep/content/song.py @@ -35,7 +35,7 @@ class SongRenderer(Content, Song): def render(self, context): """Return the string that will render the song.""" return r'\input{{{}}}'.format(files.relpath( - self.path, + self.fullpath, os.path.dirname(context['filename']) )) @@ -59,21 +59,28 @@ def parse(keyword, argument, contentlist, config): if contentlist: break contentlist = [ - files.relpath(filename, songdir) + filename for filename in ( - files.recursive_find(songdir, "*.sg") - + files.recursive_find(songdir, "*.is") + files.recursive_find(songdir.fullpath, "*.sg") + + files.recursive_find(songdir.fullpath, "*.is") ) ] for elem in contentlist: before = len(songlist) for songdir in config['_songdir']: - for filename in glob.iglob(os.path.join(songdir, elem)): - LOGGER.debug('Parsing file "{}"…'.format(filename)) - song = SongRenderer(filename, config) - songlist.append(song) - config["_languages"].update(song.languages) + if songdir.datadir and not os.path.isdir(songdir.datadir): + continue + with files.chdir(songdir.datadir): + for filename in glob.iglob(os.path.join(songdir.subpath, elem)): + LOGGER.debug('Parsing file "{}"…'.format(filename)) + song = SongRenderer( + songdir.datadir, + filename, + config, + ) + songlist.append(song) + config["_languages"].update(song.languages) if len(songlist) > before: break if len(songlist) == before: diff --git a/patacrep/content/sorted.py b/patacrep/content/sorted.py index 96ec7af4..49d2e284 100755 --- a/patacrep/content/sorted.py +++ b/patacrep/content/sorted.py @@ -49,7 +49,7 @@ def key_generator(sort): if key == "@title": field = song.unprefixed_titles elif key == "@path": - field = song.path + field = song.fullpath elif key == "by": field = song.authors else: @@ -59,7 +59,7 @@ def key_generator(sort): LOGGER.debug( "Ignoring unknown key '{}' for song {}.".format( key, - files.relpath(song.path), + files.relpath(song.fullpath), ) ) field = "" diff --git a/patacrep/content/tex.py b/patacrep/content/tex.py index a934de48..629208f1 100755 --- a/patacrep/content/tex.py +++ b/patacrep/content/tex.py @@ -41,8 +41,11 @@ def parse(keyword, argument, contentlist, config): for filename in contentlist: checked_file = None for path in config['_songdir']: - if os.path.exists(os.path.join(path, filename)): - checked_file = os.path.relpath(os.path.join(path, filename)) + if os.path.exists(os.path.join(path.fullpath, filename)): + checked_file = os.path.relpath(os.path.join( + path.fullpath, + filename, + )) break if not checked_file: LOGGER.warning( diff --git a/patacrep/files.py b/patacrep/files.py index 31cc4e2e..5ba23df2 100644 --- a/patacrep/files.py +++ b/patacrep/files.py @@ -4,6 +4,7 @@ """File system utilities.""" +from contextlib import contextmanager import fnmatch import os @@ -12,10 +13,14 @@ def recursive_find(root_directory, pattern): Return a list of files matching the pattern. """ + if not os.path.isdir(root_directory): + return [] + matches = [] - for root, _, filenames in os.walk(root_directory): - for filename in fnmatch.filter(filenames, pattern): - matches.append(os.path.join(root, filename)) + with chdir(root_directory): + for root, _, filenames in os.walk(os.curdir): + for filename in fnmatch.filter(filenames, pattern): + matches.append(os.path.join(root, filename)) return matches def relpath(path, start=None): @@ -26,3 +31,22 @@ def relpath(path, start=None): return os.path.relpath(path, start) else: return os.path.abspath(path) + +@contextmanager +def chdir(path): + """Locally change dir + + Can be used as: + + with chdir("some/directory"): + do_stuff() + + """ + olddir = os.getcwd() + if path: + os.chdir(path) + yield + os.chdir(olddir) + else: + yield + diff --git a/patacrep/songs.py b/patacrep/songs.py index a84ef253..2e00c2e0 100755 --- a/patacrep/songs.py +++ b/patacrep/songs.py @@ -4,22 +4,67 @@ """Song management.""" from unidecode import unidecode +import errno import hashlib import os import re try: - import cPickle as pickle + import cPickle as pickle except ImportError: - import pickle + import pickle from patacrep.authors import processauthors from patacrep.plastex import parsetex -def cached_name(filename): +def cached_name(datadir, filename): """Return the filename of the cache version of the file.""" - return filename + ".cache" + fullpath = os.path.join(datadir, '.cache', filename) + directory = os.path.dirname(fullpath) + try: + os.makedirs(directory) + except OSError as error: + if error.errno == errno.EEXIST and os.path.isdir(directory): + pass + else: + raise + return fullpath + +class DataSubpath(object): + """A path divided in two path: a datadir, and its subpath. + + - This object can represent either a file or directory. + - If the datadir part is the empty string, it means that the represented + path does not belong to a datadir. + """ + + def __init__(self, datadir, subpath): + if os.path.isabs(subpath): + self.datadir = "" + else: + self.datadir = datadir + self.subpath = subpath + + def __str__(self): + return os.path.join(self.datadir, self.subpath) + + @property + def fullpath(self): + """Return the full path represented by self.""" + return os.path.join(self.datadir, self.subpath) + + def clone(self): + """Return a cloned object.""" + return DataSubpath(self.datadir, self.subpath) + + def join(self, path): + """Join "path" argument to self path. + + Return self for commodity. + """ + self.subpath = os.path.join(self.subpath, path) + return self # pylint: disable=too-few-public-methods, too-many-instance-attributes class Song(object): @@ -34,28 +79,36 @@ class Song(object): "titles", "unprefixed_titles", "args", - "path", + "datadir", + "fullpath", + "subpath", "languages", "authors", "_filehash", "_version", ] - def __init__(self, filename, config): - self._filehash = hashlib.md5(open(filename, 'rb').read()).hexdigest() - if os.path.exists(cached_name(filename)): - cached = pickle.load(open(cached_name(filename), 'rb')) - if ( - cached['_filehash'] == self._filehash - and cached['_version'] == self.CACHE_VERSION - ): - for attribute in self.cached_attributes: - setattr(self, attribute, cached[attribute]) - return + def __init__(self, datadir, subpath, config): + self.fullpath = os.path.join(datadir, subpath) + if datadir: + # Only songs in datadirs are cached + self._filehash = hashlib.md5( + open(self.fullpath, 'rb').read() + ).hexdigest() + if os.path.exists(cached_name(datadir, subpath)): + cached = pickle.load(open(cached_name(datadir, subpath), 'rb')) + if ( + cached['_filehash'] == self._filehash + and cached['_version'] == self.CACHE_VERSION + ): + for attribute in self.cached_attributes: + setattr(self, attribute, cached[attribute]) + return # Data extraction from the song with plastex - data = parsetex(filename) + data = parsetex(self.fullpath) self.titles = data['titles'] + self.datadir = datadir self.unprefixed_titles = [ unprefixed_title( unidecode(unicode(title, "utf-8")), @@ -65,7 +118,7 @@ class Song(object): in self.titles ] self.args = data['args'] - self.path = filename + self.subpath = subpath self.languages = data['languages'] if "by" in self.args.keys(): self.authors = processauthors( @@ -79,14 +132,18 @@ class Song(object): self._write_cache() def _write_cache(self): - """Write a dumbed down version of self to the cache.""" - cached = {} - for attribute in self.cached_attributes: - cached[attribute] = getattr(self, attribute) - pickle.dump(cached, open(cached_name(self.path), 'wb')) + """If relevant, write a dumbed down version of self to the cache.""" + if self.datadir: + cached = {} + for attribute in self.cached_attributes: + cached[attribute] = getattr(self, attribute) + pickle.dump( + cached, + open(cached_name(self.datadir, self.subpath), 'wb'), + ) def __repr__(self): - return repr((self.titles, self.args, self.path)) + return repr((self.titles, self.args, self.fullpath)) def unprefixed_title(title, prefixes): """Remove the first prefix of the list in the beginning of title (if any). From 5cc88974f2662e346968f527fad2389d36c8bfa9 Mon Sep 17 00:00:00 2001 From: Louis Date: Sat, 5 Jul 2014 11:19:02 +0200 Subject: [PATCH 5/8] Using the best pickle protocol version --- patacrep/songs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/patacrep/songs.py b/patacrep/songs.py index 00c1d1b9..dd69e03b 100755 --- a/patacrep/songs.py +++ b/patacrep/songs.py @@ -156,6 +156,7 @@ class Song(object): pickle.dump( cached, open(cached_name(self.datadir, self.subpath), 'wb'), + protocol = -1 ) def __repr__(self): From 5a37a0fc46dcaab6f21d0090f8aba08e30f8d5c0 Mon Sep 17 00:00:00 2001 From: Louis Date: Sun, 6 Jul 2014 11:53:31 +0200 Subject: [PATCH 6/8] Ignoring cache --- patacrep/data/examples/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 patacrep/data/examples/.gitignore diff --git a/patacrep/data/examples/.gitignore b/patacrep/data/examples/.gitignore new file mode 100644 index 00000000..8c36c429 --- /dev/null +++ b/patacrep/data/examples/.gitignore @@ -0,0 +1 @@ +/.cache From 23418d7ceb482d126489595c66b2e5e1038f9563 Mon Sep 17 00:00:00 2001 From: Louis Date: Sun, 6 Jul 2014 12:00:31 +0200 Subject: [PATCH 7/8] pylint --- patacrep/songs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/patacrep/songs.py b/patacrep/songs.py index dd69e03b..5f3a7d8c 100755 --- a/patacrep/songs.py +++ b/patacrep/songs.py @@ -156,7 +156,7 @@ class Song(object): pickle.dump( cached, open(cached_name(self.datadir, self.subpath), 'wb'), - protocol = -1 + protocol=-1 ) def __repr__(self): From 443783be2c284bed8f2ec686493fe357d3c7b06a Mon Sep 17 00:00:00 2001 From: Louis Date: Sun, 6 Jul 2014 12:00:45 +0200 Subject: [PATCH 8/8] Solved sorting problem --- patacrep/authors.py | 38 ++++++++---------------- patacrep/content/sorted.py | 4 +-- patacrep/index.py | 61 ++++++++++++++++++++++++-------------- patacrep/plastex.py | 1 + 4 files changed, 54 insertions(+), 50 deletions(-) diff --git a/patacrep/authors.py b/patacrep/authors.py index 73f3fc7a..a1feda2e 100644 --- a/patacrep/authors.py +++ b/patacrep/authors.py @@ -64,7 +64,7 @@ def split_author_names(string): brace_count += 1 if char == "{": brace_count -= 1 - return string[:last_space], string[last_space:] + return string[last_space:], string[:last_space] def split_sep_author(string, sep): @@ -162,23 +162,6 @@ def processauthors_clean_authors(authors_list): if author.lstrip() ] -def processauthors_invert_names(authors_list): - """Move first names after last names - - See docstring of processauthors() for more information. - """ - dest = [] - for author in authors_list: - first, last = split_author_names(author) - if first: - dest.append(ur"\indexauthor{{{first}}}{{{last}}}".format( - first=first.strip(), - last=last.strip(), - )) - else: - dest.append(last.lstrip()) - return dest - def processauthors(authors_string, after=None, ignore=None, sep=None): r"""Return a list of authors @@ -210,10 +193,12 @@ def processauthors(authors_string, after=None, ignore=None, sep=None): 4) Strings containing words of "ignore" are dropped. # ["William Blake", "Hubert Parry", The Royal\ Choir~of~Nowhere"] - 5) First and last names are processed through LaTeX command \indexauthor - (which will, by default, invert first and last names). - # ["\indexauthor{William}{Blake}", "\indexauthor{Hubert}{Parry}", - # \indexthaor{The}{Royal\ Choir~of~Nowhere}"] + 5) First and last names are splitted + # [ + # ("Blake", "William"), + # ("Parry", "Hubert"), + # ("Royal\ Choir~of~Nowhere", "The"), + # ] """ if not sep: @@ -223,8 +208,10 @@ def processauthors(authors_string, after=None, ignore=None, sep=None): if not ignore: ignore = [] - return processauthors_invert_names( - processauthors_clean_authors( + return [ + split_author_names(author) + for author + in processauthors_clean_authors( processauthors_ignore_authors( processauthors_remove_after( processauthors_split_string( @@ -235,5 +222,4 @@ def processauthors(authors_string, after=None, ignore=None, sep=None): after), ignore) ) - ) - + ] diff --git a/patacrep/content/sorted.py b/patacrep/content/sorted.py index 0a72614a..e9a5e677 100755 --- a/patacrep/content/sorted.py +++ b/patacrep/content/sorted.py @@ -33,8 +33,8 @@ def normalize_field(field): """Return a normalized field, it being a string or a list of strings.""" if isinstance(field, basestring): return normalize_string(field) - elif isinstance(field, list): - return [normalize_string(string) for string in field] + elif isinstance(field, list) or isinstance(field, tuple): + return [normalize_field(string) for string in field] def key_generator(sort): """Return a function that returns the list of values used to sort the song. diff --git a/patacrep/index.py b/patacrep/index.py index 27fa1eb1..63f3058d 100755 --- a/patacrep/index.py +++ b/patacrep/index.py @@ -22,18 +22,6 @@ KEYWORD_PATTERN = re.compile(ur"^%(\w+)\s?(.*)$", re.LOCALE) FIRST_LETTER_PATTERN = re.compile(ur"^(?:\{?\\\w+\}?)*[^\w]*(\w)", re.LOCALE) -def sortkey(value): - """From a title, return something usable for sorting. - - It handles locale (but - don't forget to call locale.setlocale(locale.LC_ALL, '')). It also handles - the sort with latex escape sequences. - """ - return locale.strxfrm( - encoding.unidecode(simpleparse(value).replace(' ', 'A')).lower() - ) - - def process_sxd(filename): """Parse sxd file. @@ -115,12 +103,18 @@ class Index(object): No processing is done on data. It is added raw. See add() for a similar method with processing. """ - first = self.get_first_letter(key) + first = self.get_first_letter(key[0]) if not first in self.data.keys(): self.data[first] = dict() if not key in self.data[first].keys(): - self.data[first][key] = [] - self.data[first][key].append({'num': number, 'link': link}) + self.data[first][key] = { + 'sortingkey': [ + encoding.unidecode(simpleparse(item)).lower() + for item in key + ], + 'entries': [], + } + self.data[first][key]['entries'].append({'num': number, 'link': link}) def add(self, key, number, link): """Add a song to the list. @@ -133,15 +127,15 @@ class Index(object): match = pattern.match(key) if match: self._raw_add( - ur"\indextitle{{{}}}{{{}}}".format( + ( match.group(1).strip(), - (match.group(2) + match.group(3)).strip(), - ), + (match.group(2) + match.group(3)).strip() + ), number, link ) return - self._raw_add(key, number, link) + self._raw_add((key, ""), number, link) if self.indextype == "AUTHOR": # Processing authors @@ -155,10 +149,26 @@ class Index(object): """Return the LaTeX code corresponding to the reference.""" return ur'\hyperlink{{{0[link]}}}{{{0[num]}}}'.format(ref) + def key_to_str(self, key): + """Convert the key (title or author) to the LaTeX command rendering it. + + """ + if self.indextype == "AUTHOR": + if key[1]: + return ur"\indexauthor{{{first}}}{{{last}}}".format( + first=key[1], + last=key[0], + ) + else: + return key[0] + + if self.indextype == "TITLE": + return ur"\indextitle{{{0[0]}}}{{{0[1]}}}".format(key) + def entry_to_str(self, key, entry): """Return the LaTeX code corresponding to the entry.""" return unicode(ur'\idxentry{{{0}}}{{{1}}}' + EOL).format( - key, + self.key_to_str(key), ur'\\'.join([self.ref_to_str(ref) for ref in entry]), ) @@ -168,9 +178,16 @@ class Index(object): Here, an index block is a letter, and all data beginning with this letter. """ + def sortkey(key): + """Return something sortable for `entries[key]`.""" + return [ + locale.strxfrm(item) + for item + in entries[key]['sortingkey'] + ] string = ur'\begin{idxblock}{' + letter + '}' + EOL - for key in sorted(entries.keys(), key=sortkey): - string += self.entry_to_str(key, entries[key]) + for key in sorted(entries, key=sortkey): + string += self.entry_to_str(key, entries[key]['entries']) string += ur'\end{idxblock}' + EOL return string diff --git a/patacrep/plastex.py b/patacrep/plastex.py index b1c906b2..ecfa2d00 100644 --- a/patacrep/plastex.py +++ b/patacrep/plastex.py @@ -39,6 +39,7 @@ def simpleparse(text): """Parse a simple LaTeX string. """ tex = TeX() + tex.disableLogging() tex.input(text) doc = tex.parse() return process_unbr_spaces(doc.textContent)