diff --git a/utils/rules.py b/utils/rules.py index 24e1e9f5..0560b8a7 100755 --- a/utils/rules.py +++ b/utils/rules.py @@ -2,9 +2,10 @@ # -*- coding: utf-8 -*- import glob -import sys -import fileinput +import getopt, sys import re +import logging +import locale re.LOCALE # the dictionary has target_word:replacement_word pairs @@ -109,62 +110,136 @@ word_dic = { "021202": "X21202", ### end of rules } - -# Process song files -songfiles = glob.glob('songs/*/*.sg') -for filename in songfiles: - with open(filename, 'r+') as songfile: - data = songfile.read() -#replace words - for search, replace in word_dic.items(): - data = data.replace(search, replace) -#no dots for acronyms -# data = re.sub("(?P[A-Z])\.","\g", data) +def usage(): + print ''' +Usage: rules.py [OPTION] + +OPTIONS + -h, --help + display this help and exit + + -f, --files=FILES + apply the set of rules on FILES + default is songs/*/*.sg + + -l, --log=LEVEL + set the logging level to LEVEL + possible LEVEL values are : debug, info, warning, error and critical +''' + +def replace_words(string): + ''' + Search the data string for words defined in the dictionary and + replace them. This method avoids usual spelling and typos mistakes + when writing a song. + ''' + logging.info("replace_words: search and replace words from dictionary into song data") + for search, replace in word_dic.items(): + string = string.replace(search, replace) + return string #language based typographical rules - if (re.compile("selectlanguage{french}").search(data)): - #ensure non-breaking spaces before symbols ? ! ; : - data = re.sub("(?P\S)(?P[!?;:])","\g \g", data) - # ... except for gtabs macros with capos - data = re.sub("(?Pgtab.*)\s:","\g:", data) -# and apply a second time for cases like \gtab{Gm}{10:X02210:} - data = re.sub("(?Pgtab.*)\s:","\g:", data) - #ensure no spaces after symbols ( - data = re.sub("(?P[\(])\s(?P\S)","\g\g", data) - #convert inverted commas - data = re.sub("``","{\\og}", data) - data = re.sub("''","{\\\\fg}", data) - elif (re.compile("selectlanguage{english}").search(data)): - #print "english song" - #ensure no spaces before symbols ? ! ; : ) - data = re.sub("(?P\S)\s(?P[!?;:\)])","\g\g", data) - #ensure no spaces after symbols ( - data = re.sub("(?P[\(])\s(?P\S)","\g\g", data) - elif (re.compile("selectlanguage{spanish}").search(data)): - #print "spanish song" - #ensure no spaces before symbols ? ! ; : ) - data = re.sub("(?P\S)\s(?P[!?;:\)])","\g\g", data) - #ensure no spaces after symbols ¿ ¡ ( - data = re.sub("(?P[¿¡\(])\s(?P\S)","\g\g", data) - elif (re.compile("selectlanguage{portuguese}").search(data)): - #convert inverted commas - data = re.sub("``","{\\og}", data) - data = re.sub("''","{\\\\fg}", data) - else : - print "Warning: language is not defined for song : " + filename - - lines = data.split('\n') - for index, line in enumerate(lines): - #remove trailing spaces and punctuation - line = line.rstrip().rstrip(',.;').rstrip() - #remove multi-spaces within lines - line = re.sub("(?P\S)\s{2,}","\g ", line) - lines[index] = line - - data = "\n".join(lines) - songfile.seek(0) - songfile.write(data) - songfile.truncate() +def language_rules(string): + ''' + Search the data string for common typographical mistakes. + Implemented rules depend on the current song language that is + defined by babel for every .sg file through the macro + \selectlanguage{} + ''' + logging.info("language_rules: looking for common typographical mistakes") + if (re.compile("selectlanguage{french}").search(string)): + logging.info(" song language is set to : french") + #ensure non-breaking spaces before symbols ? ! ; : + string = re.sub("(?P\S)(?P[!?;:])","\g \g", string) + #... except for gtabs macros with capos + string = re.sub("(?Pgtab.*)\s:","\g:", string) + #and apply a second time for cases like \gtab{Gm}{10:X02210:} + string = re.sub("(?Pgtab.*)\s:","\g:", string) + #ensure no spaces after symbols ( + string = re.sub("(?P[\(])\s(?P\S)","\g\g", string) + #convert inverted commas + string = re.sub("``","{\\og}", string) + string = re.sub("''","{\\\\fg}", string) + elif (re.compile("selectlanguage{english}").search(string)): + logging.info(" song language is set to : english") + #ensure no spaces before symbols ? ! ; : ) + string = re.sub("(?P\S)\s(?P[!?;:\)])","\g\g", string) + #ensure no spaces after symbols ( + string = re.sub("(?P[\(])\s(?P\S)","\g\g", string) + elif (re.compile("selectlanguage{spanish}").search(string)): + logging.info(" song language is set to : spanish") + #ensure no spaces before symbols ? ! ; : ) + string = re.sub("(?P\S)\s(?P[!?;:\)])","\g\g", string) + #ensure no spaces after symbols ¿ ¡ ( + string = re.sub("(?P[¿¡\(])\s(?P\S)","\g\g", string) + elif (re.compile("selectlanguage{portuguese}").search(string)): + logging.info(" song language is set to : portuguese") + #convert inverted commas + string = re.sub("``","{\\og}", string) + string = re.sub("''","{\\\\fg}", string) + else : + print "Warning: language is not defined for song : " + filename + return string + +def process_lines(lines): + ''' + Removes trailing punctuation and multi-spaces from lines. Not + that it preserves whitespaces at the beginning of lines that + correspond to indentation. + ''' + logging.info("process_lines: handling song data line by line") + for index, line in enumerate(lines): + #remove trailing spaces and punctuation + line = line.rstrip().rstrip(',.;').rstrip() + #remove multi-spaces within lines + line = re.sub("(?P\S)\s{2,}","\g ", line) + lines[index] = line + return lines + + +def main(): + locale.setlocale(locale.LC_ALL, '') + try: + opts, args = getopt.getopt(sys.argv[1:], + "hf:l:", + ["help", "files=", "log="]) + except getopt.GetoptError: + usage() + sys.exit(2) + + songfiles = glob.glob('songs/*/*.sg') + loglevel = "warning" + + for option, arg in opts: + if option in ("-h", "--help"): + usage() + sys.exit() + elif option in ("-f", "--files"): + songfiles = glob.glob(arg) + elif option in ("-l", "--log"): + loglevel = arg + numeric_level = getattr(logging, loglevel.upper(), None) + if not isinstance(numeric_level, int): + raise ValueError('Invalid log level: %s' % loglevel) + logging.basicConfig(level=numeric_level, filename='rules.log', filemode='w') + else: + assert False, "unhandled option" + + for filename in songfiles: + with open(filename, 'r+') as songfile: + + data = songfile.read() + #no dots for acronyms + #data = re.sub("(?P[A-Z])\.","\g", data) + data = replace_words(data) + data = language_rules(data) + lines = process_lines(data.split('\n')) + data = "\n".join(lines) + songfile.seek(0) + songfile.write(data) + songfile.truncate() +if __name__ == '__main__': + main()