Browse Source

Utils: add comments, options, and methods to rules.py

remotes/origin/split-songs
Romain Goffe 13 years ago
parent
commit
4ff591865d
  1. 187
      utils/rules.py

187
utils/rules.py

@ -2,9 +2,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import glob import glob
import sys import getopt, sys
import fileinput
import re import re
import logging
import locale
re.LOCALE re.LOCALE
# the dictionary has target_word:replacement_word pairs # the dictionary has target_word:replacement_word pairs
@ -109,62 +110,136 @@ word_dic = {
"021202": "X21202", "021202": "X21202",
### end of rules ### end of rules
} }
# Process song files
songfiles = glob.glob('songs/*/*.sg')
for filename in songfiles:
with open(filename, 'r+') as songfile:
data = songfile.read()
#replace words
for search, replace in word_dic.items():
data = data.replace(search, replace)
#no dots for acronyms def usage():
# data = re.sub("(?P<capital_letter>[A-Z])\.","\g<capital_letter>", data) print '''
Usage: rules.py [OPTION]
OPTIONS
-h, --help
display this help and exit
-f, --files=FILES
apply the set of rules on FILES
default is songs/*/*.sg
-l, --log=LEVEL
set the logging level to LEVEL
possible LEVEL values are : debug, info, warning, error and critical
'''
def replace_words(string):
'''
Search the data string for words defined in the dictionary and
replace them. This method avoids usual spelling and typos mistakes
when writing a song.
'''
logging.info("replace_words: search and replace words from dictionary into song data")
for search, replace in word_dic.items():
string = string.replace(search, replace)
return string
#language based typographical rules #language based typographical rules
if (re.compile("selectlanguage{french}").search(data)): def language_rules(string):
#ensure non-breaking spaces before symbols ? ! ; : '''
data = re.sub("(?P<last_char>\S)(?P<symbol>[!?;:])","\g<last_char> \g<symbol>", data) Search the data string for common typographical mistakes.
# ... except for gtabs macros with capos Implemented rules depend on the current song language that is
data = re.sub("(?P<gtab>gtab.*)\s:","\g<gtab>:", data) defined by babel for every .sg file through the macro
# and apply a second time for cases like \gtab{Gm}{10:X02210:} \selectlanguage{<lang>}
data = re.sub("(?P<gtab>gtab.*)\s:","\g<gtab>:", data) '''
#ensure no spaces after symbols ( logging.info("language_rules: looking for common typographical mistakes")
data = re.sub("(?P<symbol>[\(])\s(?P<next_char>\S)","\g<symbol>\g<next_char>", data) if (re.compile("selectlanguage{french}").search(string)):
#convert inverted commas logging.info(" song language is set to : french")
data = re.sub("``","{\\og}", data) #ensure non-breaking spaces before symbols ? ! ; :
data = re.sub("''","{\\\\fg}", data) string = re.sub("(?P<last_char>\S)(?P<symbol>[!?;:])","\g<last_char> \g<symbol>", string)
elif (re.compile("selectlanguage{english}").search(data)): #... except for gtabs macros with capos
#print "english song" string = re.sub("(?P<gtab>gtab.*)\s:","\g<gtab>:", string)
#ensure no spaces before symbols ? ! ; : ) #and apply a second time for cases like \gtab{Gm}{10:X02210:}
data = re.sub("(?P<last_char>\S)\s(?P<symbol>[!?;:\)])","\g<last_char>\g<symbol>", data) string = re.sub("(?P<gtab>gtab.*)\s:","\g<gtab>:", string)
#ensure no spaces after symbols ( #ensure no spaces after symbols (
data = re.sub("(?P<symbol>[\(])\s(?P<next_char>\S)","\g<symbol>\g<next_char>", data) string = re.sub("(?P<symbol>[\(])\s(?P<next_char>\S)","\g<symbol>\g<next_char>", string)
elif (re.compile("selectlanguage{spanish}").search(data)): #convert inverted commas
#print "spanish song" string = re.sub("``","{\\og}", string)
#ensure no spaces before symbols ? ! ; : ) string = re.sub("''","{\\\\fg}", string)
data = re.sub("(?P<last_char>\S)\s(?P<symbol>[!?;:\)])","\g<last_char>\g<symbol>", data) elif (re.compile("selectlanguage{english}").search(string)):
#ensure no spaces after symbols ¿ ¡ ( logging.info(" song language is set to : english")
data = re.sub("(?P<symbol>[¿¡\(])\s(?P<next_char>\S)","\g<symbol>\g<next_char>", data) #ensure no spaces before symbols ? ! ; : )
elif (re.compile("selectlanguage{portuguese}").search(data)): string = re.sub("(?P<last_char>\S)\s(?P<symbol>[!?;:\)])","\g<last_char>\g<symbol>", string)
#convert inverted commas #ensure no spaces after symbols (
data = re.sub("``","{\\og}", data) string = re.sub("(?P<symbol>[\(])\s(?P<next_char>\S)","\g<symbol>\g<next_char>", string)
data = re.sub("''","{\\\\fg}", data) elif (re.compile("selectlanguage{spanish}").search(string)):
else : logging.info(" song language is set to : spanish")
print "Warning: language is not defined for song : " + filename #ensure no spaces before symbols ? ! ; : )
string = re.sub("(?P<last_char>\S)\s(?P<symbol>[!?;:\)])","\g<last_char>\g<symbol>", string)
lines = data.split('\n') #ensure no spaces after symbols ¿ ¡ (
for index, line in enumerate(lines): string = re.sub("(?P<symbol>[¿¡\(])\s(?P<next_char>\S)","\g<symbol>\g<next_char>", string)
#remove trailing spaces and punctuation elif (re.compile("selectlanguage{portuguese}").search(string)):
line = line.rstrip().rstrip(',.;').rstrip() logging.info(" song language is set to : portuguese")
#remove multi-spaces within lines #convert inverted commas
line = re.sub("(?P<last_char>\S)\s{2,}","\g<last_char> ", line) string = re.sub("``","{\\og}", string)
lines[index] = line string = re.sub("''","{\\\\fg}", string)
else :
data = "\n".join(lines) print "Warning: language is not defined for song : " + filename
songfile.seek(0) return string
songfile.write(data)
songfile.truncate() def process_lines(lines):
'''
Removes trailing punctuation and multi-spaces from lines. Not
that it preserves whitespaces at the beginning of lines that
correspond to indentation.
'''
logging.info("process_lines: handling song data line by line")
for index, line in enumerate(lines):
#remove trailing spaces and punctuation
line = line.rstrip().rstrip(',.;').rstrip()
#remove multi-spaces within lines
line = re.sub("(?P<last_char>\S)\s{2,}","\g<last_char> ", line)
lines[index] = line
return lines
def main():
locale.setlocale(locale.LC_ALL, '')
try:
opts, args = getopt.getopt(sys.argv[1:],
"hf:l:",
["help", "files=", "log="])
except getopt.GetoptError:
usage()
sys.exit(2)
songfiles = glob.glob('songs/*/*.sg')
loglevel = "warning"
for option, arg in opts:
if option in ("-h", "--help"):
usage()
sys.exit()
elif option in ("-f", "--files"):
songfiles = glob.glob(arg)
elif option in ("-l", "--log"):
loglevel = arg
numeric_level = getattr(logging, loglevel.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: %s' % loglevel)
logging.basicConfig(level=numeric_level, filename='rules.log', filemode='w')
else:
assert False, "unhandled option"
for filename in songfiles:
with open(filename, 'r+') as songfile:
data = songfile.read()
#no dots for acronyms
#data = re.sub("(?P<capital_letter>[A-Z])\.","\g<capital_letter>", data)
data = replace_words(data)
data = language_rules(data)
lines = process_lines(data.split('\n'))
data = "\n".join(lines)
songfile.seek(0)
songfile.write(data)
songfile.truncate()
if __name__ == '__main__':
main()

Loading…
Cancel
Save