154 lines
3.9 KiB
Python
154 lines
3.9 KiB
Python
import re
|
||
from num2words import num2words
|
||
|
||
|
||
alphabet_map = {
|
||
"A": " Ei ",
|
||
"B": " Bee ",
|
||
"C": " See ",
|
||
"D": " Dee ",
|
||
"E": " Ii ",
|
||
"F": " Eff ",
|
||
"G": " Jee ",
|
||
"H": " Eich ",
|
||
"I": " Eye ",
|
||
"J": " Jay ",
|
||
"K": " Kay ",
|
||
"L": " El ",
|
||
"M": " Emm ",
|
||
"N": " Enn ",
|
||
"O": " Ohh ",
|
||
"P": " Pii ",
|
||
"Q": " Queue ",
|
||
"R": " Are ",
|
||
"S": " Ess ",
|
||
"T": " Tee ",
|
||
"U": " You ",
|
||
"V": " Vii ",
|
||
"W": " Double You ",
|
||
"X": " Ex ",
|
||
"Y": " Why ",
|
||
"Z": "Zed" # Zed is weird, as I (da3dsoul) am American, but most of the voice models sound British, so it matches
|
||
}
|
||
|
||
|
||
def preprocess(string):
|
||
string = remove_surrounded_chars(string)
|
||
string = string.replace('"', '')
|
||
string = string.replace('“', '')
|
||
string = string.replace('\n', ' ')
|
||
string = remove_commas(string)
|
||
string = replace_negative(string)
|
||
string = replace_roman(string)
|
||
string = hyphen_range_to(string)
|
||
string = num_to_words(string)
|
||
|
||
# TODO Try to use a ML predictor to expand abbreviations. It's hard, dependent on context, and whether to actually
|
||
# try to say the abbreviation or spell it out as I've done below is not agreed upon
|
||
|
||
# For now, expand abbreviations to pronunciations
|
||
string = replace_abbreviations(string)
|
||
|
||
# cleanup whitespaces
|
||
string = re.sub(r'\s+([,.?!\'])', r'\1', string)
|
||
string = string.strip()
|
||
string = ' '.join(string.split())
|
||
|
||
return string
|
||
|
||
|
||
def remove_surrounded_chars(string):
|
||
# this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
|
||
# 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
|
||
return re.sub(r'\*[^*]*?(\*|$)', '', string)
|
||
|
||
|
||
def replace_negative(string):
|
||
return re.sub(r'(\s)(-)(\d+)([\s,.?!)"\'\]>])', r'\1negative \3\4', string)
|
||
|
||
|
||
def replace_roman(string):
|
||
# find a string of roman numerals. Only 2 or more, to avoid capturing I
|
||
pattern = re.compile(r'\s[IVXLCDM]{2,}[\s,.?!)"\'\]>]')
|
||
result = string
|
||
while True:
|
||
match = pattern.search(result)
|
||
if match is None:
|
||
break
|
||
|
||
start = match.start()
|
||
end = match.end()
|
||
result = result[0:start+1] + str(roman_to_int(result[start+1:end-1])) + result[end-1:len(result)]
|
||
|
||
return result
|
||
|
||
|
||
def roman_to_int(s):
|
||
rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
||
int_val = 0
|
||
for i in range(len(s)):
|
||
if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]:
|
||
int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]]
|
||
else:
|
||
int_val += rom_val[s[i]]
|
||
return int_val
|
||
|
||
|
||
def hyphen_range_to(text):
|
||
pattern = re.compile(r'(\d+)[-–](\d+)')
|
||
result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text)
|
||
return result
|
||
|
||
|
||
def num_to_words(text):
|
||
pattern = re.compile(r'\d+')
|
||
result = pattern.sub(lambda x: num2words(int(x.group())), text)
|
||
return result
|
||
|
||
|
||
def replace_abbreviations(string):
|
||
pattern = re.compile(r'(^|[\s("\'\[<])([A-Z]{2,4})([\s,.?!)"\'\]>]|$)')
|
||
result = string
|
||
while True:
|
||
match = pattern.search(result)
|
||
if match is None:
|
||
break
|
||
|
||
start = match.start()
|
||
end = match.end()
|
||
result = result[0:start] + replace_abbreviation(result[start:end]) + result[end:len(result)]
|
||
|
||
return result
|
||
|
||
|
||
def replace_abbreviation(string):
|
||
result = ""
|
||
for char in string:
|
||
result = match_mapping(char, result)
|
||
|
||
return result
|
||
|
||
|
||
def match_mapping(char, result):
|
||
for mapping in alphabet_map.keys():
|
||
if char == mapping:
|
||
return result + alphabet_map[char]
|
||
|
||
return result + char
|
||
|
||
|
||
def remove_commas(text):
|
||
import re
|
||
pattern = re.compile(r'(\d),(\d)')
|
||
result = pattern.sub(r'\1\2', text)
|
||
return result
|
||
|
||
|
||
def __main__(args):
|
||
print(preprocess(args[1]))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import sys
|
||
__main__(sys.argv)
|