|
@@ -7,7 +7,7 @@ alphabet_map = {
|
|
|
"B": " Bee ",
|
|
"B": " Bee ",
|
|
|
"C": " See ",
|
|
"C": " See ",
|
|
|
"D": " Dee ",
|
|
"D": " Dee ",
|
|
|
- "E": " II ",
|
|
|
|
|
|
|
+ "E": " Ii ",
|
|
|
"F": " Eff ",
|
|
"F": " Eff ",
|
|
|
"G": " Jee ",
|
|
"G": " Jee ",
|
|
|
"H": " Eich ",
|
|
"H": " Eich ",
|
|
@@ -38,18 +38,64 @@ def preprocess(string):
|
|
|
string = string.replace('“', '')
|
|
string = string.replace('“', '')
|
|
|
string = string.replace('\n', ' ')
|
|
string = string.replace('\n', ' ')
|
|
|
string = remove_commas(string)
|
|
string = remove_commas(string)
|
|
|
|
|
+ string = replace_roman(string)
|
|
|
string = hyphen_range_to(string)
|
|
string = hyphen_range_to(string)
|
|
|
string = num_to_words(string)
|
|
string = num_to_words(string)
|
|
|
- string = string.strip()
|
|
|
|
|
|
|
+
|
|
|
# TODO Try to use a ML predictor to expand abbreviations. It's hard, dependent on context, and whether to actually
|
|
# TODO Try to use a ML predictor to expand abbreviations. It's hard, dependent on context, and whether to actually
|
|
|
# try to say the abbreviation or spell it out as I've done below is not agreed upon
|
|
# try to say the abbreviation or spell it out as I've done below is not agreed upon
|
|
|
|
|
|
|
|
# For now, expand abbreviations to pronunciations
|
|
# For now, expand abbreviations to pronunciations
|
|
|
string = replace_abbreviations(string)
|
|
string = replace_abbreviations(string)
|
|
|
|
|
|
|
|
|
|
+ string = string.strip()
|
|
|
return string
|
|
return string
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def remove_surrounded_chars(string):
|
|
|
|
|
+ # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
|
|
|
|
|
+ # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
|
|
|
|
|
+ return re.sub(r'\*[^*]*?(\*|$)', '', string)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def replace_roman(string):
|
|
|
|
|
+ pattern = re.compile(r'\s[IVXLCDM]+[\s,.?!)"\'\]>]')
|
|
|
|
|
+ result = string
|
|
|
|
|
+ while True:
|
|
|
|
|
+ match = pattern.search(result)
|
|
|
|
|
+ if match is None:
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ start = match.start()
|
|
|
|
|
+ end = match.end()
|
|
|
|
|
+ result = result[0:start+1] + str(roman_to_int(result[start+1:end-1])) + result[end-1:len(result)]
|
|
|
|
|
+
|
|
|
|
|
+ return result
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def roman_to_int(s):
|
|
|
|
|
+ rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
|
|
|
|
+ int_val = 0
|
|
|
|
|
+ for i in range(len(s)):
|
|
|
|
|
+ if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]:
|
|
|
|
|
+ int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]]
|
|
|
|
|
+ else:
|
|
|
|
|
+ int_val += rom_val[s[i]]
|
|
|
|
|
+ return int_val
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def hyphen_range_to(text):
|
|
|
|
|
+ pattern = re.compile(r'(\d+)[-–](\d+)')
|
|
|
|
|
+ result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text)
|
|
|
|
|
+ return result
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def num_to_words(text):
|
|
|
|
|
+ pattern = re.compile(r'\d+')
|
|
|
|
|
+ result = pattern.sub(lambda x: num2words(int(x.group())), text)
|
|
|
|
|
+ return result
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def replace_abbreviations(string):
|
|
def replace_abbreviations(string):
|
|
|
pattern = re.compile(r'[\s("\'\[<][A-Z]{2,4}[\s,.?!)"\'\]>]')
|
|
pattern = re.compile(r'[\s("\'\[<][A-Z]{2,4}[\s,.?!)"\'\]>]')
|
|
|
result = string
|
|
result = string
|
|
@@ -81,24 +127,6 @@ def match_mapping(char, result):
|
|
|
return result + char
|
|
return result + char
|
|
|
|
|
|
|
|
|
|
|
|
|
-def remove_surrounded_chars(string):
|
|
|
|
|
- # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
|
|
|
|
|
- # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
|
|
|
|
|
- return re.sub(r'\*[^*]*?(\*|$)', '', string)
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-def hyphen_range_to(text):
|
|
|
|
|
- pattern = re.compile(r'(\d+)[-–](\d+)')
|
|
|
|
|
- result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text)
|
|
|
|
|
- return result
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-def num_to_words(text):
|
|
|
|
|
- pattern = re.compile(r'\d+')
|
|
|
|
|
- result = pattern.sub(lambda x: num2words(int(x.group())), text)
|
|
|
|
|
- return result
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
def remove_commas(text):
|
|
def remove_commas(text):
|
|
|
import re
|
|
import re
|
|
|
pattern = re.compile(r'(\d),(\d)')
|
|
pattern = re.compile(r'(\d),(\d)')
|