3 yıl önce · 7d4e419dbe
--- a/extensions/silero_tts/tts_preprocessor.py
+++ b/extensions/silero_tts/tts_preprocessor.py
@@ -7,7 +7,7 @@ alphabet_map = {
 
				     "B": " Bee ",
			
 
				     "C": " See ",
			
 
				     "D": " Dee ",
			
 
				-    "E": " II ",
			
 
				+    "E": " Ii ",
			
 
				     "F": " Eff ",
			
 
				     "G": " Jee ",
			
 
				     "H": " Eich ",
			
@@ -38,18 +38,64 @@ def preprocess(string):
 
				     string = string.replace('“', '')
			
 
				     string = string.replace('\n', ' ')
			
 
				     string = remove_commas(string)
			
 
				+    string = replace_roman(string)
			
 
				     string = hyphen_range_to(string)
			
 
				     string = num_to_words(string)
			
 
				-    string = string.strip()
			
 
				+
			
 
				     # TODO Try to use a ML predictor to expand abbreviations. It's hard, dependent on context, and whether to actually
			
 
				     # try to say the abbreviation or spell it out as I've done below is not agreed upon
			
 
				 
			
 
				     # For now, expand abbreviations to pronunciations
			
 
				     string = replace_abbreviations(string)
			
 
				 
			
 
				+    string = string.strip()
			
 
				     return string
			
 
				 
			
 
				 
			
 
				+def remove_surrounded_chars(string):
			
 
				+    # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
			
 
				+    # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
			
 
				+    return re.sub(r'\*[^*]*?(\*|$)', '', string)
			
 
				+
			
 
				+
			
 
				+def replace_roman(string):
			
 
				+    pattern = re.compile(r'\s[IVXLCDM]+[\s,.?!)"\'\]>]')
			
 
				+    result = string
			
 
				+    while True:
			
 
				+        match = pattern.search(result)
			
 
				+        if match is None:
			
 
				+            break
			
 
				+
			
 
				+        start = match.start()
			
 
				+        end = match.end()
			
 
				+        result = result[0:start+1] + str(roman_to_int(result[start+1:end-1])) + result[end-1:len(result)]
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def roman_to_int(s):
			
 
				+    rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
			
 
				+    int_val = 0
			
 
				+    for i in range(len(s)):
			
 
				+        if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]:
			
 
				+            int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]]
			
 
				+        else:
			
 
				+            int_val += rom_val[s[i]]
			
 
				+    return int_val
			
 
				+
			
 
				+
			
 
				+def hyphen_range_to(text):
			
 
				+    pattern = re.compile(r'(\d+)[-–](\d+)')
			
 
				+    result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text)
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def num_to_words(text):
			
 
				+    pattern = re.compile(r'\d+')
			
 
				+    result = pattern.sub(lambda x: num2words(int(x.group())), text)
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				 def replace_abbreviations(string):
			
 
				     pattern = re.compile(r'[\s("\'\[<][A-Z]{2,4}[\s,.?!)"\'\]>]')
			
 
				     result = string
			
@@ -81,24 +127,6 @@ def match_mapping(char, result):
 
				     return result + char
			
 
				 
			
 
				 
			
 
				-def remove_surrounded_chars(string):
			
 
				-    # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
			
 
				-    # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
			
 
				-    return re.sub(r'\*[^*]*?(\*|$)', '', string)
			
 
				-
			
 
				-
			
 
				-def hyphen_range_to(text):
			
 
				-    pattern = re.compile(r'(\d+)[-–](\d+)')
			
 
				-    result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text)
			
 
				-    return result
			
 
				-
			
 
				-
			
 
				-def num_to_words(text):
			
 
				-    pattern = re.compile(r'\d+')
			
 
				-    result = pattern.sub(lambda x: num2words(int(x.group())), text)
			
 
				-    return result
			
 
				-
			
 
				-
			
 
				 def remove_commas(text):
			
 
				     import re
			
 
				     pattern = re.compile(r'(\d),(\d)')