Improve Silero Preprocessing to Handle European Numbers and Decimals

Also add a test script to generate audio clips from CLI
2023-04-04 16:24:25 -04:00
parent d5f3036687
commit 1848938f7f
2 changed files with 100 additions and 7 deletions
--- a/extensions/silero_tts/tts_preprocessor.py
+++ b/extensions/silero_tts/tts_preprocessor.py
@@ -1,4 +1,5 @@
 import re
+import locale
 from num2words import num2words


@@ -39,7 +40,7 @@ def preprocess(string):
    string = string.replace('"', '')
    string = string.replace('“', '')
    string = string.replace('\n', ' ')
-    string = remove_commas(string)
+    string = convert_num_locale(string)
    string = replace_negative(string)
    string = replace_roman(string)
    string = hyphen_range_to(string)
@@ -108,8 +109,9 @@ def hyphen_range_to(text):


 def num_to_words(text):
-    pattern = re.compile(r'\d+')
-    result = pattern.sub(lambda x: num2words(int(x.group())), text)
+    # 1000 or 10.23
+    pattern = re.compile(r'\d+\.\d+|\d+')
+    result = pattern.sub(lambda x: num2words(float(x.group())), text)
    return result


@@ -145,11 +147,23 @@ def match_mapping(char, result):
    return result + char


-def remove_commas(text):
-    # This handles American locale numbers
-    # TODO This should probably be adapted to detect locale
+def convert_num_locale(text):
+    # This detects locale and converts it to American without comma separators
+    pattern = re.compile(r'(?:\s|^)\d{1,3}(?:\.\d{3})*(?:,\d+)?(?:\s|$)')
+    result = text
+    while True:
+        match = pattern.search(result)
+        if match is None:
+            break
+
+        start = match.start()
+        end = match.end()
+        result = result[0:start] + result[start:end].replace('.', '').replace(',', '.') + result[end:len(result)]
+
+    # removes comma separators from existing American numbers
    pattern = re.compile(r'(\d),(\d)')
-    result = pattern.sub(r'\1\2', text)
+    result = pattern.sub(r'\1\2', result)
+
    return result