Просмотр исходного кода

Improve Silero's Preprocessor to Handle Punctuation and Whitespace Better

da3dsoul 2 лет назад
Родитель
Сommit
39063c48bb
1 измененных файлов с 5 добавлено и 1 удалено
  1. 5 1
      extensions/silero_tts/tts_preprocessor.py

+ 5 - 1
extensions/silero_tts/tts_preprocessor.py

@@ -48,7 +48,11 @@ def preprocess(string):
     # For now, expand abbreviations to pronunciations
     # For now, expand abbreviations to pronunciations
     string = replace_abbreviations(string)
     string = replace_abbreviations(string)
 
 
+    # cleanup whitespaces
+    string = re.sub(r'\s+([,.?!\'])', r'\1', string)
     string = string.strip()
     string = string.strip()
+    string = ' '.join(string.split())
+
     return string
     return string
 
 
 
 
@@ -97,7 +101,7 @@ def num_to_words(text):
 
 
 
 
 def replace_abbreviations(string):
 def replace_abbreviations(string):
-    pattern = re.compile(r'[\s("\'\[<][A-Z]{2,4}[\s,.?!)"\'\]>]')
+    pattern = re.compile(r'(^|[\s("\'\[<])([A-Z]{2,4})([\s,.?!)"\'\]>]|$)')
     result = string
     result = string
     while True:
     while True:
         match = pattern.search(result)
         match = pattern.search(result)