Improve Silero's Preprocessor to Handle Punctuation and Whitespace Better
This commit is contained in:
@@ -48,7 +48,11 @@ def preprocess(string):
|
|||||||
# For now, expand abbreviations to pronunciations
|
# For now, expand abbreviations to pronunciations
|
||||||
string = replace_abbreviations(string)
|
string = replace_abbreviations(string)
|
||||||
|
|
||||||
|
# cleanup whitespaces
|
||||||
|
string = re.sub(r'\s+([,.?!\'])', r'\1', string)
|
||||||
string = string.strip()
|
string = string.strip()
|
||||||
|
string = ' '.join(string.split())
|
||||||
|
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
|
||||||
@@ -97,7 +101,7 @@ def num_to_words(text):
|
|||||||
|
|
||||||
|
|
||||||
def replace_abbreviations(string):
|
def replace_abbreviations(string):
|
||||||
pattern = re.compile(r'[\s("\'\[<][A-Z]{2,4}[\s,.?!)"\'\]>]')
|
pattern = re.compile(r'(^|[\s("\'\[<])([A-Z]{2,4})([\s,.?!)"\'\]>]|$)')
|
||||||
result = string
|
result = string
|
||||||
while True:
|
while True:
|
||||||
match = pattern.search(result)
|
match = pattern.search(result)
|
||||||
|
|||||||
Reference in New Issue
Block a user