tts_preprocessor.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. import re
  2. from num2words import num2words
  3. punctuation = r'[\s,.?!/)\'\]>]'
  4. alphabet_map = {
  5. "A": " Ei ",
  6. "B": " Bee ",
  7. "C": " See ",
  8. "D": " Dee ",
  9. "E": " Eee ",
  10. "F": " Eff ",
  11. "G": " Jee ",
  12. "H": " Eich ",
  13. "I": " Eye ",
  14. "J": " Jay ",
  15. "K": " Kay ",
  16. "L": " El ",
  17. "M": " Emm ",
  18. "N": " Enn ",
  19. "O": " Ohh ",
  20. "P": " Pee ",
  21. "Q": " Queue ",
  22. "R": " Are ",
  23. "S": " Ess ",
  24. "T": " Tee ",
  25. "U": " You ",
  26. "V": " Vee ",
  27. "W": " Double You ",
  28. "X": " Ex ",
  29. "Y": " Why ",
  30. "Z": " Zed " # Zed is weird, as I (da3dsoul) am American, but most of the voice models sound British, so it matches
  31. }
  32. def preprocess(string):
  33. # the order for some of these matter
  34. # For example, you need to remove the commas in numbers before expanding them
  35. string = remove_surrounded_chars(string)
  36. string = string.replace('"', '')
  37. string = string.replace('\u201D', '').replace('\u201C', '') # right and left quote
  38. string = string.replace('\u201F', '') # italic looking quote
  39. string = string.replace('\n', ' ')
  40. string = convert_num_locale(string)
  41. string = replace_negative(string)
  42. string = replace_roman(string)
  43. string = hyphen_range_to(string)
  44. string = num_to_words(string)
  45. # TODO Try to use a ML predictor to expand abbreviations. It's hard, dependent on context, and whether to actually
  46. # try to say the abbreviation or spell it out as I've done below is not agreed upon
  47. # For now, expand abbreviations to pronunciations
  48. # replace_abbreviations adds a lot of unnecessary whitespace to ensure separation
  49. string = replace_abbreviations(string)
  50. string = replace_lowercase_abbreviations(string)
  51. # cleanup whitespaces
  52. # remove whitespace before punctuation
  53. string = re.sub(rf'\s+({punctuation})', r'\1', string)
  54. string = string.strip()
  55. # compact whitespace
  56. string = ' '.join(string.split())
  57. return string
  58. def remove_surrounded_chars(string):
  59. # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
  60. # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
  61. return re.sub(r'\*[^*]*?(\*|$)', '', string)
  62. def convert_num_locale(text):
  63. # This detects locale and converts it to American without comma separators
  64. pattern = re.compile(r'(?:\s|^)\d{1,3}(?:\.\d{3})+(,\d+)(?:\s|$)')
  65. result = text
  66. while True:
  67. match = pattern.search(result)
  68. if match is None:
  69. break
  70. start = match.start()
  71. end = match.end()
  72. result = result[0:start] + result[start:end].replace('.', '').replace(',', '.') + result[end:len(result)]
  73. # removes comma separators from existing American numbers
  74. pattern = re.compile(r'(\d),(\d)')
  75. result = pattern.sub(r'\1\2', result)
  76. return result
  77. def replace_negative(string):
  78. # handles situations like -5. -5 would become negative 5, which would then be expanded to negative five
  79. return re.sub(rf'(\s)(-)(\d+)({punctuation})', r'\1negative \3\4', string)
  80. def replace_roman(string):
  81. # find a string of roman numerals.
  82. # Only 2 or more, to avoid capturing I and single character abbreviations, like names
  83. pattern = re.compile(rf'\s[IVXLCDM]{{2,}}{punctuation}')
  84. result = string
  85. while True:
  86. match = pattern.search(result)
  87. if match is None:
  88. break
  89. start = match.start()
  90. end = match.end()
  91. result = result[0:start + 1] + str(roman_to_int(result[start + 1:end - 1])) + result[end - 1:len(result)]
  92. return result
  93. def roman_to_int(s):
  94. rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
  95. int_val = 0
  96. for i in range(len(s)):
  97. if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]:
  98. int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]]
  99. else:
  100. int_val += rom_val[s[i]]
  101. return int_val
  102. def hyphen_range_to(text):
  103. pattern = re.compile(r'(\d+)[-–](\d+)')
  104. result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text)
  105. return result
  106. def num_to_words(text):
  107. # 1000 or 10.23
  108. pattern = re.compile(r'\d+\.\d+|\d+')
  109. result = pattern.sub(lambda x: num2words(float(x.group())), text)
  110. return result
  111. def replace_abbreviations(string):
  112. # abbreviations 1 to 4 characters long. It will get things like A and I, but those are pronounced with their letter
  113. pattern = re.compile(rf'(^|[\s(.\'\[<])([A-Z]{{1,4}})({punctuation}|$)')
  114. result = string
  115. while True:
  116. match = pattern.search(result)
  117. if match is None:
  118. break
  119. start = match.start()
  120. end = match.end()
  121. result = result[0:start] + replace_abbreviation(result[start:end]) + result[end:len(result)]
  122. return result
  123. def replace_lowercase_abbreviations(string):
  124. # abbreviations 1 to 4 characters long, separated by dots i.e. e.g.
  125. pattern = re.compile(rf'(^|[\s(.\'\[<])(([a-z]\.){{1,4}})({punctuation}|$)')
  126. result = string
  127. while True:
  128. match = pattern.search(result)
  129. if match is None:
  130. break
  131. start = match.start()
  132. end = match.end()
  133. result = result[0:start] + replace_abbreviation(result[start:end].upper()) + result[end:len(result)]
  134. return result
  135. def replace_abbreviation(string):
  136. result = ""
  137. for char in string:
  138. result += match_mapping(char)
  139. return result
  140. def match_mapping(char):
  141. for mapping in alphabet_map.keys():
  142. if char == mapping:
  143. return alphabet_map[char]
  144. return char
  145. def __main__(args):
  146. print(preprocess(args[1]))
  147. if __name__ == "__main__":
  148. import sys
  149. __main__(sys.argv)