tts_preprocessor.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. import re
  2. from num2words import num2words
  3. alphabet_map = {
  4. "A": " Ei ",
  5. "B": " Bee ",
  6. "C": " See ",
  7. "D": " Dee ",
  8. "E": " Ii ",
  9. "F": " Eff ",
  10. "G": " Jee ",
  11. "H": " Eich ",
  12. "I": " Eye ",
  13. "J": " Jay ",
  14. "K": " Kay ",
  15. "L": " El ",
  16. "M": " Emm ",
  17. "N": " Enn ",
  18. "O": " Ohh ",
  19. "P": " Pii ",
  20. "Q": " Queue ",
  21. "R": " Are ",
  22. "S": " Ess ",
  23. "T": " Tee ",
  24. "U": " You ",
  25. "V": " Vii ",
  26. "W": " Double You ",
  27. "X": " Ex ",
  28. "Y": " Why ",
  29. "Z": "Zed" # Zed is weird, as I (da3dsoul) am American, but most of the voice models sound British, so it matches
  30. }
  31. def preprocess(string):
  32. string = remove_surrounded_chars(string)
  33. string = string.replace('"', '')
  34. string = string.replace('“', '')
  35. string = string.replace('\n', ' ')
  36. string = remove_commas(string)
  37. string = replace_negative(string)
  38. string = replace_roman(string)
  39. string = hyphen_range_to(string)
  40. string = num_to_words(string)
  41. # TODO Try to use a ML predictor to expand abbreviations. It's hard, dependent on context, and whether to actually
  42. # try to say the abbreviation or spell it out as I've done below is not agreed upon
  43. # For now, expand abbreviations to pronunciations
  44. string = replace_abbreviations(string)
  45. # cleanup whitespaces
  46. string = re.sub(r'\s+([,.?!\'])', r'\1', string)
  47. string = string.strip()
  48. string = ' '.join(string.split())
  49. return string
  50. def remove_surrounded_chars(string):
  51. # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
  52. # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
  53. return re.sub(r'\*[^*]*?(\*|$)', '', string)
  54. def replace_negative(string):
  55. return re.sub(r'(\s)(-)(\d+)([\s,.?!)"\'\]>])', r'\1negative \3\4', string)
  56. def replace_roman(string):
  57. # find a string of roman numerals. Only 2 or more, to avoid capturing I
  58. pattern = re.compile(r'\s[IVXLCDM]{2,}[\s,.?!)"\'\]>]')
  59. result = string
  60. while True:
  61. match = pattern.search(result)
  62. if match is None:
  63. break
  64. start = match.start()
  65. end = match.end()
  66. result = result[0:start+1] + str(roman_to_int(result[start+1:end-1])) + result[end-1:len(result)]
  67. return result
  68. def roman_to_int(s):
  69. rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
  70. int_val = 0
  71. for i in range(len(s)):
  72. if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]:
  73. int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]]
  74. else:
  75. int_val += rom_val[s[i]]
  76. return int_val
  77. def hyphen_range_to(text):
  78. pattern = re.compile(r'(\d+)[-–](\d+)')
  79. result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text)
  80. return result
  81. def num_to_words(text):
  82. pattern = re.compile(r'\d+')
  83. result = pattern.sub(lambda x: num2words(int(x.group())), text)
  84. return result
  85. def replace_abbreviations(string):
  86. pattern = re.compile(r'(^|[\s("\'\[<])([A-Z]{2,4})([\s,.?!)"\'\]>]|$)')
  87. result = string
  88. while True:
  89. match = pattern.search(result)
  90. if match is None:
  91. break
  92. start = match.start()
  93. end = match.end()
  94. result = result[0:start] + replace_abbreviation(result[start:end]) + result[end:len(result)]
  95. return result
  96. def replace_abbreviation(string):
  97. result = ""
  98. for char in string:
  99. result = match_mapping(char, result)
  100. return result
  101. def match_mapping(char, result):
  102. for mapping in alphabet_map.keys():
  103. if char == mapping:
  104. return result + alphabet_map[char]
  105. return result + char
  106. def remove_commas(text):
  107. import re
  108. pattern = re.compile(r'(\d),(\d)')
  109. result = pattern.sub(r'\1\2', text)
  110. return result
  111. def __main__(args):
  112. print(preprocess(args[1]))
  113. if __name__ == "__main__":
  114. import sys
  115. __main__(sys.argv)