Skip to content

Commit 37dbf29

Browse files
authoredJul 26, 2021
Add text preprocessing utilities for TTS pipeline (#1639)
1 parent c49db73 commit 37dbf29

File tree

4 files changed

+202
-0
lines changed

4 files changed

+202
-0
lines changed
 

‎examples/pipeline_tacotron2/text/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2017 Keith Ito
3+
#
4+
# Permission is hereby granted, free of charge, to any person obtaining a copy
5+
# of this software and associated documentation files (the "Software"), to deal
6+
# in the Software without restriction, including without limitation the rights
7+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8+
# copies of the Software, and to permit persons to whom the Software is
9+
# furnished to do so, subject to the following conditions:
10+
#
11+
# The above copyright notice and this permission notice shall be included in
12+
# all copies or substantial portions of the Software.
13+
#
14+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20+
# THE SOFTWARE.
21+
#
22+
# *****************************************************************************
23+
"""
24+
Modified from https://github.com/keithito/tacotron
25+
"""
26+
27+
import inflect
28+
import re
29+
30+
31+
_inflect = inflect.engine()
32+
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
33+
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
34+
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
35+
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
36+
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
37+
_number_re = re.compile(r'[0-9]+')
38+
39+
40+
def _remove_commas(m: re.Match) -> str:
41+
return m.group(1).replace(',', '')
42+
43+
44+
def _expand_decimal_point(m: re.Match) -> str:
45+
return m.group(1).replace('.', ' point ')
46+
47+
48+
def _expand_dollars(m: re.Match) -> str:
49+
match = m.group(1)
50+
parts = match.split('.')
51+
if len(parts) > 2:
52+
return match + ' dollars' # Unexpected format
53+
dollars = int(parts[0]) if parts[0] else 0
54+
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
55+
if dollars and cents:
56+
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
57+
cent_unit = 'cent' if cents == 1 else 'cents'
58+
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
59+
elif dollars:
60+
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
61+
return '%s %s' % (dollars, dollar_unit)
62+
elif cents:
63+
cent_unit = 'cent' if cents == 1 else 'cents'
64+
return '%s %s' % (cents, cent_unit)
65+
else:
66+
return 'zero dollars'
67+
68+
69+
def _expand_ordinal(m: re.Match) -> str:
70+
return _inflect.number_to_words(m.group(0))
71+
72+
73+
def _expand_number(m: re.Match) -> str:
74+
num = int(m.group(0))
75+
if num > 1000 and num < 3000:
76+
if num == 2000:
77+
return 'two thousand'
78+
elif num > 2000 and num < 2010:
79+
return 'two thousand ' + _inflect.number_to_words(num % 100)
80+
elif num % 100 == 0:
81+
return _inflect.number_to_words(num // 100) + ' hundred'
82+
else:
83+
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
84+
else:
85+
return _inflect.number_to_words(num, andword='')
86+
87+
88+
def normalize_numbers(text: str) -> str:
89+
text = re.sub(_comma_number_re, _remove_commas, text)
90+
text = re.sub(_pounds_re, r'\1 pounds', text)
91+
text = re.sub(_dollars_re, _expand_dollars, text)
92+
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
93+
text = re.sub(_ordinal_re, _expand_ordinal, text)
94+
text = re.sub(_number_re, _expand_number, text)
95+
return text
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import unittest
2+
3+
from parameterized import parameterized
4+
5+
from .text_preprocessing import text_to_sequence
6+
7+
8+
class TestTextPreprocessor(unittest.TestCase):
9+
10+
@parameterized.expand(
11+
[
12+
["dr. Strange?", [15, 26, 14, 31, 26, 29, 11, 30, 31, 29, 12, 25, 18, 16, 10]],
13+
["ML, is fun.", [24, 23, 6, 11, 20, 30, 11, 17, 32, 25, 7]],
14+
["I love torchaudio!", [20, 11, 23, 26, 33, 16, 11, 31, 26, 29, 14, 19, 12, 32, 15, 20, 26, 2]],
15+
# 'one thousand dollars, twenty cents'
16+
["$1,000.20", [26, 25, 16, 11, 31, 19, 26, 32, 30, 12, 25, 15, 11, 15, 26, 23, 23,
17+
12, 29, 30, 6, 11, 31, 34, 16, 25, 31, 36, 11, 14, 16, 25, 31, 30]],
18+
]
19+
)
20+
def test_text_to_sequence(self, sent, seq):
21+
22+
assert (text_to_sequence(sent) == seq)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2017 Keith Ito
3+
#
4+
# Permission is hereby granted, free of charge, to any person obtaining a copy
5+
# of this software and associated documentation files (the "Software"), to deal
6+
# in the Software without restriction, including without limitation the rights
7+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8+
# copies of the Software, and to permit persons to whom the Software is
9+
# furnished to do so, subject to the following conditions:
10+
#
11+
# The above copyright notice and this permission notice shall be included in
12+
# all copies or substantial portions of the Software.
13+
#
14+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20+
# THE SOFTWARE.
21+
#
22+
# *****************************************************************************
23+
"""
24+
Modified from https://github.com/keithito/tacotron
25+
"""
26+
27+
from typing import List
28+
import re
29+
30+
from unidecode import unidecode
31+
32+
from .numbers import normalize_numbers
33+
34+
35+
# Regular expression matching whitespace:
36+
_whitespace_re = re.compile(r'\s+')
37+
38+
# List of (regular expression, replacement) pairs for abbreviations:
39+
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
40+
('mrs', 'misess'),
41+
('mr', 'mister'),
42+
('dr', 'doctor'),
43+
('st', 'saint'),
44+
('co', 'company'),
45+
('jr', 'junior'),
46+
('maj', 'major'),
47+
('gen', 'general'),
48+
('drs', 'doctors'),
49+
('rev', 'reverend'),
50+
('lt', 'lieutenant'),
51+
('hon', 'honorable'),
52+
('sgt', 'sergeant'),
53+
('capt', 'captain'),
54+
('esq', 'esquire'),
55+
('ltd', 'limited'),
56+
('col', 'colonel'),
57+
('ft', 'fort'),
58+
]]
59+
60+
_pad = '_'
61+
_punctuation = '!\'(),.:;? '
62+
_special = '-'
63+
_letters = 'abcdefghijklmnopqrstuvwxyz'
64+
65+
symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters)
66+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
67+
68+
69+
def text_to_sequence(sent: str) -> List[int]:
70+
r'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
71+
72+
Args:
73+
sent (str): The input sentence to convert to a sequence.
74+
75+
Returns:
76+
List of integers corresponding to the symbols in the sentence.
77+
'''
78+
sent = unidecode(sent) # convert to ascii
79+
sent = sent.lower() # lower case
80+
sent = normalize_numbers(sent) # expand numbers
81+
for regex, replacement in _abbreviations: # expand abbreviations
82+
sent = re.sub(regex, replacement, sent)
83+
sent = re.sub(_whitespace_re, ' ', sent) # collapse whitespace
84+
85+
return [_symbol_to_id[s] for s in sent if s in _symbol_to_id]

0 commit comments

Comments
 (0)
Please sign in to comment.