diff --git a/examples/pipeline_tacotron2/text/__init__.py b/examples/pipeline_tacotron2/text/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/pipeline_tacotron2/text/numbers.py b/examples/pipeline_tacotron2/text/numbers.py new file mode 100644 index 0000000000..3a0b141dd0 --- /dev/null +++ b/examples/pipeline_tacotron2/text/numbers.py @@ -0,0 +1,95 @@ +# ***************************************************************************** +# Copyright (c) 2017 Keith Ito +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ***************************************************************************** +""" +Modified from https://github.com/keithito/tacotron +""" + +import inflect +import re + + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') + + +def _remove_commas(m: re.Match) -> str: + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m: re.Match) -> str: + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m: re.Match) -> str: + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _expand_ordinal(m: re.Match) -> str: + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m: re.Match) -> str: + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + else: + return _inflect.number_to_words(num, andword='') + + +def normalize_numbers(text: str) -> str: + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text diff --git a/examples/pipeline_tacotron2/text/test_text.py b/examples/pipeline_tacotron2/text/test_text.py new file mode 100644 index 0000000000..61ff451668 --- /dev/null +++ b/examples/pipeline_tacotron2/text/test_text.py @@ -0,0 +1,22 @@ +import unittest + +from parameterized import parameterized + +from .text_preprocessing import text_to_sequence + + +class TestTextPreprocessor(unittest.TestCase): + + @parameterized.expand( + [ + ["dr. Strange?", [15, 26, 14, 31, 26, 29, 11, 30, 31, 29, 12, 25, 18, 16, 10]], + ["ML, is fun.", [24, 23, 6, 11, 20, 30, 11, 17, 32, 25, 7]], + ["I love torchaudio!", [20, 11, 23, 26, 33, 16, 11, 31, 26, 29, 14, 19, 12, 32, 15, 20, 26, 2]], + # 'one thousand dollars, twenty cents' + ["$1,000.20", [26, 25, 16, 11, 31, 19, 26, 32, 30, 12, 25, 15, 11, 15, 26, 23, 23, + 12, 29, 30, 6, 11, 31, 34, 16, 25, 31, 36, 11, 14, 16, 25, 31, 30]], + ] + ) + def test_text_to_sequence(self, sent, seq): + + assert (text_to_sequence(sent) == seq) diff --git a/examples/pipeline_tacotron2/text/text_preprocessing.py b/examples/pipeline_tacotron2/text/text_preprocessing.py new file mode 100644 index 0000000000..cd8bc622d6 --- /dev/null +++ b/examples/pipeline_tacotron2/text/text_preprocessing.py @@ -0,0 +1,85 @@ +# ***************************************************************************** +# Copyright (c) 2017 Keith Ito +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ***************************************************************************** +""" +Modified from https://github.com/keithito/tacotron +""" + +from typing import List +import re + +from unidecode import unidecode + +from .numbers import normalize_numbers + + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r'\s+') + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), +]] + +_pad = '_' +_punctuation = '!\'(),.:;? ' +_special = '-' +_letters = 'abcdefghijklmnopqrstuvwxyz' + +symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) +_symbol_to_id = {s: i for i, s in enumerate(symbols)} + + +def text_to_sequence(sent: str) -> List[int]: + r'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + + Args: + sent (str): The input sentence to convert to a sequence. + + Returns: + List of integers corresponding to the symbols in the sentence. + ''' + sent = unidecode(sent) # convert to ascii + sent = sent.lower() # lower case + sent = normalize_numbers(sent) # expand numbers + for regex, replacement in _abbreviations: # expand abbreviations + sent = re.sub(regex, replacement, sent) + sent = re.sub(_whitespace_re, ' ', sent) # collapse whitespace + + return [_symbol_to_id[s] for s in sent if s in _symbol_to_id]