This commit is contained in:
qinzy
2024-02-19 23:15:47 +00:00
parent 734228934f
commit db237ce6a5
62 changed files with 14 additions and 17 deletions

View File

View File

@@ -0,0 +1,140 @@
import abc
from typing import List, Tuple
from .punctuation import Punctuation
class BasePhonemizer(abc.ABC):
"""Base phonemizer class
Phonemization follows the following steps:
1. Preprocessing:
- remove empty lines
- remove punctuation
- keep track of punctuation marks
2. Phonemization:
- convert text to phonemes
3. Postprocessing:
- join phonemes
- restore punctuation marks
Args:
language (str):
Language used by the phonemizer.
punctuations (List[str]):
List of punctuation marks to be preserved.
keep_puncs (bool):
Whether to preserve punctuation marks or not.
"""
def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
# ensure the backend is installed on the system
if not self.is_available():
raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover
# ensure the backend support the requested language
self._language = self._init_language(language)
# setup punctuation processing
self._keep_puncs = keep_puncs
self._punctuator = Punctuation(punctuations)
def _init_language(self, language):
"""Language initialization
This method may be overloaded in child classes (see Segments backend)
"""
if not self.is_supported_language(language):
raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend")
return language
@property
def language(self):
"""The language code configured to be used for phonemization"""
return self._language
@staticmethod
@abc.abstractmethod
def name():
"""The name of the backend"""
...
@classmethod
@abc.abstractmethod
def is_available(cls):
"""Returns True if the backend is installed, False otherwise"""
...
@classmethod
@abc.abstractmethod
def version(cls):
"""Return the backend version as a tuple (major, minor, patch)"""
...
@staticmethod
@abc.abstractmethod
def supported_languages():
"""Return a dict of language codes -> name supported by the backend"""
...
def is_supported_language(self, language):
"""Returns True if `language` is supported by the backend"""
return language in self.supported_languages()
@abc.abstractmethod
def _phonemize(self, text, separator):
"""The main phonemization method"""
def _phonemize_preprocess(self, text) -> Tuple[List[str], List]:
"""Preprocess the text before phonemization
1. remove spaces
2. remove punctuation
Override this if you need a different behaviour
"""
text = text.strip()
if self._keep_puncs:
# a tuple (text, punctuation marks)
return self._punctuator.strip_to_restore(text)
return [self._punctuator.strip(text)], []
def _phonemize_postprocess(self, phonemized, punctuations) -> str:
"""Postprocess the raw phonemized output
Override this if you need a different behaviour
"""
if self._keep_puncs:
return self._punctuator.restore(phonemized, punctuations)[0]
return phonemized[0]
def phonemize(self, text: str, separator="|", language: str = None) -> str: # pylint: disable=unused-argument
"""Returns the `text` phonemized for the given language
Args:
text (str):
Text to be phonemized.
separator (str):
string separator used between phonemes. Default to '_'.
Returns:
(str): Phonemized text
"""
text, punctuations = self._phonemize_preprocess(text)
phonemized = []
for t in text:
p = self._phonemize(t, separator)
phonemized.append(p)
phonemized = self._phonemize_postprocess(phonemized, punctuations)
return phonemized
def print_logs(self, level: int = 0):
indent = "\t" * level
print(f"{indent}| > phoneme language: {self.language}")
print(f"{indent}| > phoneme backend: {self.name()}")

View File

@@ -0,0 +1,122 @@
"""Set of default text cleaners"""
# TODO: pick the cleaner for languages dynamically
import re
from .french_abbreviations import abbreviations_fr
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
rep_map = {
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"·": ",",
"": ",",
"...": ".",
"": ".",
"$": ".",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"(": "",
")": "",
"": "",
"": "",
"": "",
"": "",
"[": "",
"]": "",
"": "",
"": "-",
"~": "-",
"": "",
"": "",
"¿" : "",
"¡" : ""
}
def replace_punctuation(text):
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
return replaced_text
def expand_abbreviations(text, lang="fr"):
if lang == "fr":
_abbreviations = abbreviations_fr
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, " ", text).strip()
def remove_punctuation_at_begin(text):
return re.sub(r'^[,.!?]+', '', text)
def remove_aux_symbols(text):
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
return text
def replace_symbols(text, lang="en"):
"""Replace symbols based on the lenguage tag.
Args:
text:
Input text.
lang:
Lenguage identifier. ex: "en", "fr", "pt", "ca".
Returns:
The modified text
example:
input args:
text: "si l'avi cau, diguem-ho"
lang: "ca"
Output:
text: "si lavi cau, diguemho"
"""
text = text.replace(";", ",")
text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
text = text.replace(":", ",")
if lang == "en":
text = text.replace("&", " and ")
elif lang == "fr":
text = text.replace("&", " et ")
elif lang == "pt":
text = text.replace("&", " e ")
elif lang == "ca":
text = text.replace("&", " i ")
text = text.replace("'", "")
elif lang== "es":
text=text.replace("&","y")
text = text.replace("'", "")
return text
def french_cleaners(text):
"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
text = expand_abbreviations(text, lang="fr")
# text = lowercase(text) # as we use the cased bert
text = replace_punctuation(text)
text = replace_symbols(text, lang="fr")
text = remove_aux_symbols(text)
text = remove_punctuation_at_begin(text)
text = collapse_whitespace(text)
text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
return text

View File

@@ -0,0 +1,78 @@
{"symbols": [
"_",
",",
".",
"!",
"?",
"-",
"~",
"\u2026",
"N",
"Q",
"a",
"b",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"\u0251",
"\u00e6",
"\u0283",
"\u0291",
"\u00e7",
"\u026f",
"\u026a",
"\u0254",
"\u025b",
"\u0279",
"\u00f0",
"\u0259",
"\u026b",
"\u0265",
"\u0278",
"\u028a",
"\u027e",
"\u0292",
"\u03b8",
"\u03b2",
"\u014b",
"\u0266",
"\u207c",
"\u02b0",
"`",
"^",
"#",
"*",
"=",
"\u02c8",
"\u02cc",
"\u2192",
"\u2193",
"\u2191",
" ",
"ɣ",
"ɡ",
"r",
"ɲ",
"ʝ",
"ʎ",
"ː"
]
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,89 @@
{
"symbols": [
"_",
",",
".",
"!",
"?",
"-",
"~",
"\u2026",
"N",
"Q",
"a",
"b",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"\u0251",
"\u00e6",
"\u0283",
"\u0291",
"\u00e7",
"\u026f",
"\u026a",
"\u0254",
"\u025b",
"\u0279",
"\u00f0",
"\u0259",
"\u026b",
"\u0265",
"\u0278",
"\u028a",
"\u027e",
"\u0292",
"\u03b8",
"\u03b2",
"\u014b",
"\u0266",
"\u207c",
"\u02b0",
"`",
"^",
"#",
"*",
"=",
"\u02c8",
"\u02cc",
"\u2192",
"\u2193",
"\u2191",
" ",
"\u0263",
"\u0261",
"r",
"\u0272",
"\u029d",
"\u028e",
"\u02d0",
"\u0303",
"\u0153",
"\u00f8",
"\u0281",
"\u0252",
"\u028c",
"\u2014",
"\u025c",
"\u0250"
]
}

View File

@@ -0,0 +1,30 @@
from .cleaner import french_cleaners
from .gruut_wrapper import Gruut
def remove_consecutive_t(input_str):
result = []
count = 0
for char in input_str:
if char == 't':
count += 1
else:
if count < 3:
result.extend(['t'] * count)
count = 0
result.append(char)
if count < 3:
result.extend(['t'] * count)
return ''.join(result)
def fr2ipa(text):
e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
# text = french_cleaners(text)
phonemes = e.phonemize(text, separator="")
# print(phonemes)
phonemes = remove_consecutive_t(phonemes)
# print(phonemes)
return phonemes

View File

@@ -0,0 +1,48 @@
import re
# List of (regular expression, replacement) pairs for abbreviations in french:
abbreviations_fr = [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("M", "monsieur"),
("Mlle", "mademoiselle"),
("Mlles", "mesdemoiselles"),
("Mme", "Madame"),
("Mmes", "Mesdames"),
("N.B", "nota bene"),
("M", "monsieur"),
("p.c.q", "parce que"),
("Pr", "professeur"),
("qqch", "quelque chose"),
("rdv", "rendez-vous"),
("max", "maximum"),
("min", "minimum"),
("no", "numéro"),
("adr", "adresse"),
("dr", "docteur"),
("st", "saint"),
("co", "companie"),
("jr", "junior"),
("sgt", "sergent"),
("capt", "capitain"),
("col", "colonel"),
("av", "avenue"),
("av. J.-C", "avant Jésus-Christ"),
("apr. J.-C", "après Jésus-Christ"),
("art", "article"),
("boul", "boulevard"),
("c.-à-d", "cest-à-dire"),
("etc", "et cetera"),
("ex", "exemple"),
("excl", "exclusivement"),
("boul", "boulevard"),
]
] + [
(re.compile("\\b%s" % x[0]), x[1])
for x in [
("Mlle", "mademoiselle"),
("Mlles", "mesdemoiselles"),
("Mme", "Madame"),
("Mmes", "Mesdames"),
]
]

View File

@@ -0,0 +1 @@
_,.!?-~…NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ ɣɡrɲʝʎː̃œøʁɒʌ—ɜɐ

View File

@@ -0,0 +1,258 @@
import importlib
from typing import List
import gruut
from gruut_ipa import IPA # pip install gruut_ipa
from .base import BasePhonemizer
from .punctuation import Punctuation
# Table for str.translate to fix gruut/TTS phoneme mismatch
GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
class Gruut(BasePhonemizer):
"""Gruut wrapper for G2P
Args:
language (str):
Valid language code for the used backend.
punctuations (str):
Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`.
keep_puncs (bool):
If true, keep the punctuations after phonemization. Defaults to True.
use_espeak_phonemes (bool):
If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False.
keep_stress (bool):
If true, keep the stress characters after phonemization. Defaults to False.
Example:
>>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
>>> phonemizer = Gruut('en-us')
>>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|")
'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?'
"""
def __init__(
self,
language: str,
punctuations=Punctuation.default_puncs(),
keep_puncs=True,
use_espeak_phonemes=False,
keep_stress=False,
):
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
self.use_espeak_phonemes = use_espeak_phonemes
self.keep_stress = keep_stress
@staticmethod
def name():
return "gruut"
def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument
"""Convert input text to phonemes.
Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
that constitude a single sound.
It doesn't affect 🐸TTS since it individually converts each character to token IDs.
Examples::
"hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ`
Args:
text (str):
Text to be converted to phonemes.
tie (bool, optional) : When True use a '͡' character between
consecutive characters of a single phoneme. Else separate phoneme
with '_'. This option requires espeak>=1.49. Default to False.
"""
ph_list = []
for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes):
for word in sentence:
if word.is_break:
# Use actual character for break phoneme (e.g., comma)
if ph_list:
# Join with previous word
ph_list[-1].append(word.text)
else:
# First word is punctuation
ph_list.append([word.text])
elif word.phonemes:
# Add phonemes for word
word_phonemes = []
for word_phoneme in word.phonemes:
if not self.keep_stress:
# Remove primary/secondary stress
word_phoneme = IPA.without_stress(word_phoneme)
word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE)
if word_phoneme:
# Flatten phonemes
word_phonemes.extend(word_phoneme)
if word_phonemes:
ph_list.append(word_phonemes)
ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list]
ph = f"{separator} ".join(ph_words)
return ph
def _phonemize(self, text, separator):
return self.phonemize_gruut(text, separator, tie=False)
def is_supported_language(self, language):
"""Returns True if `language` is supported by the backend"""
return gruut.is_language_supported(language)
@staticmethod
def supported_languages() -> List:
"""Get a dictionary of supported languages.
Returns:
List: List of language codes.
"""
return list(gruut.get_supported_languages())
def version(self):
"""Get the version of the used backend.
Returns:
str: Version of the used backend.
"""
return gruut.__version__
@classmethod
def is_available(cls):
"""Return true if ESpeak is available else false"""
return importlib.util.find_spec("gruut") is not None
if __name__ == "__main__":
from cleaner import french_cleaners
import json
e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
symbols = [ # en + sp
"_",
",",
".",
"!",
"?",
"-",
"~",
"\u2026",
"N",
"Q",
"a",
"b",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"\u0251",
"\u00e6",
"\u0283",
"\u0291",
"\u00e7",
"\u026f",
"\u026a",
"\u0254",
"\u025b",
"\u0279",
"\u00f0",
"\u0259",
"\u026b",
"\u0265",
"\u0278",
"\u028a",
"\u027e",
"\u0292",
"\u03b8",
"\u03b2",
"\u014b",
"\u0266",
"\u207c",
"\u02b0",
"`",
"^",
"#",
"*",
"=",
"\u02c8",
"\u02cc",
"\u2192",
"\u2193",
"\u2191",
" ",
"ɣ",
"ɡ",
"r",
"ɲ",
"ʝ",
"ʎ",
"ː"
]
with open('/home/xumin/workspace/VITS-Training-Multiling/230715_fr/metadata.txt', 'r') as f:
lines = f.readlines()
used_sym = []
not_existed_sym = []
phonemes = []
for line in lines:
text = line.split('|')[-1].strip()
text = french_cleaners(text)
ipa = e.phonemize(text, separator="")
phonemes.append(ipa)
for s in ipa:
if s not in symbols:
if s not in not_existed_sym:
print(f'not_existed char: {s}')
not_existed_sym.append(s)
else:
if s not in used_sym:
# print(f'used char: {s}')
used_sym.append(s)
print(used_sym)
print(not_existed_sym)
with open('./text/fr_phonemizer/french_symbols.txt', 'w') as g:
g.writelines(symbols + not_existed_sym)
with open('./text/fr_phonemizer/example_ipa.txt', 'w') as g:
g.writelines(phonemes)
data = {'symbols': symbols + not_existed_sym}
with open('./text/fr_phonemizer/fr_symbols.json', 'w') as f:
json.dump(data, f, indent=4)

View File

@@ -0,0 +1,172 @@
import collections
import re
from enum import Enum
import six
_DEF_PUNCS = ';:,.!?¡¿—…"«»“”'
_PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"])
class PuncPosition(Enum):
"""Enum for the punctuations positions"""
BEGIN = 0
END = 1
MIDDLE = 2
ALONE = 3
class Punctuation:
"""Handle punctuations in text.
Just strip punctuations from text or strip and restore them later.
Args:
puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`.
Example:
>>> punc = Punctuation()
>>> punc.strip("This is. example !")
'This is example'
>>> text_striped, punc_map = punc.strip_to_restore("This is. example !")
>>> ' '.join(text_striped)
'This is example'
>>> text_restored = punc.restore(text_striped, punc_map)
>>> text_restored[0]
'This is. example !'
"""
def __init__(self, puncs: str = _DEF_PUNCS):
self.puncs = puncs
@staticmethod
def default_puncs():
"""Return default set of punctuations."""
return _DEF_PUNCS
@property
def puncs(self):
return self._puncs
@puncs.setter
def puncs(self, value):
if not isinstance(value, six.string_types):
raise ValueError("[!] Punctuations must be of type str.")
self._puncs = "".join(list(dict.fromkeys(list(value)))) # remove duplicates without changing the oreder
self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+")
def strip(self, text):
"""Remove all the punctuations by replacing with `space`.
Args:
text (str): The text to be processed.
Example::
"This is. example !" -> "This is example "
"""
return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip()
def strip_to_restore(self, text):
"""Remove punctuations from text to restore them later.
Args:
text (str): The text to be processed.
Examples ::
"This is. example !" -> [["This is", "example"], [".", "!"]]
"""
text, puncs = self._strip_to_restore(text)
return text, puncs
def _strip_to_restore(self, text):
"""Auxiliary method for Punctuation.preserve()"""
matches = list(re.finditer(self.puncs_regular_exp, text))
if not matches:
return [text], []
# the text is only punctuations
if len(matches) == 1 and matches[0].group() == text:
return [], [_PUNC_IDX(text, PuncPosition.ALONE)]
# build a punctuation map to be used later to restore punctuations
puncs = []
for match in matches:
position = PuncPosition.MIDDLE
if match == matches[0] and text.startswith(match.group()):
position = PuncPosition.BEGIN
elif match == matches[-1] and text.endswith(match.group()):
position = PuncPosition.END
puncs.append(_PUNC_IDX(match.group(), position))
# convert str text to a List[str], each item is separated by a punctuation
splitted_text = []
for idx, punc in enumerate(puncs):
split = text.split(punc.punc)
prefix, suffix = split[0], punc.punc.join(split[1:])
splitted_text.append(prefix)
# if the text does not end with a punctuation, add it to the last item
if idx == len(puncs) - 1 and len(suffix) > 0:
splitted_text.append(suffix)
text = suffix
return splitted_text, puncs
@classmethod
def restore(cls, text, puncs):
"""Restore punctuation in a text.
Args:
text (str): The text to be processed.
puncs (List[str]): The list of punctuations map to be used for restoring.
Examples ::
['This is', 'example'], ['.', '!'] -> "This is. example!"
"""
return cls._restore(text, puncs, 0)
@classmethod
def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements
"""Auxiliary method for Punctuation.restore()"""
if not puncs:
return text
# nothing have been phonemized, returns the puncs alone
if not text:
return ["".join(m.punc for m in puncs)]
current = puncs[0]
if current.position == PuncPosition.BEGIN:
return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num)
if current.position == PuncPosition.END:
return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1)
if current.position == PuncPosition.ALONE:
return [current.mark] + cls._restore(text, puncs[1:], num + 1)
# POSITION == MIDDLE
if len(text) == 1: # pragma: nocover
# a corner case where the final part of an intermediate
# mark (I) has not been phonemized
return cls._restore([text[0] + current.punc], puncs[1:], num)
return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
# if __name__ == "__main__":
# punc = Punctuation()
# text = "This is. This is, example!"
# print(punc.strip(text))
# split_text, puncs = punc.strip_to_restore(text)
# print(split_text, " ---- ", puncs)
# restored_text = punc.restore(split_text, puncs)
# print(restored_text)