rename
This commit is contained in:
0
melo/text/fr_phonemizer/__init__.py
Normal file
0
melo/text/fr_phonemizer/__init__.py
Normal file
140
melo/text/fr_phonemizer/base.py
Normal file
140
melo/text/fr_phonemizer/base.py
Normal file
@@ -0,0 +1,140 @@
|
||||
import abc
|
||||
from typing import List, Tuple
|
||||
|
||||
from .punctuation import Punctuation
|
||||
|
||||
|
||||
class BasePhonemizer(abc.ABC):
|
||||
"""Base phonemizer class
|
||||
|
||||
Phonemization follows the following steps:
|
||||
1. Preprocessing:
|
||||
- remove empty lines
|
||||
- remove punctuation
|
||||
- keep track of punctuation marks
|
||||
|
||||
2. Phonemization:
|
||||
- convert text to phonemes
|
||||
|
||||
3. Postprocessing:
|
||||
- join phonemes
|
||||
- restore punctuation marks
|
||||
|
||||
Args:
|
||||
language (str):
|
||||
Language used by the phonemizer.
|
||||
|
||||
punctuations (List[str]):
|
||||
List of punctuation marks to be preserved.
|
||||
|
||||
keep_puncs (bool):
|
||||
Whether to preserve punctuation marks or not.
|
||||
"""
|
||||
|
||||
def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
|
||||
# ensure the backend is installed on the system
|
||||
if not self.is_available():
|
||||
raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover
|
||||
|
||||
# ensure the backend support the requested language
|
||||
self._language = self._init_language(language)
|
||||
|
||||
# setup punctuation processing
|
||||
self._keep_puncs = keep_puncs
|
||||
self._punctuator = Punctuation(punctuations)
|
||||
|
||||
def _init_language(self, language):
|
||||
"""Language initialization
|
||||
|
||||
This method may be overloaded in child classes (see Segments backend)
|
||||
|
||||
"""
|
||||
if not self.is_supported_language(language):
|
||||
raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend")
|
||||
return language
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
"""The language code configured to be used for phonemization"""
|
||||
return self._language
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def name():
|
||||
"""The name of the backend"""
|
||||
...
|
||||
|
||||
@classmethod
|
||||
@abc.abstractmethod
|
||||
def is_available(cls):
|
||||
"""Returns True if the backend is installed, False otherwise"""
|
||||
...
|
||||
|
||||
@classmethod
|
||||
@abc.abstractmethod
|
||||
def version(cls):
|
||||
"""Return the backend version as a tuple (major, minor, patch)"""
|
||||
...
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def supported_languages():
|
||||
"""Return a dict of language codes -> name supported by the backend"""
|
||||
...
|
||||
|
||||
def is_supported_language(self, language):
|
||||
"""Returns True if `language` is supported by the backend"""
|
||||
return language in self.supported_languages()
|
||||
|
||||
@abc.abstractmethod
|
||||
def _phonemize(self, text, separator):
|
||||
"""The main phonemization method"""
|
||||
|
||||
def _phonemize_preprocess(self, text) -> Tuple[List[str], List]:
|
||||
"""Preprocess the text before phonemization
|
||||
|
||||
1. remove spaces
|
||||
2. remove punctuation
|
||||
|
||||
Override this if you need a different behaviour
|
||||
"""
|
||||
text = text.strip()
|
||||
if self._keep_puncs:
|
||||
# a tuple (text, punctuation marks)
|
||||
return self._punctuator.strip_to_restore(text)
|
||||
return [self._punctuator.strip(text)], []
|
||||
|
||||
def _phonemize_postprocess(self, phonemized, punctuations) -> str:
|
||||
"""Postprocess the raw phonemized output
|
||||
|
||||
Override this if you need a different behaviour
|
||||
"""
|
||||
if self._keep_puncs:
|
||||
return self._punctuator.restore(phonemized, punctuations)[0]
|
||||
return phonemized[0]
|
||||
|
||||
def phonemize(self, text: str, separator="|", language: str = None) -> str: # pylint: disable=unused-argument
|
||||
"""Returns the `text` phonemized for the given language
|
||||
|
||||
Args:
|
||||
text (str):
|
||||
Text to be phonemized.
|
||||
|
||||
separator (str):
|
||||
string separator used between phonemes. Default to '_'.
|
||||
|
||||
Returns:
|
||||
(str): Phonemized text
|
||||
"""
|
||||
text, punctuations = self._phonemize_preprocess(text)
|
||||
phonemized = []
|
||||
for t in text:
|
||||
p = self._phonemize(t, separator)
|
||||
phonemized.append(p)
|
||||
phonemized = self._phonemize_postprocess(phonemized, punctuations)
|
||||
return phonemized
|
||||
|
||||
def print_logs(self, level: int = 0):
|
||||
indent = "\t" * level
|
||||
print(f"{indent}| > phoneme language: {self.language}")
|
||||
print(f"{indent}| > phoneme backend: {self.name()}")
|
||||
122
melo/text/fr_phonemizer/cleaner.py
Normal file
122
melo/text/fr_phonemizer/cleaner.py
Normal file
@@ -0,0 +1,122 @@
|
||||
"""Set of default text cleaners"""
|
||||
# TODO: pick the cleaner for languages dynamically
|
||||
|
||||
import re
|
||||
from .french_abbreviations import abbreviations_fr
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
_whitespace_re = re.compile(r"\s+")
|
||||
|
||||
|
||||
rep_map = {
|
||||
":": ",",
|
||||
";": ",",
|
||||
",": ",",
|
||||
"。": ".",
|
||||
"!": "!",
|
||||
"?": "?",
|
||||
"\n": ".",
|
||||
"·": ",",
|
||||
"、": ",",
|
||||
"...": ".",
|
||||
"…": ".",
|
||||
"$": ".",
|
||||
"“": "",
|
||||
"”": "",
|
||||
"‘": "",
|
||||
"’": "",
|
||||
"(": "",
|
||||
")": "",
|
||||
"(": "",
|
||||
")": "",
|
||||
"《": "",
|
||||
"》": "",
|
||||
"【": "",
|
||||
"】": "",
|
||||
"[": "",
|
||||
"]": "",
|
||||
"—": "",
|
||||
"~": "-",
|
||||
"~": "-",
|
||||
"「": "",
|
||||
"」": "",
|
||||
"¿" : "",
|
||||
"¡" : ""
|
||||
}
|
||||
|
||||
|
||||
def replace_punctuation(text):
|
||||
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
||||
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
||||
return replaced_text
|
||||
|
||||
def expand_abbreviations(text, lang="fr"):
|
||||
if lang == "fr":
|
||||
_abbreviations = abbreviations_fr
|
||||
for regex, replacement in _abbreviations:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def lowercase(text):
|
||||
return text.lower()
|
||||
|
||||
|
||||
def collapse_whitespace(text):
|
||||
return re.sub(_whitespace_re, " ", text).strip()
|
||||
|
||||
def remove_punctuation_at_begin(text):
|
||||
return re.sub(r'^[,.!?]+', '', text)
|
||||
|
||||
def remove_aux_symbols(text):
|
||||
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
|
||||
return text
|
||||
|
||||
|
||||
def replace_symbols(text, lang="en"):
|
||||
"""Replace symbols based on the lenguage tag.
|
||||
|
||||
Args:
|
||||
text:
|
||||
Input text.
|
||||
lang:
|
||||
Lenguage identifier. ex: "en", "fr", "pt", "ca".
|
||||
|
||||
Returns:
|
||||
The modified text
|
||||
example:
|
||||
input args:
|
||||
text: "si l'avi cau, diguem-ho"
|
||||
lang: "ca"
|
||||
Output:
|
||||
text: "si lavi cau, diguemho"
|
||||
"""
|
||||
text = text.replace(";", ",")
|
||||
text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
|
||||
text = text.replace(":", ",")
|
||||
if lang == "en":
|
||||
text = text.replace("&", " and ")
|
||||
elif lang == "fr":
|
||||
text = text.replace("&", " et ")
|
||||
elif lang == "pt":
|
||||
text = text.replace("&", " e ")
|
||||
elif lang == "ca":
|
||||
text = text.replace("&", " i ")
|
||||
text = text.replace("'", "")
|
||||
elif lang== "es":
|
||||
text=text.replace("&","y")
|
||||
text = text.replace("'", "")
|
||||
return text
|
||||
|
||||
def french_cleaners(text):
|
||||
"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
|
||||
text = expand_abbreviations(text, lang="fr")
|
||||
# text = lowercase(text) # as we use the cased bert
|
||||
text = replace_punctuation(text)
|
||||
text = replace_symbols(text, lang="fr")
|
||||
text = remove_aux_symbols(text)
|
||||
text = remove_punctuation_at_begin(text)
|
||||
text = collapse_whitespace(text)
|
||||
text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
|
||||
return text
|
||||
|
||||
78
melo/text/fr_phonemizer/en_symbols.json
Normal file
78
melo/text/fr_phonemizer/en_symbols.json
Normal file
@@ -0,0 +1,78 @@
|
||||
{"symbols": [
|
||||
"_",
|
||||
",",
|
||||
".",
|
||||
"!",
|
||||
"?",
|
||||
"-",
|
||||
"~",
|
||||
"\u2026",
|
||||
"N",
|
||||
"Q",
|
||||
"a",
|
||||
"b",
|
||||
"d",
|
||||
"e",
|
||||
"f",
|
||||
"g",
|
||||
"h",
|
||||
"i",
|
||||
"j",
|
||||
"k",
|
||||
"l",
|
||||
"m",
|
||||
"n",
|
||||
"o",
|
||||
"p",
|
||||
"s",
|
||||
"t",
|
||||
"u",
|
||||
"v",
|
||||
"w",
|
||||
"x",
|
||||
"y",
|
||||
"z",
|
||||
"\u0251",
|
||||
"\u00e6",
|
||||
"\u0283",
|
||||
"\u0291",
|
||||
"\u00e7",
|
||||
"\u026f",
|
||||
"\u026a",
|
||||
"\u0254",
|
||||
"\u025b",
|
||||
"\u0279",
|
||||
"\u00f0",
|
||||
"\u0259",
|
||||
"\u026b",
|
||||
"\u0265",
|
||||
"\u0278",
|
||||
"\u028a",
|
||||
"\u027e",
|
||||
"\u0292",
|
||||
"\u03b8",
|
||||
"\u03b2",
|
||||
"\u014b",
|
||||
"\u0266",
|
||||
"\u207c",
|
||||
"\u02b0",
|
||||
"`",
|
||||
"^",
|
||||
"#",
|
||||
"*",
|
||||
"=",
|
||||
"\u02c8",
|
||||
"\u02cc",
|
||||
"\u2192",
|
||||
"\u2193",
|
||||
"\u2191",
|
||||
" ",
|
||||
"ɣ",
|
||||
"ɡ",
|
||||
"r",
|
||||
"ɲ",
|
||||
"ʝ",
|
||||
"ʎ",
|
||||
"ː"
|
||||
]
|
||||
}
|
||||
1
melo/text/fr_phonemizer/example_ipa.txt
Normal file
1
melo/text/fr_phonemizer/example_ipa.txt
Normal file
File diff suppressed because one or more lines are too long
89
melo/text/fr_phonemizer/fr_symbols.json
Normal file
89
melo/text/fr_phonemizer/fr_symbols.json
Normal file
@@ -0,0 +1,89 @@
|
||||
{
|
||||
"symbols": [
|
||||
"_",
|
||||
",",
|
||||
".",
|
||||
"!",
|
||||
"?",
|
||||
"-",
|
||||
"~",
|
||||
"\u2026",
|
||||
"N",
|
||||
"Q",
|
||||
"a",
|
||||
"b",
|
||||
"d",
|
||||
"e",
|
||||
"f",
|
||||
"g",
|
||||
"h",
|
||||
"i",
|
||||
"j",
|
||||
"k",
|
||||
"l",
|
||||
"m",
|
||||
"n",
|
||||
"o",
|
||||
"p",
|
||||
"s",
|
||||
"t",
|
||||
"u",
|
||||
"v",
|
||||
"w",
|
||||
"x",
|
||||
"y",
|
||||
"z",
|
||||
"\u0251",
|
||||
"\u00e6",
|
||||
"\u0283",
|
||||
"\u0291",
|
||||
"\u00e7",
|
||||
"\u026f",
|
||||
"\u026a",
|
||||
"\u0254",
|
||||
"\u025b",
|
||||
"\u0279",
|
||||
"\u00f0",
|
||||
"\u0259",
|
||||
"\u026b",
|
||||
"\u0265",
|
||||
"\u0278",
|
||||
"\u028a",
|
||||
"\u027e",
|
||||
"\u0292",
|
||||
"\u03b8",
|
||||
"\u03b2",
|
||||
"\u014b",
|
||||
"\u0266",
|
||||
"\u207c",
|
||||
"\u02b0",
|
||||
"`",
|
||||
"^",
|
||||
"#",
|
||||
"*",
|
||||
"=",
|
||||
"\u02c8",
|
||||
"\u02cc",
|
||||
"\u2192",
|
||||
"\u2193",
|
||||
"\u2191",
|
||||
" ",
|
||||
"\u0263",
|
||||
"\u0261",
|
||||
"r",
|
||||
"\u0272",
|
||||
"\u029d",
|
||||
"\u028e",
|
||||
"\u02d0",
|
||||
|
||||
"\u0303",
|
||||
"\u0153",
|
||||
"\u00f8",
|
||||
"\u0281",
|
||||
"\u0252",
|
||||
"\u028c",
|
||||
"\u2014",
|
||||
"\u025c",
|
||||
"\u0250"
|
||||
]
|
||||
}
|
||||
30
melo/text/fr_phonemizer/fr_to_ipa.py
Normal file
30
melo/text/fr_phonemizer/fr_to_ipa.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from .cleaner import french_cleaners
|
||||
from .gruut_wrapper import Gruut
|
||||
|
||||
|
||||
def remove_consecutive_t(input_str):
|
||||
result = []
|
||||
count = 0
|
||||
|
||||
for char in input_str:
|
||||
if char == 't':
|
||||
count += 1
|
||||
else:
|
||||
if count < 3:
|
||||
result.extend(['t'] * count)
|
||||
count = 0
|
||||
result.append(char)
|
||||
|
||||
if count < 3:
|
||||
result.extend(['t'] * count)
|
||||
|
||||
return ''.join(result)
|
||||
|
||||
def fr2ipa(text):
|
||||
e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
|
||||
# text = french_cleaners(text)
|
||||
phonemes = e.phonemize(text, separator="")
|
||||
# print(phonemes)
|
||||
phonemes = remove_consecutive_t(phonemes)
|
||||
# print(phonemes)
|
||||
return phonemes
|
||||
48
melo/text/fr_phonemizer/french_abbreviations.py
Normal file
48
melo/text/fr_phonemizer/french_abbreviations.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import re
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations in french:
|
||||
abbreviations_fr = [
|
||||
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
("M", "monsieur"),
|
||||
("Mlle", "mademoiselle"),
|
||||
("Mlles", "mesdemoiselles"),
|
||||
("Mme", "Madame"),
|
||||
("Mmes", "Mesdames"),
|
||||
("N.B", "nota bene"),
|
||||
("M", "monsieur"),
|
||||
("p.c.q", "parce que"),
|
||||
("Pr", "professeur"),
|
||||
("qqch", "quelque chose"),
|
||||
("rdv", "rendez-vous"),
|
||||
("max", "maximum"),
|
||||
("min", "minimum"),
|
||||
("no", "numéro"),
|
||||
("adr", "adresse"),
|
||||
("dr", "docteur"),
|
||||
("st", "saint"),
|
||||
("co", "companie"),
|
||||
("jr", "junior"),
|
||||
("sgt", "sergent"),
|
||||
("capt", "capitain"),
|
||||
("col", "colonel"),
|
||||
("av", "avenue"),
|
||||
("av. J.-C", "avant Jésus-Christ"),
|
||||
("apr. J.-C", "après Jésus-Christ"),
|
||||
("art", "article"),
|
||||
("boul", "boulevard"),
|
||||
("c.-à-d", "c’est-à-dire"),
|
||||
("etc", "et cetera"),
|
||||
("ex", "exemple"),
|
||||
("excl", "exclusivement"),
|
||||
("boul", "boulevard"),
|
||||
]
|
||||
] + [
|
||||
(re.compile("\\b%s" % x[0]), x[1])
|
||||
for x in [
|
||||
("Mlle", "mademoiselle"),
|
||||
("Mlles", "mesdemoiselles"),
|
||||
("Mme", "Madame"),
|
||||
("Mmes", "Mesdames"),
|
||||
]
|
||||
]
|
||||
1
melo/text/fr_phonemizer/french_symbols.txt
Normal file
1
melo/text/fr_phonemizer/french_symbols.txt
Normal file
@@ -0,0 +1 @@
|
||||
_,.!?-~…NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ ɣɡrɲʝʎː̃œøʁɒʌ—ɜɐ
|
||||
258
melo/text/fr_phonemizer/gruut_wrapper.py
Normal file
258
melo/text/fr_phonemizer/gruut_wrapper.py
Normal file
@@ -0,0 +1,258 @@
|
||||
import importlib
|
||||
from typing import List
|
||||
|
||||
import gruut
|
||||
from gruut_ipa import IPA # pip install gruut_ipa
|
||||
|
||||
from .base import BasePhonemizer
|
||||
from .punctuation import Punctuation
|
||||
|
||||
# Table for str.translate to fix gruut/TTS phoneme mismatch
|
||||
GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
|
||||
|
||||
|
||||
class Gruut(BasePhonemizer):
|
||||
"""Gruut wrapper for G2P
|
||||
|
||||
Args:
|
||||
language (str):
|
||||
Valid language code for the used backend.
|
||||
|
||||
punctuations (str):
|
||||
Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`.
|
||||
|
||||
keep_puncs (bool):
|
||||
If true, keep the punctuations after phonemization. Defaults to True.
|
||||
|
||||
use_espeak_phonemes (bool):
|
||||
If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False.
|
||||
|
||||
keep_stress (bool):
|
||||
If true, keep the stress characters after phonemization. Defaults to False.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
|
||||
>>> phonemizer = Gruut('en-us')
|
||||
>>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|")
|
||||
'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?'
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
language: str,
|
||||
punctuations=Punctuation.default_puncs(),
|
||||
keep_puncs=True,
|
||||
use_espeak_phonemes=False,
|
||||
keep_stress=False,
|
||||
):
|
||||
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
|
||||
self.use_espeak_phonemes = use_espeak_phonemes
|
||||
self.keep_stress = keep_stress
|
||||
|
||||
@staticmethod
|
||||
def name():
|
||||
return "gruut"
|
||||
|
||||
def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument
|
||||
"""Convert input text to phonemes.
|
||||
|
||||
Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
|
||||
that constitude a single sound.
|
||||
|
||||
It doesn't affect 🐸TTS since it individually converts each character to token IDs.
|
||||
|
||||
Examples::
|
||||
"hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ`
|
||||
|
||||
Args:
|
||||
text (str):
|
||||
Text to be converted to phonemes.
|
||||
|
||||
tie (bool, optional) : When True use a '͡' character between
|
||||
consecutive characters of a single phoneme. Else separate phoneme
|
||||
with '_'. This option requires espeak>=1.49. Default to False.
|
||||
"""
|
||||
ph_list = []
|
||||
for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes):
|
||||
for word in sentence:
|
||||
if word.is_break:
|
||||
# Use actual character for break phoneme (e.g., comma)
|
||||
if ph_list:
|
||||
# Join with previous word
|
||||
ph_list[-1].append(word.text)
|
||||
else:
|
||||
# First word is punctuation
|
||||
ph_list.append([word.text])
|
||||
elif word.phonemes:
|
||||
# Add phonemes for word
|
||||
word_phonemes = []
|
||||
|
||||
for word_phoneme in word.phonemes:
|
||||
if not self.keep_stress:
|
||||
# Remove primary/secondary stress
|
||||
word_phoneme = IPA.without_stress(word_phoneme)
|
||||
|
||||
word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE)
|
||||
|
||||
if word_phoneme:
|
||||
# Flatten phonemes
|
||||
word_phonemes.extend(word_phoneme)
|
||||
|
||||
if word_phonemes:
|
||||
ph_list.append(word_phonemes)
|
||||
|
||||
ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list]
|
||||
ph = f"{separator} ".join(ph_words)
|
||||
return ph
|
||||
|
||||
def _phonemize(self, text, separator):
|
||||
return self.phonemize_gruut(text, separator, tie=False)
|
||||
|
||||
def is_supported_language(self, language):
|
||||
"""Returns True if `language` is supported by the backend"""
|
||||
return gruut.is_language_supported(language)
|
||||
|
||||
@staticmethod
|
||||
def supported_languages() -> List:
|
||||
"""Get a dictionary of supported languages.
|
||||
|
||||
Returns:
|
||||
List: List of language codes.
|
||||
"""
|
||||
return list(gruut.get_supported_languages())
|
||||
|
||||
def version(self):
|
||||
"""Get the version of the used backend.
|
||||
|
||||
Returns:
|
||||
str: Version of the used backend.
|
||||
"""
|
||||
return gruut.__version__
|
||||
|
||||
@classmethod
|
||||
def is_available(cls):
|
||||
"""Return true if ESpeak is available else false"""
|
||||
return importlib.util.find_spec("gruut") is not None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from cleaner import french_cleaners
|
||||
import json
|
||||
|
||||
e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
|
||||
symbols = [ # en + sp
|
||||
"_",
|
||||
",",
|
||||
".",
|
||||
"!",
|
||||
"?",
|
||||
"-",
|
||||
"~",
|
||||
"\u2026",
|
||||
"N",
|
||||
"Q",
|
||||
"a",
|
||||
"b",
|
||||
"d",
|
||||
"e",
|
||||
"f",
|
||||
"g",
|
||||
"h",
|
||||
"i",
|
||||
"j",
|
||||
"k",
|
||||
"l",
|
||||
"m",
|
||||
"n",
|
||||
"o",
|
||||
"p",
|
||||
"s",
|
||||
"t",
|
||||
"u",
|
||||
"v",
|
||||
"w",
|
||||
"x",
|
||||
"y",
|
||||
"z",
|
||||
"\u0251",
|
||||
"\u00e6",
|
||||
"\u0283",
|
||||
"\u0291",
|
||||
"\u00e7",
|
||||
"\u026f",
|
||||
"\u026a",
|
||||
"\u0254",
|
||||
"\u025b",
|
||||
"\u0279",
|
||||
"\u00f0",
|
||||
"\u0259",
|
||||
"\u026b",
|
||||
"\u0265",
|
||||
"\u0278",
|
||||
"\u028a",
|
||||
"\u027e",
|
||||
"\u0292",
|
||||
"\u03b8",
|
||||
"\u03b2",
|
||||
"\u014b",
|
||||
"\u0266",
|
||||
"\u207c",
|
||||
"\u02b0",
|
||||
"`",
|
||||
"^",
|
||||
"#",
|
||||
"*",
|
||||
"=",
|
||||
"\u02c8",
|
||||
"\u02cc",
|
||||
"\u2192",
|
||||
"\u2193",
|
||||
"\u2191",
|
||||
" ",
|
||||
"ɣ",
|
||||
"ɡ",
|
||||
"r",
|
||||
"ɲ",
|
||||
"ʝ",
|
||||
"ʎ",
|
||||
"ː"
|
||||
]
|
||||
with open('/home/xumin/workspace/VITS-Training-Multiling/230715_fr/metadata.txt', 'r') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
|
||||
used_sym = []
|
||||
not_existed_sym = []
|
||||
phonemes = []
|
||||
|
||||
for line in lines:
|
||||
text = line.split('|')[-1].strip()
|
||||
text = french_cleaners(text)
|
||||
ipa = e.phonemize(text, separator="")
|
||||
phonemes.append(ipa)
|
||||
for s in ipa:
|
||||
if s not in symbols:
|
||||
if s not in not_existed_sym:
|
||||
print(f'not_existed char: {s}')
|
||||
not_existed_sym.append(s)
|
||||
else:
|
||||
if s not in used_sym:
|
||||
# print(f'used char: {s}')
|
||||
used_sym.append(s)
|
||||
|
||||
print(used_sym)
|
||||
print(not_existed_sym)
|
||||
|
||||
|
||||
with open('./text/fr_phonemizer/french_symbols.txt', 'w') as g:
|
||||
g.writelines(symbols + not_existed_sym)
|
||||
|
||||
with open('./text/fr_phonemizer/example_ipa.txt', 'w') as g:
|
||||
g.writelines(phonemes)
|
||||
|
||||
data = {'symbols': symbols + not_existed_sym}
|
||||
|
||||
with open('./text/fr_phonemizer/fr_symbols.json', 'w') as f:
|
||||
json.dump(data, f, indent=4)
|
||||
|
||||
172
melo/text/fr_phonemizer/punctuation.py
Normal file
172
melo/text/fr_phonemizer/punctuation.py
Normal file
@@ -0,0 +1,172 @@
|
||||
import collections
|
||||
import re
|
||||
from enum import Enum
|
||||
|
||||
import six
|
||||
|
||||
_DEF_PUNCS = ';:,.!?¡¿—…"«»“”'
|
||||
|
||||
_PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"])
|
||||
|
||||
|
||||
class PuncPosition(Enum):
|
||||
"""Enum for the punctuations positions"""
|
||||
|
||||
BEGIN = 0
|
||||
END = 1
|
||||
MIDDLE = 2
|
||||
ALONE = 3
|
||||
|
||||
|
||||
class Punctuation:
|
||||
"""Handle punctuations in text.
|
||||
|
||||
Just strip punctuations from text or strip and restore them later.
|
||||
|
||||
Args:
|
||||
puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`.
|
||||
|
||||
Example:
|
||||
>>> punc = Punctuation()
|
||||
>>> punc.strip("This is. example !")
|
||||
'This is example'
|
||||
|
||||
>>> text_striped, punc_map = punc.strip_to_restore("This is. example !")
|
||||
>>> ' '.join(text_striped)
|
||||
'This is example'
|
||||
|
||||
>>> text_restored = punc.restore(text_striped, punc_map)
|
||||
>>> text_restored[0]
|
||||
'This is. example !'
|
||||
"""
|
||||
|
||||
def __init__(self, puncs: str = _DEF_PUNCS):
|
||||
self.puncs = puncs
|
||||
|
||||
@staticmethod
|
||||
def default_puncs():
|
||||
"""Return default set of punctuations."""
|
||||
return _DEF_PUNCS
|
||||
|
||||
@property
|
||||
def puncs(self):
|
||||
return self._puncs
|
||||
|
||||
@puncs.setter
|
||||
def puncs(self, value):
|
||||
if not isinstance(value, six.string_types):
|
||||
raise ValueError("[!] Punctuations must be of type str.")
|
||||
self._puncs = "".join(list(dict.fromkeys(list(value)))) # remove duplicates without changing the oreder
|
||||
self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+")
|
||||
|
||||
def strip(self, text):
|
||||
"""Remove all the punctuations by replacing with `space`.
|
||||
|
||||
Args:
|
||||
text (str): The text to be processed.
|
||||
|
||||
Example::
|
||||
|
||||
"This is. example !" -> "This is example "
|
||||
"""
|
||||
return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip()
|
||||
|
||||
def strip_to_restore(self, text):
|
||||
"""Remove punctuations from text to restore them later.
|
||||
|
||||
Args:
|
||||
text (str): The text to be processed.
|
||||
|
||||
Examples ::
|
||||
|
||||
"This is. example !" -> [["This is", "example"], [".", "!"]]
|
||||
|
||||
"""
|
||||
text, puncs = self._strip_to_restore(text)
|
||||
return text, puncs
|
||||
|
||||
def _strip_to_restore(self, text):
|
||||
"""Auxiliary method for Punctuation.preserve()"""
|
||||
matches = list(re.finditer(self.puncs_regular_exp, text))
|
||||
if not matches:
|
||||
return [text], []
|
||||
# the text is only punctuations
|
||||
if len(matches) == 1 and matches[0].group() == text:
|
||||
return [], [_PUNC_IDX(text, PuncPosition.ALONE)]
|
||||
# build a punctuation map to be used later to restore punctuations
|
||||
puncs = []
|
||||
for match in matches:
|
||||
position = PuncPosition.MIDDLE
|
||||
if match == matches[0] and text.startswith(match.group()):
|
||||
position = PuncPosition.BEGIN
|
||||
elif match == matches[-1] and text.endswith(match.group()):
|
||||
position = PuncPosition.END
|
||||
puncs.append(_PUNC_IDX(match.group(), position))
|
||||
# convert str text to a List[str], each item is separated by a punctuation
|
||||
splitted_text = []
|
||||
for idx, punc in enumerate(puncs):
|
||||
split = text.split(punc.punc)
|
||||
prefix, suffix = split[0], punc.punc.join(split[1:])
|
||||
splitted_text.append(prefix)
|
||||
# if the text does not end with a punctuation, add it to the last item
|
||||
if idx == len(puncs) - 1 and len(suffix) > 0:
|
||||
splitted_text.append(suffix)
|
||||
text = suffix
|
||||
return splitted_text, puncs
|
||||
|
||||
@classmethod
|
||||
def restore(cls, text, puncs):
|
||||
"""Restore punctuation in a text.
|
||||
|
||||
Args:
|
||||
text (str): The text to be processed.
|
||||
puncs (List[str]): The list of punctuations map to be used for restoring.
|
||||
|
||||
Examples ::
|
||||
|
||||
['This is', 'example'], ['.', '!'] -> "This is. example!"
|
||||
|
||||
"""
|
||||
return cls._restore(text, puncs, 0)
|
||||
|
||||
@classmethod
|
||||
def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements
|
||||
"""Auxiliary method for Punctuation.restore()"""
|
||||
if not puncs:
|
||||
return text
|
||||
|
||||
# nothing have been phonemized, returns the puncs alone
|
||||
if not text:
|
||||
return ["".join(m.punc for m in puncs)]
|
||||
|
||||
current = puncs[0]
|
||||
|
||||
if current.position == PuncPosition.BEGIN:
|
||||
return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num)
|
||||
|
||||
if current.position == PuncPosition.END:
|
||||
return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1)
|
||||
|
||||
if current.position == PuncPosition.ALONE:
|
||||
return [current.mark] + cls._restore(text, puncs[1:], num + 1)
|
||||
|
||||
# POSITION == MIDDLE
|
||||
if len(text) == 1: # pragma: nocover
|
||||
# a corner case where the final part of an intermediate
|
||||
# mark (I) has not been phonemized
|
||||
return cls._restore([text[0] + current.punc], puncs[1:], num)
|
||||
|
||||
return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
|
||||
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# punc = Punctuation()
|
||||
# text = "This is. This is, example!"
|
||||
|
||||
# print(punc.strip(text))
|
||||
|
||||
# split_text, puncs = punc.strip_to_restore(text)
|
||||
# print(split_text, " ---- ", puncs)
|
||||
|
||||
# restored_text = punc.restore(split_text, puncs)
|
||||
# print(restored_text)
|
||||
Reference in New Issue
Block a user