rename

2024-02-19 23:15:47 +00:00
parent 734228934f
commit db237ce6a5
62 changed files with 14 additions and 17 deletions
--- a/melo/text/fr_phonemizer/cleaner.py
+++ b/melo/text/fr_phonemizer/cleaner.py
@@ -0,0 +1,122 @@
+"""Set of default text cleaners"""
+# TODO: pick the cleaner for languages dynamically
+
+import re
+from .french_abbreviations import abbreviations_fr
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r"\s+")
+
+
+rep_map = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "...": ".",
+    "…": ".",
+    "$": ".",
+    "“": "",
+    "”": "",
+    "‘": "",
+    "’": "",
+    "（": "",
+    "）": "",
+    "(": "",
+    ")": "",
+    "《": "",
+    "》": "",
+    "【": "",
+    "】": "",
+    "[": "",
+    "]": "",
+    "—": "",
+    "～": "-",
+    "~": "-",
+    "「": "",
+    "」": "",
+    "¿" : "",
+    "¡" : ""
+}
+
+
+def replace_punctuation(text):
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+    return replaced_text
+
+def expand_abbreviations(text, lang="fr"):
+    if lang == "fr":
+        _abbreviations = abbreviations_fr
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def lowercase(text):
+    return text.lower()
+
+
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text).strip()
+
+def remove_punctuation_at_begin(text):
+    return re.sub(r'^[,.!?]+', '', text)
+
+def remove_aux_symbols(text):
+    text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
+    return text
+
+
+def replace_symbols(text, lang="en"):
+    """Replace symbols based on the lenguage tag.
+
+    Args:
+      text:
+       Input text.
+      lang:
+        Lenguage identifier. ex: "en", "fr", "pt", "ca".
+
+    Returns:
+      The modified text
+      example:
+        input args:
+            text: "si l'avi cau, diguem-ho"
+            lang: "ca"
+        Output:
+            text: "si lavi cau, diguemho"
+    """
+    text = text.replace(";", ",")
+    text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
+    text = text.replace(":", ",")
+    if lang == "en":
+        text = text.replace("&", " and ")
+    elif lang == "fr":
+        text = text.replace("&", " et ")
+    elif lang == "pt":
+        text = text.replace("&", " e ")
+    elif lang == "ca":
+        text = text.replace("&", " i ")
+        text = text.replace("'", "")
+    elif lang== "es":
+        text=text.replace("&","y")
+        text = text.replace("'", "")
+    return text
+
+def french_cleaners(text):
+    """Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
+    text = expand_abbreviations(text, lang="fr")
+    # text = lowercase(text) # as we use the cased bert
+    text = replace_punctuation(text)
+    text = replace_symbols(text, lang="fr")
+    text = remove_aux_symbols(text)
+    text = remove_punctuation_at_begin(text)
+    text = collapse_whitespace(text)
+    text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
+    return text
+