This commit is contained in:
qinzy
2024-02-19 23:15:47 +00:00
parent 734228934f
commit db237ce6a5
62 changed files with 14 additions and 17 deletions

View File

@@ -0,0 +1,122 @@
"""Set of default text cleaners"""
# TODO: pick the cleaner for languages dynamically
import re
from .french_abbreviations import abbreviations_fr
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
rep_map = {
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"·": ",",
"": ",",
"...": ".",
"": ".",
"$": ".",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"(": "",
")": "",
"": "",
"": "",
"": "",
"": "",
"[": "",
"]": "",
"": "",
"": "-",
"~": "-",
"": "",
"": "",
"¿" : "",
"¡" : ""
}
def replace_punctuation(text):
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
return replaced_text
def expand_abbreviations(text, lang="fr"):
if lang == "fr":
_abbreviations = abbreviations_fr
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, " ", text).strip()
def remove_punctuation_at_begin(text):
return re.sub(r'^[,.!?]+', '', text)
def remove_aux_symbols(text):
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
return text
def replace_symbols(text, lang="en"):
"""Replace symbols based on the lenguage tag.
Args:
text:
Input text.
lang:
Lenguage identifier. ex: "en", "fr", "pt", "ca".
Returns:
The modified text
example:
input args:
text: "si l'avi cau, diguem-ho"
lang: "ca"
Output:
text: "si lavi cau, diguemho"
"""
text = text.replace(";", ",")
text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
text = text.replace(":", ",")
if lang == "en":
text = text.replace("&", " and ")
elif lang == "fr":
text = text.replace("&", " et ")
elif lang == "pt":
text = text.replace("&", " e ")
elif lang == "ca":
text = text.replace("&", " i ")
text = text.replace("'", "")
elif lang== "es":
text=text.replace("&","y")
text = text.replace("'", "")
return text
def french_cleaners(text):
"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
text = expand_abbreviations(text, lang="fr")
# text = lowercase(text) # as we use the cased bert
text = replace_punctuation(text)
text = replace_symbols(text, lang="fr")
text = remove_aux_symbols(text)
text = remove_punctuation_at_begin(text)
text = collapse_whitespace(text)
text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
return text