From 5daa5c978d8e09cddfc8216867be11a6b166e5f3 Mon Sep 17 00:00:00 2001 From: mrfakename Date: Mon, 26 Feb 2024 15:51:05 -0800 Subject: [PATCH] Enhance text splitting --- melo/split_utils.py | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/melo/split_utils.py b/melo/split_utils.py index 4bba978..9158e2a 100644 --- a/melo/split_utils.py +++ b/melo/split_utils.py @@ -4,7 +4,7 @@ import glob import numpy as np import soundfile as sf import torchaudio - +from txtsplit import txtsplit def split_sentence(text, min_len=10, language_str='EN'): if language_str in ['EN', 'FR', 'ES', 'SP', 'DE', 'RU']: sentences = split_sentences_latin(text, min_len=min_len) @@ -18,26 +18,27 @@ def split_sentences_latin(text, min_len=10): text = re.sub('[“”]', '"', text) text = re.sub('[‘’]', "'", text) text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text) + return [item.strip() for item in txtsplit(text, 512, 512) if item.strip()] # 将文本中的换行符、空格和制表符替换为空格 - text = re.sub('[\n\t ]+', ' ', text) - # 在标点符号后添加一个空格 - text = re.sub('([,.!?;])', r'\1 $#!', text) - # 分隔句子并去除前后空格 - sentences = [s.strip() for s in text.split('$#!')] - if len(sentences[-1]) == 0: del sentences[-1] + # text = re.sub('[\n\t ]+', ' ', text) + # # 在标点符号后添加一个空格 + # text = re.sub('([,.!?;])', r'\1 $#!', text) + # # 分隔句子并去除前后空格 + # sentences = [s.strip() for s in text.split('$#!')] + # if len(sentences[-1]) == 0: del sentences[-1] - new_sentences = [] - new_sent = [] - count_len = 0 - for ind, sent in enumerate(sentences): - # print(sent) - new_sent.append(sent) - count_len += len(sent.split(" ")) - if count_len > min_len or ind == len(sentences) - 1: - count_len = 0 - new_sentences.append(' '.join(new_sent)) - new_sent = [] - return merge_short_sentences_en(new_sentences) + # new_sentences = [] + # new_sent = [] + # count_len = 0 + # for ind, sent in enumerate(sentences): + # # print(sent) + # new_sent.append(sent) + # count_len += len(sent.split(" ")) + # if count_len > min_len or ind == len(sentences) - 1: + # count_len = 0 + # new_sentences.append(' '.join(new_sent)) + # new_sent = [] + # return merge_short_sentences_en(new_sentences) def split_sentences_zh(text, min_len=10): text = re.sub('[。!?;]', '.', text) @@ -127,4 +128,4 @@ if __name__ == '__main__': print(split_sentence(sp_text, language_str='SP')) print(split_sentence(fr_text, language_str='FR')) print(split_sentence(de_text, language_str='DE')) - print(split_sentence(ru_text, language_str='RU')) \ No newline at end of file + print(split_sentence(ru_text, language_str='RU'))