Enhance text splitting

This commit is contained in:
mrfakename
2024-02-26 15:51:05 -08:00
committed by GitHub
parent 60d18a08fe
commit 5daa5c978d

View File

@@ -4,7 +4,7 @@ import glob
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
import torchaudio import torchaudio
from txtsplit import txtsplit
def split_sentence(text, min_len=10, language_str='EN'): def split_sentence(text, min_len=10, language_str='EN'):
if language_str in ['EN', 'FR', 'ES', 'SP', 'DE', 'RU']: if language_str in ['EN', 'FR', 'ES', 'SP', 'DE', 'RU']:
sentences = split_sentences_latin(text, min_len=min_len) sentences = split_sentences_latin(text, min_len=min_len)
@@ -18,26 +18,27 @@ def split_sentences_latin(text, min_len=10):
text = re.sub('[“”]', '"', text) text = re.sub('[“”]', '"', text)
text = re.sub('[]', "'", text) text = re.sub('[]', "'", text)
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text) text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
return [item.strip() for item in txtsplit(text, 512, 512) if item.strip()]
# 将文本中的换行符、空格和制表符替换为空格 # 将文本中的换行符、空格和制表符替换为空格
text = re.sub('[\n\t ]+', ' ', text) # text = re.sub('[\n\t ]+', ' ', text)
# 在标点符号后添加一个空格 # # 在标点符号后添加一个空格
text = re.sub('([,.!?;])', r'\1 $#!', text) # text = re.sub('([,.!?;])', r'\1 $#!', text)
# 分隔句子并去除前后空格 # # 分隔句子并去除前后空格
sentences = [s.strip() for s in text.split('$#!')] # sentences = [s.strip() for s in text.split('$#!')]
if len(sentences[-1]) == 0: del sentences[-1] # if len(sentences[-1]) == 0: del sentences[-1]
new_sentences = [] # new_sentences = []
new_sent = [] # new_sent = []
count_len = 0 # count_len = 0
for ind, sent in enumerate(sentences): # for ind, sent in enumerate(sentences):
# print(sent) # # print(sent)
new_sent.append(sent) # new_sent.append(sent)
count_len += len(sent.split(" ")) # count_len += len(sent.split(" "))
if count_len > min_len or ind == len(sentences) - 1: # if count_len > min_len or ind == len(sentences) - 1:
count_len = 0 # count_len = 0
new_sentences.append(' '.join(new_sent)) # new_sentences.append(' '.join(new_sent))
new_sent = [] # new_sent = []
return merge_short_sentences_en(new_sentences) # return merge_short_sentences_en(new_sentences)
def split_sentences_zh(text, min_len=10): def split_sentences_zh(text, min_len=10):
text = re.sub('[。!?;]', '.', text) text = re.sub('[。!?;]', '.', text)