From 5daa5c978d8e09cddfc8216867be11a6b166e5f3 Mon Sep 17 00:00:00 2001
From: mrfakename <me@mrfake.name>
Date: Mon, 26 Feb 2024 15:51:05 -0800
Subject: [PATCH] Enhance text splitting

---
 melo/split_utils.py | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/melo/split_utils.py b/melo/split_utils.py
index 4bba978..9158e2a 100644
--- a/melo/split_utils.py
+++ b/melo/split_utils.py
@@ -4,7 +4,7 @@ import glob
 import numpy as np
 import soundfile as sf
 import torchaudio
-
+from txtsplit import txtsplit
 def split_sentence(text, min_len=10, language_str='EN'):
     if language_str in ['EN', 'FR', 'ES', 'SP', 'DE', 'RU']:
         sentences = split_sentences_latin(text, min_len=min_len)
@@ -18,26 +18,27 @@ def split_sentences_latin(text, min_len=10):
     text = re.sub('[“”]', '"', text)
     text = re.sub('[‘’]', "'", text)
     text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
+    return [item.strip() for item in txtsplit(text, 512, 512) if item.strip()]
     # 将文本中的换行符、空格和制表符替换为空格
-    text = re.sub('[\n\t ]+', ' ', text)
-    # 在标点符号后添加一个空格
-    text = re.sub('([,.!?;])', r'\1 $#!', text)
-    # 分隔句子并去除前后空格
-    sentences = [s.strip() for s in text.split('$#!')]
-    if len(sentences[-1]) == 0: del sentences[-1]
+    # text = re.sub('[\n\t ]+', ' ', text)
+    # # 在标点符号后添加一个空格
+    # text = re.sub('([,.!?;])', r'\1 $#!', text)
+    # # 分隔句子并去除前后空格
+    # sentences = [s.strip() for s in text.split('$#!')]
+    # if len(sentences[-1]) == 0: del sentences[-1]
 
-    new_sentences = []
-    new_sent = []
-    count_len = 0
-    for ind, sent in enumerate(sentences):
-        # print(sent)
-        new_sent.append(sent)
-        count_len += len(sent.split(" "))
-        if count_len > min_len or ind == len(sentences) - 1:
-            count_len = 0
-            new_sentences.append(' '.join(new_sent))
-            new_sent = []
-    return merge_short_sentences_en(new_sentences)
+    # new_sentences = []
+    # new_sent = []
+    # count_len = 0
+    # for ind, sent in enumerate(sentences):
+    #     # print(sent)
+    #     new_sent.append(sent)
+    #     count_len += len(sent.split(" "))
+    #     if count_len > min_len or ind == len(sentences) - 1:
+    #         count_len = 0
+    #         new_sentences.append(' '.join(new_sent))
+    #         new_sent = []
+    # return merge_short_sentences_en(new_sentences)
 
 def split_sentences_zh(text, min_len=10):
     text = re.sub('[。！？；]', '.', text)
@@ -127,4 +128,4 @@ if __name__ == '__main__':
     print(split_sentence(sp_text, language_str='SP'))
     print(split_sentence(fr_text, language_str='FR'))
     print(split_sentence(de_text, language_str='DE'))
-    print(split_sentence(ru_text, language_str='RU'))
\ No newline at end of file
+    print(split_sentence(ru_text, language_str='RU'))