first commit

2024-02-19 17:49:56 +00:00
parent 736366e546
commit aeb2fa60e4
69 changed files with 139400 additions and 0 deletions
--- a/MyShellTTSBase/text/korean.py
+++ b/MyShellTTSBase/text/korean.py
@@ -0,0 +1,192 @@
+# Convert Japanese text to phonemes which is
+# compatible with Julius https://github.com/julius-speech/segmentation-kit
+import re
+import unicodedata
+
+from transformers import AutoTokenizer
+
+from . import punctuation, symbols
+
+
+from num2words import num2words
+from MyShellTTSBase.text.ko_dictionary import english_dictionary, etc_dictionary
+from anyascii import anyascii
+from jamo import hangul_to_jamo
+
+def normalize(text):
+    text = text.strip()
+    text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
+    text = normalize_with_dictionary(text, etc_dictionary)
+    text = normalize_english(text)
+    text = text.lower()
+    return text
+
+
+def normalize_with_dictionary(text, dic):
+    if any(key in text for key in dic.keys()):
+        pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
+        return pattern.sub(lambda x: dic[x.group()], text)
+    return text
+
+
+def normalize_english(text):
+    def fn(m):
+        word = m.group()
+        if word in english_dictionary:
+            return english_dictionary.get(word)
+        return word
+
+    text = re.sub("([A-Za-z]+)", fn, text)
+    return text
+
+
+g2p_kr = None
+def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
+    """
+
+    The input and output values look the same, but they are different in Unicode.
+
+    example :
+
+        input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
+        output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
+
+    """
+    global g2p_kr  # pylint: disable=global-statement
+    if g2p_kr is None:
+        from g2pkk import G2p
+
+        g2p_kr = G2p()
+
+    if character == "english":
+        from anyascii import anyascii
+        text = normalize(text)
+        text = g2p_kr(text)
+        text = anyascii(text)
+        return text
+
+    text = normalize(text)
+    text = g2p_kr(text)
+    text = list(hangul_to_jamo(text))  # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
+    return "".join(text)
+
+def text_normalize(text):
+    # res = unicodedata.normalize("NFKC", text)
+    # res = japanese_convert_numbers_to_words(res)
+    # # res = "".join([i for i in res if is_japanese_character(i)])
+    # res = replace_punctuation(res)
+    text = normalize(text)
+    return text
+
+
+def distribute_phone(n_phone, n_word):
+    phones_per_word = [0] * n_word
+    for task in range(n_phone):
+        min_tasks = min(phones_per_word)
+        min_index = phones_per_word.index(min_tasks)
+        phones_per_word[min_index] += 1
+    return phones_per_word
+
+
+
+# tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
+
+model_id = 'kykim/bert-kor-base'
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+def g2p(norm_text):
+    tokenized = tokenizer.tokenize(norm_text)
+    phs = []
+    ph_groups = []
+    for t in tokenized:
+        if not t.startswith("#"):
+            ph_groups.append([t])
+        else:
+            ph_groups[-1].append(t.replace("#", ""))
+    word2ph = []
+    for group in ph_groups:
+        text = ""
+        for ch in group:
+            text += ch
+        if text == '[UNK]':
+            phs += ['_']
+            word2ph += [1]
+            continue
+        elif text in punctuation:
+            phs += [text]
+            word2ph += [1]
+            continue
+        # import pdb; pdb.set_trace()
+        # phonemes = japanese_text_to_phonemes(text)
+        # text = g2p_kr(text)
+        phonemes = korean_text_to_phonemes(text)
+        # import pdb; pdb.set_trace()
+        # # phonemes = [i for i in phonemes if i in symbols]
+        # for i in phonemes:
+        #     assert i in symbols, (group, norm_text, tokenized, i)
+        phone_len = len(phonemes)
+        word_len = len(group)
+
+        aaa = distribute_phone(phone_len, word_len)
+        assert len(aaa) == word_len
+        word2ph += aaa
+
+        phs += phonemes
+    phones = ["_"] + phs + ["_"]
+    tones = [0 for i in phones]
+    word2ph =  [1] + word2ph + [1]
+    assert len(word2ph) == len(tokenized) + 2
+    return phones, tones, word2ph
+
+def get_bert_feature(text, word2ph, device='cuda'):
+    from . import japanese_bert
+    return japanese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id)
+
+
+if __name__ == "__main__":
+    # tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
+    from text.symbols import symbols
+    text = "전 제 일의 가치와 폰타인 대중들이 한 일의 의미를 잘 압니다. 앞으로도 전 제 일에 자부심을 갖고 살아갈 겁니다"
+    import json
+
+    # genshin_data = json.load(open('/data/zwl/workspace/StarRail_Datasets/Index & Scripts/Index/1.3/Korean.json'))
+    genshin_data = json.load(open('/data/zwl/workspace/Genshin_Datasets/Index & Script/AI Hobbyist Version/Index/4.1/KR_output.json'))
+    from tqdm import tqdm
+    new_symbols = []
+    for key, item in tqdm(genshin_data.items()):
+        texts = item.get('voiceContent', '')
+        if isinstance(texts, list):
+            texts = ','.join(texts)
+        if texts is None:
+            continue
+        if len(texts) == 0:
+            continue
+
+        text = text_normalize(text)
+        phones, tones, word2ph = g2p(text)
+        bert = get_bert_feature(text, word2ph)
+        import  pdb; pdb.set_trace()
+        for ph in phones:
+            if ph not in symbols and ph not in new_symbols:
+                new_symbols.append(ph)
+                print('update!, now symbols:')
+                print(new_symbols)
+                with open('korean_symbol.txt', 'w') as f:
+                    f.write(f'{new_symbols}')
+
+        
+
+# if __name__ == '__main__':
+#     from pykakasi import kakasi
+#     # Initialize kakasi object
+#     kakasi = kakasi()
+
+#     # Set options for converting Chinese characters to Katakana
+#     kakasi.setMode("J", "H")  # Chinese to Katakana
+#     kakasi.setMode("K", "H")  # Hiragana to Katakana
+
+#     # Convert Chinese characters to Katakana
+#     conv = kakasi.getConverter()
+#     katakana_text = conv.do('ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?')  # Replace with your Chinese text
+
+#     print(katakana_text)  # Output: ニーハオセカイ