first commit

This commit is contained in:
qinzy
2024-02-19 17:49:56 +00:00
parent 736366e546
commit aeb2fa60e4
69 changed files with 139400 additions and 0 deletions

10
.gitignore vendored Normal file
View File

@@ -0,0 +1,10 @@
__pycache__/
.ipynb_checkpoints/
basetts_outputs_use_bert/
basetts_outputs/
multilingual_ckpts
basetts_outputs_package/
*.egg-info/
*.zip
*.wav

View File

113
MyShellTTSBase/api.py Normal file
View File

@@ -0,0 +1,113 @@
import os
import re
import json
import torch
import librosa
import soundfile
import torchaudio
import numpy as np
import torch.nn as nn
from . import utils
from . import commons
from .models import SynthesizerTrn
from .split_utils import split_sentence
from .mel_processing import spectrogram_torch, spectrogram_torch_conv
from .download_utils import load_or_download_config, load_or_download_model
class TTS(nn.Module):
def __init__(self,
language,
device='cuda:0'):
super().__init__()
if 'cuda' in device:
assert torch.cuda.is_available()
# config_path =
hps = load_or_download_config(language)
num_languages = hps.num_languages
num_tones = hps.num_tones
symbols = hps.symbols
model = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
num_tones=num_tones,
num_languages=num_languages,
**hps.model,
).to(device)
model.eval()
self.model = model
self.symbol_to_id = {s: i for i, s in enumerate(symbols)}
self.hps = hps
self.device = device
# load state_dict
checkpoint_dict = load_or_download_model(language, device)
self.model.load_state_dict(checkpoint_dict['model'], strict=True)
language = language.split('_')[0]
self.language = 'ZH_MIX_EN' if language == 'ZH' else language # we support a ZH_MIX_EN model
@staticmethod
def audio_numpy_concat(segment_data_list, sr, speed=1.):
audio_segments = []
for segment_data in segment_data_list:
audio_segments += segment_data.reshape(-1).tolist()
audio_segments += [0] * int((sr * 0.05) / speed)
audio_segments = np.array(audio_segments).astype(np.float32)
return audio_segments
@staticmethod
def split_sentences_into_pieces(text, language):
texts = split_sentence(text, language_str=language)
print(" > Text splitted to sentences.")
print('\n'.join(texts))
print(" > ===========================")
return texts
def tts_to_file(self, text, speaker_id, output_path=None, sdp_ratio=0.2, noise_scale=0.6, noise_scale_w=0.8, speed=1.0):
language = self.language
texts = self.split_sentences_into_pieces(text, language)
audio_list = []
for t in texts:
if language in ['EN', 'ZH_MIX_EN']:
t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
device = self.device
bert, ja_bert, phones, tones, lang_ids = utils.get_text_for_tts_infer(t, language, self.hps, device, self.symbol_to_id)
with torch.no_grad():
x_tst = phones.to(device).unsqueeze(0)
tones = tones.to(device).unsqueeze(0)
lang_ids = lang_ids.to(device).unsqueeze(0)
bert = bert.to(device).unsqueeze(0)
ja_bert = ja_bert.to(device).unsqueeze(0)
x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
del phones
speakers = torch.LongTensor([speaker_id]).to(device)
audio = self.model.infer(
x_tst,
x_tst_lengths,
speakers,
tones,
lang_ids,
bert,
ja_bert,
sdp_ratio=sdp_ratio,
noise_scale=noise_scale,
noise_scale_w=noise_scale_w,
length_scale=1. / speed,
)[0][0, 0].data.cpu().float().numpy()
del x_tst, tones, lang_ids, bert, ja_bert, x_tst_lengths, speakers
#
audio_list.append(audio)
torch.cuda.empty_cache()
audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed)
if output_path is None:
return audio
else:
soundfile.write(output_path, audio, self.hps.data.sampling_rate)

View File

@@ -0,0 +1,459 @@
import math
import torch
from torch import nn
from torch.nn import functional as F
from . import commons
import logging
logger = logging.getLogger(__name__)
class LayerNorm(nn.Module):
def __init__(self, channels, eps=1e-5):
super().__init__()
self.channels = channels
self.eps = eps
self.gamma = nn.Parameter(torch.ones(channels))
self.beta = nn.Parameter(torch.zeros(channels))
def forward(self, x):
x = x.transpose(1, -1)
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
return x.transpose(1, -1)
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0]
in_act = input_a + input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act
return acts
class Encoder(nn.Module):
def __init__(
self,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size=1,
p_dropout=0.0,
window_size=4,
isflow=True,
**kwargs
):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.window_size = window_size
self.cond_layer_idx = self.n_layers
if "gin_channels" in kwargs:
self.gin_channels = kwargs["gin_channels"]
if self.gin_channels != 0:
self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
self.cond_layer_idx = (
kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
)
assert (
self.cond_layer_idx < self.n_layers
), "cond_layer_idx should be less than n_layers"
self.drop = nn.Dropout(p_dropout)
self.attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.attn_layers.append(
MultiHeadAttention(
hidden_channels,
hidden_channels,
n_heads,
p_dropout=p_dropout,
window_size=window_size,
)
)
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(
FFN(
hidden_channels,
hidden_channels,
filter_channels,
kernel_size,
p_dropout=p_dropout,
)
)
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask, g=None):
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
if i == self.cond_layer_idx and g is not None:
g = self.spk_emb_linear(g.transpose(1, 2))
g = g.transpose(1, 2)
x = x + g
x = x * x_mask
y = self.attn_layers[i](x, x, attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
class Decoder(nn.Module):
def __init__(
self,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size=1,
p_dropout=0.0,
proximal_bias=False,
proximal_init=True,
**kwargs
):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.drop = nn.Dropout(p_dropout)
self.self_attn_layers = nn.ModuleList()
self.norm_layers_0 = nn.ModuleList()
self.encdec_attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.self_attn_layers.append(
MultiHeadAttention(
hidden_channels,
hidden_channels,
n_heads,
p_dropout=p_dropout,
proximal_bias=proximal_bias,
proximal_init=proximal_init,
)
)
self.norm_layers_0.append(LayerNorm(hidden_channels))
self.encdec_attn_layers.append(
MultiHeadAttention(
hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
)
)
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(
FFN(
hidden_channels,
hidden_channels,
filter_channels,
kernel_size,
p_dropout=p_dropout,
causal=True,
)
)
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask, h, h_mask):
"""
x: decoder input
h: encoder output
"""
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
device=x.device, dtype=x.dtype
)
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
y = self.self_attn_layers[i](x, x, self_attn_mask)
y = self.drop(y)
x = self.norm_layers_0[i](x + y)
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
class MultiHeadAttention(nn.Module):
def __init__(
self,
channels,
out_channels,
n_heads,
p_dropout=0.0,
window_size=None,
heads_share=True,
block_length=None,
proximal_bias=False,
proximal_init=False,
):
super().__init__()
assert channels % n_heads == 0
self.channels = channels
self.out_channels = out_channels
self.n_heads = n_heads
self.p_dropout = p_dropout
self.window_size = window_size
self.heads_share = heads_share
self.block_length = block_length
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.attn = None
self.k_channels = channels // n_heads
self.conv_q = nn.Conv1d(channels, channels, 1)
self.conv_k = nn.Conv1d(channels, channels, 1)
self.conv_v = nn.Conv1d(channels, channels, 1)
self.conv_o = nn.Conv1d(channels, out_channels, 1)
self.drop = nn.Dropout(p_dropout)
if window_size is not None:
n_heads_rel = 1 if heads_share else n_heads
rel_stddev = self.k_channels**-0.5
self.emb_rel_k = nn.Parameter(
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
* rel_stddev
)
self.emb_rel_v = nn.Parameter(
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
* rel_stddev
)
nn.init.xavier_uniform_(self.conv_q.weight)
nn.init.xavier_uniform_(self.conv_k.weight)
nn.init.xavier_uniform_(self.conv_v.weight)
if proximal_init:
with torch.no_grad():
self.conv_k.weight.copy_(self.conv_q.weight)
self.conv_k.bias.copy_(self.conv_q.bias)
def forward(self, x, c, attn_mask=None):
q = self.conv_q(x)
k = self.conv_k(c)
v = self.conv_v(c)
x, self.attn = self.attention(q, k, v, mask=attn_mask)
x = self.conv_o(x)
return x
def attention(self, query, key, value, mask=None):
# reshape [b, d, t] -> [b, n_h, t, d_k]
b, d, t_s, t_t = (*key.size(), query.size(2))
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
if self.window_size is not None:
assert (
t_s == t_t
), "Relative attention is only available for self-attention."
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
rel_logits = self._matmul_with_relative_keys(
query / math.sqrt(self.k_channels), key_relative_embeddings
)
scores_local = self._relative_position_to_absolute_position(rel_logits)
scores = scores + scores_local
if self.proximal_bias:
assert t_s == t_t, "Proximal bias is only available for self-attention."
scores = scores + self._attention_bias_proximal(t_s).to(
device=scores.device, dtype=scores.dtype
)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e4)
if self.block_length is not None:
assert (
t_s == t_t
), "Local attention is only available for self-attention."
block_mask = (
torch.ones_like(scores)
.triu(-self.block_length)
.tril(self.block_length)
)
scores = scores.masked_fill(block_mask == 0, -1e4)
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
p_attn = self.drop(p_attn)
output = torch.matmul(p_attn, value)
if self.window_size is not None:
relative_weights = self._absolute_position_to_relative_position(p_attn)
value_relative_embeddings = self._get_relative_embeddings(
self.emb_rel_v, t_s
)
output = output + self._matmul_with_relative_values(
relative_weights, value_relative_embeddings
)
output = (
output.transpose(2, 3).contiguous().view(b, d, t_t)
) # [b, n_h, t_t, d_k] -> [b, d, t_t]
return output, p_attn
def _matmul_with_relative_values(self, x, y):
"""
x: [b, h, l, m]
y: [h or 1, m, d]
ret: [b, h, l, d]
"""
ret = torch.matmul(x, y.unsqueeze(0))
return ret
def _matmul_with_relative_keys(self, x, y):
"""
x: [b, h, l, d]
y: [h or 1, m, d]
ret: [b, h, l, m]
"""
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
return ret
def _get_relative_embeddings(self, relative_embeddings, length):
2 * self.window_size + 1
# Pad first before slice to avoid using cond ops.
pad_length = max(length - (self.window_size + 1), 0)
slice_start_position = max((self.window_size + 1) - length, 0)
slice_end_position = slice_start_position + 2 * length - 1
if pad_length > 0:
padded_relative_embeddings = F.pad(
relative_embeddings,
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
)
else:
padded_relative_embeddings = relative_embeddings
used_relative_embeddings = padded_relative_embeddings[
:, slice_start_position:slice_end_position
]
return used_relative_embeddings
def _relative_position_to_absolute_position(self, x):
"""
x: [b, h, l, 2*l-1]
ret: [b, h, l, l]
"""
batch, heads, length, _ = x.size()
# Concat columns of pad to shift from relative to absolute indexing.
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
# Concat extra elements so to add up to shape (len+1, 2*len-1).
x_flat = x.view([batch, heads, length * 2 * length])
x_flat = F.pad(
x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
)
# Reshape and slice out the padded elements.
x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
:, :, :length, length - 1 :
]
return x_final
def _absolute_position_to_relative_position(self, x):
"""
x: [b, h, l, l]
ret: [b, h, l, 2*l-1]
"""
batch, heads, length, _ = x.size()
# pad along column
x = F.pad(
x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
)
x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
# add 0's in the beginning that will skew the elements after reshape
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
return x_final
def _attention_bias_proximal(self, length):
"""Bias for self-attention to encourage attention to close positions.
Args:
length: an integer scalar.
Returns:
a Tensor with shape [1, 1, length, length]
"""
r = torch.arange(length, dtype=torch.float32)
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
class FFN(nn.Module):
def __init__(
self,
in_channels,
out_channels,
filter_channels,
kernel_size,
p_dropout=0.0,
activation=None,
causal=False,
):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.activation = activation
self.causal = causal
if causal:
self.padding = self._causal_padding
else:
self.padding = self._same_padding
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
self.drop = nn.Dropout(p_dropout)
def forward(self, x, x_mask):
x = self.conv_1(self.padding(x * x_mask))
if self.activation == "gelu":
x = x * torch.sigmoid(1.702 * x)
else:
x = torch.relu(x)
x = self.drop(x)
x = self.conv_2(self.padding(x * x_mask))
return x * x_mask
def _causal_padding(self, x):
if self.kernel_size == 1:
return x
pad_l = self.kernel_size - 1
pad_r = 0
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(x, commons.convert_pad_shape(padding))
return x
def _same_padding(self, x):
if self.kernel_size == 1:
return x
pad_l = (self.kernel_size - 1) // 2
pad_r = self.kernel_size // 2
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(x, commons.convert_pad_shape(padding))
return x

160
MyShellTTSBase/commons.py Normal file
View File

@@ -0,0 +1,160 @@
import math
import torch
from torch.nn import functional as F
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def get_padding(kernel_size, dilation=1):
return int((kernel_size * dilation - dilation) / 2)
def convert_pad_shape(pad_shape):
layer = pad_shape[::-1]
pad_shape = [item for sublist in layer for item in sublist]
return pad_shape
def intersperse(lst, item):
result = [item] * (len(lst) * 2 + 1)
result[1::2] = lst
return result
def kl_divergence(m_p, logs_p, m_q, logs_q):
"""KL(P||Q)"""
kl = (logs_q - logs_p) - 0.5
kl += (
0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
)
return kl
def rand_gumbel(shape):
"""Sample from the Gumbel distribution, protect from overflows."""
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
return -torch.log(-torch.log(uniform_samples))
def rand_gumbel_like(x):
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
return g
def slice_segments(x, ids_str, segment_size=4):
ret = torch.zeros_like(x[:, :, :segment_size])
for i in range(x.size(0)):
idx_str = ids_str[i]
idx_end = idx_str + segment_size
ret[i] = x[i, :, idx_str:idx_end]
return ret
def rand_slice_segments(x, x_lengths=None, segment_size=4):
b, d, t = x.size()
if x_lengths is None:
x_lengths = t
ids_str_max = x_lengths - segment_size + 1
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
ret = slice_segments(x, ids_str, segment_size)
return ret, ids_str
def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
position = torch.arange(length, dtype=torch.float)
num_timescales = channels // 2
log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
num_timescales - 1
)
inv_timescales = min_timescale * torch.exp(
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
)
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
signal = F.pad(signal, [0, 0, 0, channels % 2])
signal = signal.view(1, channels, length)
return signal
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
b, channels, length = x.size()
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return x + signal.to(dtype=x.dtype, device=x.device)
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
b, channels, length = x.size()
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
def subsequent_mask(length):
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
return mask
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0]
in_act = input_a + input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act
return acts
def convert_pad_shape(pad_shape):
layer = pad_shape[::-1]
pad_shape = [item for sublist in layer for item in sublist]
return pad_shape
def shift_1d(x):
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
return x
def sequence_mask(length, max_length=None):
if max_length is None:
max_length = length.max()
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
return x.unsqueeze(0) < length.unsqueeze(1)
def generate_path(duration, mask):
"""
duration: [b, 1, t_x]
mask: [b, 1, t_y, t_x]
"""
b, _, t_y, t_x = mask.shape
cum_duration = torch.cumsum(duration, -1)
cum_duration_flat = cum_duration.view(b * t_x)
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
path = path.view(b, t_x, t_y)
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
path = path.unsqueeze(1).transpose(2, 3) * mask
return path
def clip_grad_value_(parameters, clip_value, norm_type=2):
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
parameters = list(filter(lambda p: p.grad is not None, parameters))
norm_type = float(norm_type)
if clip_value is not None:
clip_value = float(clip_value)
total_norm = 0
for p in parameters:
param_norm = p.grad.data.norm(norm_type)
total_norm += param_norm.item() ** norm_type
if clip_value is not None:
p.grad.data.clamp_(min=-clip_value, max=clip_value)
total_norm = total_norm ** (1.0 / norm_type)
return total_norm

View File

@@ -0,0 +1,47 @@
import torch
import os
from . import utils
DOWNLOAD_CKPT_URLS = {
'EN': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/EN/checkpoint.pth',
'EN_V2': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/EN_V2/checkpoint.pth',
'FR': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/FR/checkpoint.pth',
'JP': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/JP/checkpoint.pth',
'ES': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/ES/checkpoint.pth',
'ZH': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/ZH/checkpoint.pth',
'KR': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/KR/checkpoint.pth',
}
DOWNLOAD_CONFIG_URLS = {
'EN': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/EN/config.json',
'EN_V2': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/EN_V2/config.json',
'FR': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/FR/config.json',
'JP': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/JP/config.json',
'ES': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/ES/config.json',
'ZH': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/ZH/config.json',
'KR': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/KR/config.json',
}
def load_or_download_config(locale):
language = locale.split('-')[0].upper()
assert language in DOWNLOAD_CONFIG_URLS
config_path = os.path.expanduser(f'~/.local/share/openvoice/basespeakers/{language}/config.json')
try:
return utils.get_hparams_from_file(config_path)
except:
# download
os.makedirs(os.path.dirname(config_path), exist_ok=True)
os.system(f'wget {DOWNLOAD_CONFIG_URLS[language]} -O {config_path}')
return utils.get_hparams_from_file(config_path)
def load_or_download_model(locale, device):
language = locale.split('-')[0].upper()
assert language in DOWNLOAD_CKPT_URLS
ckpt_path = os.path.expanduser(f'~/.local/share/openvoice/basespeakers/{language}/checkpoint.pth')
try:
return torch.load(ckpt_path, map_location=device)
except:
# download
os.makedirs(os.path.dirname(ckpt_path), exist_ok=True)
os.system(f'wget {DOWNLOAD_CKPT_URLS[language]} -O {ckpt_path}')
return torch.load(ckpt_path, map_location=device)

View File

@@ -0,0 +1,174 @@
import torch
import torch.utils.data
import librosa
from librosa.filters import mel as librosa_mel_fn
MAX_WAV_VALUE = 32768.0
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
"""
PARAMS
------
C: compression factor
"""
return torch.log(torch.clamp(x, min=clip_val) * C)
def dynamic_range_decompression_torch(x, C=1):
"""
PARAMS
------
C: compression factor used to compress
"""
return torch.exp(x) / C
def spectral_normalize_torch(magnitudes):
output = dynamic_range_compression_torch(magnitudes)
return output
def spectral_de_normalize_torch(magnitudes):
output = dynamic_range_decompression_torch(magnitudes)
return output
mel_basis = {}
hann_window = {}
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
if torch.min(y) < -1.1:
print("min value is ", torch.min(y))
if torch.max(y) > 1.1:
print("max value is ", torch.max(y))
global hann_window
dtype_device = str(y.dtype) + "_" + str(y.device)
wnsize_dtype_device = str(win_size) + "_" + dtype_device
if wnsize_dtype_device not in hann_window:
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
dtype=y.dtype, device=y.device
)
y = torch.nn.functional.pad(
y.unsqueeze(1),
(int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
mode="reflect",
)
y = y.squeeze(1)
spec = torch.stft(
y,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window[wnsize_dtype_device],
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=False,
)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
return spec
def spectrogram_torch_conv(y, n_fft, sampling_rate, hop_size, win_size, center=False):
global hann_window
dtype_device = str(y.dtype) + '_' + str(y.device)
wnsize_dtype_device = str(win_size) + '_' + dtype_device
if wnsize_dtype_device not in hann_window:
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
# ******************** original ************************#
# y = y.squeeze(1)
# spec1 = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
# center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
# ******************** ConvSTFT ************************#
freq_cutoff = n_fft // 2 + 1
fourier_basis = torch.view_as_real(torch.fft.fft(torch.eye(n_fft)))
forward_basis = fourier_basis[:freq_cutoff].permute(2, 0, 1).reshape(-1, 1, fourier_basis.shape[1])
forward_basis = forward_basis * torch.as_tensor(librosa.util.pad_center(torch.hann_window(win_size), size=n_fft)).float()
import torch.nn.functional as F
# if center:
# signal = F.pad(y[:, None, None, :], (n_fft // 2, n_fft // 2, 0, 0), mode = 'reflect').squeeze(1)
assert center is False
forward_transform_squared = F.conv1d(y, forward_basis.to(y.device), stride = hop_size)
spec2 = torch.stack([forward_transform_squared[:, :freq_cutoff, :], forward_transform_squared[:, freq_cutoff:, :]], dim = -1)
# ******************** Verification ************************#
spec1 = torch.stft(y.squeeze(1), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
assert torch.allclose(spec1, spec2, atol=1e-4)
spec = torch.sqrt(spec2.pow(2).sum(-1) + 1e-6)
return spec
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
global mel_basis
dtype_device = str(spec.dtype) + "_" + str(spec.device)
fmax_dtype_device = str(fmax) + "_" + dtype_device
if fmax_dtype_device not in mel_basis:
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
dtype=spec.dtype, device=spec.device
)
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
spec = spectral_normalize_torch(spec)
return spec
def mel_spectrogram_torch(
y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
):
global mel_basis, hann_window
dtype_device = str(y.dtype) + "_" + str(y.device)
fmax_dtype_device = str(fmax) + "_" + dtype_device
wnsize_dtype_device = str(win_size) + "_" + dtype_device
if fmax_dtype_device not in mel_basis:
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
dtype=y.dtype, device=y.device
)
if wnsize_dtype_device not in hann_window:
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
dtype=y.dtype, device=y.device
)
y = torch.nn.functional.pad(
y.unsqueeze(1),
(int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
mode="reflect",
)
y = y.squeeze(1)
spec = torch.stft(
y,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window[wnsize_dtype_device],
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=False,
)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
spec = spectral_normalize_torch(spec)
return spec

1038
MyShellTTSBase/models.py Normal file

File diff suppressed because it is too large Load Diff

598
MyShellTTSBase/modules.py Normal file
View File

@@ -0,0 +1,598 @@
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn import Conv1d
from torch.nn.utils import weight_norm, remove_weight_norm
from . import commons
from .commons import init_weights, get_padding
from .transforms import piecewise_rational_quadratic_transform
from .attentions import Encoder
LRELU_SLOPE = 0.1
class LayerNorm(nn.Module):
def __init__(self, channels, eps=1e-5):
super().__init__()
self.channels = channels
self.eps = eps
self.gamma = nn.Parameter(torch.ones(channels))
self.beta = nn.Parameter(torch.zeros(channels))
def forward(self, x):
x = x.transpose(1, -1)
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
return x.transpose(1, -1)
class ConvReluNorm(nn.Module):
def __init__(
self,
in_channels,
hidden_channels,
out_channels,
kernel_size,
n_layers,
p_dropout,
):
super().__init__()
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = p_dropout
assert n_layers > 1, "Number of layers should be larger than 0."
self.conv_layers = nn.ModuleList()
self.norm_layers = nn.ModuleList()
self.conv_layers.append(
nn.Conv1d(
in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
)
)
self.norm_layers.append(LayerNorm(hidden_channels))
self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
for _ in range(n_layers - 1):
self.conv_layers.append(
nn.Conv1d(
hidden_channels,
hidden_channels,
kernel_size,
padding=kernel_size // 2,
)
)
self.norm_layers.append(LayerNorm(hidden_channels))
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(self, x, x_mask):
x_org = x
for i in range(self.n_layers):
x = self.conv_layers[i](x * x_mask)
x = self.norm_layers[i](x)
x = self.relu_drop(x)
x = x_org + self.proj(x)
return x * x_mask
class DDSConv(nn.Module):
"""
Dialted and Depth-Separable Convolution
"""
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
super().__init__()
self.channels = channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = p_dropout
self.drop = nn.Dropout(p_dropout)
self.convs_sep = nn.ModuleList()
self.convs_1x1 = nn.ModuleList()
self.norms_1 = nn.ModuleList()
self.norms_2 = nn.ModuleList()
for i in range(n_layers):
dilation = kernel_size**i
padding = (kernel_size * dilation - dilation) // 2
self.convs_sep.append(
nn.Conv1d(
channels,
channels,
kernel_size,
groups=channels,
dilation=dilation,
padding=padding,
)
)
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
self.norms_1.append(LayerNorm(channels))
self.norms_2.append(LayerNorm(channels))
def forward(self, x, x_mask, g=None):
if g is not None:
x = x + g
for i in range(self.n_layers):
y = self.convs_sep[i](x * x_mask)
y = self.norms_1[i](y)
y = F.gelu(y)
y = self.convs_1x1[i](y)
y = self.norms_2[i](y)
y = F.gelu(y)
y = self.drop(y)
x = x + y
return x * x_mask
class WN(torch.nn.Module):
def __init__(
self,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=0,
p_dropout=0,
):
super(WN, self).__init__()
assert kernel_size % 2 == 1
self.hidden_channels = hidden_channels
self.kernel_size = (kernel_size,)
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.p_dropout = p_dropout
self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList()
self.drop = nn.Dropout(p_dropout)
if gin_channels != 0:
cond_layer = torch.nn.Conv1d(
gin_channels, 2 * hidden_channels * n_layers, 1
)
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
for i in range(n_layers):
dilation = dilation_rate**i
padding = int((kernel_size * dilation - dilation) / 2)
in_layer = torch.nn.Conv1d(
hidden_channels,
2 * hidden_channels,
kernel_size,
dilation=dilation,
padding=padding,
)
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
self.in_layers.append(in_layer)
# last one is not necessary
if i < n_layers - 1:
res_skip_channels = 2 * hidden_channels
else:
res_skip_channels = hidden_channels
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
self.res_skip_layers.append(res_skip_layer)
def forward(self, x, x_mask, g=None, **kwargs):
output = torch.zeros_like(x)
n_channels_tensor = torch.IntTensor([self.hidden_channels])
if g is not None:
g = self.cond_layer(g)
for i in range(self.n_layers):
x_in = self.in_layers[i](x)
if g is not None:
cond_offset = i * 2 * self.hidden_channels
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
else:
g_l = torch.zeros_like(x_in)
acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
acts = self.drop(acts)
res_skip_acts = self.res_skip_layers[i](acts)
if i < self.n_layers - 1:
res_acts = res_skip_acts[:, : self.hidden_channels, :]
x = (x + res_acts) * x_mask
output = output + res_skip_acts[:, self.hidden_channels :, :]
else:
output = output + res_skip_acts
return output * x_mask
def remove_weight_norm(self):
if self.gin_channels != 0:
torch.nn.utils.remove_weight_norm(self.cond_layer)
for l in self.in_layers:
torch.nn.utils.remove_weight_norm(l)
for l in self.res_skip_layers:
torch.nn.utils.remove_weight_norm(l)
class ResBlock1(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
super(ResBlock1, self).__init__()
self.convs1 = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[2],
padding=get_padding(kernel_size, dilation[2]),
)
),
]
)
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
),
]
)
self.convs2.apply(init_weights)
def forward(self, x, x_mask=None):
for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c1(xt)
xt = F.leaky_relu(xt, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c2(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
class ResBlock2(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
super(ResBlock2, self).__init__()
self.convs = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]),
)
),
]
)
self.convs.apply(init_weights)
def forward(self, x, x_mask=None):
for c in self.convs:
xt = F.leaky_relu(x, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs:
remove_weight_norm(l)
class Log(nn.Module):
def forward(self, x, x_mask, reverse=False, **kwargs):
if not reverse:
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
logdet = torch.sum(-y, [1, 2])
return y, logdet
else:
x = torch.exp(x) * x_mask
return x
class Flip(nn.Module):
def forward(self, x, *args, reverse=False, **kwargs):
x = torch.flip(x, [1])
if not reverse:
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
return x, logdet
else:
return x
class ElementwiseAffine(nn.Module):
def __init__(self, channels):
super().__init__()
self.channels = channels
self.m = nn.Parameter(torch.zeros(channels, 1))
self.logs = nn.Parameter(torch.zeros(channels, 1))
def forward(self, x, x_mask, reverse=False, **kwargs):
if not reverse:
y = self.m + torch.exp(self.logs) * x
y = y * x_mask
logdet = torch.sum(self.logs * x_mask, [1, 2])
return y, logdet
else:
x = (x - self.m) * torch.exp(-self.logs) * x_mask
return x
class ResidualCouplingLayer(nn.Module):
def __init__(
self,
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
p_dropout=0,
gin_channels=0,
mean_only=False,
):
assert channels % 2 == 0, "channels should be divisible by 2"
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.half_channels = channels // 2
self.mean_only = mean_only
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
self.enc = WN(
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
p_dropout=p_dropout,
gin_channels=gin_channels,
)
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
self.post.weight.data.zero_()
self.post.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0) * x_mask
h = self.enc(h, x_mask, g=g)
stats = self.post(h) * x_mask
if not self.mean_only:
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
else:
m = stats
logs = torch.zeros_like(m)
if not reverse:
x1 = m + x1 * torch.exp(logs) * x_mask
x = torch.cat([x0, x1], 1)
logdet = torch.sum(logs, [1, 2])
return x, logdet
else:
x1 = (x1 - m) * torch.exp(-logs) * x_mask
x = torch.cat([x0, x1], 1)
return x
class ConvFlow(nn.Module):
def __init__(
self,
in_channels,
filter_channels,
kernel_size,
n_layers,
num_bins=10,
tail_bound=5.0,
):
super().__init__()
self.in_channels = in_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.num_bins = num_bins
self.tail_bound = tail_bound
self.half_channels = in_channels // 2
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
self.proj = nn.Conv1d(
filter_channels, self.half_channels * (num_bins * 3 - 1), 1
)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0)
h = self.convs(h, x_mask, g=g)
h = self.proj(h) * x_mask
b, c, t = x0.shape
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
self.filter_channels
)
unnormalized_derivatives = h[..., 2 * self.num_bins :]
x1, logabsdet = piecewise_rational_quadratic_transform(
x1,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=reverse,
tails="linear",
tail_bound=self.tail_bound,
)
x = torch.cat([x0, x1], 1) * x_mask
logdet = torch.sum(logabsdet * x_mask, [1, 2])
if not reverse:
return x, logdet
else:
return x
class TransformerCouplingLayer(nn.Module):
def __init__(
self,
channels,
hidden_channels,
kernel_size,
n_layers,
n_heads,
p_dropout=0,
filter_channels=0,
mean_only=False,
wn_sharing_parameter=None,
gin_channels=0,
):
assert n_layers == 3, n_layers
assert channels % 2 == 0, "channels should be divisible by 2"
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.half_channels = channels // 2
self.mean_only = mean_only
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
self.enc = (
Encoder(
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
isflow=True,
gin_channels=gin_channels,
)
if wn_sharing_parameter is None
else wn_sharing_parameter
)
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
self.post.weight.data.zero_()
self.post.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0) * x_mask
h = self.enc(h, x_mask, g=g)
stats = self.post(h) * x_mask
if not self.mean_only:
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
else:
m = stats
logs = torch.zeros_like(m)
if not reverse:
x1 = m + x1 * torch.exp(logs) * x_mask
x = torch.cat([x0, x1], 1)
logdet = torch.sum(logs, [1, 2])
return x, logdet
else:
x1 = (x1 - m) * torch.exp(-logs) * x_mask
x = torch.cat([x0, x1], 1)
return x
x1, logabsdet = piecewise_rational_quadratic_transform(
x1,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=reverse,
tails="linear",
tail_bound=self.tail_bound,
)
x = torch.cat([x0, x1], 1) * x_mask
logdet = torch.sum(logabsdet * x_mask, [1, 2])
if not reverse:
return x, logdet
else:
return x

View File

@@ -0,0 +1,130 @@
import re
import os
import glob
import numpy as np
import soundfile as sf
import torchaudio
def split_sentence(text, min_len=10, language_str='EN'):
if language_str in ['EN', 'FR', 'ES', 'SP', 'DE', 'RU']:
sentences = split_sentences_latin(text, min_len=min_len)
else:
sentences = split_sentences_zh(text, min_len=min_len)
return sentences
def split_sentences_latin(text, min_len=10):
text = re.sub('[。!?;]', '.', text)
text = re.sub('[]', ',', text)
text = re.sub('[“”]', '"', text)
text = re.sub('[]', "'", text)
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
# 将文本中的换行符、空格和制表符替换为空格
text = re.sub('[\n\t ]+', ' ', text)
# 在标点符号后添加一个空格
text = re.sub('([,.!?;])', r'\1 $#!', text)
# 分隔句子并去除前后空格
sentences = [s.strip() for s in text.split('$#!')]
if len(sentences[-1]) == 0: del sentences[-1]
new_sentences = []
new_sent = []
count_len = 0
for ind, sent in enumerate(sentences):
# print(sent)
new_sent.append(sent)
count_len += len(sent.split(" "))
if count_len > min_len or ind == len(sentences) - 1:
count_len = 0
new_sentences.append(' '.join(new_sent))
new_sent = []
return merge_short_sentences_en(new_sentences)
def split_sentences_zh(text, min_len=10):
text = re.sub('[。!?;]', '.', text)
text = re.sub('[]', ',', text)
# 将文本中的换行符、空格和制表符替换为空格
text = re.sub('[\n\t ]+', ' ', text)
# 在标点符号后添加一个空格
text = re.sub('([,.!?;])', r'\1 $#!', text)
# 分隔句子并去除前后空格
# sentences = [s.strip() for s in re.split('(。|||)', text)]
sentences = [s.strip() for s in text.split('$#!')]
if len(sentences[-1]) == 0: del sentences[-1]
new_sentences = []
new_sent = []
count_len = 0
for ind, sent in enumerate(sentences):
new_sent.append(sent)
count_len += len(sent)
if count_len > min_len or ind == len(sentences) - 1:
count_len = 0
new_sentences.append(' '.join(new_sent))
new_sent = []
return merge_short_sentences_zh(new_sentences)
def merge_short_sentences_en(sens):
"""Avoid short sentences by merging them with the following sentence.
Args:
List[str]: list of input sentences.
Returns:
List[str]: list of output sentences.
"""
sens_out = []
for s in sens:
# If the previous sentense is too short, merge them with
# the current sentence.
if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
sens_out[-1] = sens_out[-1] + " " + s
else:
sens_out.append(s)
try:
if len(sens_out[-1].split(" ")) <= 2:
sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
sens_out.pop(-1)
except:
pass
return sens_out
def merge_short_sentences_zh(sens):
# return sens
"""Avoid short sentences by merging them with the following sentence.
Args:
List[str]: list of input sentences.
Returns:
List[str]: list of output sentences.
"""
sens_out = []
for s in sens:
# If the previous sentense is too short, merge them with
# the current sentence.
if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
sens_out[-1] = sens_out[-1] + " " + s
else:
sens_out.append(s)
try:
if len(sens_out[-1]) <= 2:
sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
sens_out.pop(-1)
except:
pass
return sens_out
if __name__ == '__main__':
zh_text = "好的,我来给你讲一个故事吧。从前有一个小姑娘,她叫做小红。小红非常喜欢在森林里玩耍,她经常会和她的小伙伴们一起去探险。有一天,小红和她的小伙伴们走到了森林深处,突然遇到了一只凶猛的野兽。小红的小伙伴们都吓得不敢动弹,但是小红并没有被吓倒,她勇敢地走向野兽,用她的智慧和勇气成功地制服了野兽,保护了她的小伙伴们。从那以后,小红变得更加勇敢和自信,成为了她小伙伴们心中的英雄。"
en_text = "I didnt know what to do. I said please kill her because it would be better than being kidnapped,” Ben, whose surname CNN is not using for security concerns, said on Wednesday. “Its a nightmare. I said please kill her, dont take her there."
sp_text = "¡Claro! ¿En qué tema te gustaría que te hable en español? Puedo proporcionarte información o conversar contigo sobre una amplia variedad de temas, desde cultura y comida hasta viajes y tecnología. ¿Tienes alguna preferencia en particular?"
fr_text = "Bien sûr ! En quelle matière voudriez-vous que je vous parle en français ? Je peux vous fournir des informations ou discuter avec vous sur une grande variété de sujets, que ce soit la culture, la nourriture, les voyages ou la technologie. Avez-vous une préférence particulière ?"
de_text = 'Es war das Wichtigste was wir sichern wollten da es keine Möglichkeit gab eine 20 Megatonnen- H- Bombe ab zu werfen von einem 30, C124.'
ru_text = 'Но он был во многом, как-бы, всё равно что сын плантатора, так как являлся сыном человека, у которого было в собственности много чего.'
print(split_sentence(zh_text, language_str='ZH'))
print(split_sentence(en_text, language_str='EN'))
print(split_sentence(sp_text, language_str='SP'))
print(split_sentence(fr_text, language_str='FR'))
print(split_sentence(de_text, language_str='DE'))
print(split_sentence(ru_text, language_str='RU'))

View File

@@ -0,0 +1,35 @@
from .symbols import *
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
def cleaned_text_to_sequence(cleaned_text, tones, language, symbol_to_id=None):
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
text: string to convert to a sequence
Returns:
List of integers corresponding to the symbols in the text
"""
symbol_to_id_map = symbol_to_id if symbol_to_id else _symbol_to_id
phones = [symbol_to_id_map[symbol] for symbol in cleaned_text]
tone_start = language_tone_start_map[language]
tones = [i + tone_start for i in tones]
lang_id = language_id_map[language]
lang_ids = [lang_id for i in phones]
return phones, tones, lang_ids
def get_bert(norm_text, word2ph, language, device):
from .chinese_bert import get_bert_feature as zh_bert
from .english_bert import get_bert_feature as en_bert
from .japanese_bert import get_bert_feature as jp_bert
from .chinese_mix import get_bert_feature as zh_mix_en_bert
from .spanish_bert import get_bert_feature as sp_bert
from .french_bert import get_bert_feature as fr_bert
from .korean import get_bert_feature as kr_bert
lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert, 'ZH_MIX_EN': zh_mix_en_bert,
'FR': fr_bert, 'SP': sp_bert, 'ES': sp_bert, "KR": kr_bert}
bert = lang_bert_func_map[language](norm_text, word2ph, device)
return bert

View File

@@ -0,0 +1,199 @@
import os
import re
import cn2an
from pypinyin import lazy_pinyin, Style
from .symbols import punctuation
from .tone_sandhi import ToneSandhi
current_file_path = os.path.dirname(__file__)
pinyin_to_symbol_map = {
line.split("\t")[0]: line.strip().split("\t")[1]
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
}
import jieba.posseg as psg
rep_map = {
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"·": ",",
"": ",",
"...": "",
"$": ".",
"": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"(": "'",
")": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"[": "'",
"]": "'",
"": "-",
"": "-",
"~": "-",
"": "'",
"": "'",
}
tone_modifier = ToneSandhi()
def replace_punctuation(text):
text = text.replace("", "").replace("", "")
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
)
return replaced_text
def g2p(text):
pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
phones, tones, word2ph = _g2p(sentences)
assert sum(word2ph) == len(phones)
assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
phones = ["_"] + phones + ["_"]
tones = [0] + tones + [0]
word2ph = [1] + word2ph + [1]
return phones, tones, word2ph
def _get_initials_finals(word):
initials = []
finals = []
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
orig_finals = lazy_pinyin(
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
)
for c, v in zip(orig_initials, orig_finals):
initials.append(c)
finals.append(v)
return initials, finals
def _g2p(segments):
phones_list = []
tones_list = []
word2ph = []
for seg in segments:
# Replace all English words in the sentence
seg = re.sub("[a-zA-Z]+", "", seg)
seg_cut = psg.lcut(seg)
initials = []
finals = []
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
for word, pos in seg_cut:
if pos == "eng":
import pdb; pdb.set_trace()
continue
sub_initials, sub_finals = _get_initials_finals(word)
sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
initials.append(sub_initials)
finals.append(sub_finals)
# assert len(sub_initials) == len(sub_finals) == len(word)
initials = sum(initials, [])
finals = sum(finals, [])
#
for c, v in zip(initials, finals):
raw_pinyin = c + v
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii
if c == v:
assert c in punctuation
phone = [c]
tone = "0"
word2ph.append(1)
else:
v_without_tone = v[:-1]
tone = v[-1]
pinyin = c + v_without_tone
assert tone in "12345"
if c:
# 多音节
v_rep_map = {
"uei": "ui",
"iou": "iu",
"uen": "un",
}
if v_without_tone in v_rep_map.keys():
pinyin = c + v_rep_map[v_without_tone]
else:
# 单音节
pinyin_rep_map = {
"ing": "ying",
"i": "yi",
"in": "yin",
"u": "wu",
}
if pinyin in pinyin_rep_map.keys():
pinyin = pinyin_rep_map[pinyin]
else:
single_rep_map = {
"v": "yu",
"e": "e",
"i": "y",
"u": "w",
}
if pinyin[0] in single_rep_map.keys():
pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
phone = pinyin_to_symbol_map[pinyin].split(" ")
word2ph.append(len(phone))
phones_list += phone
tones_list += [int(tone)] * len(phone)
return phones_list, tones_list, word2ph
def text_normalize(text):
numbers = re.findall(r"\d+(?:\.?\d+)?", text)
for number in numbers:
text = text.replace(number, cn2an.an2cn(number), 1)
text = replace_punctuation(text)
return text
def get_bert_feature(text, word2ph, device=None):
from text import chinese_bert
return chinese_bert.get_bert_feature(text, word2ph, device=device)
if __name__ == "__main__":
from text.chinese_bert import get_bert_feature
text = "chemistry 但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
text = text_normalize(text)
print(text)
phones, tones, word2ph = g2p(text)
bert = get_bert_feature(text, word2ph)
print(phones, tones, word2ph, bert.shape)
# # 示例用法
# text = "这是一个示例文本:,你好!这是一个测试...."
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试

View File

@@ -0,0 +1,107 @@
import torch
import sys
from transformers import AutoTokenizer, AutoModelForMaskedLM
# model_id = 'hfl/chinese-roberta-wwm-ext-large'
local_path = "./bert/chinese-roberta-wwm-ext-large"
tokenizers = {}
models = {}
def get_bert_feature(text, word2ph, device=None, model_id='hfl/chinese-roberta-wwm-ext-large'):
if model_id not in models:
models[model_id] = AutoModelForMaskedLM.from_pretrained(
model_id
).to(device)
tokenizers[model_id] = AutoTokenizer.from_pretrained(model_id)
model = models[model_id]
tokenizer = tokenizers[model_id]
if (
sys.platform == "darwin"
and torch.backends.mps.is_available()
and device == "cpu"
):
device = "mps"
if not device:
device = "cuda"
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt")
for i in inputs:
inputs[i] = inputs[i].to(device)
res = model(**inputs, output_hidden_states=True)
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
# import pdb; pdb.set_trace()
# assert len(word2ph) == len(text) + 2
word2phone = word2ph
phone_level_feature = []
for i in range(len(word2phone)):
repeat_feature = res[i].repeat(word2phone[i], 1)
phone_level_feature.append(repeat_feature)
phone_level_feature = torch.cat(phone_level_feature, dim=0)
return phone_level_feature.T
if __name__ == "__main__":
import torch
word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
word2phone = [
1,
2,
1,
2,
2,
1,
2,
2,
1,
2,
2,
1,
2,
2,
2,
2,
2,
1,
1,
2,
2,
1,
2,
2,
2,
2,
1,
2,
2,
2,
2,
2,
1,
2,
2,
2,
2,
1,
]
# 计算总帧数
total_frames = sum(word2phone)
print(word_level_feature.shape)
print(word2phone)
phone_level_feature = []
for i in range(len(word2phone)):
print(word_level_feature[i].shape)
# 对每个词重复word2phone[i]次
repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
phone_level_feature.append(repeat_feature)
phone_level_feature = torch.cat(phone_level_feature, dim=0)
print(phone_level_feature.shape) # torch.Size([36, 1024])

View File

@@ -0,0 +1,253 @@
import os
import re
import cn2an
from pypinyin import lazy_pinyin, Style
# from text.symbols import punctuation
from .symbols import language_tone_start_map
from .tone_sandhi import ToneSandhi
from .english import g2p as g2p_en
from transformers import AutoTokenizer
punctuation = ["!", "?", "", ",", ".", "'", "-"]
current_file_path = os.path.dirname(__file__)
pinyin_to_symbol_map = {
line.split("\t")[0]: line.strip().split("\t")[1]
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
}
import jieba.posseg as psg
rep_map = {
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"·": ",",
"": ",",
"...": "",
"$": ".",
"": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"(": "'",
")": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"[": "'",
"]": "'",
"": "-",
"": "-",
"~": "-",
"": "'",
"": "'",
}
tone_modifier = ToneSandhi()
def replace_punctuation(text):
text = text.replace("", "").replace("", "")
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(r"[^\u4e00-\u9fa5_a-zA-Z\s" + "".join(punctuation) + r"]+", "", replaced_text)
replaced_text = re.sub(r"[\s]+", " ", replaced_text)
return replaced_text
def g2p(text, impl='v2'):
pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
if impl == 'v1':
_func = _g2p
elif impl == 'v2':
_func = _g2p_v2
else:
raise NotImplementedError()
phones, tones, word2ph = _func(sentences)
assert sum(word2ph) == len(phones)
# assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
phones = ["_"] + phones + ["_"]
tones = [0] + tones + [0]
word2ph = [1] + word2ph + [1]
return phones, tones, word2ph
def _get_initials_finals(word):
initials = []
finals = []
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
orig_finals = lazy_pinyin(
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
)
for c, v in zip(orig_initials, orig_finals):
initials.append(c)
finals.append(v)
return initials, finals
model_id = 'bert-base-multilingual-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
def _g2p(segments):
phones_list = []
tones_list = []
word2ph = []
for seg in segments:
# Replace all English words in the sentence
# seg = re.sub("[a-zA-Z]+", "", seg)
seg_cut = psg.lcut(seg)
initials = []
finals = []
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
for word, pos in seg_cut:
if pos == "eng":
initials.append(['EN_WORD'])
finals.append([word])
else:
sub_initials, sub_finals = _get_initials_finals(word)
sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
initials.append(sub_initials)
finals.append(sub_finals)
# assert len(sub_initials) == len(sub_finals) == len(word)
initials = sum(initials, [])
finals = sum(finals, [])
#
for c, v in zip(initials, finals):
if c == 'EN_WORD':
tokenized_en = tokenizer.tokenize(v)
phones_en, tones_en, word2ph_en = g2p_en(text=None, pad_start_end=False, tokenized=tokenized_en)
# apply offset to tones_en
tones_en = [t + language_tone_start_map['EN'] for t in tones_en]
phones_list += phones_en
tones_list += tones_en
word2ph += word2ph_en
else:
raw_pinyin = c + v
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii
if c == v:
assert c in punctuation
phone = [c]
tone = "0"
word2ph.append(1)
else:
v_without_tone = v[:-1]
tone = v[-1]
pinyin = c + v_without_tone
assert tone in "12345"
if c:
# 多音节
v_rep_map = {
"uei": "ui",
"iou": "iu",
"uen": "un",
}
if v_without_tone in v_rep_map.keys():
pinyin = c + v_rep_map[v_without_tone]
else:
# 单音节
pinyin_rep_map = {
"ing": "ying",
"i": "yi",
"in": "yin",
"u": "wu",
}
if pinyin in pinyin_rep_map.keys():
pinyin = pinyin_rep_map[pinyin]
else:
single_rep_map = {
"v": "yu",
"e": "e",
"i": "y",
"u": "w",
}
if pinyin[0] in single_rep_map.keys():
pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
phone = pinyin_to_symbol_map[pinyin].split(" ")
word2ph.append(len(phone))
phones_list += phone
tones_list += [int(tone)] * len(phone)
return phones_list, tones_list, word2ph
def text_normalize(text):
numbers = re.findall(r"\d+(?:\.?\d+)?", text)
for number in numbers:
text = text.replace(number, cn2an.an2cn(number), 1)
text = replace_punctuation(text)
return text
def get_bert_feature(text, word2ph, device):
from . import chinese_bert
return chinese_bert.get_bert_feature(text, word2ph, model_id='bert-base-multilingual-uncased', device=device)
from .chinese import _g2p as _chinese_g2p
def _g2p_v2(segments):
spliter = '#$&^!@'
phones_list = []
tones_list = []
word2ph = []
for text in segments:
assert spliter not in text
# replace all english words
text = re.sub('([a-zA-Z\s]+)', lambda x: f'{spliter}{x.group(1)}{spliter}', text)
texts = text.split(spliter)
texts = [t for t in texts if len(t) > 0]
for text in texts:
if re.match('[a-zA-Z\s]+', text):
# english
tokenized_en = tokenizer.tokenize(text)
phones_en, tones_en, word2ph_en = g2p_en(text=None, pad_start_end=False, tokenized=tokenized_en)
# apply offset to tones_en
tones_en = [t + language_tone_start_map['EN'] for t in tones_en]
phones_list += phones_en
tones_list += tones_en
word2ph += word2ph_en
else:
phones_zh, tones_zh, word2ph_zh = _chinese_g2p([text])
phones_list += phones_zh
tones_list += tones_zh
word2ph += word2ph_zh
return phones_list, tones_list, word2ph
if __name__ == "__main__":
# from text.chinese_bert import get_bert_feature
text = "NFT啊chemistry 但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
text = '我最近在学习machine learning希望能够在未来的artificial intelligence领域有所建树。'
text = '今天下午我们准备去shopping mall购物然后晚上去看一场movie。'
text = '我们现在 also 能够 help 很多公司 use some machine learning 的 algorithms 啊!'
text = text_normalize(text)
print(text)
phones, tones, word2ph = g2p(text, impl='v2')
bert = get_bert_feature(text, word2ph, device='cuda:0')
print(phones)
import pdb; pdb.set_trace()
# # 示例用法
# text = "这是一个示例文本:,你好!这是一个测试...."
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试

View File

@@ -0,0 +1,36 @@
from . import chinese, japanese, english, chinese_mix, korean, french, spanish
from . import cleaned_text_to_sequence
import copy
language_module_map = {"ZH": chinese, "JP": japanese, "EN": english, 'ZH_MIX_EN': chinese_mix, 'KR': korean,
'FR': french, 'SP': spanish, 'ES': spanish}
def clean_text(text, language):
language_module = language_module_map[language]
norm_text = language_module.text_normalize(text)
phones, tones, word2ph = language_module.g2p(norm_text)
return norm_text, phones, tones, word2ph
def clean_text_bert(text, language, device=None):
language_module = language_module_map[language]
norm_text = language_module.text_normalize(text)
phones, tones, word2ph = language_module.g2p(norm_text)
word2ph_bak = copy.deepcopy(word2ph)
for i in range(len(word2ph)):
word2ph[i] = word2ph[i] * 2
word2ph[0] += 1
bert = language_module.get_bert_feature(norm_text, word2ph, device=device)
return norm_text, phones, tones, word2ph_bak, bert
def text_to_sequence(text, language):
norm_text, phones, tones, word2ph = clean_text(text, language)
return cleaned_text_to_sequence(phones, tones, language)
if __name__ == "__main__":
pass

View File

@@ -0,0 +1,110 @@
"""Set of default text cleaners"""
# TODO: pick the cleaner for languages dynamically
import re
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
rep_map = {
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"·": ",",
"": ",",
"...": ".",
"": ".",
"$": ".",
"": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"(": "'",
")": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"[": "'",
"]": "'",
"": "",
"": "-",
"~": "-",
"": "'",
"": "'",
}
def replace_punctuation(text):
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
return replaced_text
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, " ", text).strip()
def remove_punctuation_at_begin(text):
return re.sub(r'^[,.!?]+', '', text)
def remove_aux_symbols(text):
text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text)
return text
def replace_symbols(text, lang="en"):
"""Replace symbols based on the lenguage tag.
Args:
text:
Input text.
lang:
Lenguage identifier. ex: "en", "fr", "pt", "ca".
Returns:
The modified text
example:
input args:
text: "si l'avi cau, diguem-ho"
lang: "ca"
Output:
text: "si lavi cau, diguemho"
"""
text = text.replace(";", ",")
text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
text = text.replace(":", ",")
if lang == "en":
text = text.replace("&", " and ")
elif lang == "fr":
text = text.replace("&", " et ")
elif lang == "pt":
text = text.replace("&", " e ")
elif lang == "ca":
text = text.replace("&", " i ")
text = text.replace("'", "")
elif lang== "es":
text=text.replace("&","y")
text = text.replace("'", "")
return text
def unicleaners(text, cased=False, lang='en'):
"""Basic pipeline for Portuguese text. There is no need to expand abbreviation and
numbers, phonemizer already does that"""
if not cased:
text = lowercase(text)
text = replace_punctuation(text)
text = replace_symbols(text, lang=lang)
text = remove_aux_symbols(text)
text = remove_punctuation_at_begin(text)
text = collapse_whitespace(text)
text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
return text

129530
MyShellTTSBase/text/cmudict.rep Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -0,0 +1,284 @@
import pickle
import os
import re
from g2p_en import G2p
from . import symbols
from .english_utils.abbreviations import expand_abbreviations
from .english_utils.time_norm import expand_time_english
from .english_utils.number_norm import normalize_numbers
from .japanese import distribute_phone
from transformers import AutoTokenizer
current_file_path = os.path.dirname(__file__)
CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
_g2p = G2p()
arpa = {
"AH0",
"S",
"AH1",
"EY2",
"AE2",
"EH0",
"OW2",
"UH0",
"NG",
"B",
"G",
"AY0",
"M",
"AA0",
"F",
"AO0",
"ER2",
"UH1",
"IY1",
"AH2",
"DH",
"IY0",
"EY1",
"IH0",
"K",
"N",
"W",
"IY2",
"T",
"AA1",
"ER1",
"EH2",
"OY0",
"UH2",
"UW1",
"Z",
"AW2",
"AW1",
"V",
"UW2",
"AA2",
"ER",
"AW0",
"UW0",
"R",
"OW1",
"EH1",
"ZH",
"AE0",
"IH2",
"IH",
"Y",
"JH",
"P",
"AY1",
"EY0",
"OY2",
"TH",
"HH",
"D",
"ER0",
"CH",
"AO1",
"AE1",
"AO2",
"OY1",
"AY2",
"IH1",
"OW0",
"L",
"SH",
}
def post_replace_ph(ph):
rep_map = {
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"·": ",",
"": ",",
"...": "",
"v": "V",
}
if ph in rep_map.keys():
ph = rep_map[ph]
if ph in symbols:
return ph
if ph not in symbols:
ph = "UNK"
return ph
def read_dict():
g2p_dict = {}
start_line = 49
with open(CMU_DICT_PATH) as f:
line = f.readline()
line_index = 1
while line:
if line_index >= start_line:
line = line.strip()
word_split = line.split(" ")
word = word_split[0]
syllable_split = word_split[1].split(" - ")
g2p_dict[word] = []
for syllable in syllable_split:
phone_split = syllable.split(" ")
g2p_dict[word].append(phone_split)
line_index = line_index + 1
line = f.readline()
return g2p_dict
def cache_dict(g2p_dict, file_path):
with open(file_path, "wb") as pickle_file:
pickle.dump(g2p_dict, pickle_file)
def get_dict():
if os.path.exists(CACHE_PATH):
with open(CACHE_PATH, "rb") as pickle_file:
g2p_dict = pickle.load(pickle_file)
else:
g2p_dict = read_dict()
cache_dict(g2p_dict, CACHE_PATH)
return g2p_dict
eng_dict = get_dict()
def refine_ph(phn):
tone = 0
if re.search(r"\d$", phn):
tone = int(phn[-1]) + 1
phn = phn[:-1]
return phn.lower(), tone
def refine_syllables(syllables):
tones = []
phonemes = []
for phn_list in syllables:
for i in range(len(phn_list)):
phn = phn_list[i]
phn, tone = refine_ph(phn)
phonemes.append(phn)
tones.append(tone)
return phonemes, tones
def text_normalize(text):
text = text.lower()
text = expand_time_english(text)
text = normalize_numbers(text)
text = expand_abbreviations(text)
return text
model_id = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
def g2p_old(text):
tokenized = tokenizer.tokenize(text)
# import pdb; pdb.set_trace()
phones = []
tones = []
words = re.split(r"([,;.\-\?\!\s+])", text)
for w in words:
if w.upper() in eng_dict:
phns, tns = refine_syllables(eng_dict[w.upper()])
phones += phns
tones += tns
else:
phone_list = list(filter(lambda p: p != " ", _g2p(w)))
for ph in phone_list:
if ph in arpa:
ph, tn = refine_ph(ph)
phones.append(ph)
tones.append(tn)
else:
phones.append(ph)
tones.append(0)
# todo: implement word2ph
word2ph = [1 for i in phones]
phones = [post_replace_ph(i) for i in phones]
return phones, tones, word2ph
def g2p(text, pad_start_end=True, tokenized=None):
if tokenized is None:
tokenized = tokenizer.tokenize(text)
# import pdb; pdb.set_trace()
phs = []
ph_groups = []
for t in tokenized:
if not t.startswith("#"):
ph_groups.append([t])
else:
ph_groups[-1].append(t.replace("#", ""))
phones = []
tones = []
word2ph = []
for group in ph_groups:
w = "".join(group)
phone_len = 0
word_len = len(group)
if w.upper() in eng_dict:
phns, tns = refine_syllables(eng_dict[w.upper()])
phones += phns
tones += tns
phone_len += len(phns)
else:
phone_list = list(filter(lambda p: p != " ", _g2p(w)))
for ph in phone_list:
if ph in arpa:
ph, tn = refine_ph(ph)
phones.append(ph)
tones.append(tn)
else:
phones.append(ph)
tones.append(0)
phone_len += 1
aaa = distribute_phone(phone_len, word_len)
word2ph += aaa
phones = [post_replace_ph(i) for i in phones]
if pad_start_end:
phones = ["_"] + phones + ["_"]
tones = [0] + tones + [0]
word2ph = [1] + word2ph + [1]
return phones, tones, word2ph
def get_bert_feature(text, word2ph, device=None):
from text import english_bert
return english_bert.get_bert_feature(text, word2ph, device=device)
if __name__ == "__main__":
# print(get_dict())
# print(eng_word_to_phoneme("hello"))
from text.english_bert import get_bert_feature
text = "In this paper, we propose 1 DSPGAN, a N-F-T GAN-based universal vocoder."
text = text_normalize(text)
phones, tones, word2ph = g2p(text)
import pdb; pdb.set_trace()
bert = get_bert_feature(text, word2ph)
print(phones, tones, word2ph, bert.shape)
# all_phones = set()
# for k, syllables in eng_dict.items():
# for group in syllables:
# for ph in group:
# all_phones.add(ph)
# print(all_phones)

View File

@@ -0,0 +1,39 @@
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import sys
model_id = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = None
def get_bert_feature(text, word2ph, device=None):
global model
if (
sys.platform == "darwin"
and torch.backends.mps.is_available()
and device == "cpu"
):
device = "mps"
if not device:
device = "cuda"
if model is None:
model = AutoModelForMaskedLM.from_pretrained(model_id).to(
device
)
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt")
for i in inputs:
inputs[i] = inputs[i].to(device)
res = model(**inputs, output_hidden_states=True)
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
assert inputs["input_ids"].shape[-1] == len(word2ph)
word2phone = word2ph
phone_level_feature = []
for i in range(len(word2phone)):
repeat_feature = res[i].repeat(word2phone[i], 1)
phone_level_feature.append(repeat_feature)
phone_level_feature = torch.cat(phone_level_feature, dim=0)
return phone_level_feature.T

View File

@@ -0,0 +1,35 @@
import re
# List of (regular expression, replacement) pairs for abbreviations in english:
abbreviations_en = [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("mrs", "misess"),
("mr", "mister"),
("dr", "doctor"),
("st", "saint"),
("co", "company"),
("jr", "junior"),
("maj", "major"),
("gen", "general"),
("drs", "doctors"),
("rev", "reverend"),
("lt", "lieutenant"),
("hon", "honorable"),
("sgt", "sergeant"),
("capt", "captain"),
("esq", "esquire"),
("ltd", "limited"),
("col", "colonel"),
("ft", "fort"),
]
]
def expand_abbreviations(text, lang="en"):
if lang == "en":
_abbreviations = abbreviations_en
else:
raise NotImplementedError()
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text

View File

@@ -0,0 +1,97 @@
""" from https://github.com/keithito/tacotron """
import re
from typing import Dict
import inflect
_inflect = inflect.engine()
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
_currency_re = re.compile(r"(£|\$|¥)([0-9\,\.]*[0-9]+)")
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
_number_re = re.compile(r"-?[0-9]+")
def _remove_commas(m):
return m.group(1).replace(",", "")
def _expand_decimal_point(m):
return m.group(1).replace(".", " point ")
def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
parts = value.replace(",", "").split(".")
if len(parts) > 2:
return f"{value} {inflection[2]}" # Unexpected format
text = []
integer = int(parts[0]) if parts[0] else 0
if integer > 0:
integer_unit = inflection.get(integer, inflection[2])
text.append(f"{integer} {integer_unit}")
fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if fraction > 0:
fraction_unit = inflection.get(fraction / 100, inflection[0.02])
text.append(f"{fraction} {fraction_unit}")
if len(text) == 0:
return f"zero {inflection[2]}"
return " ".join(text)
def _expand_currency(m: "re.Match") -> str:
currencies = {
"$": {
0.01: "cent",
0.02: "cents",
1: "dollar",
2: "dollars",
},
"": {
0.01: "cent",
0.02: "cents",
1: "euro",
2: "euros",
},
"£": {
0.01: "penny",
0.02: "pence",
1: "pound sterling",
2: "pounds sterling",
},
"¥": {
# TODO rin
0.02: "sen",
2: "yen",
},
}
unit = m.group(1)
currency = currencies[unit]
value = m.group(2)
return __expand_currency(value, currency)
def _expand_ordinal(m):
return _inflect.number_to_words(m.group(0))
def _expand_number(m):
num = int(m.group(0))
if 1000 < num < 3000:
if num == 2000:
return "two thousand"
if 2000 < num < 2010:
return "two thousand " + _inflect.number_to_words(num % 100)
if num % 100 == 0:
return _inflect.number_to_words(num // 100) + " hundred"
return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
return _inflect.number_to_words(num, andword="")
def normalize_numbers(text):
text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_currency_re, _expand_currency, text)
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text)
return text

View File

@@ -0,0 +1,47 @@
import re
import inflect
_inflect = inflect.engine()
_time_re = re.compile(
r"""\b
((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
:
([0-5][0-9]) # minutes
\s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
\b""",
re.IGNORECASE | re.X,
)
def _expand_num(n: int) -> str:
return _inflect.number_to_words(n)
def _expand_time_english(match: "re.Match") -> str:
hour = int(match.group(1))
past_noon = hour >= 12
time = []
if hour > 12:
hour -= 12
elif hour == 0:
hour = 12
past_noon = True
time.append(_expand_num(hour))
minute = int(match.group(6))
if minute > 0:
if minute < 10:
time.append("oh")
time.append(_expand_num(minute))
am_pm = match.group(7)
if am_pm is None:
time.append("p m" if past_noon else "a m")
else:
time.extend(list(am_pm.replace(".", "")))
return " ".join(time)
def expand_time_english(text: str) -> str:
return re.sub(_time_re, _expand_time_english, text)

View File

@@ -0,0 +1,140 @@
import abc
from typing import List, Tuple
from .punctuation import Punctuation
class BasePhonemizer(abc.ABC):
"""Base phonemizer class
Phonemization follows the following steps:
1. Preprocessing:
- remove empty lines
- remove punctuation
- keep track of punctuation marks
2. Phonemization:
- convert text to phonemes
3. Postprocessing:
- join phonemes
- restore punctuation marks
Args:
language (str):
Language used by the phonemizer.
punctuations (List[str]):
List of punctuation marks to be preserved.
keep_puncs (bool):
Whether to preserve punctuation marks or not.
"""
def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
# ensure the backend is installed on the system
if not self.is_available():
raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover
# ensure the backend support the requested language
self._language = self._init_language(language)
# setup punctuation processing
self._keep_puncs = keep_puncs
self._punctuator = Punctuation(punctuations)
def _init_language(self, language):
"""Language initialization
This method may be overloaded in child classes (see Segments backend)
"""
if not self.is_supported_language(language):
raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend")
return language
@property
def language(self):
"""The language code configured to be used for phonemization"""
return self._language
@staticmethod
@abc.abstractmethod
def name():
"""The name of the backend"""
...
@classmethod
@abc.abstractmethod
def is_available(cls):
"""Returns True if the backend is installed, False otherwise"""
...
@classmethod
@abc.abstractmethod
def version(cls):
"""Return the backend version as a tuple (major, minor, patch)"""
...
@staticmethod
@abc.abstractmethod
def supported_languages():
"""Return a dict of language codes -> name supported by the backend"""
...
def is_supported_language(self, language):
"""Returns True if `language` is supported by the backend"""
return language in self.supported_languages()
@abc.abstractmethod
def _phonemize(self, text, separator):
"""The main phonemization method"""
def _phonemize_preprocess(self, text) -> Tuple[List[str], List]:
"""Preprocess the text before phonemization
1. remove spaces
2. remove punctuation
Override this if you need a different behaviour
"""
text = text.strip()
if self._keep_puncs:
# a tuple (text, punctuation marks)
return self._punctuator.strip_to_restore(text)
return [self._punctuator.strip(text)], []
def _phonemize_postprocess(self, phonemized, punctuations) -> str:
"""Postprocess the raw phonemized output
Override this if you need a different behaviour
"""
if self._keep_puncs:
return self._punctuator.restore(phonemized, punctuations)[0]
return phonemized[0]
def phonemize(self, text: str, separator="|", language: str = None) -> str: # pylint: disable=unused-argument
"""Returns the `text` phonemized for the given language
Args:
text (str):
Text to be phonemized.
separator (str):
string separator used between phonemes. Default to '_'.
Returns:
(str): Phonemized text
"""
text, punctuations = self._phonemize_preprocess(text)
phonemized = []
for t in text:
p = self._phonemize(t, separator)
phonemized.append(p)
phonemized = self._phonemize_postprocess(phonemized, punctuations)
return phonemized
def print_logs(self, level: int = 0):
indent = "\t" * level
print(f"{indent}| > phoneme language: {self.language}")
print(f"{indent}| > phoneme backend: {self.name()}")

View File

@@ -0,0 +1,109 @@
"""Set of default text cleaners"""
# TODO: pick the cleaner for languages dynamically
import re
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
rep_map = {
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"·": ",",
"": ",",
"...": ".",
"": ".",
"$": ".",
"": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"(": "'",
")": "'",
"": "'",
"": "'",
"": "'",
"": "'",
"[": "'",
"]": "'",
"": "",
"": "-",
"~": "-",
"": "'",
"": "'",
}
def replace_punctuation(text):
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
return replaced_text
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, " ", text).strip()
def remove_punctuation_at_begin(text):
return re.sub(r'^[,.!?]+', '', text)
def remove_aux_symbols(text):
text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text)
return text
def replace_symbols(text, lang="en"):
"""Replace symbols based on the lenguage tag.
Args:
text:
Input text.
lang:
Lenguage identifier. ex: "en", "fr", "pt", "ca".
Returns:
The modified text
example:
input args:
text: "si l'avi cau, diguem-ho"
lang: "ca"
Output:
text: "si lavi cau, diguemho"
"""
text = text.replace(";", ",")
text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
text = text.replace(":", ",")
if lang == "en":
text = text.replace("&", " and ")
elif lang == "fr":
text = text.replace("&", " et ")
elif lang == "pt":
text = text.replace("&", " e ")
elif lang == "ca":
text = text.replace("&", " i ")
text = text.replace("'", "")
elif lang== "es":
text=text.replace("&","y")
text = text.replace("'", "")
return text
def spanish_cleaners(text):
"""Basic pipeline for Portuguese text. There is no need to expand abbreviation and
numbers, phonemizer already does that"""
text = lowercase(text)
text = replace_symbols(text, lang="es")
text = replace_punctuation(text)
text = remove_aux_symbols(text)
text = remove_punctuation_at_begin(text)
text = collapse_whitespace(text)
text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
return text

View File

@@ -0,0 +1,79 @@
{
"symbols": [
"_",
",",
".",
"!",
"?",
"-",
"~",
"\u2026",
"N",
"Q",
"a",
"b",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"\u0251",
"\u00e6",
"\u0283",
"\u0291",
"\u00e7",
"\u026f",
"\u026a",
"\u0254",
"\u025b",
"\u0279",
"\u00f0",
"\u0259",
"\u026b",
"\u0265",
"\u0278",
"\u028a",
"\u027e",
"\u0292",
"\u03b8",
"\u03b2",
"\u014b",
"\u0266",
"\u207c",
"\u02b0",
"`",
"^",
"#",
"*",
"=",
"\u02c8",
"\u02cc",
"\u2192",
"\u2193",
"\u2191",
" ",
"\u0263",
"\u0261",
"r",
"\u0272",
"\u029d",
"\u028e",
"\u02d0"
]
}

View File

@@ -0,0 +1 @@
_,.!?-~…NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ ɡrɲʝɣʎː—¿¡

View File

@@ -0,0 +1,83 @@
{
"symbols": [
"_",
",",
".",
"!",
"?",
"-",
"~",
"\u2026",
"N",
"Q",
"a",
"b",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"\u0251",
"\u00e6",
"\u0283",
"\u0291",
"\u00e7",
"\u026f",
"\u026a",
"\u0254",
"\u025b",
"\u0279",
"\u00f0",
"\u0259",
"\u026b",
"\u0265",
"\u0278",
"\u028a",
"\u027e",
"\u0292",
"\u03b8",
"\u03b2",
"\u014b",
"\u0266",
"\u207c",
"\u02b0",
"`",
"^",
"#",
"*",
"=",
"\u02c8",
"\u02cc",
"\u2192",
"\u2193",
"\u2191",
" ",
"\u0261",
"r",
"\u0272",
"\u029d",
"\u0263",
"\u028e",
"\u02d0",
"\u2014",
"\u00bf",
"\u00a1"
]
}

View File

@@ -0,0 +1,12 @@
from .cleaner import spanish_cleaners
from .gruut_wrapper import Gruut
def es2ipa(text):
e = Gruut(language="es-es", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
# text = spanish_cleaners(text)
phonemes = e.phonemize(text, separator="")
return phonemes
if __name__ == '__main__':
print(es2ipa('¿Y a quién echaría de menos, en el mundo si no fuese a vos?'))

View File

@@ -0,0 +1,400 @@
kapˈitulo ˈuno de daβˈid kˌoppeɾfjˈelð o el soβɾˈino de mi tˈia de tʃˈaɾles dˌiθjˈens.
ˈesta ɡɾˌaβaθjˈon de lˌiβɾˈiβoks ˈes de domˈinjo pˈuβliko.
si dˈeβo o nˈo sˈer el ˈeɾoe de mi pɾˈopja istˈoɾja, o si ˈeste kˌometˈiðo seɾˈa dˌesempeɲˈaðo poɾ ˈotɾa peɾsˈona ke nˈo ʝˈo,
ˈeso ˈes pɾeθˈisamˈente lo ke el lektˈoɾ beɾˈa en las siɣjˈɛntes pˈaxinas.
pˌaɾa pɾˌoθeðˈeɾ kon ˈoɾðen diɾˈe ke naθˈi, seɣˈun me ˈan dˈitʃo i lo kɾˈeo, ˈun bjˈeɾnes a las dˈoθe de la nˈotʃe.
notˈaɾon ke al mˈismo tjˈempo ke dˈaβa el ɾɾelˈox su pɾimˈeɾa kˌampanˈaða, lanθˈaβa ʝˈo sˌimultˈaneamˈente mi pɾimˈeɾ kexˈiðo.
tenjˈɛndo en kˌonsiðˌeɾaθjˈon el dˈia i ˈoɾa de mi nˌaθimjˈɛnto, la ˌemfeɾmˈeɾa de la paɾˈiða i bˈaɾjas komˈaðɾes de la bˌeθindˈad,
a kjˌenes aβˈia ˌinspiɾˈaðo ˈun bˈiβo ˌinteɾˈes alɣˈunos mˈeses ˈantes de tɾaβˈaɾ kˌonoθimjˈɛnto kon ˈeʎas, dˌeklaɾˈaɾon dˈos kˈosas.
la pɾimˈeɾa, ke estˈaβa pɾˌeðestinˈaðo a sˈer dˌesɣɾaθjˈaðo, i la seɣˈunda, ke ɡˌoθaɾˈia el pɾˌiβilˈexjo de bˈeɾ espˈektɾos i espˈiɾitus,
kˈɣa ˌineβitˈaβle de tˈoðas las ˌimfoɾtunˈaðas kɾiatˈuɾas de ˈambos sˈeksos ke nˈaθen en bjˈeɾnes dˌesðe las dˈoθe de la nˈotʃe ˌasta el ˌamaneθˈeɾ.
ɾɾespˈekto al pɾimˈeɾ pˈunto nˈo me ˌeksplikaɾˈe akˈi, pwˈes mi istˈoɾja dˌemostɾaɾˈa sˌufiθjˈɛntemˈente si la pɾˌeðikθjˈon fwˈe o nˈo beɾˈiðika.
en kwˌanto al seɣˈundo, bˌastˈeme deθˈiɾ ke, a mˈenos de aβˈeɾ bˈisto espˈektɾos i espˈiɾitus kwˌando estˈaβa en la kˈuna,
sˈiɣo aˈun ˌaɣwaɾðˈandolos.
nˈo se kɾˈea ke me kˌondwˈelo poɾ la pɾˌiβaθjˈon de ˈesta pˈaɾte de mi eɾˈɛnθja, i si ˈalɣjen,
poɾ kˌaswaliðˈad, ˌembiðjaɾˈe mi pwˈesto, se lo θˈeðo kon ˈalma i bˈiða.
naθˈi de pjˈes, kˌomo deθˈiɾse swˈele, i la θjuðˈad ke me djˈo el sˈer fwˈe blundeɾstoun en el kondˈaðo de sˌuffolˈu o poɾ aʎˈi θˈeɾka,
fwˈi ˈun ˈixo pˈostumo, pwˈes aβɾˈi los ˈoxos al dˈia a los sˈeɪs mˈeses de aβˈeɾ θeɾɾˈaðo mi pˈaðɾe los sˈujjos pˌaɾa sjˈempɾe.
xamˈas ˌolβiðaɾˈe la ˌindeskɾipːtˈiβle lˈastima ke se ˌapoðeɾˈo de mˈi al fˌiɣˈaɾme ke mi pˈaðɾe se beˈia aʎˈi ˌaβandonˈaðo,
sˈolo, en mˈeðjo de las tinjˈeβlas de la nˈotʃe, mjˌentɾas ke nwˌestɾa biβjˈɛnda, bjˈen templˈaða i ʎˈena de lˈuθ,
le θeɾɾˈaβa kɾuˈelmˈente sus pwˈeɾtas.
la peɾsˈona mˈas ˌimpoɾtˈante de nwˌestɾa famˈilja ˈeɾa ˈuna tˈia de mi pˈaðɾe, ke dˌesempˌeɲaɾˈa ˈun ˌimpoɾtˈante papˈel en mi ɾɾelˈato.
mˈiss tɾotwˈooð o mˈiss betse, kˌomo la ʎamˈaβa mi pˈoβɾe mˈaðɾe, kˈaða bˈeθ ke loɣɾˈaβa dˌominˈaɾ el teɾɾˈoɾ ke le ˌinspiɾˈaβa el aβlˈaɾ de sˌemexˈante peɾsˈona,
kˈosa ɾɾˈaɾa ˌentɾe paɾˈentesis, mˈiss betse se aβˈia kasˈaðo kon ˈun ˈombɾe mˈas xˈoβen ke ˈeʎa,
suxˈeto mˈujj ɡwˈapo, ˌaʊnke nˈo mˈujj bwˈeno, pwˈes seɣˈun el ɾɾumˈoɾ pˈuβliko, el maɾˈiðo θuɾɾˈo mˈas de ˈuna bˈeθ a su muxˈeɾ,
amˈen de ke θjˈeɾto dˈia, a pɾopˈosito de ˈuna kwestjˈon de suβsˈiðjos, tɾatˈo de ɾɾˌespondˈeɾ a la ˌoposiθjˈon de su kˈaɾa a mitˈad tiɾˈandola poɾ la bentˈana de ˈun pˈiso seɣˈundo.
sˌemexˈantes pɾuˈeβas de ˌinkompˌatiβˌiliðˈað de kaɾˈakteɾ aβˈian ˌoβliɣˈaðo a mˈiss betse a dˌesembˌaɾaθˈaɾse de ˈel poɾ mˈeðjo de dinˈeɾo i ˈambos espˈosos se sˌepaɾˈaɾon ˌamiɣˈaβlemˈente.
el tiɾˈano maɾtʃˈo kon su kˌapitˈal a la ˈindja, dˌonde seɣˈun ˈuna tɾˌaðiθjˈon de famˈilja, le aβˈian bˈisto ˈuna bˈeθ montˈaðo en ˈun ˌelefˈante i en kˌompaɲˈia de ˈun ɡɾˈan mˈono.
nˈo aβˈia poðˈiðo ˌaβeɾiɣwˈaɾse si ˈeste ˈeɾa ˈuna makˈaka o ˈun ˌaβeˈun, pɾinθˈesa del moɣˈol, kˌonoθˈiða tambjˈen kon el nˈombɾe de ˈuna baβˈu,
ˌaʊnke ʝˈo me inklˈino poɾ ˈesto ˈultimo.
sˈea de ˈeʎo lo ke kjˈeɾa, el kˈaso ˈes ke al kˈaβo de djˈˈaɲos ʎeɣˈo a ˌiŋɡlatˈeɾɾa la notˈiθja de su fˌaʎeθimjˈɛnto,
sin ke nˈaðje supjˈeɾa kˈomo, pwˈes asˈi ke se sˌepaɾˈaɾon, ˈeʎa tomˈo su ˌapeʎˈiðo de soltˈeɾa,
kompɾˈo ˈuna kˈasa pekˈeɲa en ˈuna alðˈea a oɾˈiʎas del mˈaɾ, dˌonde se ˌinstalˈo en kˌompaɲˈia de ˈuna kɾiˈaða,
kˌomo ˈuna bˌeɾðaðˈeɾa ɾɾeklˈusa.
mi pˈaðɾe aβˈia sˈiðo su soβɾˈino pɾˌeðilˈekto, seɣˈun tˈɛŋɡo ˌentendˈiðo, pˌeɾo ˈeʎa se djˈo poɾ sˈumamˈente ˌofendˈiða a pɾopˈosito de su mˌatɾimˈonjo bˌaxo pɾetˈeksto de ke mi mˈaðɾe nˈo ˈeɾa sˈino ˈuna muɲˈeka de θˈeɾa.
ˌaʊnke nˈo aβˈia bˈisto nˈunka a mi mˈaðɾe, saβˈia ke sˈolo kontˈaβa bˈeɪnte ˈaɲos.
mi pˈaðɾe i mis betse nˈo bolβjˈeɾon a bˈeɾse.
ˈel, al kasˈaɾse kon mi mˈaðɾe, tenˈia dˈoβles ˈaɲos ke ˈeʎa, i kˌomo su salˈud ˈeɾa dˌelikˈaða,
muɾjˈo al kˈaβo de ˈun ˈaɲo, o sˈea, kˌomo ʎˈeβo dˈitʃo, sˈeɪs mˈeses ˈantes de mi benˈiða al mˈundo.
ˈe akˈi el estˈaðo de la sˌitwaθjˈon en la tˈaɾðe de akˈel dˈia del mˈes de mˈaɾθo,
ke me pˌeɾmitiɾˈe kˌalifikˈaɾ de mˌemoɾˈaβle bjˈeɾnes.
ˈa bˈase mi mˈaðɾe, sentˈaða θˈeɾka del fwˈeɣo, emfˈeɾma, tɾˈiste, pensˈando en el pˈoβɾe wˈeɾfano ke ˈiβa a benˈiɾ al mˈundo,
kwˌando, alθˈando la bˈista, despwˈes de aβˈeɾ ˌeŋxuɣˈaðo alɣˈunas lˈaɣɾimas, ˌapeɾθiβjˈo a tɾaβˈes de la bentˈana ˈuna muxˈeɾ dˌeskonoθˈiða ke benˈia poɾ el xaɾðˈin.
mi mˈaðɾe tˈuβo el pɾˌesentimjˈɛnto de ke ˈeɾa mis betse.
ˈia en su tˈaʎe, en su mˈoðo de andˈaɾ, en tˈoðo o en fˈin, tˈal ɾɾˌixiðˈeθ ke a bjˈen seɣˈuɾo nˈo poðˈia sˈer ˈotɾa mˈas ke ˈeʎa.
al ˌaθeɾkˈaɾse a la kˈasa, djˈo ˈuna nwˈeβa pɾuˈeβa de su ˌiðentiðˈað.
mi pˈaðɾe aβˈia ɾɾˌepetˈiðo mˈas de ˈuna bˈeθ ke la tˈal seɲˈoɾa nˈo se kˌonduθˈia nˈunka kˌomo los demˈas.
en bˈeθ de ʎamˈaɾ, se ˌaθeɾkˈo en dˌeɾetʃˈuɾa a la bentˈana poɾ dˌonde la aβˈia bˈisto mi mˈaðɾe i peɣˈo su ɾɾˈostɾo a los kɾistˈales.
ˈesta bisˈita pɾoðˈuxo tˈal ˌimpɾesjˈon ke sjˈempɾe ˈe tenˈiðo el kˌombenθimjˈɛnto ke si naθˈi en ˈun bjˈeɾnes se lo dˈeβo a mis betse.
mi mˈaðɾe, ʎˈena de espˈanto, se lˌeβantˈo de su sˈiʎa i se ɾɾˌetiɾˈo a ˈun ɾɾinkˈon,
mjˌentɾas ke mis betse ˌeskuðɾiɲˈaβa kon ˈoxos ˌinkisitoɾjˈales tˈoða la ˌaβitaθjˈon.
nˈo taɾðˈo en dˌistiŋɡˈiɾ a su soβɾˈina i le ˈiθo ˈun xˈesto pˌaɾa ke ˌakuðjˈese a aβɾˈiɾle la pwˈeɾta,
i kˌomo el tˈal xˈesto ˈeɾa el de ˈuna peɾsˈona ˌakostumbɾˈaða a aθˈeɾse ˌoβeðeθˈeɾ, mi mˈaðɾe ˌoβeðeθjˈo.
—supˈɡo ke sˈoɪs mi tɾˈes daβˈid kˌoppeɾfjˈelð —dˈixo mis betse—
su supˈɡo sˌiɡnifikˈaβa ke nˈo aβˈia matˈeɾja a ˌekiβˌokaθjˈon al bˈeɾ la bestˈiða de lˈuto i ˌaβokˈaða a sˈer mˈaðɾe.
—sˈi —ɾɾˌespondjˈo mi mˈaðɾe tˈimiðamˈente—
—sˈoɪ mˈiss tɾoˌutwˈooð —dˈixo la ɾɾeθjˈen ʎeɣˈaða—i espˈeɾo ke ˈantes de aˈoɾa aβɾˈeis oˈiðo aβlˈaɾ de mˈi.
ˈe tenˈiðo ˈese ɡˈusto—ɾɾˌespondjˈo mi mˈaðɾe—
pwˈes bjˈen, sˈoɪ ʝˈo mˈisma en peɾsˈona.
mi mˈaðɾe baxˈo la kaβˈeθa, ɾɾoɣˈando a mis betse ke entɾˈase.
sˌentˈaɾonse xˈunto a la tʃˌimenˈea i mi mˈaðɾe se etʃˈo a ʎoɾˈaɾ.
—ta, ta, ta——dˈixo mis betse kon ˌimpaθjˈɛnθja—
—mi mˈaðɾe nˈo pˈuðo kˌontenˈeɾ sus lˈaɣɾimas sˈino al kˈaβo de alɣˈunos minˈutos.
—kitˈaos el sombɾˈeɾo, ˈixa mˈia, pˌaɾa ke pwˈeða bˈeɾos—ˌaɲaðjˈo la bjˈexa.
mi mˈaðɾe la tenˈia dˌemasjˈaðo mjˈeðo pˌaɾa neɣˈaɾse.
asˈi ˈes ke se dˌespoxˈo de su sombɾˈeɾo, ˌaʊnke kon tˈal ˌaxitaθjˈon, ke sus kaβˈeʎos, ke ˈeɾan sˈumamˈente eɾmˈosos,
ˈa, djˈos de bondˈad, sˈoɪs ˈuna tʃikˈiʎa i nˈaða mˈas.
sin dˈuða ke mi mˈaðɾe tenˈia el ˈaɪɾe sˈumamˈente ˌaliɲˈaðo, pˌeɾo la bwˈena seɲˈoɾa ˌaθepːtˈo la ˌeksklamaθjˈon kˌomo ˈun ɾɾepɾˈotʃe mˌeɾeθˈiðo,
i ɾɾˌespondjˈo ke, en efˈekto, temˈia tenˈeɾ pˈoka ˌekspeɾjˈɛnθja kˌomo bjˈuða i kˌomo mˈaðɾe.
mˈiss betse pˌaɾeθjˈo ˌamansˈaɾse, i ˌenseɣˈiða, pasˈando bɾˈuskamˈente a ˈotɾa ˌinteɾpˌelaθjˈon, ˌeksklamˈo.
—¿poɾ kˈe se ʎˈama ˈesta kˈasa ɾɾoˌojˈeɾo?
—fwˈe el nˈombɾe ke le djˈo mɾ.
kˌoppeɾfjˈelð kwˌando kompɾˈo la kˈasa—ɾɾˌeplikˈo mi mˈaðɾe.
kɾejjˈo ke aβˈia en los ˈaɾβoles mˈutʃas kˌoɾnˈexas.
en akˈel momˈɛnto, ˈuna ɾɾˈafaɣa de bjˈɛnto sˌakuðjˈo ˌasta tˈal pˈunto los ˈolmos del ekstɾˈemo del xaɾðˈin ke mi mˈaðɾe i mˈiss betse dˌiɾixjˈeɾon sus miɾˈaðas a akˈel pˈunto.
los ˈaɾβoles se ˌinklinˈaɾon ˈunos sˌoβɾe ˈotɾos, ˌasemexˈandose a ˈunos xiɣˈantes ke se kˌomfjaɾˈian ˈun sekɾˈeto.
ˌenseɣˈiða, de ɾɾepˈɛnte, kˌomo si se uβjˈese ˌentuɾβˈaðo kon sus oɾɾˈiβles komfjˈanθas, ˌaxitˈaɾon kˌombulsiβˈamente sus fˌoɾmiðˈaβles bɾˈaθos,
ˌaɾɾoxˈando a lo lˈexos los antˈiɣwos nˈiðos de kˌoɾnˈexas pˌaɾeθˈiðos a los ɾɾˈestos de ˈun naʊfɾˈaxjo ke aθˈota la tˌempestˈad.
—¿dˈonde estˈan las kˌoɾnˈexas?
—pɾˌeɣuntˈo mˈiss betse.
—las mi mˈaðɾe pensˈaβa en akˈel momˈɛnto en ˈotɾa kˈosa.
—¿kˈe se ˈan ˈetʃo las kˌoɾnˈexas?
—dˌesðe ke estˈamos akˈi nˈo las ˈemos bˈisto, ɾɾˌespondjˈo mi mˈaðɾe.
—kɾeˈiamos —mɾ.
kˌoppeɾfjˈelð kɾeˈia ke ˈuna nˌumeɾˈosa famˈilja de kˌoɾnˈexas poβlˈaβa ˈestos ˈaɾβoles, pˌeɾo los nˈiðos ˈeɾan antˈiɣwos i aθˈia mˈutʃo tjˈempo ke los pˈaxaɾos los aβˈian ˌaβandonˈaðo.
ˈese detˈaʎe pˈinta ˌadmiɾˈaβlemˈente a daβˈid kˌoppeɾfjˈelð de la kaβˈeθa a los pjˈes.
ʎamˈaɾ a ˈuna kˈasa ɾɾˈukeɾˈi, kwˌando en ˈeʎa nˈo eksˈiste nˈi ˈuna kˌoɾnˈexa, sˌuponˈeɾ ke ˈaɪ pˈaxaɾos pˌoɾke eksˈisten nˈiðos.
i si aβˈeis benˈiðo pˌaɾa aβlˈaɾme mˈal de ˈel —mi pˈoβɾe mˈaðɾe, a lo ke supˈɡo,
tˈuβo poɾ ˈun momˈɛnto la iðˈea de ponˈeɾ kˈoto a las ˌimpeɾtinˈɛnθjas de mi tˈia, ke nˈo ˈeɾa muxˈeɾ ke se dexˈaβa dˌominˈaɾ tˈan fˈaθilmˈente.
pˌeɾo aˈun nˈo aβˈia ˌakaβˈaðo de ˌaɾtikulˈaɾ su pɾimˈeɾa fɾˈase kwˌando el esfwˈeɾθo ˌaβasaʎˈando su balˈoɾ le pɾoðˈuxo ˈuna kɾˈisis neɾβjˈosa.
—¿kˈomo se ʎˈama bwˌestɾa kɾiˈaða?
—pɾˌeɣuntˈo mi tˈia tiɾˈando al mˈismo tjˈempo del koɾðˈon de la kˌampanˈiʎa.
—pˌeɣotj, tˌaɾtamˌuðeˈo mi mˈaðɾe.
—pˌeɣotj, dixˈisteɪs.
bˈajja ˈun nˈombɾe pˌaɾa ˈuna peɾsˈona kɾistjˈana.
ˈes su ˌapeʎˈiðo, —ɾɾˌeplikˈo mi mˈaðɾe.
mi maɾˈiðo la ʎamˈaβa asˈi pˌoɾke su nˈombɾe de pˈila ˈeɾa lo mˈismo ke el mˈio.
la kɾiˈaða ˌapaɾeθjˈo i la tˈia le dˈixo.
—pˌeɣotj, bwˌestɾa ˈama estˈa ˈalɣo ˌindispwˈesta.
ˈˈuna tˈaθa de tˈe sin peɾðˈeɾ el tjˈempo miɾˈando las mˌusaɾˈaɲas.
aβjˈɛndo dˈaðo ˈesta ˈoɾðen, kˌomo si en la kˈasa se uβjˈese ɾɾˌekonoθˈiðo sjˈempɾe su ˌaʊtoɾiðˈað sˌoβeɾˈana,
i dexˈando a pˌeɣotj ke se fwˈese a kumplˈiɾ lo mandˈaðo, mˈiss betse bolβjˈo a ˌokupˈaɾ su pwˈesto al lˈaðo del alˈumbɾe i kɾuθˈo ˈambas mˈanos sˌoβɾe ˈuna de sus ɾɾoðˈiʎas.
—nˈo dˈuðo —dˈixo la bjˈexa, kˌomo si pɾˌosiɣjˈese ˈuna kˌombeɾsaθjˈon ˌinteɾɾumpˈiða.
—nˈo dˈuðo ke tendɾˈeis ˈuna ˈixa.
—pwˈes bjˈen, a paɾtˈiɾ del momˈɛnto de su nˌaθimjˈɛnto, ˈesa ˈixa… —kiθˈas ˈeɾa ˈun nˈiɲo.
—se ˌatɾeβjˈo a ˌinsinwˈaɾ mi mˈaðɾe.
—os dˈiɣo —ɾɾˌeplikˈo mˈiss betse—ke dˈeβe sˈer ˈuna ˈixa.
—tɾatˈad de nˈo kˌontɾaðeθˈiɾme.
asˈi ke, nˈaθka, os dˈiɣo kjˈeɾo pɾoβˈaɾle mi ˌamistˈað.
seɾˈe su maðɾˈina i la pondɾˈeis poɾ nˈombɾe betse tɾotwˈooð kˌoppeɾfjˈelð.
i nˈo tjˈene ke aβˈeɾ eŋɡˈaɲos en la bˈiða de ˈesta betse tɾotwˈooð.
nˈo se bˌuɾlaɾˈan de sus ˌafekθjˈones, nˈo, ˈixa mˈia.
se la ˌeðukaɾˈa bjˈen i saβɾˈa ke nˈo ˈes pɾeθˈiso dˈaɾ su kˌoɾaθˈon a kjen nˈo lo meɾˈeθe.
ʝˈo mˈisma me ˌenkaɾɣˈe de ˈeʎo, si tˈal.
mi mˈaðɾe, dˌemasjˈaðo kˌonmoβˈiða pˌaɾa aβˈeɾ poðˈiðo ˌanaliθˈaɾ kon sˌeɣuɾiðˈad tˈoðas las ˌimfleksjˈones de bˈoθ de mi tˈia,
kɾejjˈo kˌompɾendˈeɾ, sin embˈɣo, ke en akˈeʎa ˌokasjˈon ˌaluðˈia a antˈiɣwos ɾɾekwˈeɾðos pˌeɾsonˈales.
—¿i daβˈid se poɾtˈo bjˈen kon bˈos?
—pɾˌeɣuntˈo mˈiss betse despwˈes de ˈuna lixˈeɾa pˈaʊsa.
—¿biβˈisteɪs en bwˈena ˌintelixˈɛnθja?
ˈeɾamos mˈujj felˈiθes, ɾɾˌespondjˈo mi mˈaðɾe.
mi maɾˈiðo nˈo pˈuðo sˈer mexˈoɾ pˌaɾa konmˈiɣo.
ˈa, os mˌimaɾˈia, supˈɡo, dˈixo mˈiss betse.
sˌoβɾe tˈoðo ˈoɪ ke me ˈaʎo sˈola en el mˈundo, ɾɾˌespondjˈo mi mˈaðɾe ɾɾompjˈɛndo a ʎoɾˈaɾ.
—bˈajja, nˈo ʎoɾˈeis.
bjˈen se bˈe ke os ʎeβˈaβaɪs kˌomo ˈunos ˈaŋxeles.
poɾ ˈeso os ˈe dˌiɾixˈiðo ˈesta pɾeɣˈunta.
¿ˈeɾaɪs wˈeɾfana, beɾðˈad?
—¿e ˌinstitutɾˈiθ?
ˈeɾa ˌinstitutɾˈiθ en ˈuna kˈasa a dˌonde solˈia ˈiɾ de bisˈita de kwˌando en kwˌando, mɾ.
tˈuβo la bondˈad de fixˈaɾ su ˌatenθjˈon en mˈi.
me aβlˈo ˌamistˈosamˈente i me pɾopˈuso kasˈaɾse konmˈiɣo.
—ˌaθepːtˈe i nos kasˈamos, ɾɾˌespondjˈo mi mˈaðɾe kon ˌiŋxenwiðˈað.
ˈa, pˈoβɾe nˈiɲa, ˌaɲaðjˈo mˈiss betse en bˈoθ bˈaxa i miɾˈando al fwˈeɣo kon ˈaɪɾe ˌensimismˈaðo.
¿kˈe ˈes lo ke saβˈeis?
—nˈo os kompɾˈɛndo.
kwiðˈˈuna kˈasa, poɾ exˈemplo.
tˈemo ke nˈo sˈepa lo sˌufiθjˈɛnte, kˌomo ʝˈo kisjˈeɾa, pˌeɾo mɾ.
fˈalta le aθˈia ˌapɾendˈeɾ pɾimˈeɾo, ˌeksklamˈo mˈiss betse en fˈoɾma de paɾˈentesis.
ˈeo ke uβjˈeɾa ˌapɾoβetʃˈaðo, poɾ el desˈeo ke tenˈia de ˌapɾendˈeɾ i poɾ la paθjˈɛnθja kon ke me ˌinstɾuˈia,
si la desɣɾˈaθja de su mwˈeɾte al ʎeɣˈaɾ akˈi, mi mˈaðɾe pɾˌoɾɾumpjˈo de nwˈeβo ˌensoʎˈosos i nˈo pˈuðo kˌontinwˈaɾ.
—bˈajja, nˈo ʎoɾˈeis, dˈixo mˈiss betse.
os bˈaɪs a ponˈeɾ mˈala i nˈo aɾˈeis ɡɾˈan bjˈen a mi ˌixˈaða.
ˈeste ˈultimo ˌaɾɣumˈɛnto pˌaɾeθjˈo kalmˈaɾ alɣˈun tˈanto a mi mˈaðɾe.
ɾɾeɪnˈo ˈun momˈɛnto de pˈaʊsa i mi nˈoβle tˈia kˌontinwˈo kon los pjˈes en los moɾˈiʎos de la tʃˌimenˈea.
daβˈid, pɾˌosiɣjˈo, aβˈia kompɾˈaðo ˈuna ˌanwaliðˈað, seɣˈun me ˈan ˌaseɣˈaðo.
—¿kˈe ˈa ˈetʃo poɾ bˈos?
kˌoppeɾfjˈelð —ɾɾˌespondjˈo la ˌinteɾpelˈaða, aθjˈɛndo ˈun pˌoðeɾˈoso esfwˈeɾθo—
ˈa sˈiðo lo bastˈante bwˈeno pˌaɾa ˌaseɣˈaɾme ˈuna pˈaɾte de dˈitʃa ɾɾˈɛnta.
—kinjˈɛntas lˈiβɾas ˌesteɾlˈinas.
—uβjˈeɾa poðˈiðo aθˈeɾ mˈenos, —ˌaɲaðjˈo mˈiss betse.
al ʎeɣˈaɾ akˈi, ɾɾˌeðoβlˈaɾon los soʎˈoθos de mi mˈaðɾe.
pˌeɣotte, ke entɾˈaβa en akˈel momˈɛnto kon ˈuna tˈaθa de tˈe en ˈuna mˈano i ˈun kˌandelˈeɾo en la ˈotɾa aʎˈo tˈan mˈal a su seɲˈoɾa,
kˈosa ke uβjˈeɾa notˈaðo fˈaθilmˈente mˈiss betse a estˈaɾ mexˈoɾ ˌalumbɾˈaða a la estˈanθja, ke se ˌapɾesuɾˈo a ʎeβˈaɾla a su kˈama.
luˈeɣo, ʎamˈando a su soβɾˈino, sˈean pˌeɣotte, ke aθˈia alɣˈunos dˈias se aʎˈaβa ˌeskondˈiðo en la kˈasa sin ke lo supjˈese su mˈaðɾe,
le dˈixo, ˈið kˌoɾɾiˈɛndo en bˈuska del mˈeðiko i de la ˌemfeɾmˈeɾa.
ˈuno i ˈotɾa ˌasombɾˈaɾonse ˈun pˈoko kwˌando ʎeɣˈaɾon sˌuθesˈiβamˈente, kon alɣˈunos minˈutos de ˌinteɾβˈalo, al aʎˈˈuna seɲˈoɾa dˌeskonoθˈiða,
de ɾɾˈostɾo ˌimponˈɛnte, sentˈaða en fɾˈɛnte del alˈumbɾe, kon ˈun sombɾˈeɾo ke kolɣˈaβa del bɾˈaθo deɾˈetʃo i ˌokupˈaða en ˌintɾoðuθˈiɾse ˌalɣˈon en las oɾˈexas.
kˌomo pˌeɣotte nˈo saβˈia kjˈen ˈeɾa i su mˈaðɾe nˈo deθˈia nˈaða, la dˌeskonoθˈiða se keðˈo en la sˈala sin ke nˈaðje se ˌokupˈase de ˈeʎa.
el doktˈoɾ, al bˈeɾla en el mˈismo sˈitjo kˈaða bˈeθ ke suβˈia o baxˈaβa del kwˈaɾto de la emfˈeɾma,
kɾejjˈo ke benˈia poɾ iðˈentiko motˈiβo ke ˈel i la dˌiɾixjˈo ˈuna fɾˈase de kˌoɾtesanˈia.
ˈeɾa el ˈombɾe mˈas tˈimiðo i melˈoso, ˌeskiβˈandose kontˈinwamˈente i ˌaβandonˈando su pwˈesto poɾ temˈoɾ de sˈer ˌimpoɾtˈuno.
en bˈeθ de andˈaɾ, pwˈeðe deθˈiɾse ke se ˌeskuɾɾˈia sin ɾɾuˈiðo i mˈas lˈɛntamˈente ke el espˈektɾo de amlˈet.
kon la kaβˈeθa ˌenkoxˈiða ˌentɾe los ˈombɾos, kon la ˌekspɾesjˈon de ˈuna moðˈestja ke peðˈia peɾðˈon,
poɾ nˈaða de ˈeste mˈundo uβjˈeɾa dˈitʃo ˈuna palˈaβɾa dˈuɾa i dˌesaɣɾaðˈaβle nˈi a ˈun pˈeɾɾo,
poɾ mˈas ke fwˈese ˈun pˈeɾɾo ɾɾaβjˈoso.
pensˈo ke a mi tˈia le dolˈian los oˈiðos i le pɾˌeɣuntˈo kon ˈun aθˈɛnto sˈumamˈente melˈoso si sufɾˈia de alɣˈuna ˌiɾɾitaθjˈon lokˈal.
—¿i kˈe djˈaβlo sˌiɡnifˈika ˈeso?
—ɾɾˌespondjˈo mi tˈia tˈan bɾˈuskamˈente ke el doktˈoɾ siʎˈiβ, kˌomo eɾˈiðo de mutˈismo, fwˈe a sentˈaɾse al lˈaðo del alˈumbɾe.
en bɾˈeβe fwˈe ʎamˈaðo de nwˈeβo al lˈaðo de mi mˈaðɾe, dˌonde pˌeɾmaneθjˈo alɣˈunos instˈantes.
suβjˈo, bolβjˈo a baxˈaɾ i kwˌando se ˌeskuɾɾiˈo poɾ ˈultima bˈeθ en la sˈala kɾejjˈo tenˈˈun maɡnˈifiko pɾetˈeksto pˌaɾa ɾɾˌenoβˈaɾ la kˌombeɾsaθjˈon.
—seɲˈoɾa, tˈɛŋɡo el majjˈɡˈusto en dˈaɾos mi ˌenoɾaβwˈena.
—se pwˈeðe saβˈeɾ poɾ kˈe, —ɾɾˌeplikˈo mi tˈia seβˈeɾamˈente.
el doktˈoɾ kɾejjˈo aβˈeɾ paɾtˈiðo de lixˈeɾo, ˌolβiðˈando la ˌintɾoðukθjˈon ˌimbaɾjˈaβle de tˈoðos sus diskˈuɾsos.
ˈantes de kˌontinwˈaɾ bolβjˈo a sˌaluðˈaɾ kon majjˈoɾ ɾɾespˈeto si kˈaβe ke la pɾimˈeɾa bˈeθ.
—seɲˈoɾa, tɾˌankilˈˈaos.
felˈiθ ʝˈo ke pwˈeðo dˈaɾos la ˌenoɾaβwˈena.
ʝˈa nˈo tenˈeis ke temˈeɾ nˈaða.
en ˈuna de ˈestas fɾˈases ˌenroʎˈose el doktˈoɾ i mi tˈia kˌontinwˈaβa miɾˈandole ɾɾˌepɾimjˈɛndo kon mˈutʃo tɾaβˈaxo su ˌimpaθjˈɛnθja,
ˌasta ke poɾ fˈin mɾ.
siʎˈiβ ˌeksklamˈo pˌaɾa tˌeɾminˈaɾ su diskˈuɾso.
—felˈiθ ʝˈo ke pwˈeðo deθˈiɾos.
ʝˈa se ˌakaβˈo tˈoðo, komplˈetamˈente tˈoðo.
—¿i kˈomo estˈa la mˈaðɾe?
—pɾˌeɣuntˈo mi tˈia kɾuθˈandose de bɾˈaθos i sin ˌaβandonˈaɾ el sombɾˈeɾo.
—mˈujj bjˈen, seɲˈoɾa, i espˈeɾo ke kˈaða bˈeθ sˌeɣˈa mexˈoɾ.
bˈa tˈoðo lo bjˈen ke pwˈeðe ˈˈuna xˈoβen pɾˌimeɾˈiθa en su sˌitwaθjˈon.
¿poðˈeis bˈeɾla sin ˌinkombenjˈɛnte a niŋɡˈuno?
—pɾˌeɣuntˈo mi tˈia kon la mˈisma ˌaspeɾˈeθa.
el doktˈoɾ siʎˈiβ ˌenkoxjˈo la kaβˈeθa ˌentɾe los ˈombɾos ˈun pˈoko mˈas ke de kostˈumbɾe.
—la tʃˌikitˈina, la ɾɾeθjˈen naθˈiða, ɾɾepˈito.
—seɲˈoɾa —ɾɾˌeplikˈo el doktˈoɾ—kɾeˈia ke saβˈiaɪs ke nˈo ˈes ˈuna nˈiɲa, sˈino ˈun nˈiɲo.
mi tˈia nˈo pɾˌonunθjˈo nˈi ˈuna sˈilaβa, pˌeɾo koxjˈɛndo su sombɾˈeɾo poɾ las θˈintas a manˈeɾa de ˈuna ˈonda,
ˌamenaθˈo kon ˈel la kaβˈeθa del doktˈoɾ.
se lo ˌenkasketˈo a tɾaβˈes en la sˈujja, saljˈo i nˈo bolβjˈo mˈas.
dˌesapˌaɾeθjˈo kˌomo nˈaða ˌenoxˈaða, o kˌomo ˈuno de ˈesos espˈiɾitus ke estˈaβa pɾˌeðestinˈaðo a bˈeɾ, seɣˈun el ɾɾumˈoɾ pˌopulˈaɾ.
sann pˌeɣotj pɾˌetendˈia aβˈeɾse ˌenkontɾˈaðo kon ˈeʎa a la pwˈeɾta de la kˈasa sin poðˈeɾ kˌompɾendˈeɾ klˈaɾamˈente lo ke le pɾˌeɣuntˈaɾa mˈiss betse,
ke le ˌaplikˈo ˈun pˈaɾ de pˌeskoθˈones pˌaɾa ˌaɣˈaɾ su ˌintelixˈɛnθja.
la tˈia del mutʃˈatʃo ˌafiɾmˈo a la maɲˈana siɣjˈɛnte ke sann tenˈia los kaɾɾˈiʎos kˌomo ˈuna ˌamapˈola a kˌonsekwˈɛnθja de la ˌinteɾɾˌoɣaθjˈon de la bwˈena seɲˈoɾa.
mi bwˈena tˈia nˈo bolβjˈo, nˈo tˈal.
ʝˈo me ʎˈaβa en mi kˈuna i mi mˈaðɾe en su kˈama.
mˈiss betse tɾotwˈooð kˌoppeɾfjˈelð, la sˌoβɾinˈita ke mi tˈia aβˈia ˌespeɾˈaðo ˌasta las dˈoθe de la nˈotʃe,
pˌeɾmaneθjˈo en el lˈimbo, en ˈesa fˌoɾmiðˈaβle ɾɾexjˈon de dˌonde ʝˈo ʎeɣˈaβa i de dˌonde pɾˌoβenˈian tˈoðos los bjaxˈeɾos de la bˈiða.
la lˈuθ del dˈia pɾˌojjektˈo sus ɾɾˈajjos en la mansjˈon de la nˈaða, i a sus ɾɾeflˈexos mi sˈer dexˈo la inˈeɾθja i bˈino a tomˈaɾ pwˈesto ˌentɾe los moɾtˈales.
kapˈitulo dˈos de daβˈid kˌoppeɾfjˈelð o el soβɾˈino de metˈia de tʃˈaɾles dˌiθjˈens.
ˈesta ɡɾˌaβaθjˈon de lˌiβɾˈiβoks ˈes de domˈinjo pˈuβliko.
kapˈitulo dˈos.
mi mˈaðɾe i pˌeɣotte sˈon pˌaɾa mˈi los dˈos pɾimˈeɾos sˈeɾes ke ɾɾekwˈeɾðan mi memˈoɾja en ˈeste kwˈaðɾo ɾɾˌetɾospektˈiβo.
mi mˈaðɾe kon sus eɾmˈosos kaβˈeʎos i su esβˈelto tˈaʎe.
pˌeɣotte ke nˈo tenˈia tˈaʎe de niŋɡˈuna klˈase, pˌeɾo ke pˌoseˈia ˈunos ɡɾˈandes ˈoxos nˈeɣɾos, ˈunos moflˈetes mˈujj kˌoloɾˈaðos i ˈunos bɾˈaθos mˈas kˌoloɾˈaðos aˈun.
a bˈeθes me ekstɾˈaɲa kˈomo los pˈaxaɾos nˈo akˈuðen a pˌikoteˈaɾlos kon pɾˌefeɾˈɛnθja a las manθˈanas.
se me fiɣˈuɾa estˈaɾ bjˈɛndo mˈujj θˈeɾka de mˈi akˈeʎas dˈos kɾiatˈuɾas, bjˈen ˌaɣatʃˈandose pˌaɾa ke puðjˈeɾa sˌolˈito etʃˈaɾme en sus bɾˈaθos o ponjˈendose de ɾɾoðˈiʎas mjˌentɾas ke ʝˈo ˈiβa de ˈuna a ˈotɾa.
aˈun kɾˈeo sentˈiɾ la ˌimpɾesjˈon de la mˈano ke me ˌalaɾɣˈaβa pˌeɣotte, akˈeʎa mˈano ke la kostˈuɾa aβˈia bwˈelto mˈas ˈaspeɾa ke ˈuna lˈima.
kiθˈas sˈea ˈun kapɾˈitʃo de mi ˌimaxˌinaθjˈon al pensˈaɾ ke nwˌestɾa memˈoɾja pwˈeðe ˈiɾ mˈas aʎˈa de lo ke se kɾˈee xˌeneɾˈalmˈente en lo pasˈaðo,
asˈi kˌomo tambjˈen pjˈɛnso ke mˈutʃos nˈiɲos estˈan dotˈaðos de ˈuna fˌakultˈad de ˌoβseɾβaθjˈon ˈes mˈas,
nˈo se dˈeβe deθˈiɾ ke la majjˈoɾ pˈaɾte de los ˈombɾes ke sˈon notˈaβles ɾɾespˈekto a ˈeste pˌaɾtikulˈˈan ˌadkiɾˈiðo ˈeste dˈon.
ˈantes, poɾ el kontɾˈaɾjo, ˌestaɾˈian mˈas bjˈen dispwˈestos a peɾðˈeɾlo, kon tˈanta mˈas ɾɾaθˈon kwˌanto ke ˈestos mˈismos ˈombɾes konsˈeɾβan θjˈeɾta lˌuθiðˈeθ de iðˈeas i θjˈeɾta pɾˌeðispˌosiθjˈon a sˈer felˈiθes,
ke ˈes ˈotɾa de las eɾˈɛnθjas de su imfˈanθja.
en tˈoðo kˈaso, al xuθɣˈaɾ poɾ mˈi a los demˈas, lo ˈaɣo poɾ aβˈeɾ sˈiðo kwˌando nˈiɲo sˈumamˈente ˌoβseɾβaðˈoɾ,
i ˈoɪ ke sˈoɪ ˈombɾe, ɾɾekwˈeɾðo peɾfˈektamˈente mi bˈiða de la imfˈanθja.
beˈamos de kˈe mˈas me akwˈeɾðo.
de mi kˈasa kon tˈoðos sus ˌeskondˈites.
en el pˈiso bˌaxo se ˈaʎa la koθˈina kˈujja pwˈeɾta dˈa ˈun pˈatjo.
en mˈeðjo de ˈeste pˈatjo, ˈun pˌalomˈaɾ sin palˈomas.
en ˈun ɾɾinkˈon la kasˈeta del pˈeɾɾo, poɾ supwˈesto en ˌinkilˈino.
ˌaðemˈas, ˈuna poɾθjˈon de ˈaβes de tamˈaɲo ɾɾˌespetˈaβle, ʝˈɛndo i binjˈɛndo kon ˈaɪɾe fˌosko i ˌamenˌaθaðˈoɾ.
sˌoβɾe tˈoðo ˈun ɡˈaʎo, suβˈiðo en ˈun maðˈeɾo, ke pˌaɾeθˈia fixˈaɾ tˈoða su ˌatenθjˈon en mˈi kˈaða bˈeθ ke miɾˈaβa a tɾaβˈes de la bentˈana,
kˈosa ke aˈun me ˈaθe temblˈaɾ, pwˈes el tˈal ɡˈaʎo nˈo ˈeɾa nˈaða bwˈeno.
ˈunos kwˌantos pˈaβos ke kˌaminˈaβan kon su ˈaɪɾe dˌeɾɾeŋɡˈaðo i me pˌeɾseɣˈian ˌalaɾɣˈando el peskwˈeθo.
poɾ de nˈotʃe swˈeɲo kon ˈeʎos, kˌomo el dˌomaðˈoɾ de fjˈeɾas swˈeɲa kon sus leˈones.
ˈe akˈi ˈun pasˈiʎo tˈan lˈɣo ke a mˈi se me fiɣˈuɾa ke nˈo tjˈene fˈin,
i ke bˈa de la koθˈina a la pwˈeɾta de la kˈaʎe.
en ˈeste kˌoɾɾeðˈˈaɪ ˈun kwˈaɾto oskˈuɾo ke sˈiɾβe pˌaɾa ɡwaɾðˈaɾ tɾˈastos bjˈexos.
asˈi ke ˈes de nˈotʃe, pˈaso mˈujj depɾˈisa poɾ delˈante del tˈal kwˈaɾto, pwˈes nˈo se a θjˈɛnθja θjˈeɾta lo ke ˈaɪ ˌentɾe los bjˈexos tonˈeles i las kˈaxas de tˈe,
a pesˈaɾ ke eksˈala ˈun olˈoɾ de xaβˈon, pimjˈɛnta, bˈelas de sˈeβo i kafˈe.
tambjˈen ˈaɪ dˈos sˈalas, ˈuna de ˈeʎas pekˈeɲa, dˌonde solˈemos pasˈaɾ las belˈaðas mi mˈaðɾe, pˌeɣotj i ʝˈo.
pwˈes bwˈeno ˈes deθˈiɾ ke pˌeɣotj fˈoɾma nwˌestɾa teɾtˈulja, asˈi ke ˈa ˌakaβˈaðo sus kˌeaθˈeɾes i se mˈaɾtʃa tˈoðo el mˈundo.
ˌenseɣˈiða bjˈene la sˈala pɾˌinθipˈal, en la kwˈal ɾɾˌeθiβˈimos los domˈɡos.
ˌaʊnke ˈesta ˌaβitaθjˈon ˈes majjˈoɾ ke la ˈotɾa, nˈo ˈes tˈan kˈomoða, i pˌaɾa mˈi ɾɾˈeɪna en ˈeʎa ˈuna espˈeθje de lˈuɣuβɾe tɾistˈeθa,
pwˈes pˌeɣotj me ˈa kontˈaðo ke kwˌando los fˌuneɾˈales de mi pˈaðɾe se bjˈo ʎˈena kon tˈanta xˈɛnte kˌomo bˈino,
bestˈiða de lˈuto pˌaɾa ˌakompaɲˈaɾ su ˌataˈuð.
tambjˈen en ˈesta sˈala, θjˈeɾto domˈɡo poɾ la nˈotʃe, mi mˈaðɾe nos leˈia a pˌeɣotj i a mˈi la ɾɾˌesuɾɾekθjˈon del ˈaxaɾo ˌentɾe los mwˈeɾtos.
nˈo konˈoθko nˈaða tˈan bˈeɾðe kˌomo el θˈesped de ˈeste θˌementˈeɾjo, nˈi ˌaɾβolˈeðas mˈas sombɾˈias ke las sˈujjas,
nˈi kˈalma iɣwˈal a la de las lˈosas de los tˈumulos.
ˈi se ˌapaθjˈɛnta el ɡanˈaðo, i kwˌando poɾ la mˌaɲanˈita me pˈɡo de ɾɾoðˈiʎas en mi kˈama pˌaɾa bˈeɾ los koɾðˈeɾos,
distˈɡo el pɾimˈeɾ ɾɾˈajjo de sˈol ke pɾojjˈekta su ɾɾeflˈexo en la esfˈeɾa solˈaɾ i me pɾeɣˈunto ¿ˌestaɾˈa alˈeɣɾe la esfˈeɾa kwˌando mˈaɾka aˈun las ˈoɾas?
mˈas aʎˈa se enkwˈɛntɾa el bˈanko de la iɣlˈesja, ˈun bˈanko kon ˈun ɾɾespˈalðo ˈalto, kˌolokˈaðo xˈunto a ˈuna de las bentˈanas bˈaxas,
dˌesðe dˌonde se pwˈeðe bˈeɾ nwˌestɾa kˈasa duɾˈante el seɾβˈiθjo, lo kwˈal eksplˈika la kostˈumbɾe de pˌeɣotj ke miɾˈaβa fɾekwˈɛntemˈente ˌaθja akˈel lˈaðo pˌaɾa ˌaθeɾkˈaɾse de ke nˈo ˈaɪ laðɾˈones nˈi fwˈeɣo.
pˌeɾo aˈun kwˌando ˈeʎa mˈiɾa a ˈuno i ˈotɾo lˈaðo, se emfˈaða si ʝˈo bwˈelβo la kaβˈeθa i me ˈaθe sˈeɲas pˌaɾa ke nˈo sepˈaɾe la bˈista del minˈistɾo ke ofˈiθja.
nˈo pwˈeðo miɾˈaɾle kontˈinwamˈente pˌoɾke le konˈoθko bastˈante kon sˌoβɾepeʎˈiθos en ˈeʎa, i de kwˌando en kwˌando me ˈetʃa ˈunas miɾˈaðas,
mˈiɾo a mi mˈaðɾe ke ˈaθe kˌomo ke nˈo me bˈe, luˈeɣo a ˈun tʃikˈiʎo ke me ˈaθe ˈuna mwˈeka,
al ˈotɾo lˈaðo del pˈoɾtiko bˈeo ˈun kaɾnˈeɾo ke paɾˈeθe keɾˈeɾ entɾˈaɾ en la iɣlˈesja i me sjˈɛnto dispwˈesto a ɡɾitˈaɾle ke se bˈajja,
pˌeɾo ¿kˈe seɾˈia de mˈi si tˈal iθjˈeɾe?
ˈun pˈoko mˈas lˈexos estˈa el pˈulpito.
ˈi sˈi ke poðɾˈia xˌuɣˈaɾse bjˈen.
¡kˈe ɡˈoθo si ʝˈo me bjˈeɾa en akˈeʎa fˌoɾtalˈeθa i binjˈeɾa ˈuno de mis kˌompaɲˈeɾos a ponˈeɾme sˈitjo!
le tˌiɾaɾˈia el koxˈin de tˌeɾθjopˈelo del pɾˌeðikaðˈoɾ.
ˌinsensˈiβlemˈente, a fwˈeɾθa de miɾˈaɾ, θjˌeɾɾanse mis ˈoxos i mis oˈiðos nˈo ˈojjen a fwˈeɾθa de aθˈeɾ kˌomo ke eskˈutʃo al minˈistɾo ke kˈanta ˈun sˈalmo kon bˈoθ dˌesafinˈaða de bˌaxo pɾofˈundo.
me kˈaɪɣo del bˈanko metjˈɛndo ˈun estɾˈepito ˌimfeɾnˈal i peɣˈote me leβˈanta del swˈelo mˈas mwˈeɾto ke bˈiβo.
aˈoɾa bˈeo la fatʃˈaða de nwˌestɾa kˈasa i las bentˈanas ɾɾˌoðeˈaðas de ˈun ˌenrexˈaðo de maðˈeɾa.
distˈɡo el pˌaɾteɾ, el θˈesped i los ˈaltos ˈalamos kon sus nˈiðos de kˌoɾnˈexas.
ˌatɾaβjˈeso el pasˈiʎo i la koθˈina.
me ɾɾeˈuno en la wˈeɾta kon mi mamˈa i mjˌentɾas kˈoxe la fɾˈuta maðˈuɾa de la ˌempaliθˈaða,
ɾɾˈoβo a ˌoɾtaðiʎas alɣˈuna ke ˈotɾa ɡɾosˈeʎa.
en el imbjˈeɾno xuɣˈamos en la sˈala mˈas pekˈeɲa.
kwˌando mi mˈaðɾe se kˈansa, se sjˈɛnta en la butˈaka.
alɣˈunas bˈeθes se diɾˈixe al espˈexo.
en soɾtˈixa en los dˈeðos los ɾɾˈiθos de su eɾmˈosa kˌaβeʎˈeɾa.
se axˈusta su esβˈelto tˈaʎe i bjˈen sˈe ke nˈo le emfˈaða el aʎˈaɾse sjˈempɾe bonˈita.
ˌaɲaðiɾˈe a ˈestas pɾimˈeɾas ˌimpɾesjˈones el sˌentimjˈɛnto de ˈun bˌeɾðaðˈeɾo ˌasθendjˈɛnte ke pˌeɣotj ˌexeɾθˈia sˌoβɾe mi mˈaðɾe i sˌoβɾe mˈi.
la kˌonsultˈaβamos a pɾopˈosito de tˈoðo i ˌasta le tenˈiamos θjˈeɾto mjˈeðo.
ˈun dˈia pˌeɣotj i ʝˈo nos aʎˈaβamos sentˈaðos los dˈos al lˈaðo del alˈumbɾe, pwˈes mi mˈaðɾe aβˈia ˈiðo de bisˈita a kˈasa de ˈuna beθˈina.
leˈiale ˈun kapˈitulo sˌoβɾe los kˌokoðɾˈilos i ˈun pˈoko poɾ fˈalta del lektˈoɾ i ˈotɾo pˈoko poɾ fˈalta de ˌintelixˈɛnθja.
estˈoɪ seɣˈuɾo de ke pˌeɣotj nˈo poðˈia deθˈiɾ a pˈunto fˈixo si el kˌokoðɾˈilo ˈeɾa ˈun ˌanimˈal o ˈuna leɣˈumbɾe ˌekstɾaˌoɾðinˈaɾja,
kwˌando en ˈesto se ˌapoðeɾˈo de mˈi el swˈeɲo, pˌeɾo nˈo keɾˈia ˌakostˈaɾme poɾ nˈaða de ˈeste mˈundo.
tɾatˈe de ɾɾˌesistˈiɾ al swˈeɲo miɾˈando fˈixamˈente a pˌeɣotj, kˈujjo tˈaʎe tomˈaβa kˈaða bˈeθ a mis ˈoxos majjˈoɾes pɾˌopoɾθjˈones i se me pɾˌesentˈaβa kˌomo ˈun bˌeɾðaðˈeɾo xiɣˈante.
me fɾotˈe los ˈoxos i apˈenas sˈi poðˈia aβɾˈiɾ los pˈaɾpaðos, nˈo peɾðjˈɛndo de bˈista nˈi mi kɾiˈaða nˈi el kˈaβo de θˈeɾa ke el ˈilo ʎenˈaβa de sˈuɾkos,
nˈi su θˈinta pˌaɾa meðˈiɾ, nˈi su θˈesta de kostˈuɾa, en kˈujja tˈapa aβˈia dˌiβuxˈaða la kˌateðɾˈal de sˈan pˈaβlo kon su kˈupula ˌenkaɾnˈaða,
nˈi el deðˈal de kˈoβɾe ke la ɾɾˌesɣwaɾðˈaβa de las pˌikaðˈuɾas de la aɣˈuxa.
pˌeɾo sentˈi ke pˌaɾa nˈo sˌukumbˈiɾ nˌeθesitˈaβa ˈun nwˈeβo esfwˈeɾθo i dˌiɾixˈi bɾˈuskamˈente a pˌeɣotj ˈesta sˌiŋɡulˈaɾ pɾeɣˈunta.
—¿pˌeɣotj, aβˈeis estˈaðo kasˈaða alɣˈuna bˈeθ?
djˈos de mi bˈiða, dˈime kjˈen djˈaβlos te ˈa aβlˈaðo de kˌasamjˈɛnto.
pˌeɣotj se ˌestɾemeθjˈo de tˈal mˈoðo ke ʝˈo me dˌespeɾtˈe del tˈoðo.
dexˈo de kosˈeɾ i me miɾˈo sin soltˈaɾ la aɣˈuxa de su mˈano.
—¿aβˈeis estˈaðo kasˈaða alɣˈuna bˈeθ?
—sˈoɪ ˈuna ɡwˈapa tʃˈika, ¿nˈo ˈes ˈesto?
a deθˈiɾ beɾðˈad, se me fˌiɣˈaβa ke ˈeɾa ɡwˈapa, de ˈuna beʎˈeθa dˌifeɾˈɛnte a la de mi mˈaðɾe,
pˌeɾo en su xˈeneɾo nˈo aβˈia nˈaða ke peðˈiɾ.
su ˌenθendˈiðo kˈutis me pˌaɾeθˈia tˈan bɾiʎˈante kˌomo el fˈondo de ˈun tˌaβuɾˈete de tˌeɾθjopˈelo ˌenkaɾnˈaðo en ke mi mˈaðɾe aβˈia boɾðˈaðo ˈunas flˈoɾes.
tˈal bˈˈeɾa ˈun pˈoko mˈas swˈaβe el tˈakto, pˌeɾo ˈesta ˈeɾa su ˈunika dˌifeɾˈɛnθja.
—kon ke sˈoɪ eɾmˈosa, dˈixo pˌeɣotj.
ˈo, nˈo, ˈixo mˈio, pˌeɾo ¿kjˈen djˈaβlos te ˈa aβlˈaðo de kˌasamjˈɛntos?
—nˈo sˈe, ɾɾˌeplikˈe.
dˈime si se pwˈeðe kasˈˈuno kon bˈaɾjas peɾsˈonas a ˈun mˈismo tjˈempo.
—nˈo tˈal, ɾɾˌespondjˈo pˌeɣotj sin bˌaθilˈaɾ.
pˌeɾo kwˌando se ˈa mwˈeɾto la peɾsˈona kon kjen ˈuno se ˈa kasˈaðo, pwˈeðe el ke sˌoβɾeβˈiβe kasˈaɾse ˈotɾa bˈeθ.
—se pwˈeðe, si se kjˈeɾe, ɾɾˌeplikˈo la xˈoβen.
ˈeso depˈɛnde de la ˌopinjˈon de kˈaða ˈuno.
—¿i kwˈal ˈes bwˌestɾa ˌopinjˈon?
ˌaɲaðˈi kon tˈanta majjˈoɾ kˌuɾjosiðˈad kwˌanto ke ˈeʎa me ˌeksaminˈaβa kon ɡɾˈan ˌatenθjˈon.
pˌeɣotj dexˈo de fixˈaɾ sus nˈeɣɾos ˈoxos en los mˈios, pˈusˈose a kosˈeɾ i ˌeksklamˈo despwˈes de tˌituβeˈˈun pˈoko.
—tˈoðo lo ke pwˈeðo deθˈˈes ke nˈunka ˈe estˈaðo kasˈaða i ke xamˈas me kˌasaɾˈe.
—bˈeo ke estˈais de mˈal umˈoɾ, pˌeɣotj.
le dˈixe i ɡwaɾðˈe silˈɛnθjo, kɾejjˈɛndo en efˈekto ke la aβˈia kˌontɾaɾjˈaðo.
pˌeɾo me ˌeŋɡˈaβa, pˌoɾke duɾˈante alɣˈunos minˈutos tɾatˈo de tɾˌaβaxˈaɾ i nˈo kˌonsiɣjˈɛndˈolo ˌaβɾiˈo de ɾɾepˈɛnte sus bɾˈaθos,
i me atɾˈaxo a ˈeʎos besˈando ɾɾˌepetˈiðas bˈeθes mi ɾɾiθˈaða kˌaβeʎˈeɾa.
ˌapeɾθiβˈime de la ˌeneɾxˈia de sus kaɾˈiθjas al bˈeɾ ke saltˈaβan dˈos botˈones de su bestˈiðo,
pwˈes kˌomo nˈo aβˈia wˈeko poɾ niŋɡˈun lˈaðo, kwalkjˈeɾ ˌexeɾθˈiθjo le ˌeksponˈia sˌemexˈante ˌinkombenjˈɛnte.
—bˈamos, ˌeksklamˈo.
siɣˈamos la istˈoɾja de los kˌokoðɾˈilos.
nˈo pˈuðe kˌompɾendˈeɾ poɾ kˈe pˌeɣotj mostɾˈaβa tˈanta tˌuɾβaθjˈon i dˌeseˈaβa bolβˈeɾ a los kˌokoðɾˈilos, seɣˈun ˈeʎa los ʎamˈaβa.
nˈo oβstˈante, seɣˈimos lejjˈɛndo la istˈoɾja de ˈestos mˈonstɾuos, o mexˈoɾ dˈitʃo, biβˈimos en su kˌompaɲˈia poɾ espˈaθjo de mˈeðja ˈoɾa.
dexˈamos sus wˈeβos en la aɾˈena pˌaɾa ke el sˈol puðjˈeɾa ˌempoʎˈaɾlos.
nos bˈimos pˌeɾseɣˈiðos poɾ el pˈaðɾe i la mˈaðɾe, kˈujja kˈoleɾa buɾlˈamos dˈando bwˈeltas, kˈosa ke nˈo poðˈian aθˈeɾ kˌomo nosˈotɾos a kˈaʊsa de la pˌesaðˈeθ de sus mˌoβimjˈɛntos.
ˌenseɣˈiða les pˌeɾseɣˈimos a nwˌestɾa bˈeθ en el ˈaɣwa kon los kˌaθaðˈoɾes indˈixenas.
les ˌintɾoðuθˈimos a buθˈaðos pˈintʃos en la bˈoka.
en ˈuna palˈaβɾa, nˈo taɾðˈamos en ˌapɾendˈeɾ de memˈoɾja tˈoða la istˈoɾja de los kˌokoðɾˈilos, al mˈenos ʝˈo,
pwˈes en kwˌanto a pˌeɣotj se me fˌiɣˈaβa ke poɾ momˈɛntos pˌaðeθˈia alɣˈunas dˌistɾakθjˈones i se pikˈaβa los dˈeðos kon la aɣˈuxa.
ˌiβamos a kˌontinwˈaɾ nwˌestɾa lektˈuɾa kwˌando ʎamˈaɾon a la pwˈeɾta.
la ke ʎeɣˈaβa ˈeɾa mi mˈaðɾe i benˈia ˌakompaɲˈaða de ˈun kˌaβaʎˈeɾo de pˌatiʎas nˈeɣɾas, ke ɾɾˌekonoθˈi poɾ aβˈeɾnos ˌakompaɲˈaðo ʝˈa el domˈɡo ˌanteɾjˈoɾ dˌesðe la iɣlˈesja ˌasta nwˌestɾa kˈasa.
kwˌando mi mamˈa en el dintˈel de la pwˈeɾta me koxjˈo en sus bɾˈaθos i me besˈo,
el kˌaβaʎˈeɾo dˈixo ke ʝˈo ˈeɾa mˈas felˈiθ poɾ mi pɾˌiβilˈexjo ke ˈun monˈaɾka o ˈuna kˈosa pˌaɾeθˈiða,
pwˈes dˈeβo kˌomfesˈaɾ ke a mi memˈoɾja bjˈene a ˌajjuðˈaɾ mi ˌekspeɾjˈɛnθja sˌuβsiɣjˈɛnte.
kˈiso, poɾ su pˈaɾte, ˌakaɾiθjˈaɾme poɾ enθˈima del ˈombɾo de mi mˈaðɾe, pˌeɾo malðˈita la sˌimpatˈia ke sentˈi ˌaθja ˈel i poɾ su ˈaspeɾa bˈoθ.
tˈuβe θˈelos al notˈaɾ ke su mˈano ɾɾoθˈaβa a mi mˈaðɾe i la sˌepaɾˈe kwˌanto me fwˈe posˈiβle.
¿kˈomo se entjˈɛnde, daβˈid?
dˈixo mi mˈaðɾe kon ˈaɪɾe de ɾɾepɾˈotʃe.
keɾˈiðo nˈiɲo, ˌeksklamˈo el kˌaβaʎˈeɾo, nˈo pwˈeðo ˌenoxˈaɾme de su θˌelofilˈeal.
xamˈas aβˈia bˈisto ˈun kaɾmˈin tˈan suβˈiðo en las mexˈiʎas de mi mˈaðɾe.
ɾɾiɲˈo me kˌaɾiɲˈosamˈente i, al mˈismo tjˈempo ke me ˌestɾetʃˈaβa kˈontɾa su kˌoɾaθˈon, djˈo las ɡɾˈaθjas a akˈel seɲˈoɾ poɾ la molˈestja de aβˈeɾla ˌakompaɲˈaðo.
ˈes pɾeθˈiso ke nos dˈemos las bwˈenas nˈotʃes, ˈixo mˈio—dˈixo el kˌaβaʎˈeɾo, ke a su bˈeθ koxjˈo la mˈano de mi mˈaðɾe i besˈo el ɡwˈante ke la kuβɾˈia.
—bwˈenas nˈotʃes—le ɾɾˌespondˈi.
—bˈamos, seˈamos bwˈenos amˈiɣos—ɾɾˌepitjˈo el kˌaβaʎˈeɾo ɾɾiɲˈando—bˈɛŋɡa la mˈano.
ʝˈo tenˈia mi mˈano deɾˈetʃa ˌentɾe las mˈanos de mi mˈaðɾe, asˈi fwˈe ke la ˌalaɾɣˈe la ˈotɾa.
—nˈo ˈes ˈesta la bwˈena, daβˈid—ˌoβseɾβˈo el kˌaβaʎˈeɾo sin dexˈaɾ de ɾɾeˈiɾ.
mi mˈaðɾe kˈiso aθˈeɾme dˈaɾ la mˈano deɾˈetʃa, pˌeɾo, kˌomo me aʎˈaβa bjˈen dˌeθiðˈiðo a nˈo dˈaɾ sˈino la iθkjˈeɾða,
el kˌaβaʎˈeɾo ˌakaβˈo poɾ ˌestɾetʃˈaɾla koɾðjˈalmˈente.
ˌenseɣˈiða ɾɾˌepitjˈo ke ʝˈo ˈeɾa ˈuna kɾiatˈuɾa ˌeksθelˈɛnte i se ɾɾˌetiɾˈo.
bˈi ke ɾɾˌeβolβˈia la ˈultima ˌaβenˈiða del xaɾðˈin i ke nos embjˈaβa ˈuna miɾˈaða de dˌespeðˈiða kon sus nˈeɣɾos ˈoxos de mˌalaɣˈeɾo.
la pwˈeɾta, ˈuna bˈeθ θeɾɾˈaða, peɣˈo tˈik ke nˈo aβˈia aβlˈaðo nˈi ˈuna sˈola palˈaβɾa, sˌuxetˈo el bˈaɾɾo de jˈeɾɾo i los tɾˈes entɾˈamos en el salˈon.
ˈi, kˈontɾa su kostˈumbɾe, mi mˈaðɾe, en bˈeθ de sentˈaɾse en su butˈaka, al lˈaðo de la lˈumbɾe,
pˌeɾmaneθjˈo al ˈotɾo ekstɾˈemo de la ˌaβitaθjˈon ˌinstalˈandose en ˈuna sˈiʎa i tˌaɾaɾeˈando.
mjˌentɾas ke aθˈia ɡˌoɾɣˈitos, ˌempeθˈe a doɾmˈiɾme, pˌeɾo mi swˈeɲo fwˈe bastˈante lixˈeɾo pˌaɾa poðˈeɾ oˈiɾ a pˌeɣotj ke,
de pjˈe e inmˈoβil en mˈeðjo del salˈon, kon ˈun kˌandelˈeɾo en la mˈano, deθˈia a mi mˈaðɾe.
—¿os aβˈeis dˌiβeɾtˈiðo ˈesta nˈotʃe, seɲˈoɾa?
—sˈi, ɡɾˈaθjas, pˌeɣotj, bastˈante.
me ˌaβentˈuɾo a ˌaɲaðˈiɾ ke aβˈeis pasˈaðo ˈuna swaɣe ke nˈo uβjˈeɾa kˌomplaθˈiðo mˈutʃo a mɾ.
—ˌeksklamˈo mi mˈaðɾe—me bˌolβeɾˈeis lˈoka.
nˈo ˈaɪ ˈuna muxˈeɾ en el mˈundo ke se bˈea peˈoɾ tɾatˈaða poɾ su kɾiˈaða ke ʝˈo.
nˈo θˈeso de pɾˌeɣuntˈaɾme si sˈoɪ ˈuna tʃikˈiʎa o ˈuna muxˈeɾ bjˈuða.
—nˈaðje iɡnˈoɾa ke aβˈeis sˈiðo kasˈaða, seɲˈoɾa, —ɾɾˌeplikˈo pˌeɣotj—
—en ˈeste kˈaso, ¿kˈomo os ˌatɾeβˈeis?
o, mexˈoɾ dˈitʃo, ¿kˈomo tenˈeis balˈoɾ pˌaɾa aθˈeɾme tˈan dˌesɣɾaθjˈaða i ˌatoɾmentˈaɾme asˈi kwˌando saβˈeis ke nˈo tˈɛŋɡo nˈi ˈuna sˈola amˈiɣa?
—ɾɾaθˈon de mˈas pˌaɾa sˈer mˈas pɾˌekaβˈiða, —dˈixo pˌeɣotj—
—pwˈeðo ˌimpeðˈiɾ, —ˌaɲaðjˈo mi mˈaðɾe—ke sˈean fˈinos i atˈɛntos konmˈiɣo.
ˈes pɾeθˈiso ke me kˈambje, ke me eskˈalðe el ɾɾˈostɾo.
kwalkjˈeɾa diɾˈia ke nˈo dˌeseˈaβaɪs ˈotɾa kˈosa, —ˌaɲaðjˈo mi mˈaðɾe ɾɾompjˈɛndo a ʎoɾˈaɾ i ʝˈɛndo a sentˈaɾse en la butˈaka pˌaɾa ˌakaɾiθjˈaɾme.
ˈa, mi keɾˈiðo daβˈid, pˈoβɾe ˈixo mˈio, tambjˈen seɾˈeis kapˈaθ de deθˈiɾ ke nˈo kjˈeɾo ˈeste tesˈoɾo kwˌando nˈo ˈaɪ kɾiatˈuɾa en el mˈundo mˈas amˈaða.
nˈaðje ˈa dˈitʃo tˈal kˈosa, seɲˈoɾa, —ˌeksklamˈo pˌeɣotj, ˌempeθˈando kon moβˈeɾse—
—lo aβˈeis dˈitʃo, o a lo mˈenos ˈesa ˈa sˈiðo bwˌestɾa ˌintenθjˈon, —pɾˌosiɣjˈo mi mˈaðɾe sin dexˈaɾ de ʎoɾˈaɾ—
pˌeɾo mi ˈixo sˈaβe ke le kjˈeɾo.
daβˈid ɾɾespˈonde, —¿sˈoɪ ˈuna mˈala mˈaðɾe?—
me pɾˌeɣuntˈo al bˈeɾ ke sus kaɾˈiθjas me aβˈian dˌespeɾtˈaðo.
ˈaβla, ˈixo mˈio, —sˈoɪ ˈuna mˈaðɾe ˌeɣoˈista i kɾuˈel.
a ˈesto los tɾˈes nos pusˈimos a sˌoʎoθˈaɾ, ʝˈo mˈutʃo mˈas fwˈeɾte ke mi mˈaðɾe i pˌeɣotj,
ˌaʊnke estˈoɪ seɣˈuɾo ke nwˌestɾas lˈaɣɾimas ˈeɾan iɣwˈalmˈente sinθˈeɾas.
asˈi ke uβˈimos ʎoɾˈaðo lo bastˈante nos fwˈimos a ˌakostˈaɾ.
nˈo bjˈen me aβˈia doɾmˈiðo kwˌando mis ˌoleˈoθos bolβjˈeɾon a dˌespeɾtˈaɾme, i bˈi a mi mˈaðɾe sentˈaða al lˈaðo de mi kˈama.
me koxjˈo en sus bɾˈaθos i akˈeʎa bˈeθ me doɾmˈi de bˈeɾas ˌasta la maɲˈana siɣjˈɛnte.
nˈo pwˈeðo deθˈiɾ si fwˈe el domˈɡo siɣjˈɛnte u ˈotɾo ke bolβˈi a bˈeɾ al kˌaβaʎˈeɾo de las paðˈiʎas nˈeɣɾas.
nˈo ˌaseɣˈuɾo la ˌeksaktitˈuð de mis fˈetʃas, pˌeɾo el kˈaso ˈes ke tˈoðos los domˈɡos le aʎˈaβamos en la iɣlˈesja i nos ˌakompaɲˈaβa a kˈasa.
ˈuna bˈeθ nos ˈiθo ˈuna bisˈita bˌaxo el pɾetˈeksto de bˈˈun xeɾˈanjo ke estˈaβa al balkˈon.
se me fˌiɣˈo ke nˈo ɾɾˌepaɾˈaβa mˈutʃo en el xeɾˈanjo, pˌeɾo ˈantes de ˈiɾse sˌuplikˈo a mi mˈaðɾe ke le djˈeɾa ˈuna mˌatˈita.
ɾɾˌespondjˈole ke poðˈia koxˈeɾla ˈel mˈismo, a lo kwˈal se neɣˈo, ˌinsistjˈɛndo pˌaɾa ke se la djˈese de su mˈano.
mi mˈaðɾe ˌakθeðjˈo i el kˌaβaʎˈeɾo dˈixo ke la kˌonseɾβaɾˈia etˈeɾnamˈente, lo kwˈal me ˈiθo sˌospetʃˈaɾ ke nˈo ˈeɾan ɡɾˈandes sus kˌonoθimjˈɛntos,
pwˈesto ke ˌiɡnoɾˈaβa ke la flˈoɾ sˌepaɾˈaða de su tˈaʎo se mˌaɾtʃitaɾˈia al kˈaβo de ˈuno o dˈos dˈias.
pˌeɣotj nˈo pasˈaβa kon tˈanta fɾekwˈɛnθja las nˈotʃes en nwˌestɾa kˌompaɲˈia.
mi mˈaðɾe la miɾˈaβa kon ɡɾˈan dˌefeɾˈɛnθja, aˈun mˈas ke ˈantes, seɣˈun notˈe, i los tɾˈes kˌontinwˈaβamos sjˈɛndo los mexˈoɾes amˈiɣos del mˈundo.
sin embˈɣo, ˌeksistˈia θjˈeɾta dˌifeɾˈɛnθja, ˈuna espˈeθje de kˌoɾteðˈad ˌindefinˈiβle.
alɣˈunas bˈeθes pˌeɣotj pˌaɾeθˈia ke ɾɾˌepɾotʃˈaβa a mi mˈaðɾe el ke se pusjˈese tˈoðos los lˈindos tɾˈaxes ke ʎenˈaβan sus aɾmˈaɾjos o el ke fwˈese de bisˈita kon fɾekwˈɛnθja a kˈasa de la beθˈina,
pˌeɾo tˈoðo ˈesto me lo ˌeksplikˈaβa ʝˈo ˌimpeɾfˈektamˈente.
pˈoko a pˈoko me ˌakostumbɾˈe a bˈeɾ al kˌaβaʎˈeɾo de las pˌatiʎas nˈeɣɾas, sin keɾˈeɾle poɾ ˈeso mˈas,
sin dexˈaɾ de tenˈeɾ los mˈismos θˈelos, pˌeɾo nˈo me saβˈia dˈaɾ kwˈɛnta de akˈeʎos sˌentimjˈɛntos pˈuɾamˈente ˌinstintˈiβos.
akˈeʎo sˌoβɾepasˈaβa a mi ɾɾˌaθonamjˈɛnto de nˈiɲo.
ˈuna ˌeɾmosˈisima maɲˈana de otˈoɲo me aʎˈaβa en nwˌestɾo paɾtˈeɾɾe kon mi mˈaðɾe, kwˌando mɾ.
muɾston, ke ˈeɾa su nˈombɾe, ʎeɣˈo a kaβˈaʎo.
sˌaluðˈo a mi mˈaðɾe, dˌixˈole ke se dˌiɾixˈia a lˌoˈustoˈe a bˈeɾ a ˈunos amˈiɣos ke le ˌespeɾˈaβan kon su ʝaɣ,
i pɾopˈuso ʎeβˈaɾme si akˈel pasˈeo poðˈia sˈer de mi aɣɾˈaðo.
el ˈaɪɾe ˈeɾa tˈan swˈaβe i el kaβˈaʎo pjafˈaβa tˈan nˈoβlemˈente a la pwˈeɾta del xaɾðˈin ke me dexˈe sˌeðuθˈiɾ.
fwˈi en bˈuska de pˌeɣotj pˌaɾa ke me bistjˈeɾa.
ˌentɾetˈanto, mɾ.
muɾston etʃˈo pjˈe a tjˈeɾɾa, se etʃˈo las ɾɾiˈɛndas al bɾˈaθo i siɣjˈo la ˌempaliθˈaða ke mi mˈaðɾe,
pˌaɾa aθˈeɾle kˌompaɲˈia, seɣˈia tambjˈen poɾ la pˈaɾte de aðˈɛntɾo.
me akwˈeɾðo ke pˌeɣotj i ʝˈo miɾˈaβamos de kwˌando en kwˌando poɾ la bentˈana, i los dˈos ke se pˌaseˈaβan pˌaɾeθˈian ˌeksaminˈaɾ el espˈino mˈujj de θˈeɾka.
de ɾɾepˈɛnte, pˌeɣotj, ke estˈaβa de mˈujj bwˈen umˈoɾ, ˌekspeɾˌimentˈo θjˈeɾta kˌontɾaɾjeðˈad i me peɪnˈo kon fwˈeɾθa,
kˈosa ke me ˌoβliɣˈo a aθˈˈun xˈesto.
muɾston i ʝˈo nˈo taɾðˈamos en ˌalexˈaɾnos tɾotˈando poɾ la kˌaɾɾetˈeɾa.
me ʎeβˈaβa delˈante de su sˈiʎa, koxˈiðo kon ˈuno de sus bɾˈaθos, i nˈo poðˈia mˈenos dˌeβolβˈeɾ de kwˌando en kwˌando la kaβˈeθa pˌaɾa miɾˈaɾ su ɾɾˈostɾo.
tenˈia ˈuna espˈeθje de ˈoxos nˈeɣɾos de aβˈismo.
nˈo konˈoθko ˈotɾa ˌekspɾesjˈon ke pwˈeða dˌefinˈˈun ˈoxo kˈujja pɾˌofundiðˈad ˈes ˌimpenetɾˈaβle, ke en ˈuna lixˈeɾa dˌistɾakθjˈon paɾˈeθen de ɾɾepˈɛnte belˈaɾse o ˌapaɣˈaɾse.
ˌeksaminˈe akˈeʎa kˈaɾa kon θjˈeɾto espˈanto, i me pɾˌeɣuntˈe kˈe seɾˈia lo ke asˈi pɾˌeokupˈaβa su ˌimaxˌinaθjˈon.
nˈo dexˈe de ˌadmiɾˈaɾ sus nˈeɣɾas pˌatiʎas i su bjˈen ˌafeɪtˈaða bˈaɾβa ke nˈo mostɾˈaβa sˈino los pˈuntos nˈeɣɾos ke tambjˈen imˈitan la bˈaɾβa en ˈuna fiɣˈuɾa de θˈeɾa.

View File

@@ -0,0 +1,253 @@
import importlib
from typing import List
import gruut
from gruut_ipa import IPA # pip install gruut_ipa
from .base import BasePhonemizer
from .punctuation import Punctuation
# Table for str.translate to fix gruut/TTS phoneme mismatch
GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
class Gruut(BasePhonemizer):
"""Gruut wrapper for G2P
Args:
language (str):
Valid language code for the used backend.
punctuations (str):
Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`.
keep_puncs (bool):
If true, keep the punctuations after phonemization. Defaults to True.
use_espeak_phonemes (bool):
If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False.
keep_stress (bool):
If true, keep the stress characters after phonemization. Defaults to False.
Example:
>>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
>>> phonemizer = Gruut('en-us')
>>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|")
'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?'
"""
def __init__(
self,
language: str,
punctuations=Punctuation.default_puncs(),
keep_puncs=True,
use_espeak_phonemes=False,
keep_stress=False,
):
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
self.use_espeak_phonemes = use_espeak_phonemes
self.keep_stress = keep_stress
@staticmethod
def name():
return "gruut"
def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument
"""Convert input text to phonemes.
Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
that constitude a single sound.
It doesn't affect 🐸TTS since it individually converts each character to token IDs.
Examples::
"hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ`
Args:
text (str):
Text to be converted to phonemes.
tie (bool, optional) : When True use a '͡' character between
consecutive characters of a single phoneme. Else separate phoneme
with '_'. This option requires espeak>=1.49. Default to False.
"""
ph_list = []
for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes):
for word in sentence:
if word.is_break:
# Use actual character for break phoneme (e.g., comma)
if ph_list:
# Join with previous word
ph_list[-1].append(word.text)
else:
# First word is punctuation
ph_list.append([word.text])
elif word.phonemes:
# Add phonemes for word
word_phonemes = []
for word_phoneme in word.phonemes:
if not self.keep_stress:
# Remove primary/secondary stress
word_phoneme = IPA.without_stress(word_phoneme)
word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE)
if word_phoneme:
# Flatten phonemes
word_phonemes.extend(word_phoneme)
if word_phonemes:
ph_list.append(word_phonemes)
ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list]
ph = f"{separator} ".join(ph_words)
return ph
def _phonemize(self, text, separator):
return self.phonemize_gruut(text, separator, tie=False)
def is_supported_language(self, language):
"""Returns True if `language` is supported by the backend"""
return gruut.is_language_supported(language)
@staticmethod
def supported_languages() -> List:
"""Get a dictionary of supported languages.
Returns:
List: List of language codes.
"""
return list(gruut.get_supported_languages())
def version(self):
"""Get the version of the used backend.
Returns:
str: Version of the used backend.
"""
return gruut.__version__
@classmethod
def is_available(cls):
"""Return true if ESpeak is available else false"""
return importlib.util.find_spec("gruut") is not None
if __name__ == "__main__":
from es_to_ipa import es2ipa
import json
e = Gruut(language="es-es", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
symbols = [
"_",
",",
".",
"!",
"?",
"-",
"~",
"\u2026",
"N",
"Q",
"a",
"b",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"\u0251",
"\u00e6",
"\u0283",
"\u0291",
"\u00e7",
"\u026f",
"\u026a",
"\u0254",
"\u025b",
"\u0279",
"\u00f0",
"\u0259",
"\u026b",
"\u0265",
"\u0278",
"\u028a",
"\u027e",
"\u0292",
"\u03b8",
"\u03b2",
"\u014b",
"\u0266",
"\u207c",
"\u02b0",
"`",
"^",
"#",
"*",
"=",
"\u02c8",
"\u02cc",
"\u2192",
"\u2193",
"\u2191",
" ",
]
with open('./text/es_phonemizer/spanish_text.txt', 'r') as f:
lines = f.readlines()
used_sym = []
not_existed_sym = []
phonemes = []
for line in lines[:400]:
text = line.split('|')[-1].strip()
ipa = es2ipa(text)
phonemes.append(ipa + '\n')
for s in ipa:
if s not in symbols:
if s not in not_existed_sym:
print(f'not_existed char: {s}')
not_existed_sym.append(s)
else:
if s not in used_sym:
# print(f'used char: {s}')
used_sym.append(s)
print(used_sym)
print(not_existed_sym)
with open('./text/es_phonemizer/es_symbols.txt', 'w') as g:
g.writelines(symbols + not_existed_sym)
with open('./text/es_phonemizer/example_ipa.txt', 'w') as g:
g.writelines(phonemes)
data = {'symbols': symbols + not_existed_sym}
with open('./text/es_phonemizer/es_symbols_v2.json', 'w') as f:
json.dump(data, f, indent=4)

View File

@@ -0,0 +1,174 @@
import collections
import re
from enum import Enum
import six
_DEF_PUNCS = ';:,.!?¡¿—…"«»“”'
_PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"])
class PuncPosition(Enum):
"""Enum for the punctuations positions"""
BEGIN = 0
END = 1
MIDDLE = 2
ALONE = 3
class Punctuation:
"""Handle punctuations in text.
Just strip punctuations from text or strip and restore them later.
Args:
puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`.
Example:
>>> punc = Punctuation()
>>> punc.strip("This is. example !")
'This is example'
>>> text_striped, punc_map = punc.strip_to_restore("This is. example !")
>>> ' '.join(text_striped)
'This is example'
>>> text_restored = punc.restore(text_striped, punc_map)
>>> text_restored[0]
'This is. example !'
"""
def __init__(self, puncs: str = _DEF_PUNCS):
self.puncs = puncs
@staticmethod
def default_puncs():
"""Return default set of punctuations."""
return _DEF_PUNCS
@property
def puncs(self):
return self._puncs
@puncs.setter
def puncs(self, value):
if not isinstance(value, six.string_types):
raise ValueError("[!] Punctuations must be of type str.")
self._puncs = "".join(list(dict.fromkeys(list(value)))) # remove duplicates without changing the oreder
self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+")
def strip(self, text):
"""Remove all the punctuations by replacing with `space`.
Args:
text (str): The text to be processed.
Example::
"This is. example !" -> "This is example "
"""
return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip()
def strip_to_restore(self, text):
"""Remove punctuations from text to restore them later.
Args:
text (str): The text to be processed.
Examples ::
"This is. example !" -> [["This is", "example"], [".", "!"]]
"""
text, puncs = self._strip_to_restore(text)
return text, puncs
def _strip_to_restore(self, text):
"""Auxiliary method for Punctuation.preserve()"""
matches = list(re.finditer(self.puncs_regular_exp, text))
if not matches:
return [text], []
# the text is only punctuations
if len(matches) == 1 and matches[0].group() == text:
return [], [_PUNC_IDX(text, PuncPosition.ALONE)]
# build a punctuation map to be used later to restore punctuations
puncs = []
for match in matches:
position = PuncPosition.MIDDLE
if match == matches[0] and text.startswith(match.group()):
position = PuncPosition.BEGIN
elif match == matches[-1] and text.endswith(match.group()):
position = PuncPosition.END
puncs.append(_PUNC_IDX(match.group(), position))
# convert str text to a List[str], each item is separated by a punctuation
splitted_text = []
for idx, punc in enumerate(puncs):
split = text.split(punc.punc)
prefix, suffix = split[0], punc.punc.join(split[1:])
splitted_text.append(prefix)
# if the text does not end with a punctuation, add it to the last item
if idx == len(puncs) - 1 and len(suffix) > 0:
splitted_text.append(suffix)
text = suffix
while splitted_text[0] == '':
splitted_text = splitted_text[1:]
return splitted_text, puncs
@classmethod
def restore(cls, text, puncs):
"""Restore punctuation in a text.
Args:
text (str): The text to be processed.
puncs (List[str]): The list of punctuations map to be used for restoring.
Examples ::
['This is', 'example'], ['.', '!'] -> "This is. example!"
"""
return cls._restore(text, puncs, 0)
@classmethod
def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements
"""Auxiliary method for Punctuation.restore()"""
if not puncs:
return text
# nothing have been phonemized, returns the puncs alone
if not text:
return ["".join(m.punc for m in puncs)]
current = puncs[0]
if current.position == PuncPosition.BEGIN:
return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num)
if current.position == PuncPosition.END:
return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1)
if current.position == PuncPosition.ALONE:
return [current.mark] + cls._restore(text, puncs[1:], num + 1)
# POSITION == MIDDLE
if len(text) == 1: # pragma: nocover
# a corner case where the final part of an intermediate
# mark (I) has not been phonemized
return cls._restore([text[0] + current.punc], puncs[1:], num)
return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
# if __name__ == "__main__":
# punc = Punctuation()
# text = "This is. This is, example!"
# print(punc.strip(text))
# split_text, puncs = punc.strip_to_restore(text)
# print(split_text, " ---- ", puncs)
# restored_text = punc.restore(split_text, puncs)
# print(restored_text)

View File

@@ -0,0 +1 @@
dˌaβˈiðkopeɾfjl unθsbmtʃwɛxɪŋʊɣɡrɲʝʎː

View File

@@ -0,0 +1,124 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "ImportError",
"evalue": "attempted relative import with no known parent package",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb Cell 1\u001b[0m line \u001b[0;36m5\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2Bcatams4/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mos\u001b[39;00m\u001b[39m,\u001b[39m \u001b[39msys\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2Bcatams4/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=3'>4</a>\u001b[0m sys\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mappend(\u001b[39m'\u001b[39m\u001b[39m/home/xumin/workspace/MyShell-VC-Training/text/es_phonemizer/\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2Bcatams4/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=4'>5</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mes_to_ipa\u001b[39;00m \u001b[39mimport\u001b[39;00m es2ipa\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2Bcatams4/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=8'>9</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39msplit_sentences_en\u001b[39m(text, min_len\u001b[39m=\u001b[39m\u001b[39m10\u001b[39m):\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2Bcatams4/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=9'>10</a>\u001b[0m \u001b[39m# 将文本中的换行符、空格和制表符替换为空格\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2Bcatams4/home/xumin/workspace/Bert-VITS2/text/es_phonemizer/test.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=10'>11</a>\u001b[0m text \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39msub(\u001b[39m'\u001b[39m\u001b[39m[\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\t\u001b[39;00m\u001b[39m ]+\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m \u001b[39m\u001b[39m'\u001b[39m, text)\n",
"File \u001b[0;32m/data/workspace/Bert-VITS2/text/es_phonemizer/es_to_ipa.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mcleaner\u001b[39;00m \u001b[39mimport\u001b[39;00m spanish_cleaners\n\u001b[1;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mgruut_wrapper\u001b[39;00m \u001b[39mimport\u001b[39;00m Gruut\n\u001b[1;32m 4\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mes2ipa\u001b[39m(text):\n",
"\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package"
]
}
],
"source": [
"import re\n",
"import os\n",
"import os, sys\n",
"sys.path.append('/home/xumin/workspace/MyShell-VC-Training/text/es_phonemizer/')\n",
"from es_to_ipa import es2ipa\n",
"\n",
"\n",
"\n",
"def split_sentences_en(text, min_len=10):\n",
" # 将文本中的换行符、空格和制表符替换为空格\n",
" text = re.sub('[\\n\\t ]+', ' ', text)\n",
" # 在标点符号后添加一个空格\n",
" text = re.sub('([¿—¡])', r'\\1 $#!', text)\n",
" # 分隔句子并去除前后空格\n",
" \n",
" sentences = [s.strip() for s in text.split(' $#!')]\n",
" if len(sentences[-1]) == 0: del sentences[-1]\n",
"\n",
" new_sentences = []\n",
" new_sent = []\n",
" for ind, sent in enumerate(sentences):\n",
" if sent in ['¿', '—', '¡']:\n",
" new_sent.append(sent)\n",
" else:\n",
" new_sent.append(es2ipa(sent))\n",
" \n",
" \n",
" new_sentences = ''.join(new_sent)\n",
"\n",
" return new_sentences"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'—¿aβˈeis estˈaðo kasˈaða alɣˈuna bˈeθ?'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"split_sentences_en('—¿Habéis estado casada alguna vez?')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'aβˈeis estˈaðo kasˈaða alɣˈuna bˈeθ?'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"es2ipa('—¿Habéis estado casada alguna vez?')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.18"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,140 @@
import abc
from typing import List, Tuple
from .punctuation import Punctuation
class BasePhonemizer(abc.ABC):
"""Base phonemizer class
Phonemization follows the following steps:
1. Preprocessing:
- remove empty lines
- remove punctuation
- keep track of punctuation marks
2. Phonemization:
- convert text to phonemes
3. Postprocessing:
- join phonemes
- restore punctuation marks
Args:
language (str):
Language used by the phonemizer.
punctuations (List[str]):
List of punctuation marks to be preserved.
keep_puncs (bool):
Whether to preserve punctuation marks or not.
"""
def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
# ensure the backend is installed on the system
if not self.is_available():
raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover
# ensure the backend support the requested language
self._language = self._init_language(language)
# setup punctuation processing
self._keep_puncs = keep_puncs
self._punctuator = Punctuation(punctuations)
def _init_language(self, language):
"""Language initialization
This method may be overloaded in child classes (see Segments backend)
"""
if not self.is_supported_language(language):
raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend")
return language
@property
def language(self):
"""The language code configured to be used for phonemization"""
return self._language
@staticmethod
@abc.abstractmethod
def name():
"""The name of the backend"""
...
@classmethod
@abc.abstractmethod
def is_available(cls):
"""Returns True if the backend is installed, False otherwise"""
...
@classmethod
@abc.abstractmethod
def version(cls):
"""Return the backend version as a tuple (major, minor, patch)"""
...
@staticmethod
@abc.abstractmethod
def supported_languages():
"""Return a dict of language codes -> name supported by the backend"""
...
def is_supported_language(self, language):
"""Returns True if `language` is supported by the backend"""
return language in self.supported_languages()
@abc.abstractmethod
def _phonemize(self, text, separator):
"""The main phonemization method"""
def _phonemize_preprocess(self, text) -> Tuple[List[str], List]:
"""Preprocess the text before phonemization
1. remove spaces
2. remove punctuation
Override this if you need a different behaviour
"""
text = text.strip()
if self._keep_puncs:
# a tuple (text, punctuation marks)
return self._punctuator.strip_to_restore(text)
return [self._punctuator.strip(text)], []
def _phonemize_postprocess(self, phonemized, punctuations) -> str:
"""Postprocess the raw phonemized output
Override this if you need a different behaviour
"""
if self._keep_puncs:
return self._punctuator.restore(phonemized, punctuations)[0]
return phonemized[0]
def phonemize(self, text: str, separator="|", language: str = None) -> str: # pylint: disable=unused-argument
"""Returns the `text` phonemized for the given language
Args:
text (str):
Text to be phonemized.
separator (str):
string separator used between phonemes. Default to '_'.
Returns:
(str): Phonemized text
"""
text, punctuations = self._phonemize_preprocess(text)
phonemized = []
for t in text:
p = self._phonemize(t, separator)
phonemized.append(p)
phonemized = self._phonemize_postprocess(phonemized, punctuations)
return phonemized
def print_logs(self, level: int = 0):
indent = "\t" * level
print(f"{indent}| > phoneme language: {self.language}")
print(f"{indent}| > phoneme backend: {self.name()}")

View File

@@ -0,0 +1,122 @@
"""Set of default text cleaners"""
# TODO: pick the cleaner for languages dynamically
import re
from .french_abbreviations import abbreviations_fr
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
rep_map = {
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"·": ",",
"": ",",
"...": ".",
"": ".",
"$": ".",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"(": "",
")": "",
"": "",
"": "",
"": "",
"": "",
"[": "",
"]": "",
"": "",
"": "-",
"~": "-",
"": "",
"": "",
"¿" : "",
"¡" : ""
}
def replace_punctuation(text):
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
return replaced_text
def expand_abbreviations(text, lang="fr"):
if lang == "fr":
_abbreviations = abbreviations_fr
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, " ", text).strip()
def remove_punctuation_at_begin(text):
return re.sub(r'^[,.!?]+', '', text)
def remove_aux_symbols(text):
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
return text
def replace_symbols(text, lang="en"):
"""Replace symbols based on the lenguage tag.
Args:
text:
Input text.
lang:
Lenguage identifier. ex: "en", "fr", "pt", "ca".
Returns:
The modified text
example:
input args:
text: "si l'avi cau, diguem-ho"
lang: "ca"
Output:
text: "si lavi cau, diguemho"
"""
text = text.replace(";", ",")
text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
text = text.replace(":", ",")
if lang == "en":
text = text.replace("&", " and ")
elif lang == "fr":
text = text.replace("&", " et ")
elif lang == "pt":
text = text.replace("&", " e ")
elif lang == "ca":
text = text.replace("&", " i ")
text = text.replace("'", "")
elif lang== "es":
text=text.replace("&","y")
text = text.replace("'", "")
return text
def french_cleaners(text):
"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
text = expand_abbreviations(text, lang="fr")
# text = lowercase(text) # as we use the cased bert
text = replace_punctuation(text)
text = replace_symbols(text, lang="fr")
text = remove_aux_symbols(text)
text = remove_punctuation_at_begin(text)
text = collapse_whitespace(text)
text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
return text

View File

@@ -0,0 +1,78 @@
{"symbols": [
"_",
",",
".",
"!",
"?",
"-",
"~",
"\u2026",
"N",
"Q",
"a",
"b",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"\u0251",
"\u00e6",
"\u0283",
"\u0291",
"\u00e7",
"\u026f",
"\u026a",
"\u0254",
"\u025b",
"\u0279",
"\u00f0",
"\u0259",
"\u026b",
"\u0265",
"\u0278",
"\u028a",
"\u027e",
"\u0292",
"\u03b8",
"\u03b2",
"\u014b",
"\u0266",
"\u207c",
"\u02b0",
"`",
"^",
"#",
"*",
"=",
"\u02c8",
"\u02cc",
"\u2192",
"\u2193",
"\u2191",
" ",
"ɣ",
"ɡ",
"r",
"ɲ",
"ʝ",
"ʎ",
"ː"
]
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,89 @@
{
"symbols": [
"_",
",",
".",
"!",
"?",
"-",
"~",
"\u2026",
"N",
"Q",
"a",
"b",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"\u0251",
"\u00e6",
"\u0283",
"\u0291",
"\u00e7",
"\u026f",
"\u026a",
"\u0254",
"\u025b",
"\u0279",
"\u00f0",
"\u0259",
"\u026b",
"\u0265",
"\u0278",
"\u028a",
"\u027e",
"\u0292",
"\u03b8",
"\u03b2",
"\u014b",
"\u0266",
"\u207c",
"\u02b0",
"`",
"^",
"#",
"*",
"=",
"\u02c8",
"\u02cc",
"\u2192",
"\u2193",
"\u2191",
" ",
"\u0263",
"\u0261",
"r",
"\u0272",
"\u029d",
"\u028e",
"\u02d0",
"\u0303",
"\u0153",
"\u00f8",
"\u0281",
"\u0252",
"\u028c",
"\u2014",
"\u025c",
"\u0250"
]
}

View File

@@ -0,0 +1,30 @@
from .cleaner import french_cleaners
from .gruut_wrapper import Gruut
def remove_consecutive_t(input_str):
result = []
count = 0
for char in input_str:
if char == 't':
count += 1
else:
if count < 3:
result.extend(['t'] * count)
count = 0
result.append(char)
if count < 3:
result.extend(['t'] * count)
return ''.join(result)
def fr2ipa(text):
e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
# text = french_cleaners(text)
phonemes = e.phonemize(text, separator="")
# print(phonemes)
phonemes = remove_consecutive_t(phonemes)
# print(phonemes)
return phonemes

View File

@@ -0,0 +1,48 @@
import re
# List of (regular expression, replacement) pairs for abbreviations in french:
abbreviations_fr = [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("M", "monsieur"),
("Mlle", "mademoiselle"),
("Mlles", "mesdemoiselles"),
("Mme", "Madame"),
("Mmes", "Mesdames"),
("N.B", "nota bene"),
("M", "monsieur"),
("p.c.q", "parce que"),
("Pr", "professeur"),
("qqch", "quelque chose"),
("rdv", "rendez-vous"),
("max", "maximum"),
("min", "minimum"),
("no", "numéro"),
("adr", "adresse"),
("dr", "docteur"),
("st", "saint"),
("co", "companie"),
("jr", "junior"),
("sgt", "sergent"),
("capt", "capitain"),
("col", "colonel"),
("av", "avenue"),
("av. J.-C", "avant Jésus-Christ"),
("apr. J.-C", "après Jésus-Christ"),
("art", "article"),
("boul", "boulevard"),
("c.-à-d", "cest-à-dire"),
("etc", "et cetera"),
("ex", "exemple"),
("excl", "exclusivement"),
("boul", "boulevard"),
]
] + [
(re.compile("\\b%s" % x[0]), x[1])
for x in [
("Mlle", "mademoiselle"),
("Mlles", "mesdemoiselles"),
("Mme", "Madame"),
("Mmes", "Mesdames"),
]
]

View File

@@ -0,0 +1 @@
_,.!?-~…NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ ɣɡrɲʝʎː̃œøʁɒʌ—ɜɐ

View File

@@ -0,0 +1,258 @@
import importlib
from typing import List
import gruut
from gruut_ipa import IPA # pip install gruut_ipa
from .base import BasePhonemizer
from .punctuation import Punctuation
# Table for str.translate to fix gruut/TTS phoneme mismatch
GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
class Gruut(BasePhonemizer):
"""Gruut wrapper for G2P
Args:
language (str):
Valid language code for the used backend.
punctuations (str):
Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`.
keep_puncs (bool):
If true, keep the punctuations after phonemization. Defaults to True.
use_espeak_phonemes (bool):
If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False.
keep_stress (bool):
If true, keep the stress characters after phonemization. Defaults to False.
Example:
>>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
>>> phonemizer = Gruut('en-us')
>>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|")
'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?'
"""
def __init__(
self,
language: str,
punctuations=Punctuation.default_puncs(),
keep_puncs=True,
use_espeak_phonemes=False,
keep_stress=False,
):
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
self.use_espeak_phonemes = use_espeak_phonemes
self.keep_stress = keep_stress
@staticmethod
def name():
return "gruut"
def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument
"""Convert input text to phonemes.
Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
that constitude a single sound.
It doesn't affect 🐸TTS since it individually converts each character to token IDs.
Examples::
"hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ`
Args:
text (str):
Text to be converted to phonemes.
tie (bool, optional) : When True use a '͡' character between
consecutive characters of a single phoneme. Else separate phoneme
with '_'. This option requires espeak>=1.49. Default to False.
"""
ph_list = []
for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes):
for word in sentence:
if word.is_break:
# Use actual character for break phoneme (e.g., comma)
if ph_list:
# Join with previous word
ph_list[-1].append(word.text)
else:
# First word is punctuation
ph_list.append([word.text])
elif word.phonemes:
# Add phonemes for word
word_phonemes = []
for word_phoneme in word.phonemes:
if not self.keep_stress:
# Remove primary/secondary stress
word_phoneme = IPA.without_stress(word_phoneme)
word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE)
if word_phoneme:
# Flatten phonemes
word_phonemes.extend(word_phoneme)
if word_phonemes:
ph_list.append(word_phonemes)
ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list]
ph = f"{separator} ".join(ph_words)
return ph
def _phonemize(self, text, separator):
return self.phonemize_gruut(text, separator, tie=False)
def is_supported_language(self, language):
"""Returns True if `language` is supported by the backend"""
return gruut.is_language_supported(language)
@staticmethod
def supported_languages() -> List:
"""Get a dictionary of supported languages.
Returns:
List: List of language codes.
"""
return list(gruut.get_supported_languages())
def version(self):
"""Get the version of the used backend.
Returns:
str: Version of the used backend.
"""
return gruut.__version__
@classmethod
def is_available(cls):
"""Return true if ESpeak is available else false"""
return importlib.util.find_spec("gruut") is not None
if __name__ == "__main__":
from cleaner import french_cleaners
import json
e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True)
symbols = [ # en + sp
"_",
",",
".",
"!",
"?",
"-",
"~",
"\u2026",
"N",
"Q",
"a",
"b",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"\u0251",
"\u00e6",
"\u0283",
"\u0291",
"\u00e7",
"\u026f",
"\u026a",
"\u0254",
"\u025b",
"\u0279",
"\u00f0",
"\u0259",
"\u026b",
"\u0265",
"\u0278",
"\u028a",
"\u027e",
"\u0292",
"\u03b8",
"\u03b2",
"\u014b",
"\u0266",
"\u207c",
"\u02b0",
"`",
"^",
"#",
"*",
"=",
"\u02c8",
"\u02cc",
"\u2192",
"\u2193",
"\u2191",
" ",
"ɣ",
"ɡ",
"r",
"ɲ",
"ʝ",
"ʎ",
"ː"
]
with open('/home/xumin/workspace/VITS-Training-Multiling/230715_fr/metadata.txt', 'r') as f:
lines = f.readlines()
used_sym = []
not_existed_sym = []
phonemes = []
for line in lines:
text = line.split('|')[-1].strip()
text = french_cleaners(text)
ipa = e.phonemize(text, separator="")
phonemes.append(ipa)
for s in ipa:
if s not in symbols:
if s not in not_existed_sym:
print(f'not_existed char: {s}')
not_existed_sym.append(s)
else:
if s not in used_sym:
# print(f'used char: {s}')
used_sym.append(s)
print(used_sym)
print(not_existed_sym)
with open('./text/fr_phonemizer/french_symbols.txt', 'w') as g:
g.writelines(symbols + not_existed_sym)
with open('./text/fr_phonemizer/example_ipa.txt', 'w') as g:
g.writelines(phonemes)
data = {'symbols': symbols + not_existed_sym}
with open('./text/fr_phonemizer/fr_symbols.json', 'w') as f:
json.dump(data, f, indent=4)

View File

@@ -0,0 +1,172 @@
import collections
import re
from enum import Enum
import six
_DEF_PUNCS = ';:,.!?¡¿—…"«»“”'
_PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"])
class PuncPosition(Enum):
"""Enum for the punctuations positions"""
BEGIN = 0
END = 1
MIDDLE = 2
ALONE = 3
class Punctuation:
"""Handle punctuations in text.
Just strip punctuations from text or strip and restore them later.
Args:
puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`.
Example:
>>> punc = Punctuation()
>>> punc.strip("This is. example !")
'This is example'
>>> text_striped, punc_map = punc.strip_to_restore("This is. example !")
>>> ' '.join(text_striped)
'This is example'
>>> text_restored = punc.restore(text_striped, punc_map)
>>> text_restored[0]
'This is. example !'
"""
def __init__(self, puncs: str = _DEF_PUNCS):
self.puncs = puncs
@staticmethod
def default_puncs():
"""Return default set of punctuations."""
return _DEF_PUNCS
@property
def puncs(self):
return self._puncs
@puncs.setter
def puncs(self, value):
if not isinstance(value, six.string_types):
raise ValueError("[!] Punctuations must be of type str.")
self._puncs = "".join(list(dict.fromkeys(list(value)))) # remove duplicates without changing the oreder
self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+")
def strip(self, text):
"""Remove all the punctuations by replacing with `space`.
Args:
text (str): The text to be processed.
Example::
"This is. example !" -> "This is example "
"""
return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip()
def strip_to_restore(self, text):
"""Remove punctuations from text to restore them later.
Args:
text (str): The text to be processed.
Examples ::
"This is. example !" -> [["This is", "example"], [".", "!"]]
"""
text, puncs = self._strip_to_restore(text)
return text, puncs
def _strip_to_restore(self, text):
"""Auxiliary method for Punctuation.preserve()"""
matches = list(re.finditer(self.puncs_regular_exp, text))
if not matches:
return [text], []
# the text is only punctuations
if len(matches) == 1 and matches[0].group() == text:
return [], [_PUNC_IDX(text, PuncPosition.ALONE)]
# build a punctuation map to be used later to restore punctuations
puncs = []
for match in matches:
position = PuncPosition.MIDDLE
if match == matches[0] and text.startswith(match.group()):
position = PuncPosition.BEGIN
elif match == matches[-1] and text.endswith(match.group()):
position = PuncPosition.END
puncs.append(_PUNC_IDX(match.group(), position))
# convert str text to a List[str], each item is separated by a punctuation
splitted_text = []
for idx, punc in enumerate(puncs):
split = text.split(punc.punc)
prefix, suffix = split[0], punc.punc.join(split[1:])
splitted_text.append(prefix)
# if the text does not end with a punctuation, add it to the last item
if idx == len(puncs) - 1 and len(suffix) > 0:
splitted_text.append(suffix)
text = suffix
return splitted_text, puncs
@classmethod
def restore(cls, text, puncs):
"""Restore punctuation in a text.
Args:
text (str): The text to be processed.
puncs (List[str]): The list of punctuations map to be used for restoring.
Examples ::
['This is', 'example'], ['.', '!'] -> "This is. example!"
"""
return cls._restore(text, puncs, 0)
@classmethod
def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements
"""Auxiliary method for Punctuation.restore()"""
if not puncs:
return text
# nothing have been phonemized, returns the puncs alone
if not text:
return ["".join(m.punc for m in puncs)]
current = puncs[0]
if current.position == PuncPosition.BEGIN:
return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num)
if current.position == PuncPosition.END:
return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1)
if current.position == PuncPosition.ALONE:
return [current.mark] + cls._restore(text, puncs[1:], num + 1)
# POSITION == MIDDLE
if len(text) == 1: # pragma: nocover
# a corner case where the final part of an intermediate
# mark (I) has not been phonemized
return cls._restore([text[0] + current.punc], puncs[1:], num)
return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
# if __name__ == "__main__":
# punc = Punctuation()
# text = "This is. This is, example!"
# print(punc.strip(text))
# split_text, puncs = punc.strip_to_restore(text)
# print(split_text, " ---- ", puncs)
# restored_text = punc.restore(split_text, puncs)
# print(restored_text)

View File

@@ -0,0 +1,94 @@
import pickle
import os
import re
from . import symbols
from .fr_phonemizer import cleaner as fr_cleaner
from .fr_phonemizer import fr_to_ipa
from transformers import AutoTokenizer
def distribute_phone(n_phone, n_word):
phones_per_word = [0] * n_word
for task in range(n_phone):
min_tasks = min(phones_per_word)
min_index = phones_per_word.index(min_tasks)
phones_per_word[min_index] += 1
return phones_per_word
def text_normalize(text):
text = fr_cleaner.french_cleaners(text)
return text
model_id = 'dbmdz/bert-base-french-europeana-cased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
def g2p(text, pad_start_end=True, tokenized=None):
if tokenized is None:
tokenized = tokenizer.tokenize(text)
# import pdb; pdb.set_trace()
phs = []
ph_groups = []
for t in tokenized:
if not t.startswith("#"):
ph_groups.append([t])
else:
ph_groups[-1].append(t.replace("#", ""))
phones = []
tones = []
word2ph = []
# print(ph_groups)
for group in ph_groups:
w = "".join(group)
phone_len = 0
word_len = len(group)
if w == '[UNK]':
phone_list = ['UNK']
else:
phone_list = list(filter(lambda p: p != " ", fr_to_ipa.fr2ipa(w)))
for ph in phone_list:
phones.append(ph)
tones.append(0)
phone_len += 1
aaa = distribute_phone(phone_len, word_len)
word2ph += aaa
# print(phone_list, aaa)
# print('=' * 10)
if pad_start_end:
phones = ["_"] + phones + ["_"]
tones = [0] + tones + [0]
word2ph = [1] + word2ph + [1]
return phones, tones, word2ph
def get_bert_feature(text, word2ph, device=None):
from text import french_bert
return french_bert.get_bert_feature(text, word2ph, device=device)
if __name__ == "__main__":
ori_text = 'Ce service gratuit est“”"" 【disponible》 en chinois 【simplifié] et autres 123'
# ori_text = "Ils essayaient vainement de faire comprendre à ma mère qu'avec les cent mille francs que m'avait laissé mon père,"
# print(ori_text)
text = text_normalize(ori_text)
print(text)
phoneme = fr_to_ipa.fr2ipa(text)
print(phoneme)
from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer
from text.cleaner_multiling import unicleaners
def text_normalize(text):
text = unicleaners(text, cased=True, lang='fr')
return text
# print(ori_text)
text = text_normalize(ori_text)
print(text)
phonemizer = MultiPhonemizer({"fr-fr": "espeak"})
# phonemizer.lang_to_phonemizer['fr'].keep_stress = True
# phonemizer.lang_to_phonemizer['fr'].use_espeak_phonemes = True
phoneme = phonemizer.phonemize(text, separator="", language='fr-fr')
print(phoneme)

View File

@@ -0,0 +1,39 @@
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import sys
model_id = 'dbmdz/bert-base-french-europeana-cased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = None
def get_bert_feature(text, word2ph, device=None):
global model
if (
sys.platform == "darwin"
and torch.backends.mps.is_available()
and device == "cpu"
):
device = "mps"
if not device:
device = "cuda"
if model is None:
model = AutoModelForMaskedLM.from_pretrained(model_id).to(
device
)
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt")
for i in inputs:
inputs[i] = inputs[i].to(device)
res = model(**inputs, output_hidden_states=True)
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
assert inputs["input_ids"].shape[-1] == len(word2ph)
word2phone = word2ph
phone_level_feature = []
for i in range(len(word2phone)):
repeat_feature = res[i].repeat(word2phone[i], 1)
phone_level_feature.append(repeat_feature)
phone_level_feature = torch.cat(phone_level_feature, dim=0)
return phone_level_feature.T

View File

@@ -0,0 +1,647 @@
# Convert Japanese text to phonemes which is
# compatible with Julius https://github.com/julius-speech/segmentation-kit
import re
import unicodedata
from transformers import AutoTokenizer
from . import symbols
punctuation = ["!", "?", "", ",", ".", "'", "-"]
try:
import MeCab
except ImportError as e:
raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e
from num2words import num2words
_CONVRULES = [
# Conversion of 2 letters
"アァ/ a a",
"イィ/ i i",
"イェ/ i e",
"イャ/ y a",
"ウゥ/ u:",
"エェ/ e e",
"オォ/ o:",
"カァ/ k a:",
"キィ/ k i:",
"クゥ/ k u:",
"クャ/ ky a",
"クュ/ ky u",
"クョ/ ky o",
"ケェ/ k e:",
"コォ/ k o:",
"ガァ/ g a:",
"ギィ/ g i:",
"グゥ/ g u:",
"グャ/ gy a",
"グュ/ gy u",
"グョ/ gy o",
"ゲェ/ g e:",
"ゴォ/ g o:",
"サァ/ s a:",
"シィ/ sh i:",
"スゥ/ s u:",
"スャ/ sh a",
"スュ/ sh u",
"スョ/ sh o",
"セェ/ s e:",
"ソォ/ s o:",
"ザァ/ z a:",
"ジィ/ j i:",
"ズゥ/ z u:",
"ズャ/ zy a",
"ズュ/ zy u",
"ズョ/ zy o",
"ゼェ/ z e:",
"ゾォ/ z o:",
"タァ/ t a:",
"チィ/ ch i:",
"ツァ/ ts a",
"ツィ/ ts i",
"ツゥ/ ts u:",
"ツャ/ ch a",
"ツュ/ ch u",
"ツョ/ ch o",
"ツェ/ ts e",
"ツォ/ ts o",
"テェ/ t e:",
"トォ/ t o:",
"ダァ/ d a:",
"ヂィ/ j i:",
"ヅゥ/ d u:",
"ヅャ/ zy a",
"ヅュ/ zy u",
"ヅョ/ zy o",
"デェ/ d e:",
"ドォ/ d o:",
"ナァ/ n a:",
"ニィ/ n i:",
"ヌゥ/ n u:",
"ヌャ/ ny a",
"ヌュ/ ny u",
"ヌョ/ ny o",
"ネェ/ n e:",
"ノォ/ n o:",
"ハァ/ h a:",
"ヒィ/ h i:",
"フゥ/ f u:",
"フャ/ hy a",
"フュ/ hy u",
"フョ/ hy o",
"ヘェ/ h e:",
"ホォ/ h o:",
"バァ/ b a:",
"ビィ/ b i:",
"ブゥ/ b u:",
"フャ/ hy a",
"ブュ/ by u",
"フョ/ hy o",
"ベェ/ b e:",
"ボォ/ b o:",
"パァ/ p a:",
"ピィ/ p i:",
"プゥ/ p u:",
"プャ/ py a",
"プュ/ py u",
"プョ/ py o",
"ペェ/ p e:",
"ポォ/ p o:",
"マァ/ m a:",
"ミィ/ m i:",
"ムゥ/ m u:",
"ムャ/ my a",
"ムュ/ my u",
"ムョ/ my o",
"メェ/ m e:",
"モォ/ m o:",
"ヤァ/ y a:",
"ユゥ/ y u:",
"ユャ/ y a:",
"ユュ/ y u:",
"ユョ/ y o:",
"ヨォ/ y o:",
"ラァ/ r a:",
"リィ/ r i:",
"ルゥ/ r u:",
"ルャ/ ry a",
"ルュ/ ry u",
"ルョ/ ry o",
"レェ/ r e:",
"ロォ/ r o:",
"ワァ/ w a:",
"ヲォ/ o:",
"ディ/ d i",
"デェ/ d e:",
"デャ/ dy a",
"デュ/ dy u",
"デョ/ dy o",
"ティ/ t i",
"テェ/ t e:",
"テャ/ ty a",
"テュ/ ty u",
"テョ/ ty o",
"スィ/ s i",
"ズァ/ z u a",
"ズィ/ z i",
"ズゥ/ z u",
"ズャ/ zy a",
"ズュ/ zy u",
"ズョ/ zy o",
"ズェ/ z e",
"ズォ/ z o",
"キャ/ ky a",
"キュ/ ky u",
"キョ/ ky o",
"シャ/ sh a",
"シュ/ sh u",
"シェ/ sh e",
"ショ/ sh o",
"チャ/ ch a",
"チュ/ ch u",
"チェ/ ch e",
"チョ/ ch o",
"トゥ/ t u",
"トャ/ ty a",
"トュ/ ty u",
"トョ/ ty o",
"ドァ/ d o a",
"ドゥ/ d u",
"ドャ/ dy a",
"ドュ/ dy u",
"ドョ/ dy o",
"ドォ/ d o:",
"ニャ/ ny a",
"ニュ/ ny u",
"ニョ/ ny o",
"ヒャ/ hy a",
"ヒュ/ hy u",
"ヒョ/ hy o",
"ミャ/ my a",
"ミュ/ my u",
"ミョ/ my o",
"リャ/ ry a",
"リュ/ ry u",
"リョ/ ry o",
"ギャ/ gy a",
"ギュ/ gy u",
"ギョ/ gy o",
"ヂェ/ j e",
"ヂャ/ j a",
"ヂュ/ j u",
"ヂョ/ j o",
"ジェ/ j e",
"ジャ/ j a",
"ジュ/ j u",
"ジョ/ j o",
"ビャ/ by a",
"ビュ/ by u",
"ビョ/ by o",
"ピャ/ py a",
"ピュ/ py u",
"ピョ/ py o",
"ウァ/ u a",
"ウィ/ w i",
"ウェ/ w e",
"ウォ/ w o",
"ファ/ f a",
"フィ/ f i",
"フゥ/ f u",
"フャ/ hy a",
"フュ/ hy u",
"フョ/ hy o",
"フェ/ f e",
"フォ/ f o",
"ヴァ/ b a",
"ヴィ/ b i",
"ヴェ/ b e",
"ヴォ/ b o",
"ヴュ/ by u",
# Conversion of 1 letter
"ア/ a",
"イ/ i",
"ウ/ u",
"エ/ e",
"オ/ o",
"カ/ k a",
"キ/ k i",
"ク/ k u",
"ケ/ k e",
"コ/ k o",
"サ/ s a",
"シ/ sh i",
"ス/ s u",
"セ/ s e",
"ソ/ s o",
"タ/ t a",
"チ/ ch i",
"ツ/ ts u",
"テ/ t e",
"ト/ t o",
"ナ/ n a",
"ニ/ n i",
"ヌ/ n u",
"ネ/ n e",
"/ n o",
"ハ/ h a",
"ヒ/ h i",
"フ/ f u",
"ヘ/ h e",
"ホ/ h o",
"マ/ m a",
"ミ/ m i",
"ム/ m u",
"メ/ m e",
"モ/ m o",
"ラ/ r a",
"リ/ r i",
"ル/ r u",
"レ/ r e",
"ロ/ r o",
"ガ/ g a",
"ギ/ g i",
"グ/ g u",
"ゲ/ g e",
"ゴ/ g o",
"ザ/ z a",
"ジ/ j i",
"ズ/ z u",
"ゼ/ z e",
"ゾ/ z o",
"ダ/ d a",
"ヂ/ j i",
"ヅ/ z u",
"デ/ d e",
"ド/ d o",
"バ/ b a",
"ビ/ b i",
"ブ/ b u",
"ベ/ b e",
"ボ/ b o",
"パ/ p a",
"ピ/ p i",
"プ/ p u",
"ペ/ p e",
"ポ/ p o",
"ヤ/ y a",
"ユ/ y u",
"ヨ/ y o",
"ワ/ w a",
"ヰ/ i",
"ヱ/ e",
"ヲ/ o",
"ン/ N",
"ッ/ q",
"ヴ/ b u",
"ー/:",
# Try converting broken text
"ァ/ a",
"ィ/ i",
"ゥ/ u",
"ェ/ e",
"ォ/ o",
"ヮ/ w a",
"ォ/ o",
# Try converting broken text
"ャ/ y a",
"ョ/ y o",
"ュ/ y u",
"琦/ ch i",
"ヶ/ k e",
"髙/ t a k a",
"煞/ sh y a",
# Symbols
"、/ ,",
"。/ .",
"/ !",
"/ ?",
"・/ ,",
]
_COLON_RX = re.compile(":+")
_REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
def _makerulemap():
l = [tuple(x.split("/")) for x in _CONVRULES]
return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
_RULEMAP1, _RULEMAP2 = _makerulemap()
def kata2phoneme(text: str) -> str:
"""Convert katakana text to phonemes."""
text = text.strip()
res = []
while text:
if len(text) >= 2:
x = _RULEMAP2.get(text[:2])
if x is not None:
text = text[2:]
res += x.split(" ")[1:]
continue
x = _RULEMAP1.get(text[0])
if x is not None:
text = text[1:]
res += x.split(" ")[1:]
continue
res.append(text[0])
text = text[1:]
# res = _COLON_RX.sub(":", res)
return res
_KATAKANA = "".join(chr(ch) for ch in range(ord(""), ord("") + 1))
_HIRAGANA = "".join(chr(ch) for ch in range(ord(""), ord("") + 1))
_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
def hira2kata(text: str) -> str:
text = text.translate(_HIRA2KATATRANS)
return text.replace("う゛", "")
_SYMBOL_TOKENS = set(list("・、。?!"))
_NO_YOMI_TOKENS = set(list("「」『』―()[][]"))
_TAGGER = MeCab.Tagger()
def text2kata(text: str) -> str:
parsed = _TAGGER.parse(text)
res = []
for line in parsed.split("\n"):
if line == "EOS":
break
parts = line.split("\t")
word, yomi = parts[0], parts[1]
if yomi:
try:
res.append(yomi.split(',')[6])
except:
import pdb; pdb.set_trace()
else:
if word in _SYMBOL_TOKENS:
res.append(word)
elif word in ("", ""):
res.append("")
elif word in _NO_YOMI_TOKENS:
pass
else:
res.append(word)
return hira2kata("".join(res))
_ALPHASYMBOL_YOMI = {
"#": "シャープ",
"%": "パーセント",
"&": "アンド",
"+": "プラス",
"-": "マイナス",
":": "コロン",
";": "セミコロン",
"<": "小なり",
"=": "イコール",
">": "大なり",
"@": "アット",
"a": "エー",
"b": "ビー",
"c": "シー",
"d": "ディー",
"e": "イー",
"f": "エフ",
"g": "ジー",
"h": "エイチ",
"i": "アイ",
"j": "ジェー",
"k": "ケー",
"l": "エル",
"m": "エム",
"n": "エヌ",
"o": "オー",
"p": "ピー",
"q": "キュー",
"r": "アール",
"s": "エス",
"t": "ティー",
"u": "ユー",
"v": "ブイ",
"w": "ダブリュー",
"x": "エックス",
"y": "ワイ",
"z": "ゼット",
"α": "アルファ",
"β": "ベータ",
"γ": "ガンマ",
"δ": "デルタ",
"ε": "イプシロン",
"ζ": "ゼータ",
"η": "イータ",
"θ": "シータ",
"ι": "イオタ",
"κ": "カッパ",
"λ": "ラムダ",
"μ": "ミュー",
"ν": "ニュー",
"ξ": "クサイ",
"ο": "オミクロン",
"π": "パイ",
"ρ": "ロー",
"σ": "シグマ",
"τ": "タウ",
"υ": "ウプシロン",
"φ": "ファイ",
"χ": "カイ",
"ψ": "プサイ",
"ω": "オメガ",
}
_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
_CURRENCY_MAP = {"$": "ドル", "¥": "", "£": "ポンド", "": "ユーロ"}
_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
def japanese_convert_numbers_to_words(text: str) -> str:
res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
return res
def japanese_convert_alpha_symbols_to_words(text: str) -> str:
return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
def japanese_text_to_phonemes(text: str) -> str:
"""Convert Japanese text to phonemes."""
res = unicodedata.normalize("NFKC", text)
res = japanese_convert_numbers_to_words(res)
res = japanese_convert_alpha_symbols_to_words(res)
res = text2kata(res)
res = kata2phoneme(res)
return res
def is_japanese_character(char):
# 定义日语文字系统的 Unicode 范围
japanese_ranges = [
(0x3040, 0x309F), # 平假名
(0x30A0, 0x30FF), # 片假名
(0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs)
(0x3400, 0x4DBF), # 汉字扩展 A
(0x20000, 0x2A6DF), # 汉字扩展 B
# 可以根据需要添加其他汉字扩展范围
]
# 将字符的 Unicode 编码转换为整数
char_code = ord(char)
# 检查字符是否在任何一个日语范围内
for start, end in japanese_ranges:
if start <= char_code <= end:
return True
return False
rep_map = {
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"·": ",",
"": ",",
"...": "",
}
def replace_punctuation(text):
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(
r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF"
+ "".join(punctuation)
+ r"]+",
"",
replaced_text,
)
return replaced_text
from pykakasi import kakasi
# Initialize kakasi object
kakasi = kakasi()
# Set options for converting Chinese characters to Katakana
kakasi.setMode("J", "K") # Chinese to Katakana
kakasi.setMode("H", "K") # Hiragana to Katakana
# Convert Chinese characters to Katakana
conv = kakasi.getConverter()
def text_normalize(text):
res = unicodedata.normalize("NFKC", text)
res = japanese_convert_numbers_to_words(res)
res = "".join([i for i in res if is_japanese_character(i)])
res = replace_punctuation(res)
res = conv.do(res)
return res
def distribute_phone(n_phone, n_word):
phones_per_word = [0] * n_word
for task in range(n_phone):
min_tasks = min(phones_per_word)
min_index = phones_per_word.index(min_tasks)
phones_per_word[min_index] += 1
return phones_per_word
# tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
model_id = 'cl-tohoku/bert-base-japanese-v3'
tokenizer = AutoTokenizer.from_pretrained(model_id)
def g2p(norm_text):
tokenized = tokenizer.tokenize(norm_text)
phs = []
ph_groups = []
for t in tokenized:
if not t.startswith("#"):
ph_groups.append([t])
else:
ph_groups[-1].append(t.replace("#", ""))
word2ph = []
for group in ph_groups:
text = ""
for ch in group:
text += ch
if text == '[UNK]':
phs += ['_']
word2ph += [1]
continue
elif text in punctuation:
phs += [text]
word2ph += [1]
continue
# import pdb; pdb.set_trace()
# phonemes = japanese_text_to_phonemes(text)
phonemes = kata2phoneme(text)
# phonemes = [i for i in phonemes if i in symbols]
for i in phonemes:
assert i in symbols, (group, norm_text, tokenized, i)
phone_len = len(phonemes)
word_len = len(group)
aaa = distribute_phone(phone_len, word_len)
assert len(aaa) == word_len
word2ph += aaa
phs += phonemes
phones = ["_"] + phs + ["_"]
tones = [0 for i in phones]
word2ph = [1] + word2ph + [1]
assert len(word2ph) == len(tokenized) + 2
return phones, tones, word2ph
def get_bert_feature(text, word2ph, device):
from text import japanese_bert
return japanese_bert.get_bert_feature(text, word2ph, device=device)
if __name__ == "__main__":
# tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
text = "こんにちは、世界!..."
text = 'ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?'
text = 'あの、お前以外のみんなは、全員生きてること?'
from text.japanese_bert import get_bert_feature
text = text_normalize(text)
print(text)
phones, tones, word2ph = g2p(text)
bert = get_bert_feature(text, word2ph)
print(phones, tones, word2ph, bert.shape)
# if __name__ == '__main__':
# from pykakasi import kakasi
# # Initialize kakasi object
# kakasi = kakasi()
# # Set options for converting Chinese characters to Katakana
# kakasi.setMode("J", "H") # Chinese to Katakana
# kakasi.setMode("K", "H") # Hiragana to Katakana
# # Convert Chinese characters to Katakana
# conv = kakasi.getConverter()
# katakana_text = conv.do('ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?') # Replace with your Chinese text
# print(katakana_text) # Output: ニーハオセカイ

View File

@@ -0,0 +1,53 @@
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import sys
# model = None
# model_id = 'cl-tohoku/bert-base-japanese-v3'
# tokenizer = AutoTokenizer.from_pretrained(model_id)
models = {}
tokenizers = {}
def get_bert_feature(text, word2ph, device=None, model_id='cl-tohoku/bert-base-japanese-v3'):
global model
global tokenizer
if (
sys.platform == "darwin"
and torch.backends.mps.is_available()
and device == "cpu"
):
device = "mps"
if not device:
device = "cuda"
if model_id not in models:
model = AutoModelForMaskedLM.from_pretrained(model_id).to(
device
)
models[model_id] = model
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizers[model_id] = tokenizer
else:
model = models[model_id]
tokenizer = tokenizers[model_id]
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt")
tokenized = tokenizer.tokenize(text)
for i in inputs:
inputs[i] = inputs[i].to(device)
res = model(**inputs, output_hidden_states=True)
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
assert inputs["input_ids"].shape[-1] == len(word2ph), f"{inputs['input_ids'].shape[-1]}/{len(word2ph)}"
word2phone = word2ph
phone_level_feature = []
for i in range(len(word2phone)):
repeat_feature = res[i].repeat(word2phone[i], 1)
phone_level_feature.append(repeat_feature)
phone_level_feature = torch.cat(phone_level_feature, dim=0)
return phone_level_feature.T

View File

@@ -0,0 +1,44 @@
# coding: utf-8
# Add the word you want to the dictionary.
etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
english_dictionary = {
"KOREA": "코리아",
"IDOL": "아이돌",
"IT": "아이티",
"IQ": "아이큐",
"UP": "",
"DOWN": "다운",
"PC": "피씨",
"CCTV": "씨씨티비",
"SNS": "에스엔에스",
"AI": "에이아이",
"CEO": "씨이오",
"A": "에이",
"B": "",
"C": "",
"D": "",
"E": "",
"F": "에프",
"G": "",
"H": "에이치",
"I": "아이",
"J": "제이",
"K": "케이",
"L": "",
"M": "",
"N": "",
"O": "",
"P": "",
"Q": "",
"R": "",
"S": "에스",
"T": "",
"U": "",
"V": "브이",
"W": "더블유",
"X": "엑스",
"Y": "와이",
"Z": "제트",
}

View File

@@ -0,0 +1,192 @@
# Convert Japanese text to phonemes which is
# compatible with Julius https://github.com/julius-speech/segmentation-kit
import re
import unicodedata
from transformers import AutoTokenizer
from . import punctuation, symbols
from num2words import num2words
from MyShellTTSBase.text.ko_dictionary import english_dictionary, etc_dictionary
from anyascii import anyascii
from jamo import hangul_to_jamo
def normalize(text):
text = text.strip()
text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
text = normalize_with_dictionary(text, etc_dictionary)
text = normalize_english(text)
text = text.lower()
return text
def normalize_with_dictionary(text, dic):
if any(key in text for key in dic.keys()):
pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
return pattern.sub(lambda x: dic[x.group()], text)
return text
def normalize_english(text):
def fn(m):
word = m.group()
if word in english_dictionary:
return english_dictionary.get(word)
return word
text = re.sub("([A-Za-z]+)", fn, text)
return text
g2p_kr = None
def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
"""
The input and output values look the same, but they are different in Unicode.
example :
input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
"""
global g2p_kr # pylint: disable=global-statement
if g2p_kr is None:
from g2pkk import G2p
g2p_kr = G2p()
if character == "english":
from anyascii import anyascii
text = normalize(text)
text = g2p_kr(text)
text = anyascii(text)
return text
text = normalize(text)
text = g2p_kr(text)
text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
return "".join(text)
def text_normalize(text):
# res = unicodedata.normalize("NFKC", text)
# res = japanese_convert_numbers_to_words(res)
# # res = "".join([i for i in res if is_japanese_character(i)])
# res = replace_punctuation(res)
text = normalize(text)
return text
def distribute_phone(n_phone, n_word):
phones_per_word = [0] * n_word
for task in range(n_phone):
min_tasks = min(phones_per_word)
min_index = phones_per_word.index(min_tasks)
phones_per_word[min_index] += 1
return phones_per_word
# tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
model_id = 'kykim/bert-kor-base'
tokenizer = AutoTokenizer.from_pretrained(model_id)
def g2p(norm_text):
tokenized = tokenizer.tokenize(norm_text)
phs = []
ph_groups = []
for t in tokenized:
if not t.startswith("#"):
ph_groups.append([t])
else:
ph_groups[-1].append(t.replace("#", ""))
word2ph = []
for group in ph_groups:
text = ""
for ch in group:
text += ch
if text == '[UNK]':
phs += ['_']
word2ph += [1]
continue
elif text in punctuation:
phs += [text]
word2ph += [1]
continue
# import pdb; pdb.set_trace()
# phonemes = japanese_text_to_phonemes(text)
# text = g2p_kr(text)
phonemes = korean_text_to_phonemes(text)
# import pdb; pdb.set_trace()
# # phonemes = [i for i in phonemes if i in symbols]
# for i in phonemes:
# assert i in symbols, (group, norm_text, tokenized, i)
phone_len = len(phonemes)
word_len = len(group)
aaa = distribute_phone(phone_len, word_len)
assert len(aaa) == word_len
word2ph += aaa
phs += phonemes
phones = ["_"] + phs + ["_"]
tones = [0 for i in phones]
word2ph = [1] + word2ph + [1]
assert len(word2ph) == len(tokenized) + 2
return phones, tones, word2ph
def get_bert_feature(text, word2ph, device='cuda'):
from . import japanese_bert
return japanese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id)
if __name__ == "__main__":
# tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
from text.symbols import symbols
text = "전 제 일의 가치와 폰타인 대중들이 한 일의 의미를 잘 압니다. 앞으로도 전 제 일에 자부심을 갖고 살아갈 겁니다"
import json
# genshin_data = json.load(open('/data/zwl/workspace/StarRail_Datasets/Index & Scripts/Index/1.3/Korean.json'))
genshin_data = json.load(open('/data/zwl/workspace/Genshin_Datasets/Index & Script/AI Hobbyist Version/Index/4.1/KR_output.json'))
from tqdm import tqdm
new_symbols = []
for key, item in tqdm(genshin_data.items()):
texts = item.get('voiceContent', '')
if isinstance(texts, list):
texts = ','.join(texts)
if texts is None:
continue
if len(texts) == 0:
continue
text = text_normalize(text)
phones, tones, word2ph = g2p(text)
bert = get_bert_feature(text, word2ph)
import pdb; pdb.set_trace()
for ph in phones:
if ph not in symbols and ph not in new_symbols:
new_symbols.append(ph)
print('update!, now symbols:')
print(new_symbols)
with open('korean_symbol.txt', 'w') as f:
f.write(f'{new_symbols}')
# if __name__ == '__main__':
# from pykakasi import kakasi
# # Initialize kakasi object
# kakasi = kakasi()
# # Set options for converting Chinese characters to Katakana
# kakasi.setMode("J", "H") # Chinese to Katakana
# kakasi.setMode("K", "H") # Hiragana to Katakana
# # Convert Chinese characters to Katakana
# conv = kakasi.getConverter()
# katakana_text = conv.do('ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?') # Replace with your Chinese text
# print(katakana_text) # Output: ニーハオセカイ

View File

@@ -0,0 +1,429 @@
a AA a
ai AA ai
an AA an
ang AA ang
ao AA ao
ba b a
bai b ai
ban b an
bang b ang
bao b ao
bei b ei
ben b en
beng b eng
bi b i
bian b ian
biao b iao
bie b ie
bin b in
bing b ing
bo b o
bu b u
ca c a
cai c ai
can c an
cang c ang
cao c ao
ce c e
cei c ei
cen c en
ceng c eng
cha ch a
chai ch ai
chan ch an
chang ch ang
chao ch ao
che ch e
chen ch en
cheng ch eng
chi ch ir
chong ch ong
chou ch ou
chu ch u
chua ch ua
chuai ch uai
chuan ch uan
chuang ch uang
chui ch ui
chun ch un
chuo ch uo
ci c i0
cong c ong
cou c ou
cu c u
cuan c uan
cui c ui
cun c un
cuo c uo
da d a
dai d ai
dan d an
dang d ang
dao d ao
de d e
dei d ei
den d en
deng d eng
di d i
dia d ia
dian d ian
diao d iao
die d ie
ding d ing
diu d iu
dong d ong
dou d ou
du d u
duan d uan
dui d ui
dun d un
duo d uo
e EE e
ei EE ei
en EE en
eng EE eng
er EE er
fa f a
fan f an
fang f ang
fei f ei
fen f en
feng f eng
fo f o
fou f ou
fu f u
ga g a
gai g ai
gan g an
gang g ang
gao g ao
ge g e
gei g ei
gen g en
geng g eng
gong g ong
gou g ou
gu g u
gua g ua
guai g uai
guan g uan
guang g uang
gui g ui
gun g un
guo g uo
ha h a
hai h ai
han h an
hang h ang
hao h ao
he h e
hei h ei
hen h en
heng h eng
hong h ong
hou h ou
hu h u
hua h ua
huai h uai
huan h uan
huang h uang
hui h ui
hun h un
huo h uo
ji j i
jia j ia
jian j ian
jiang j iang
jiao j iao
jie j ie
jin j in
jing j ing
jiong j iong
jiu j iu
ju j v
jv j v
juan j van
jvan j van
jue j ve
jve j ve
jun j vn
jvn j vn
ka k a
kai k ai
kan k an
kang k ang
kao k ao
ke k e
kei k ei
ken k en
keng k eng
kong k ong
kou k ou
ku k u
kua k ua
kuai k uai
kuan k uan
kuang k uang
kui k ui
kun k un
kuo k uo
la l a
lai l ai
lan l an
lang l ang
lao l ao
le l e
lei l ei
leng l eng
li l i
lia l ia
lian l ian
liang l iang
liao l iao
lie l ie
lin l in
ling l ing
liu l iu
lo l o
long l ong
lou l ou
lu l u
luan l uan
lun l un
luo l uo
lv l v
lve l ve
ma m a
mai m ai
man m an
mang m ang
mao m ao
me m e
mei m ei
men m en
meng m eng
mi m i
mian m ian
miao m iao
mie m ie
min m in
ming m ing
miu m iu
mo m o
mou m ou
mu m u
na n a
nai n ai
nan n an
nang n ang
nao n ao
ne n e
nei n ei
nen n en
neng n eng
ni n i
nian n ian
niang n iang
niao n iao
nie n ie
nin n in
ning n ing
niu n iu
nong n ong
nou n ou
nu n u
nuan n uan
nun n un
nuo n uo
nv n v
nve n ve
o OO o
ou OO ou
pa p a
pai p ai
pan p an
pang p ang
pao p ao
pei p ei
pen p en
peng p eng
pi p i
pian p ian
piao p iao
pie p ie
pin p in
ping p ing
po p o
pou p ou
pu p u
qi q i
qia q ia
qian q ian
qiang q iang
qiao q iao
qie q ie
qin q in
qing q ing
qiong q iong
qiu q iu
qu q v
qv q v
quan q van
qvan q van
que q ve
qve q ve
qun q vn
qvn q vn
ran r an
rang r ang
rao r ao
re r e
ren r en
reng r eng
ri r ir
rong r ong
rou r ou
ru r u
rua r ua
ruan r uan
rui r ui
run r un
ruo r uo
sa s a
sai s ai
san s an
sang s ang
sao s ao
se s e
sen s en
seng s eng
sha sh a
shai sh ai
shan sh an
shang sh ang
shao sh ao
she sh e
shei sh ei
shen sh en
sheng sh eng
shi sh ir
shou sh ou
shu sh u
shua sh ua
shuai sh uai
shuan sh uan
shuang sh uang
shui sh ui
shun sh un
shuo sh uo
si s i0
song s ong
sou s ou
su s u
suan s uan
sui s ui
sun s un
suo s uo
ta t a
tai t ai
tan t an
tang t ang
tao t ao
te t e
tei t ei
teng t eng
ti t i
tian t ian
tiao t iao
tie t ie
ting t ing
tong t ong
tou t ou
tu t u
tuan t uan
tui t ui
tun t un
tuo t uo
wa w a
wai w ai
wan w an
wang w ang
wei w ei
wen w en
weng w eng
wo w o
wu w u
xi x i
xia x ia
xian x ian
xiang x iang
xiao x iao
xie x ie
xin x in
xing x ing
xiong x iong
xiu x iu
xu x v
xv x v
xuan x van
xvan x van
xue x ve
xve x ve
xun x vn
xvn x vn
ya y a
yan y En
yang y ang
yao y ao
ye y E
yi y i
yin y in
ying y ing
yo y o
yong y ong
you y ou
yu y v
yv y v
yuan y van
yvan y van
yue y ve
yve y ve
yun y vn
yvn y vn
za z a
zai z ai
zan z an
zang z ang
zao z ao
ze z e
zei z ei
zen z en
zeng z eng
zha zh a
zhai zh ai
zhan zh an
zhang zh ang
zhao zh ao
zhe zh e
zhei zh ei
zhen zh en
zheng zh eng
zhi zh ir
zhong zh ong
zhou zh ou
zhu zh u
zhua zh ua
zhuai zh uai
zhuan zh uan
zhuang zh uang
zhui zh ui
zhun zh un
zhuo zh uo
zi z i0
zong z ong
zou z ou
zu z u
zuan z uan
zui z ui
zun z un
zuo z uo

View File

@@ -0,0 +1,122 @@
import pickle
import os
import re
from . import symbols
from .es_phonemizer import cleaner as es_cleaner
from .es_phonemizer import es_to_ipa
from transformers import AutoTokenizer
def distribute_phone(n_phone, n_word):
phones_per_word = [0] * n_word
for task in range(n_phone):
min_tasks = min(phones_per_word)
min_index = phones_per_word.index(min_tasks)
phones_per_word[min_index] += 1
return phones_per_word
def text_normalize(text):
text = es_cleaner.spanish_cleaners(text)
return text
def post_replace_ph(ph):
rep_map = {
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"·": ",",
"": ",",
"...": ""
}
if ph in rep_map.keys():
ph = rep_map[ph]
if ph in symbols:
return ph
if ph not in symbols:
ph = "UNK"
return ph
def refine_ph(phn):
tone = 0
if re.search(r"\d$", phn):
tone = int(phn[-1]) + 1
phn = phn[:-1]
return phn.lower(), tone
def refine_syllables(syllables):
tones = []
phonemes = []
for phn_list in syllables:
for i in range(len(phn_list)):
phn = phn_list[i]
phn, tone = refine_ph(phn)
phonemes.append(phn)
tones.append(tone)
return phonemes, tones
# model_id = 'bert-base-uncased'
model_id = 'dccuchile/bert-base-spanish-wwm-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
def g2p(text, pad_start_end=True, tokenized=None):
if tokenized is None:
tokenized = tokenizer.tokenize(text)
# import pdb; pdb.set_trace()
phs = []
ph_groups = []
for t in tokenized:
if not t.startswith("#"):
ph_groups.append([t])
else:
ph_groups[-1].append(t.replace("#", ""))
phones = []
tones = []
word2ph = []
# print(ph_groups)
for group in ph_groups:
w = "".join(group)
phone_len = 0
word_len = len(group)
if w == '[UNK]':
phone_list = ['UNK']
else:
phone_list = list(filter(lambda p: p != " ", es_to_ipa.es2ipa(w)))
for ph in phone_list:
phones.append(ph)
tones.append(0)
phone_len += 1
aaa = distribute_phone(phone_len, word_len)
word2ph += aaa
# print(phone_list, aaa)
# print('=' * 10)
if pad_start_end:
phones = ["_"] + phones + ["_"]
tones = [0] + tones + [0]
word2ph = [1] + word2ph + [1]
return phones, tones, word2ph
def get_bert_feature(text, word2ph, device=None):
from text import spanish_bert
return spanish_bert.get_bert_feature(text, word2ph, device=device)
if __name__ == "__main__":
text = "en nuestros tiempos estos dos pueblos ilustres empiezan a curarse, gracias sólo a la sana y vigorosa higiene de 1789."
# print(text)
text = text_normalize(text)
print(text)
phones, tones, word2ph = g2p(text)
bert = get_bert_feature(text, word2ph)
print(phones)
print(len(phones), tones, sum(word2ph), bert.shape)

View File

@@ -0,0 +1,39 @@
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import sys
model_id = 'dccuchile/bert-base-spanish-wwm-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = None
def get_bert_feature(text, word2ph, device=None):
global model
if (
sys.platform == "darwin"
and torch.backends.mps.is_available()
and device == "cpu"
):
device = "mps"
if not device:
device = "cuda"
if model is None:
model = AutoModelForMaskedLM.from_pretrained(model_id).to(
device
)
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt")
for i in inputs:
inputs[i] = inputs[i].to(device)
res = model(**inputs, output_hidden_states=True)
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
assert inputs["input_ids"].shape[-1] == len(word2ph)
word2phone = word2ph
phone_level_feature = []
for i in range(len(word2phone)):
repeat_feature = res[i].repeat(word2phone[i], 1)
phone_level_feature.append(repeat_feature)
phone_level_feature = torch.cat(phone_level_feature, dim=0)
return phone_level_feature.T

View File

@@ -0,0 +1,290 @@
# punctuation = ["!", "?", "…", ",", ".", "'", "-"]
punctuation = ["!", "?", "", ",", ".", "'", "-", "¿", "¡"]
pu_symbols = punctuation + ["SP", "UNK"]
pad = "_"
# chinese
zh_symbols = [
"E",
"En",
"a",
"ai",
"an",
"ang",
"ao",
"b",
"c",
"ch",
"d",
"e",
"ei",
"en",
"eng",
"er",
"f",
"g",
"h",
"i",
"i0",
"ia",
"ian",
"iang",
"iao",
"ie",
"in",
"ing",
"iong",
"ir",
"iu",
"j",
"k",
"l",
"m",
"n",
"o",
"ong",
"ou",
"p",
"q",
"r",
"s",
"sh",
"t",
"u",
"ua",
"uai",
"uan",
"uang",
"ui",
"un",
"uo",
"v",
"van",
"ve",
"vn",
"w",
"x",
"y",
"z",
"zh",
"AA",
"EE",
"OO",
]
num_zh_tones = 6
# japanese
ja_symbols = [
"N",
"a",
"a:",
"b",
"by",
"ch",
"d",
"dy",
"e",
"e:",
"f",
"g",
"gy",
"h",
"hy",
"i",
"i:",
"j",
"k",
"ky",
"m",
"my",
"n",
"ny",
"o",
"o:",
"p",
"py",
"q",
"r",
"ry",
"s",
"sh",
"t",
"ts",
"ty",
"u",
"u:",
"w",
"y",
"z",
"zy",
]
num_ja_tones = 1
# English
en_symbols = [
"aa",
"ae",
"ah",
"ao",
"aw",
"ay",
"b",
"ch",
"d",
"dh",
"eh",
"er",
"ey",
"f",
"g",
"hh",
"ih",
"iy",
"jh",
"k",
"l",
"m",
"n",
"ng",
"ow",
"oy",
"p",
"r",
"s",
"sh",
"t",
"th",
"uh",
"uw",
"V",
"w",
"y",
"z",
"zh",
]
num_en_tones = 4
# Korean
kr_symbols = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '(', '', ')', '', '', '', '', '', '', '', '', '~', '\\', '[', ']', '/', '^', ':', '', '*']
num_kr_tones = 1
# Spanish
es_symbols = [
"N",
"Q",
"a",
"b",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"ɑ",
"æ",
"ʃ",
"ʑ",
"ç",
"ɯ",
"ɪ",
"ɔ",
"ɛ",
"ɹ",
"ð",
"ə",
"ɫ",
"ɥ",
"ɸ",
"ʊ",
"ɾ",
"ʒ",
"θ",
"β",
"ŋ",
"ɦ",
"ɡ",
"r",
"ɲ",
"ʝ",
"ɣ",
"ʎ",
"ˈ",
"ˌ",
"ː"
]
num_es_tones = 1
# French
fr_symbols = [
"\u0303",
"œ",
"ø",
"ʁ",
"ɒ",
"ʌ",
"ɜ",
"ɐ"
]
num_fr_tones = 1
# German
de_symbols = [
"ʏ",
"̩"
]
num_de_tones = 1
# Russian
ru_symbols = [
"ɭ",
"ʲ",
"ɕ",
"\"",
"ɵ",
"^",
"ɬ"
]
num_ru_tones = 1
# combine all symbols
normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols + kr_symbols + es_symbols + fr_symbols + de_symbols + ru_symbols))
symbols = [pad] + normal_symbols + pu_symbols
sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
# combine all tones
num_tones = num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones + num_es_tones + num_fr_tones + num_de_tones + num_ru_tones
# language maps
language_id_map = {"ZH": 0, "JP": 1, "EN": 2, "ZH_MIX_EN": 3, 'KR': 4, 'ES': 5, 'SP': 5 ,'FR': 6}
num_languages = len(language_id_map.keys())
language_tone_start_map = {
"ZH": 0,
"ZH_MIX_EN": 0,
"JP": num_zh_tones,
"EN": num_zh_tones + num_ja_tones,
'KR': num_zh_tones + num_ja_tones + num_en_tones,
"ES": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones,
"SP": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones,
"FR": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones + num_es_tones,
}
if __name__ == "__main__":
a = set(zh_symbols)
b = set(en_symbols)
print(sorted(a & b))

View File

@@ -0,0 +1,769 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from typing import Tuple
import jieba
from pypinyin import lazy_pinyin
from pypinyin import Style
class ToneSandhi:
def __init__(self):
self.must_neural_tone_words = {
"麻烦",
"麻利",
"鸳鸯",
"高粱",
"骨头",
"骆驼",
"马虎",
"首饰",
"馒头",
"馄饨",
"风筝",
"难为",
"队伍",
"阔气",
"闺女",
"门道",
"锄头",
"铺盖",
"铃铛",
"铁匠",
"钥匙",
"里脊",
"里头",
"部分",
"那么",
"道士",
"造化",
"迷糊",
"连累",
"这么",
"这个",
"运气",
"过去",
"软和",
"转悠",
"踏实",
"跳蚤",
"跟头",
"趔趄",
"财主",
"豆腐",
"讲究",
"记性",
"记号",
"认识",
"规矩",
"见识",
"裁缝",
"补丁",
"衣裳",
"衣服",
"衙门",
"街坊",
"行李",
"行当",
"蛤蟆",
"蘑菇",
"薄荷",
"葫芦",
"葡萄",
"萝卜",
"荸荠",
"苗条",
"苗头",
"苍蝇",
"芝麻",
"舒服",
"舒坦",
"舌头",
"自在",
"膏药",
"脾气",
"脑袋",
"脊梁",
"能耐",
"胳膊",
"胭脂",
"胡萝",
"胡琴",
"胡同",
"聪明",
"耽误",
"耽搁",
"耷拉",
"耳朵",
"老爷",
"老实",
"老婆",
"老头",
"老太",
"翻腾",
"罗嗦",
"罐头",
"编辑",
"结实",
"红火",
"累赘",
"糨糊",
"糊涂",
"精神",
"粮食",
"簸箕",
"篱笆",
"算计",
"算盘",
"答应",
"笤帚",
"笑语",
"笑话",
"窟窿",
"窝囊",
"窗户",
"稳当",
"稀罕",
"称呼",
"秧歌",
"秀气",
"秀才",
"福气",
"祖宗",
"砚台",
"码头",
"石榴",
"石头",
"石匠",
"知识",
"眼睛",
"眯缝",
"眨巴",
"眉毛",
"相声",
"盘算",
"白净",
"痢疾",
"痛快",
"疟疾",
"疙瘩",
"疏忽",
"畜生",
"生意",
"甘蔗",
"琵琶",
"琢磨",
"琉璃",
"玻璃",
"玫瑰",
"玄乎",
"狐狸",
"状元",
"特务",
"牲口",
"牙碜",
"牌楼",
"爽快",
"爱人",
"热闹",
"烧饼",
"烟筒",
"烂糊",
"点心",
"炊帚",
"灯笼",
"火候",
"漂亮",
"滑溜",
"溜达",
"温和",
"清楚",
"消息",
"浪头",
"活泼",
"比方",
"正经",
"欺负",
"模糊",
"槟榔",
"棺材",
"棒槌",
"棉花",
"核桃",
"栅栏",
"柴火",
"架势",
"枕头",
"枇杷",
"机灵",
"本事",
"木头",
"木匠",
"朋友",
"月饼",
"月亮",
"暖和",
"明白",
"时候",
"新鲜",
"故事",
"收拾",
"收成",
"提防",
"挖苦",
"挑剔",
"指甲",
"指头",
"拾掇",
"拳头",
"拨弄",
"招牌",
"招呼",
"抬举",
"护士",
"折腾",
"扫帚",
"打量",
"打算",
"打点",
"打扮",
"打听",
"打发",
"扎实",
"扁担",
"戒指",
"懒得",
"意识",
"意思",
"情形",
"悟性",
"怪物",
"思量",
"怎么",
"念头",
"念叨",
"快活",
"忙活",
"志气",
"心思",
"得罪",
"张罗",
"弟兄",
"开通",
"应酬",
"庄稼",
"干事",
"帮手",
"帐篷",
"希罕",
"师父",
"师傅",
"巴结",
"巴掌",
"差事",
"工夫",
"岁数",
"屁股",
"尾巴",
"少爷",
"小气",
"小伙",
"将就",
"对头",
"对付",
"寡妇",
"家伙",
"客气",
"实在",
"官司",
"学问",
"学生",
"字号",
"嫁妆",
"媳妇",
"媒人",
"婆家",
"娘家",
"委屈",
"姑娘",
"姐夫",
"妯娌",
"妥当",
"妖精",
"奴才",
"女婿",
"头发",
"太阳",
"大爷",
"大方",
"大意",
"大夫",
"多少",
"多么",
"外甥",
"壮实",
"地道",
"地方",
"在乎",
"困难",
"嘴巴",
"嘱咐",
"嘟囔",
"嘀咕",
"喜欢",
"喇嘛",
"喇叭",
"商量",
"唾沫",
"哑巴",
"哈欠",
"哆嗦",
"咳嗽",
"和尚",
"告诉",
"告示",
"含糊",
"吓唬",
"后头",
"名字",
"名堂",
"合同",
"吆喝",
"叫唤",
"口袋",
"厚道",
"厉害",
"千斤",
"包袱",
"包涵",
"匀称",
"勤快",
"动静",
"动弹",
"功夫",
"力气",
"前头",
"刺猬",
"刺激",
"别扭",
"利落",
"利索",
"利害",
"分析",
"出息",
"凑合",
"凉快",
"冷战",
"冤枉",
"冒失",
"养活",
"关系",
"先生",
"兄弟",
"便宜",
"使唤",
"佩服",
"作坊",
"体面",
"位置",
"似的",
"伙计",
"休息",
"什么",
"人家",
"亲戚",
"亲家",
"交情",
"云彩",
"事情",
"买卖",
"主意",
"丫头",
"丧气",
"两口",
"东西",
"东家",
"世故",
"不由",
"不在",
"下水",
"下巴",
"上头",
"上司",
"丈夫",
"丈人",
"一辈",
"那个",
"菩萨",
"父亲",
"母亲",
"咕噜",
"邋遢",
"费用",
"冤家",
"甜头",
"介绍",
"荒唐",
"大人",
"泥鳅",
"幸福",
"熟悉",
"计划",
"扑腾",
"蜡烛",
"姥爷",
"照顾",
"喉咙",
"吉他",
"弄堂",
"蚂蚱",
"凤凰",
"拖沓",
"寒碜",
"糟蹋",
"倒腾",
"报复",
"逻辑",
"盘缠",
"喽啰",
"牢骚",
"咖喱",
"扫把",
"惦记",
}
self.must_not_neural_tone_words = {
"男子",
"女子",
"分子",
"原子",
"量子",
"莲子",
"石子",
"瓜子",
"电子",
"人人",
"虎虎",
}
self.punc = ":,;。?!“”‘’':,;.?!"
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
# e.g.
# word: "家里"
# pos: "s"
# finals: ['ia1', 'i3']
def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
# reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
for j, item in enumerate(word):
if (
j - 1 >= 0
and item == word[j - 1]
and pos[0] in {"n", "v", "a"}
and word not in self.must_not_neural_tone_words
):
finals[j] = finals[j][:-1] + "5"
ge_idx = word.find("")
if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
finals[-1] = finals[-1][:-1] + "5"
elif len(word) >= 1 and word[-1] in "的地得":
finals[-1] = finals[-1][:-1] + "5"
# e.g. 走了, 看着, 去过
# elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
# finals[-1] = finals[-1][:-1] + "5"
elif (
len(word) > 1
and word[-1] in "们子"
and pos in {"r", "n"}
and word not in self.must_not_neural_tone_words
):
finals[-1] = finals[-1][:-1] + "5"
# e.g. 桌上, 地下, 家里
elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
finals[-1] = finals[-1][:-1] + "5"
# e.g. 上来, 下去
elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
finals[-1] = finals[-1][:-1] + "5"
# 个做量词
elif (
ge_idx >= 1
and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
) or word == "":
finals[ge_idx] = finals[ge_idx][:-1] + "5"
else:
if (
word in self.must_neural_tone_words
or word[-2:] in self.must_neural_tone_words
):
finals[-1] = finals[-1][:-1] + "5"
word_list = self._split_word(word)
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
for i, word in enumerate(word_list):
# conventional neural in Chinese
if (
word in self.must_neural_tone_words
or word[-2:] in self.must_neural_tone_words
):
finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
finals = sum(finals_list, [])
return finals
def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
# e.g. 看不懂
if len(word) == 3 and word[1] == "":
finals[1] = finals[1][:-1] + "5"
else:
for i, char in enumerate(word):
# "不" before tone4 should be bu2, e.g. 不怕
if char == "" and i + 1 < len(word) and finals[i + 1][-1] == "4":
finals[i] = finals[i][:-1] + "2"
return finals
def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
# "一" in number sequences, e.g. 一零零, 二一零
if word.find("") != -1 and all(
[item.isnumeric() for item in word if item != ""]
):
return finals
# "一" between reduplication words should be yi5, e.g. 看一看
elif len(word) == 3 and word[1] == "" and word[0] == word[-1]:
finals[1] = finals[1][:-1] + "5"
# when "一" is ordinal word, it should be yi1
elif word.startswith("第一"):
finals[1] = finals[1][:-1] + "1"
else:
for i, char in enumerate(word):
if char == "" and i + 1 < len(word):
# "一" before tone4 should be yi2, e.g. 一段
if finals[i + 1][-1] == "4":
finals[i] = finals[i][:-1] + "2"
# "一" before non-tone4 should be yi4, e.g. 一天
else:
# "一" 后面如果是标点,还读一声
if word[i + 1] not in self.punc:
finals[i] = finals[i][:-1] + "4"
return finals
def _split_word(self, word: str) -> List[str]:
word_list = jieba.cut_for_search(word)
word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
first_subword = word_list[0]
first_begin_idx = word.find(first_subword)
if first_begin_idx == 0:
second_subword = word[len(first_subword) :]
new_word_list = [first_subword, second_subword]
else:
second_subword = word[: -len(first_subword)]
new_word_list = [second_subword, first_subword]
return new_word_list
def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
if len(word) == 2 and self._all_tone_three(finals):
finals[0] = finals[0][:-1] + "2"
elif len(word) == 3:
word_list = self._split_word(word)
if self._all_tone_three(finals):
# disyllabic + monosyllabic, e.g. 蒙古/包
if len(word_list[0]) == 2:
finals[0] = finals[0][:-1] + "2"
finals[1] = finals[1][:-1] + "2"
# monosyllabic + disyllabic, e.g. 纸/老虎
elif len(word_list[0]) == 1:
finals[1] = finals[1][:-1] + "2"
else:
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
if len(finals_list) == 2:
for i, sub in enumerate(finals_list):
# e.g. 所有/人
if self._all_tone_three(sub) and len(sub) == 2:
finals_list[i][0] = finals_list[i][0][:-1] + "2"
# e.g. 好/喜欢
elif (
i == 1
and not self._all_tone_three(sub)
and finals_list[i][0][-1] == "3"
and finals_list[0][-1][-1] == "3"
):
finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
finals = sum(finals_list, [])
# split idiom into two words who's length is 2
elif len(word) == 4:
finals_list = [finals[:2], finals[2:]]
finals = []
for sub in finals_list:
if self._all_tone_three(sub):
sub[0] = sub[0][:-1] + "2"
finals += sub
return finals
def _all_tone_three(self, finals: List[str]) -> bool:
return all(x[-1] == "3" for x in finals)
# merge "不" and the word behind it
# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
last_word = ""
for word, pos in seg:
if last_word == "":
word = last_word + word
if word != "":
new_seg.append((word, pos))
last_word = word[:]
if last_word == "":
new_seg.append((last_word, "d"))
last_word = ""
return new_seg
# function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
# function 2: merge single "一" and the word behind it
# if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
# e.g.
# input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
# output seg: [['听一听', 'v']]
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
# function 1
for i, (word, pos) in enumerate(seg):
if (
i - 1 >= 0
and word == ""
and i + 1 < len(seg)
and seg[i - 1][0] == seg[i + 1][0]
and seg[i - 1][1] == "v"
):
new_seg[i - 1][0] = new_seg[i - 1][0] + "" + new_seg[i - 1][0]
else:
if (
i - 2 >= 0
and seg[i - 1][0] == ""
and seg[i - 2][0] == word
and pos == "v"
):
continue
else:
new_seg.append([word, pos])
seg = new_seg
new_seg = []
# function 2
for i, (word, pos) in enumerate(seg):
if new_seg and new_seg[-1][0] == "":
new_seg[-1][0] = new_seg[-1][0] + word
else:
new_seg.append([word, pos])
return new_seg
# the first and the second words are all_tone_three
def _merge_continuous_three_tones(
self, seg: List[Tuple[str, str]]
) -> List[Tuple[str, str]]:
new_seg = []
sub_finals_list = [
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
for (word, pos) in seg
]
assert len(sub_finals_list) == len(seg)
merge_last = [False] * len(seg)
for i, (word, pos) in enumerate(seg):
if (
i - 1 >= 0
and self._all_tone_three(sub_finals_list[i - 1])
and self._all_tone_three(sub_finals_list[i])
and not merge_last[i - 1]
):
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
if (
not self._is_reduplication(seg[i - 1][0])
and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
):
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
merge_last[i] = True
else:
new_seg.append([word, pos])
else:
new_seg.append([word, pos])
return new_seg
def _is_reduplication(self, word: str) -> bool:
return len(word) == 2 and word[0] == word[1]
# the last char of first word and the first char of second word is tone_three
def _merge_continuous_three_tones_2(
self, seg: List[Tuple[str, str]]
) -> List[Tuple[str, str]]:
new_seg = []
sub_finals_list = [
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
for (word, pos) in seg
]
assert len(sub_finals_list) == len(seg)
merge_last = [False] * len(seg)
for i, (word, pos) in enumerate(seg):
if (
i - 1 >= 0
and sub_finals_list[i - 1][-1][-1] == "3"
and sub_finals_list[i][0][-1] == "3"
and not merge_last[i - 1]
):
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
if (
not self._is_reduplication(seg[i - 1][0])
and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
):
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
merge_last[i] = True
else:
new_seg.append([word, pos])
else:
new_seg.append([word, pos])
return new_seg
def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
for i, (word, pos) in enumerate(seg):
if i - 1 >= 0 and word == "" and seg[i - 1][0] != "#":
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
else:
new_seg.append([word, pos])
return new_seg
def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
for i, (word, pos) in enumerate(seg):
if new_seg and word == new_seg[-1][0]:
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
else:
new_seg.append([word, pos])
return new_seg
def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
seg = self._merge_bu(seg)
try:
seg = self._merge_yi(seg)
except:
print("_merge_yi failed")
seg = self._merge_reduplication(seg)
seg = self._merge_continuous_three_tones(seg)
seg = self._merge_continuous_three_tones_2(seg)
seg = self._merge_er(seg)
return seg
def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
finals = self._bu_sandhi(word, finals)
finals = self._yi_sandhi(word, finals)
finals = self._neural_sandhi(word, pos, finals)
finals = self._three_sandhi(word, finals)
return finals

View File

@@ -0,0 +1,209 @@
import torch
from torch.nn import functional as F
import numpy as np
DEFAULT_MIN_BIN_WIDTH = 1e-3
DEFAULT_MIN_BIN_HEIGHT = 1e-3
DEFAULT_MIN_DERIVATIVE = 1e-3
def piecewise_rational_quadratic_transform(
inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails=None,
tail_bound=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
if tails is None:
spline_fn = rational_quadratic_spline
spline_kwargs = {}
else:
spline_fn = unconstrained_rational_quadratic_spline
spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
outputs, logabsdet = spline_fn(
inputs=inputs,
unnormalized_widths=unnormalized_widths,
unnormalized_heights=unnormalized_heights,
unnormalized_derivatives=unnormalized_derivatives,
inverse=inverse,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative,
**spline_kwargs
)
return outputs, logabsdet
def searchsorted(bin_locations, inputs, eps=1e-6):
bin_locations[..., -1] += eps
return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
def unconstrained_rational_quadratic_spline(
inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails="linear",
tail_bound=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
outside_interval_mask = ~inside_interval_mask
outputs = torch.zeros_like(inputs)
logabsdet = torch.zeros_like(inputs)
if tails == "linear":
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
constant = np.log(np.exp(1 - min_derivative) - 1)
unnormalized_derivatives[..., 0] = constant
unnormalized_derivatives[..., -1] = constant
outputs[outside_interval_mask] = inputs[outside_interval_mask]
logabsdet[outside_interval_mask] = 0
else:
raise RuntimeError("{} tails are not implemented.".format(tails))
(
outputs[inside_interval_mask],
logabsdet[inside_interval_mask],
) = rational_quadratic_spline(
inputs=inputs[inside_interval_mask],
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
inverse=inverse,
left=-tail_bound,
right=tail_bound,
bottom=-tail_bound,
top=tail_bound,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative,
)
return outputs, logabsdet
def rational_quadratic_spline(
inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
left=0.0,
right=1.0,
bottom=0.0,
top=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
if torch.min(inputs) < left or torch.max(inputs) > right:
raise ValueError("Input to a transform is not within its domain")
num_bins = unnormalized_widths.shape[-1]
if min_bin_width * num_bins > 1.0:
raise ValueError("Minimal bin width too large for the number of bins")
if min_bin_height * num_bins > 1.0:
raise ValueError("Minimal bin height too large for the number of bins")
widths = F.softmax(unnormalized_widths, dim=-1)
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
cumwidths = torch.cumsum(widths, dim=-1)
cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
cumwidths = (right - left) * cumwidths + left
cumwidths[..., 0] = left
cumwidths[..., -1] = right
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
heights = F.softmax(unnormalized_heights, dim=-1)
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
cumheights = torch.cumsum(heights, dim=-1)
cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
cumheights = (top - bottom) * cumheights + bottom
cumheights[..., 0] = bottom
cumheights[..., -1] = top
heights = cumheights[..., 1:] - cumheights[..., :-1]
if inverse:
bin_idx = searchsorted(cumheights, inputs)[..., None]
else:
bin_idx = searchsorted(cumwidths, inputs)[..., None]
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
delta = heights / widths
input_delta = delta.gather(-1, bin_idx)[..., 0]
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
input_heights = heights.gather(-1, bin_idx)[..., 0]
if inverse:
a = (inputs - input_cumheights) * (
input_derivatives + input_derivatives_plus_one - 2 * input_delta
) + input_heights * (input_delta - input_derivatives)
b = input_heights * input_derivatives - (inputs - input_cumheights) * (
input_derivatives + input_derivatives_plus_one - 2 * input_delta
)
c = -input_delta * (inputs - input_cumheights)
discriminant = b.pow(2) - 4 * a * c
assert (discriminant >= 0).all()
root = (2 * c) / (-b - torch.sqrt(discriminant))
outputs = root * input_bin_widths + input_cumwidths
theta_one_minus_theta = root * (1 - root)
denominator = input_delta + (
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta
)
derivative_numerator = input_delta.pow(2) * (
input_derivatives_plus_one * root.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - root).pow(2)
)
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, -logabsdet
else:
theta = (inputs - input_cumwidths) / input_bin_widths
theta_one_minus_theta = theta * (1 - theta)
numerator = input_heights * (
input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
)
denominator = input_delta + (
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta
)
outputs = input_cumheights + numerator / denominator
derivative_numerator = input_delta.pow(2) * (
input_derivatives_plus_one * theta.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - theta).pow(2)
)
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, logabsdet

420
MyShellTTSBase/utils.py Normal file
View File

@@ -0,0 +1,420 @@
import os
import glob
import argparse
import logging
import json
import subprocess
import numpy as np
from scipy.io.wavfile import read
import torch
import torchaudio
import librosa
from .text import cleaned_text_to_sequence, get_bert
from .text.cleaner import clean_text
from . import commons
MATPLOTLIB_FLAG = False
logger = logging.getLogger(__name__)
def get_text_for_tts_infer(text, language_str, hps, device, symbol_to_id=None):
norm_text, phone, tone, word2ph = clean_text(text, language_str)
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str, symbol_to_id)
if hps.data.add_blank:
phone = commons.intersperse(phone, 0)
tone = commons.intersperse(tone, 0)
language = commons.intersperse(language, 0)
for i in range(len(word2ph)):
word2ph[i] = word2ph[i] * 2
word2ph[0] += 1
if getattr(hps.data, "disable_bert", False):
bert = torch.zeros(1024, len(phone))
ja_bert = torch.zeros(768, len(phone))
else:
bert = get_bert(norm_text, word2ph, language_str, device)
del word2ph
assert bert.shape[-1] == len(phone), phone
if language_str == "ZH":
bert = bert
ja_bert = torch.zeros(768, len(phone))
elif language_str in ["JP", "EN", "ZH_MIX_EN", 'KR', 'SP', 'ES', 'FR', 'DE', 'RU']:
ja_bert = bert
bert = torch.zeros(1024, len(phone))
else:
raise NotImplementedError()
assert bert.shape[-1] == len(
phone
), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
phone = torch.LongTensor(phone)
tone = torch.LongTensor(tone)
language = torch.LongTensor(language)
return bert, ja_bert, phone, tone, language
def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
iteration = checkpoint_dict["iteration"]
learning_rate = checkpoint_dict["learning_rate"]
if (
optimizer is not None
and not skip_optimizer
and checkpoint_dict["optimizer"] is not None
):
optimizer.load_state_dict(checkpoint_dict["optimizer"])
elif optimizer is None and not skip_optimizer:
# else: Disable this line if Infer and resume checkpoint,then enable the line upper
new_opt_dict = optimizer.state_dict()
new_opt_dict_params = new_opt_dict["param_groups"][0]["params"]
new_opt_dict["param_groups"] = checkpoint_dict["optimizer"]["param_groups"]
new_opt_dict["param_groups"][0]["params"] = new_opt_dict_params
optimizer.load_state_dict(new_opt_dict)
saved_state_dict = checkpoint_dict["model"]
if hasattr(model, "module"):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict = {}
for k, v in state_dict.items():
try:
# assert "emb_g" not in k
new_state_dict[k] = saved_state_dict[k]
assert saved_state_dict[k].shape == v.shape, (
saved_state_dict[k].shape,
v.shape,
)
except Exception as e:
# For upgrading from the old version
if "ja_bert_proj" in k:
v = torch.zeros_like(v)
logger.warn(
f"Seems you are using the old version of the model, the {k} is automatically set to zero for backward compatibility"
)
else:
logger.error(f"{k} is not in the checkpoint")
new_state_dict[k] = v
if hasattr(model, "module"):
model.module.load_state_dict(new_state_dict, strict=False)
else:
model.load_state_dict(new_state_dict, strict=False)
logger.info(
"Loaded checkpoint '{}' (iteration {})".format(checkpoint_path, iteration)
)
return model, optimizer, learning_rate, iteration
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
logger.info(
"Saving model and optimizer state at iteration {} to {}".format(
iteration, checkpoint_path
)
)
if hasattr(model, "module"):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
torch.save(
{
"model": state_dict,
"iteration": iteration,
"optimizer": optimizer.state_dict(),
"learning_rate": learning_rate,
},
checkpoint_path,
)
def summarize(
writer,
global_step,
scalars={},
histograms={},
images={},
audios={},
audio_sampling_rate=22050,
):
for k, v in scalars.items():
writer.add_scalar(k, v, global_step)
for k, v in histograms.items():
writer.add_histogram(k, v, global_step)
for k, v in images.items():
writer.add_image(k, v, global_step, dataformats="HWC")
for k, v in audios.items():
writer.add_audio(k, v, global_step, audio_sampling_rate)
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
f_list = glob.glob(os.path.join(dir_path, regex))
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
x = f_list[-1]
return x
def plot_spectrogram_to_numpy(spectrogram):
global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG:
import matplotlib
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger("matplotlib")
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
plt.colorbar(im, ax=ax)
plt.xlabel("Frames")
plt.ylabel("Channels")
plt.tight_layout()
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
def plot_alignment_to_numpy(alignment, info=None):
global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG:
import matplotlib
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger("matplotlib")
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
fig, ax = plt.subplots(figsize=(6, 4))
im = ax.imshow(
alignment.transpose(), aspect="auto", origin="lower", interpolation="none"
)
fig.colorbar(im, ax=ax)
xlabel = "Decoder timestep"
if info is not None:
xlabel += "\n\n" + info
plt.xlabel(xlabel)
plt.ylabel("Encoder timestep")
plt.tight_layout()
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
def load_wav_to_torch(full_path):
sampling_rate, data = read(full_path)
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
def load_wav_to_torch_new(full_path):
audio_norm, sampling_rate = torchaudio.load(full_path, frame_offset=0, num_frames=-1, normalize=True, channels_first=True)
audio_norm = audio_norm.mean(dim=0)
return audio_norm, sampling_rate
def load_wav_to_torch_librosa(full_path, sr):
audio_norm, sampling_rate = librosa.load(full_path, sr=sr, mono=True)
return torch.FloatTensor(audio_norm.astype(np.float32)), sampling_rate
def load_filepaths_and_text(filename, split="|"):
with open(filename, encoding="utf-8") as f:
filepaths_and_text = [line.strip().split(split) for line in f]
return filepaths_and_text
def get_hparams(init=True):
parser = argparse.ArgumentParser()
parser.add_argument(
"-c",
"--config",
type=str,
default="./configs/base.json",
help="JSON file for configuration",
)
parser.add_argument('--local-rank', type=int, default=0)
parser.add_argument("-m", "--model", type=str, required=True, help="Model name")
parser.add_argument('--pretrain_G', type=str, default=None,
help='pretrain model')
parser.add_argument('--pretrain_D', type=str, default=None,
help='pretrain model D')
parser.add_argument('--pretrain_dur', type=str, default=None,
help='pretrain model duration')
args = parser.parse_args()
model_dir = os.path.join("./logs", args.model)
os.makedirs(model_dir, exist_ok=True)
config_path = args.config
config_save_path = os.path.join(model_dir, "config.json")
if init:
with open(config_path, "r") as f:
data = f.read()
with open(config_save_path, "w") as f:
f.write(data)
else:
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
hparams.model_dir = model_dir
hparams.pretrain_G = args.pretrain_G
hparams.pretrain_D = args.pretrain_D
hparams.pretrain_dur = args.pretrain_dur
return hparams
def clean_checkpoints(path_to_models="logs/44k/", n_ckpts_to_keep=2, sort_by_time=True):
"""Freeing up space by deleting saved ckpts
Arguments:
path_to_models -- Path to the model directory
n_ckpts_to_keep -- Number of ckpts to keep, excluding G_0.pth and D_0.pth
sort_by_time -- True -> chronologically delete ckpts
False -> lexicographically delete ckpts
"""
import re
ckpts_files = [
f
for f in os.listdir(path_to_models)
if os.path.isfile(os.path.join(path_to_models, f))
]
def name_key(_f):
return int(re.compile("._(\\d+)\\.pth").match(_f).group(1))
def time_key(_f):
return os.path.getmtime(os.path.join(path_to_models, _f))
sort_key = time_key if sort_by_time else name_key
def x_sorted(_x):
return sorted(
[f for f in ckpts_files if f.startswith(_x) and not f.endswith("_0.pth")],
key=sort_key,
)
to_del = [
os.path.join(path_to_models, fn)
for fn in (x_sorted("G")[:-n_ckpts_to_keep] + x_sorted("D")[:-n_ckpts_to_keep])
]
def del_info(fn):
return logger.info(f".. Free up space by deleting ckpt {fn}")
def del_routine(x):
return [os.remove(x), del_info(x)]
[del_routine(fn) for fn in to_del]
def get_hparams_from_dir(model_dir):
config_save_path = os.path.join(model_dir, "config.json")
with open(config_save_path, "r", encoding="utf-8") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
hparams.model_dir = model_dir
return hparams
def get_hparams_from_file(config_path):
with open(config_path, "r", encoding="utf-8") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
return hparams
def check_git_hash(model_dir):
source_dir = os.path.dirname(os.path.realpath(__file__))
if not os.path.exists(os.path.join(source_dir, ".git")):
logger.warn(
"{} is not a git repository, therefore hash value comparison will be ignored.".format(
source_dir
)
)
return
cur_hash = subprocess.getoutput("git rev-parse HEAD")
path = os.path.join(model_dir, "githash")
if os.path.exists(path):
saved_hash = open(path).read()
if saved_hash != cur_hash:
logger.warn(
"git hash values are different. {}(saved) != {}(current)".format(
saved_hash[:8], cur_hash[:8]
)
)
else:
open(path, "w").write(cur_hash)
def get_logger(model_dir, filename="train.log"):
global logger
logger = logging.getLogger(os.path.basename(model_dir))
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
if not os.path.exists(model_dir):
os.makedirs(model_dir, exist_ok=True)
h = logging.FileHandler(os.path.join(model_dir, filename))
h.setLevel(logging.DEBUG)
h.setFormatter(formatter)
logger.addHandler(h)
return logger
class HParams:
def __init__(self, **kwargs):
for k, v in kwargs.items():
if type(v) == dict:
v = HParams(**v)
self[k] = v
def keys(self):
return self.__dict__.keys()
def items(self):
return self.__dict__.items()
def values(self):
return self.__dict__.values()
def __len__(self):
return len(self.__dict__)
def __getitem__(self, key):
return getattr(self, key)
def __setitem__(self, key, value):
return setattr(self, key, value)
def __contains__(self, key):
return key in self.__dict__
def __repr__(self):
return self.__dict__.__repr__()

13
requirements.txt Normal file
View File

@@ -0,0 +1,13 @@
transformers==4.27.4
mecab-python3==1.0.5
num2words==0.5.12
unidic_lite==1.0.8
unidic==1.1.0
mecab-python3==1.0.5
pykakasi==2.2.1
fugashi==1.3.0
g2p_en==2.1.0
anyascii
jamo
gruut[de,es,fr]==2.2.3
g2pkk>=0.1.1

16
setup.py Normal file
View File

@@ -0,0 +1,16 @@
from setuptools import setup, find_packages
import os
cwd = os.path.dirname(os.path.abspath(__file__))
requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines()
setup(
name='MyShellTTSBase',
version='0.1.0',
packages=find_packages(),
include_package_data=True,
install_requires=requirements,
package_data={
'': ['*.txt', 'cmudict_*'],
},
)

View File

@@ -0,0 +1,99 @@
Did you ever hear a folk tale about a giant turtle?
Can you name five cars that were popular in the 1970s?
May I ask what's your favorite university and why?
Well, have you ever experienced violence in your life?
Have you ever imposed restrictions?
Did you ever feel guilty for not providing enough care for your pet?
Would you prefer barbecue-flavored chips or plain chips?
Are contractions common in English?
Well, have you ever seen a slam poetry competition?
Am I correct in assuming that bilateral trade agreements favor developed countries?
Are there any scientific theories on why love exists in humans?
Well, do you think figure skating is harder than gymnastics?
Can you tell me if the apartment has a balcony or not?
Have you ever overcome a challenging obstacle positively?
Could you elaborate on the meaning behind that quote?
Shall seniors receive higher taxes?
Do you think adding a liquid flavor to coffee ruins it?
Well, in our conversation about the restaurant, how would you review it overall?
Have you consistently followed through with goals?
Can pilots hear passengers coughing?
Well, have you tried rainbow sprinkles?
Are there any golden retrievers at the local animal shelter?
Have you seen Tyler?
Had you ever deployed to Mars?
Well, have you ever felt intimidated by your competition's tactics?
Are there any specific rules about when you can continue?
Can you describe Antarctica's temperatures?
May I ask, have you ever tasted a bloody mary before?
Did anyone mention the order yet?
Are automatic transmissions more fuel efficient?
Shall we discuss the impact of self-control on personal success?
Have you traveled internationally this May?
Well, have you ever tried shrimp ceviche?
Have you ever seen an act of extraordinary courage in person?
Have you ever wondered how proceed affects the outcome of a project?
Have you calculated the mean weight of all the participants?
Should we bring confetti to the parade?
Do influencers control behavior?
Shall we discuss the price of the new car lease?
Had Nice ever been your home?
Have you ever encountered a gifted child who struggled academically?
Can everyone work together?
Did you know how long an ostrich can survive without water?
Do nurses in long-term care facilities receive adequate training for dementia care?
Has separation ever felt liberating?
Would you prefer a flexible or fixed schedule for work?
Does pension plan have rollover?
Has Vital's mission expanded beyond health supplements?
Have you ever witnessed a bombing attack?
May I predict the outcome of the election based on polls?
Do you think strict parenting leads to more successful children later in life?
Shall we explore nearby parks?
Are there any ways to verify the credibility of online reviews?
Have you ever witnessed a roundabout accident?
Well, upon reflection, do we really want sushi?
Well, have you ever experienced workplace harassment?
Do you think it's sure that the rain will stop soon?
Would you say distance affects relationships?
Can we truly deny the existence of higher power?
Do you think crop yields will be affected by the drought?
Do you think the backup plan is good enough?
Can you tell me, meanwhile, what happened while I was gone?
Did the wise old owl speak?
Well, have you ever been to a retreat that truly transformed you?
Have you ever had to calculate the exact measurements for a recipe?
Can warning signs prevent accidents while driving on icy roads?
Do you think the current job market offers equal opportunity?
Have you ever analyzed your own dreams?
May I ask if colonialism affected your ancestry?
Well, what chest exercises target the upper pecs?
Are there occasionally unexpected consequences of honesty?
Do you think the new restaurant is overpriced?
Do critics take into account audience preferences?
Has translation technology reached a point where it can accurately translate idioms?
Have you ever been to a music festival in another country?
Do you think our taste in food is genetic?
Are you a hopeless romantic at heart?
Shall we explore abandoned urban places?
Does agency promote individualism?
Well, what implementing strategies?
Have you ever noticed the smallest detail that changed your perspective?
Have you ever seen a normal ghost?
Have you ever considered the considerable effort?
Are there holistic chronic cure?
Did unemployment rates change recently?
Does change come from within or without?
Does the length of the patent term affect innovation rates?
Can Junior play basketball?
Shall we analyze the data?
Have you ever tried the Szechuan cuisine before?
Had you ever debated a controversial topic before?
Have you ever analyzed case?
Is it true that stripping originated in ancient Egypt or Greece?
Have you ever dyed your hair a crazy color?
Shall we compare the top-rated pizza places in our city?
May people in different countries play soccer?
Well, have you recycled?
Shall we precisely measure ingredients?
Can you embrace someone you don't love?

View File

@@ -0,0 +1,20 @@
El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.
Las estrellas bailan en la noche, creando un espectáculo celestial que despierta el alma.
Las majestuosas montañas se alzan en silencio, guardianas inmutables del tiempo que pasa.
El amor, como un suave perfume, envuelve nuestros corazones con un calor reconfortante.
El susurro suave del viento atraviesa los campos de lavanda, llevándose consigo el aroma de la Provenza.
El resplandor de la luna baña la ciudad dormida en una luz mística.
Las calles empedradas revelan historias antiguas, cada piedra llevando el peso del pasado.
La risa de los niños resuena como una melodía encantada en el suave aire de la primavera.
Los jardines floridos estallan con colores vibrantes, creando un cuadro viviente de la naturaleza.
Las olas acarician suavemente la playa, dejando tras de sí huellas efímeras en la arena.
La Torre Eiffel se yergue con orgullo, testigo silencioso del amor eterno en París.
Las mariposas danzan entre las flores, creando una coreografía grácil en el jardín.
Los animados cafés resuenan con conversaciones apasionadas y el embriagador aroma del café recién molido.
Los ríos serpenteantes atraviesan el campo, reflejando el cielo azul en sus aguas tranquilas.
Los imponentes castillos cuentan historias de caballeros y princesas en un pasado lejano.
Los viñedos se extienden hasta donde alcanza la vista, sus filas ordenadas testimonio de la antigua tradición vinícola.
Las risas resuenan en las estrechas callejuelas, despertando la vieja ciudad de su quietud.
Los campos de girasoles saludan al sol con sus caras doradas, un mar de oro bajo un cielo azul.
Las notas melódicas de un acordeón flotan en el aire, capturando la esencia musical de las calles parisinas.
Las cumbres nevadas de los Alpes brillan bajo la luz de la luna, un paisaje invernal de ensueño.

View File

@@ -0,0 +1,20 @@
La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.
Les étoiles dansent dans la nuit, créant un spectacle céleste qui éveille l'âme.
Les montagnes majestueuses se dressent en silence, gardiennes immuables du temps qui passe.
L'amour, tel un doux parfum, enveloppe nos cœurs d'une chaleur réconfortante.
Le doux murmure du vent traverse les champs de lavande, emportant avec lui le parfum de la Provence.
La lueur de la lune baigne la ville endormie dans une lumière mystique.
Les ruelles pavées révèlent des histoires anciennes, chaque pierre portant le poids du passé.
Le rire des enfants résonne comme une mélodie enchantée dans l'air doux du printemps.
Les jardins fleuris éclatent de couleurs vives, créant un tableau vivant de la nature.
Les vagues caressent doucement la plage, laissant derrière elles des traces éphémères dans le sable.
La Tour Eiffel se dresse fièrement, témoin silencieux de l'amour éternel à Paris.
Les papillons dansent parmi les fleurs, créant une chorégraphie gracieuse dans le jardin.
Les cafés animés résonnent de conversations passionnées et du parfum enivrant du café fraîchement moulu.
Les rivières sinueuses traversent la campagne, reflétant le ciel azur dans leurs eaux calmes.
Les châteaux imposants racontent des contes de chevaliers et de princesses dans un passé lointain.
Les vignobles s'étendent à perte de vue, leurs rangées ordonnées témoignant du savoir-faire viticole ancestral.
Les éclats de rire résonnent dans les ruelles étroites, réveillant la vieille ville de sa quiétude.
Les champs de tournesols saluent le soleil avec leurs visages dorés, une mer d'or sous un ciel d'azur.
Les notes mélodieuses d'un accordéon flottent dans l'air, capturant l'essence musicale des rues parisiennes.
Les sommets enneigés des Alpes brillent sous la lumière de la lune, un paysage hivernal féérique.

View File

@@ -0,0 +1,10 @@
彼は毎朝ジョギングをして体を健康に保っています。
私たちは来年、友人たちと一緒にヨーロッパ旅行を計画しています。
新しいレストランで美味しい料理を試すことが楽しみです。
彼女の絵は情熱と芸術性が溢れていて、見る人を魅了します。
最近、忙しさに追われていて、ゆっくり休む時間がありません。
日本の文化は多様で魅力的であり、世界中から注目されています。
彼の犬は忠実で賢く、家族にとって大切な存在です。
私の友達は常に私をサポートしてくれる信頼できる存在です。
家族と一緒に過ごす時間は、私にとって何よりも大切です。
彼の夢は大きく、努力と決意でそれを実現しようとしています。

View File

@@ -0,0 +1,5 @@
안녕하세요! 오늘은 날씨가 정말 좋네요.
한국 음식을 먹어보고 싶어요. 불고기랑 김치찌개가 제가 좋아하는 음식이에요.
요즘에는 한국 드라마를 자주 보고 있어요. 정말 재미있어요.
한글을 배우는 것이 재미있어요. 조금씩 읽고 쓸 수 있게 되고 있어요.
친구들과 함께 한국 여행을 계획 중이에요. 서울과 부산을 방문할 예정이에요.,

View File

@@ -0,0 +1,11 @@
人工智能是一种非常适合和促进自上而下集中控制的技术,而加密货币则是一种完全关注自下而上分散合作的技术。
Web 3的一个目标是支持艺术家。
欢迎来到Web 3与A6Z,一个由团队打造的构建下一代互联网的节目。
我最喜欢的fruit是苹果。
今天我们要学习Python programming。
她在library看书。
你喜欢听pop music吗
今天下午我们准备去shopping mall购物然后晚上去看一场movie。
我最近在学习machine learning希望能够在未来的artificial intelligence领域有所建树。
在这次vacation中我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。
今天天气真不错我们去Paris吃蒸汽海鲜吧,

View File

@@ -0,0 +1,43 @@
from MyShellTTSBase.api import TTS
import os
import glob
import sys
language = sys.argv[1]
model = TTS(language=language)
speaker_ids = model.hps.data.spk2id
speakers = list(speaker_ids.keys())
root_folder = language.lower()
if 'zh' in root_folder:
texts = open('basetts_test_resources/zh_mix_en_egs_text.txt', 'r').readlines()
language = 'ZH_MIX_EN'
elif 'es' in root_folder:
texts = open('basetts_test_resources/es_egs_text.txt', 'r').readlines()
language = 'SP'
elif 'fr' in root_folder:
texts = open('basetts_test_resources/fr_egs_text.txt', 'r').readlines()
language = 'FR'
elif 'en' in root_folder:
texts = open('basetts_test_resources/en_egs_text.txt', 'r').readlines()
# texts = ["Boss? You're not my boss, you're just a sad little person who likes to hide behind a computer screen and pretend you have power over others. "]
language = 'EN'
elif 'jp' in root_folder:
texts = open('basetts_test_resources/jp_egs_text.txt', 'r').readlines()
language = 'JP'
elif 'kr' in root_folder:
texts = open('basetts_test_resources/kr_egs_text.txt', 'r').readlines()
language = 'KR'
else:
raise NotImplementedError()
save_dir = os.path.join('basetts_outputs_package', root_folder.split('/')[-1])
for speed in [1.0]:
for speaker in speakers:
for sent_id, text in enumerate(texts):
output_path = f'{save_dir}/{speaker}/speed_{speed}/sent_{sent_id:03d}.wav'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
model.tts_to_file(text, speaker_ids[speaker], output_path, speed=speed)