Skip to content

Commit 36c6ff8

Browse files
authored
Merge pull request #544 from idiap/xtts-hf
Download models via `huggingface_hub`
2 parents c694e59 + af20c85 commit 36c6ff8

File tree

85 files changed

+1225
-1412
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+1225
-1412
lines changed

TTS/.models.json

Lines changed: 63 additions & 188 deletions
Large diffs are not rendered by default.

TTS/__init__.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,24 +23,6 @@
2323
if not is_torch_available() or not is_torchaudio_available:
2424
raise ImportError(PYTORCH_IMPORT_ERROR)
2525

26-
if is_torch_greater_or_equal("2.4"):
27-
import _codecs
28-
from collections import defaultdict
29-
30-
import numpy as np
31-
import torch
32-
from packaging import version
33-
34-
from TTS.config.shared_configs import BaseDatasetConfig
35-
from TTS.tts.configs.xtts_config import XttsConfig
36-
from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig
37-
from TTS.utils.radam import RAdam
38-
39-
torch.serialization.add_safe_globals([dict, defaultdict, RAdam])
40-
41-
# XTTS
42-
torch.serialization.add_safe_globals([BaseDatasetConfig, XttsConfig, XttsAudioConfig, XttsArgs])
43-
4426
if is_torch_greater_or_equal("2.9"):
4527
if not is_torchcodec_available():
4628
raise ImportError(TORCHCODEC_IMPORT_ERROR)

TTS/api.py

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Any
99

1010
from torch import nn
11+
from trainer.io import get_user_data_dir
1112

1213
from TTS.config import load_config
1314
from TTS.utils.manage import ModelManager
@@ -81,6 +82,7 @@ def __init__(
8182
self.synthesizer: Synthesizer | None = None
8283
self.voice_converter: Synthesizer | None = None
8384
self.model_name = ""
85+
self.voice_dir = None
8486

8587
self.vocoder_path = vocoder_path
8688
self.vocoder_config_path = vocoder_config_path
@@ -93,6 +95,7 @@ def __init__(
9395
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
9496

9597
if model_name is not None and len(model_name) > 0:
98+
self.voice_dir = get_user_data_dir("tts") / model_name / "voices"
9699
if "tts_models" in model_name:
97100
self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
98101
elif "voice_conversion_models" in model_name:
@@ -158,22 +161,10 @@ def list_models() -> list[str]:
158161

159162
def download_model_by_name(
160163
self, model_name: str, vocoder_name: str | None = None
161-
) -> tuple[Path | None, Path | None, Path | None, Path | None, Path | None]:
164+
) -> tuple[Path | None, Path | None, Path | None, Path | None]:
162165
model_path, config_path, model_item = self.manager.download_model(model_name)
163-
if (
164-
"fairseq" in model_name
165-
or "openvoice" in model_name
166-
or (
167-
model_item is not None
168-
and isinstance(model_item["model_url"], list)
169-
and len(model_item["model_url"]) > 2
170-
)
171-
):
172-
# return model directory if there are multiple files
173-
# we assume that the model knows how to load itself
174-
return None, None, None, None, model_path
175166
if model_item.get("default_vocoder") is None:
176-
return model_path, config_path, None, None, None
167+
return model_path, config_path, None, None
177168
if vocoder_name is None:
178169
vocoder_name = model_item["default_vocoder"]
179170
vocoder_path, vocoder_config_path = None, None
@@ -183,7 +174,7 @@ def download_model_by_name(
183174
vocoder_config_path = self.vocoder_config_path
184175
if vocoder_path is None or vocoder_config_path is None:
185176
vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
186-
return model_path, config_path, vocoder_path, vocoder_config_path, None
177+
return model_path, config_path, vocoder_path, vocoder_config_path
187178

188179
def load_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None:
189180
"""Load one of the 🐸TTS models by name.
@@ -202,15 +193,15 @@ def load_vc_model_by_name(self, model_name: str, vocoder_name: str | None = None
202193
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
203194
"""
204195
self.model_name = model_name
205-
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
196+
model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(
206197
model_name, vocoder_name
207198
)
208199
self.voice_converter = Synthesizer(
209200
vc_checkpoint=model_path,
210201
vc_config=config_path,
211202
vocoder_checkpoint=vocoder_path,
212203
vocoder_config=vocoder_config_path,
213-
model_dir=model_dir,
204+
voice_dir=self.voice_dir,
214205
use_cuda=gpu,
215206
)
216207

@@ -225,7 +216,7 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: str | None = Non
225216
"""
226217
self.model_name = model_name
227218

228-
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
219+
model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(
229220
model_name, vocoder_name
230221
)
231222

@@ -240,7 +231,7 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: str | None = Non
240231
vocoder_config=vocoder_config_path,
241232
encoder_checkpoint=self.encoder_path,
242233
encoder_config=self.encoder_config_path,
243-
model_dir=model_dir,
234+
voice_dir=self.voice_dir,
244235
use_cuda=gpu,
245236
)
246237

@@ -266,6 +257,7 @@ def load_tts_model_by_path(self, model_path: str, config_path: str, *, gpu: bool
266257
encoder_config=self.encoder_config_path,
267258
use_cuda=gpu,
268259
)
260+
self.voice_dir = self.synthesizer.voice_dir
269261

270262
def _check_arguments(
271263
self,

TTS/bin/synthesize.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,6 @@ def main(arg_list: list[str] | None = None) -> None:
343343
encoder_config_path = None
344344
vc_path = None
345345
vc_config_path = None
346-
model_dir = None
347346

348347
# 0) Print version number
349348
if args.version:

TTS/demos/xtts_ft_demo/utils/gpt_train.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
from trainer import Trainer, TrainerArgs
55

66
from TTS.config.shared_configs import BaseDatasetConfig
7+
from TTS.tts.configs.xtts_config import XttsAudioConfig
78
from TTS.tts.datasets import load_tts_samples
89
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
9-
from TTS.tts.models.xtts import XttsAudioConfig
1010
from TTS.utils.manage import ModelManager
1111

1212

TTS/model.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,17 @@
77
from trainer import TrainerModel
88
from trainer.io import load_fsspec
99

10+
from TTS.config.shared_configs import BaseTrainingConfig
11+
1012

1113
class BaseTrainerModel(TrainerModel):
1214
"""BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
1315
1416
Every new Coqui model must inherit it.
1517
"""
1618

19+
config: BaseTrainingConfig
20+
1721
@staticmethod
1822
@abstractmethod
1923
def init_from_config(config: Coqpit) -> "BaseTrainerModel":

TTS/tts/configs/align_tts_config.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,56 @@
11
from dataclasses import dataclass, field
22

3+
from TTS.config.shared_configs import ModelArgs
34
from TTS.tts.configs.shared_configs import BaseTTSConfig
4-
from TTS.tts.models.align_tts import AlignTTSArgs
5+
6+
7+
@dataclass
8+
class AlignTTSArgs(ModelArgs):
9+
"""
10+
Args:
11+
num_chars (int):
12+
number of unique input to characters
13+
out_channels (int):
14+
number of output tensor channels. It is equal to the expected spectrogram size.
15+
hidden_channels (int):
16+
number of channels in all the model layers.
17+
hidden_channels_ffn (int):
18+
number of channels in transformer's conv layers.
19+
hidden_channels_dp (int):
20+
number of channels in duration predictor network.
21+
num_heads (int):
22+
number of attention heads in transformer networks.
23+
num_transformer_layers (int):
24+
number of layers in encoder and decoder transformer blocks.
25+
dropout_p (int):
26+
dropout rate in transformer layers.
27+
length_scale (int, optional):
28+
coefficient to set the speech speed. <1 slower, >1 faster. Defaults to 1.
29+
num_speakers (int, optional):
30+
number of speakers for multi-speaker training. Defaults to 0.
31+
external_c (bool, optional):
32+
enable external speaker embeddings. Defaults to False.
33+
c_in_channels (int, optional):
34+
number of channels in speaker embedding vectors. Defaults to 0.
35+
"""
36+
37+
num_chars: int = None
38+
out_channels: int = 80
39+
hidden_channels: int = 256
40+
hidden_channels_dp: int = 256
41+
encoder_type: str = "fftransformer"
42+
encoder_params: dict = field(
43+
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
44+
)
45+
decoder_type: str = "fftransformer"
46+
decoder_params: dict = field(
47+
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
48+
)
49+
length_scale: float = 1.0
50+
num_speakers: int = 0
51+
use_speaker_embedding: bool = False
52+
use_d_vector_file: bool = False
53+
d_vector_dim: int = 0
554

655

756
@dataclass

TTS/tts/configs/bark_config.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
import os
22
from dataclasses import dataclass, field
33

4+
from coqpit import Coqpit
45
from trainer.io import get_user_data_dir
56

67
from TTS.tts.configs.shared_configs import BaseTTSConfig
78
from TTS.tts.layers.bark.model import GPTConfig
89
from TTS.tts.layers.bark.model_fine import FineGPTConfig
9-
from TTS.tts.models.bark import BarkAudioConfig
10+
11+
12+
@dataclass
13+
class BarkAudioConfig(Coqpit):
14+
sample_rate: int = 24000
15+
output_sample_rate: int = 24000
1016

1117

1218
@dataclass

TTS/tts/configs/delightful_tts_config.py

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,78 @@
11
from dataclasses import dataclass, field
22

3+
from coqpit import Coqpit
4+
5+
from TTS.config.shared_configs import BaseAudioConfig, ModelArgs
36
from TTS.tts.configs.shared_configs import BaseTTSConfig
4-
from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig
7+
8+
9+
@dataclass
10+
class VocoderConfig(Coqpit):
11+
resblock_type_decoder: str = "1"
12+
resblock_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [3, 7, 11])
13+
resblock_dilation_sizes_decoder: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
14+
upsample_rates_decoder: list[int] = field(default_factory=lambda: [8, 8, 2, 2])
15+
upsample_initial_channel_decoder: int = 512
16+
upsample_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [16, 16, 4, 4])
17+
use_spectral_norm_discriminator: bool = False
18+
upsampling_rates_discriminator: list[int] = field(default_factory=lambda: [4, 4, 4, 4])
19+
periods_discriminator: list[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
20+
pretrained_model_path: str | None = None
21+
22+
23+
@dataclass
24+
class DelightfulTtsAudioConfig(BaseAudioConfig):
25+
mel_fmax: float = 8000
26+
num_mels: int = 100
27+
28+
29+
@dataclass
30+
class DelightfulTtsArgs(ModelArgs):
31+
num_chars: int = 100
32+
spec_segment_size: int = 32
33+
n_hidden_conformer_encoder: int = 512
34+
n_layers_conformer_encoder: int = 6
35+
n_heads_conformer_encoder: int = 8
36+
dropout_conformer_encoder: float = 0.1
37+
kernel_size_conv_mod_conformer_encoder: int = 7
38+
kernel_size_depthwise_conformer_encoder: int = 7
39+
lrelu_slope: float = 0.3
40+
n_hidden_conformer_decoder: int = 512
41+
n_layers_conformer_decoder: int = 6
42+
n_heads_conformer_decoder: int = 8
43+
dropout_conformer_decoder: float = 0.1
44+
kernel_size_conv_mod_conformer_decoder: int = 11
45+
kernel_size_depthwise_conformer_decoder: int = 11
46+
bottleneck_size_p_reference_encoder: int = 4
47+
bottleneck_size_u_reference_encoder: int = 512
48+
ref_enc_filters_reference_encoder = [32, 32, 64, 64, 128, 128]
49+
ref_enc_size_reference_encoder: int = 3
50+
ref_enc_strides_reference_encoder = [1, 2, 1, 2, 1]
51+
ref_enc_pad_reference_encoder = [1, 1]
52+
ref_enc_gru_size_reference_encoder: int = 32
53+
ref_attention_dropout_reference_encoder: float = 0.2
54+
token_num_reference_encoder: int = 32
55+
predictor_kernel_size_reference_encoder: int = 5
56+
n_hidden_variance_adaptor: int = 512
57+
kernel_size_variance_adaptor: int = 5
58+
dropout_variance_adaptor: float = 0.5
59+
n_bins_variance_adaptor: int = 256
60+
emb_kernel_size_variance_adaptor: int = 3
61+
use_speaker_embedding: bool = False
62+
num_speakers: int = 0
63+
speakers_file: str = None
64+
d_vector_file: str = None
65+
speaker_embedding_channels: int = 384
66+
use_d_vector_file: bool = False
67+
d_vector_dim: int = 0
68+
freeze_vocoder: bool = False
69+
freeze_text_encoder: bool = False
70+
freeze_duration_predictor: bool = False
71+
freeze_pitch_predictor: bool = False
72+
freeze_energy_predictor: bool = False
73+
freeze_basis_vectors_predictor: bool = False
74+
freeze_decoder: bool = False
75+
length_scale: float = 1.0
576

677

778
@dataclass

TTS/tts/configs/fast_pitch_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from dataclasses import dataclass, field
22

3+
from TTS.tts.configs.forward_tts_config import ForwardTTSArgs
34
from TTS.tts.configs.shared_configs import BaseTTSConfig
4-
from TTS.tts.models.forward_tts import ForwardTTSArgs
55

66

77
@dataclass

0 commit comments

Comments
 (0)