From 162c05c8a6eaf2d35311c70babb2349c0fc2c31b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filip=20G=C4=99bala?= Date: Wed, 7 Jan 2026 08:47:09 +0100 Subject: [PATCH 1/4] fix: fixed typo in config.yml --- Configs/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Configs/config.yml b/Configs/config.yml index 3dad1af..f299e4d 100644 --- a/Configs/config.yml +++ b/Configs/config.yml @@ -5,7 +5,7 @@ log_interval: 10 device: "cuda" multigpu: false epochs_1st: 200 # number of epochs for first stage training -epochs_2nd: 100 # number of peochs for second stage training +epochs_2nd: 100 # number of epochs for second stage training batch_size: 32 pretrained_model: "" second_stage_load_pretrained: false # set to true if the pre-trained model is for 2nd stage From 55cfeb02553c4d54801ee900a55d4c1aa3c39666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filip=20G=C4=99bala?= Date: Wed, 7 Jan 2026 08:49:49 +0100 Subject: [PATCH 2/4] fix: remove unnecessary LinearNorm class definition --- models.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/models.py b/models.py index 1a8b109..cf1f3e6 100644 --- a/models.py +++ b/models.py @@ -488,17 +488,6 @@ def forward(self, x, s): x = (1 + gamma) * x + beta return x.transpose(1, -1).transpose(-1, -2) -class LinearNorm(torch.nn.Module): - def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): - super(LinearNorm, self).__init__() - self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) - - torch.nn.init.xavier_uniform_( - self.linear_layer.weight, - gain=torch.nn.init.calculate_gain(w_init_gain)) - - def forward(self, x): - return self.linear_layer(x) class ProsodyPredictor(nn.Module): From 7e639d3f5a695a067720842207d9c4d4f96018ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filip=20G=C4=99bala?= Date: Wed, 7 Jan 2026 09:00:27 +0100 Subject: [PATCH 3/4] fix: remove unused imports across multiple files --- Utils/ASR/layers.py | 4 ---- Utils/ASR/models.py | 1 - meldataset.py | 8 -------- models.py | 5 ----- optimizers.py | 5 ----- train_first.py | 2 -- train_second.py | 2 -- utils.py | 5 ----- 8 files changed, 32 deletions(-) diff --git a/Utils/ASR/layers.py b/Utils/ASR/layers.py index ae76628..282cddb 100644 --- a/Utils/ASR/layers.py +++ b/Utils/ASR/layers.py @@ -1,10 +1,6 @@ -import math import torch from torch import nn -from typing import Optional, Any -from torch import Tensor import torch.nn.functional as F -import torchaudio import torchaudio.functional as audio_F import random diff --git a/Utils/ASR/models.py b/Utils/ASR/models.py index 69a28d5..917ff82 100644 --- a/Utils/ASR/models.py +++ b/Utils/ASR/models.py @@ -1,7 +1,6 @@ import math import torch from torch import nn -from torch.nn import TransformerEncoder import torch.nn.functional as F from .layers import MFCC, Attention, LinearNorm, ConvNorm, ConvBlock diff --git a/meldataset.py b/meldataset.py index b79d88a..7cb0024 100644 --- a/meldataset.py +++ b/meldataset.py @@ -1,7 +1,4 @@ #coding: utf-8 -import os -import os.path as osp -import time import random import numpy as np import random @@ -9,8 +6,6 @@ import librosa import torch -from torch import nn -import torch.nn.functional as F import torchaudio from torch.utils.data import DataLoader @@ -18,9 +13,6 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) -import os -import os.path as osp -import pandas as pd _pad = "$" _punctuation = ';:,.!?¡¿—…"«»“” ' diff --git a/models.py b/models.py index cf1f3e6..9e92664 100644 --- a/models.py +++ b/models.py @@ -1,12 +1,7 @@ #coding:utf-8 -import os -import os.path as osp - -import copy import math -import numpy as np import torch import torch.nn as nn import torch.nn.functional as F diff --git a/optimizers.py b/optimizers.py index 25770bc..c4ec795 100644 --- a/optimizers.py +++ b/optimizers.py @@ -1,10 +1,5 @@ #coding:utf-8 -import os, sys -import os.path as osp -import numpy as np import torch -from torch import nn -from torch.optim import Optimizer from functools import reduce from torch.optim import AdamW diff --git a/train_first.py b/train_first.py index 8a319d4..8a7467c 100644 --- a/train_first.py +++ b/train_first.py @@ -1,7 +1,5 @@ import os import os.path as osp -import re -import sys import yaml import shutil import numpy as np diff --git a/train_second.py b/train_second.py index 32fd30a..12fe90d 100644 --- a/train_second.py +++ b/train_second.py @@ -1,7 +1,5 @@ import os import os.path as osp -import re -import sys import yaml import shutil import numpy as np diff --git a/utils.py b/utils.py index 0d38583..0cbe498 100644 --- a/utils.py +++ b/utils.py @@ -1,12 +1,7 @@ -from monotonic_align import maximum_path -from monotonic_align import mask_from_lens from monotonic_align.core import maximum_path_c import numpy as np import torch -import copy -from torch import nn import torch.nn.functional as F -import torchaudio import librosa import matplotlib.pyplot as plt From dfd2e08ac55136886afade3a19517b672de30bba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filip=20G=C4=99bala?= Date: Wed, 7 Jan 2026 09:31:27 +0100 Subject: [PATCH 4/4] fix: update installation instructions and add requirements.txt --- README.md | 2 +- requirements.txt | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 requirements.txt diff --git a/README.md b/README.md index e4c3c61..100f3b3 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ cd StyleTTS ``` 3. Install python requirements: ```bash -pip install SoundFile torchaudio munch torch pydub pyyaml librosa git+https://github.com/resemble-ai/monotonic_align.git +pip install -r requirements.txt ``` 4. Download and extract the [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/), unzip to the data folder and upsample the data to 24 kHz. The vocoder, text aligner and pitch extractor are pre-trained on 24 kHz data, but you can easily change the preprocessing and re-train them using your own preprocessing. I will provide more receipes and pre-trained models later if I have time. If you are willing to help, feel free to work on other preprocessing methods. For LibriTTS, you will need to combine train-clean-360 with train-clean-100 and rename the folder train-clean-460 (see [val_list_libritts.txt](https://github.com/yl4579/StyleTTS/blob/main/Data/val_list_libritts.txt) as an example). diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5919aa0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +SoundFile +torchaudio +torch +munch +pydub +pyyaml +librosa +git+https://github.com/resemble-ai/monotonic_align.git +click \ No newline at end of file