From 162c05c8a6eaf2d35311c70babb2349c0fc2c31b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Filip=20G=C4=99bala?= <filip.gebala@interia.pl>
Date: Wed, 7 Jan 2026 08:47:09 +0100
Subject: [PATCH 1/4] fix: fixed typo in config.yml

---
 Configs/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Configs/config.yml b/Configs/config.yml
index 3dad1af..f299e4d 100644
--- a/Configs/config.yml
+++ b/Configs/config.yml
@@ -5,7 +5,7 @@ log_interval: 10
 device: "cuda"
 multigpu: false
 epochs_1st: 200 # number of epochs for first stage training
-epochs_2nd: 100 # number of peochs for second stage training
+epochs_2nd: 100 # number of epochs for second stage training
 batch_size: 32
 pretrained_model: ""
 second_stage_load_pretrained: false # set to true if the pre-trained model is for 2nd stage

From 55cfeb02553c4d54801ee900a55d4c1aa3c39666 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Filip=20G=C4=99bala?= <filip.gebala@interia.pl>
Date: Wed, 7 Jan 2026 08:49:49 +0100
Subject: [PATCH 2/4] fix: remove unnecessary LinearNorm class definition

---
 models.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/models.py b/models.py
index 1a8b109..cf1f3e6 100644
--- a/models.py
+++ b/models.py
@@ -488,17 +488,6 @@ def forward(self, x, s):
         x = (1 + gamma) * x + beta
         return x.transpose(1, -1).transpose(-1, -2)
     
-class LinearNorm(torch.nn.Module):
-    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
-        super(LinearNorm, self).__init__()
-        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
-
-        torch.nn.init.xavier_uniform_(
-            self.linear_layer.weight,
-            gain=torch.nn.init.calculate_gain(w_init_gain))
-
-    def forward(self, x):
-        return self.linear_layer(x)
 
 class ProsodyPredictor(nn.Module):
 

From 7e639d3f5a695a067720842207d9c4d4f96018ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Filip=20G=C4=99bala?= <filip.gebala@interia.pl>
Date: Wed, 7 Jan 2026 09:00:27 +0100
Subject: [PATCH 3/4] fix: remove unused imports across multiple files

---
 Utils/ASR/layers.py | 4 ----
 Utils/ASR/models.py | 1 -
 meldataset.py       | 8 --------
 models.py           | 5 -----
 optimizers.py       | 5 -----
 train_first.py      | 2 --
 train_second.py     | 2 --
 utils.py            | 5 -----
 8 files changed, 32 deletions(-)

diff --git a/Utils/ASR/layers.py b/Utils/ASR/layers.py
index ae76628..282cddb 100644
--- a/Utils/ASR/layers.py
+++ b/Utils/ASR/layers.py
@@ -1,10 +1,6 @@
-import math
 import torch
 from torch import nn
-from typing import Optional, Any
-from torch import Tensor
 import torch.nn.functional as F
-import torchaudio
 import torchaudio.functional as audio_F
 
 import random
diff --git a/Utils/ASR/models.py b/Utils/ASR/models.py
index 69a28d5..917ff82 100644
--- a/Utils/ASR/models.py
+++ b/Utils/ASR/models.py
@@ -1,7 +1,6 @@
 import math
 import torch
 from torch import nn
-from torch.nn import TransformerEncoder
 import torch.nn.functional as F
 from .layers import MFCC, Attention, LinearNorm, ConvNorm, ConvBlock
 
diff --git a/meldataset.py b/meldataset.py
index b79d88a..7cb0024 100644
--- a/meldataset.py
+++ b/meldataset.py
@@ -1,7 +1,4 @@
 #coding: utf-8
-import os
-import os.path as osp
-import time
 import random
 import numpy as np
 import random
@@ -9,8 +6,6 @@
 import librosa
 
 import torch
-from torch import nn
-import torch.nn.functional as F
 import torchaudio
 from torch.utils.data import DataLoader
 
@@ -18,9 +13,6 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
-import os
-import os.path as osp
-import pandas as pd
 
 _pad = "$"
 _punctuation = ';:,.!?¡¿—…"«»“” '
diff --git a/models.py b/models.py
index cf1f3e6..9e92664 100644
--- a/models.py
+++ b/models.py
@@ -1,12 +1,7 @@
 #coding:utf-8
 
-import os
-import os.path as osp
-
-import copy
 import math
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/optimizers.py b/optimizers.py
index 25770bc..c4ec795 100644
--- a/optimizers.py
+++ b/optimizers.py
@@ -1,10 +1,5 @@
 #coding:utf-8
-import os, sys
-import os.path as osp
-import numpy as np
 import torch
-from torch import nn
-from torch.optim import Optimizer
 from functools import reduce
 from torch.optim import AdamW
 
diff --git a/train_first.py b/train_first.py
index 8a319d4..8a7467c 100644
--- a/train_first.py
+++ b/train_first.py
@@ -1,7 +1,5 @@
 import os
 import os.path as osp
-import re
-import sys
 import yaml
 import shutil
 import numpy as np
diff --git a/train_second.py b/train_second.py
index 32fd30a..12fe90d 100644
--- a/train_second.py
+++ b/train_second.py
@@ -1,7 +1,5 @@
 import os
 import os.path as osp
-import re
-import sys
 import yaml
 import shutil
 import numpy as np
diff --git a/utils.py b/utils.py
index 0d38583..0cbe498 100644
--- a/utils.py
+++ b/utils.py
@@ -1,12 +1,7 @@
-from monotonic_align import maximum_path
-from monotonic_align import mask_from_lens
 from monotonic_align.core import maximum_path_c
 import numpy as np
 import torch
-import copy
-from torch import nn
 import torch.nn.functional as F
-import torchaudio
 import librosa
 import matplotlib.pyplot as plt
 

From dfd2e08ac55136886afade3a19517b672de30bba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Filip=20G=C4=99bala?= <filip.gebala@interia.pl>
Date: Wed, 7 Jan 2026 09:31:27 +0100
Subject: [PATCH 4/4] fix: update installation instructions and add
 requirements.txt

---
 README.md        | 2 +-
 requirements.txt | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)
 create mode 100644 requirements.txt

diff --git a/README.md b/README.md
index e4c3c61..100f3b3 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ cd StyleTTS
 ```
 3. Install python requirements: 
 ```bash
-pip install SoundFile torchaudio munch torch pydub pyyaml librosa git+https://github.com/resemble-ai/monotonic_align.git
+pip install -r requirements.txt
 ```
 4. Download and extract the [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/), unzip to the data folder and upsample the data to 24 kHz. The vocoder, text aligner and pitch extractor are pre-trained on 24 kHz data, but you can easily change the preprocessing and re-train them using your own preprocessing. I will provide more receipes and pre-trained models later if I have time. If you are willing to help, feel free to work on other preprocessing methods. 
 For LibriTTS, you will need to combine train-clean-360 with train-clean-100 and rename the folder train-clean-460 (see [val_list_libritts.txt](https://github.com/yl4579/StyleTTS/blob/main/Data/val_list_libritts.txt) as an example).
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..5919aa0
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+SoundFile
+torchaudio
+torch
+munch
+pydub
+pyyaml
+librosa
+git+https://github.com/resemble-ai/monotonic_align.git
+click
\ No newline at end of file