yl4579 · ffenix7 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/Configs/config.yml b/Configs/config.yml
@@ -5,7 +5,7 @@ log_interval: 10
 device: "cuda"
 multigpu: false
 epochs_1st: 200 # number of epochs for first stage training
-epochs_2nd: 100 # number of peochs for second stage training
+epochs_2nd: 100 # number of epochs for second stage training
 batch_size: 32
 pretrained_model: ""
 second_stage_load_pretrained: false # set to true if the pre-trained model is for 2nd stage

diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ cd StyleTTS
 ```
 3. Install python requirements: 
 ```bash
-pip install SoundFile torchaudio munch torch pydub pyyaml librosa git+https://github.com/resemble-ai/monotonic_align.git
+pip install -r requirements.txt
 ```
 4. Download and extract the [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/), unzip to the data folder and upsample the data to 24 kHz. The vocoder, text aligner and pitch extractor are pre-trained on 24 kHz data, but you can easily change the preprocessing and re-train them using your own preprocessing. I will provide more receipes and pre-trained models later if I have time. If you are willing to help, feel free to work on other preprocessing methods. 
 For LibriTTS, you will need to combine train-clean-360 with train-clean-100 and rename the folder train-clean-460 (see [val_list_libritts.txt](https://github.com/yl4579/StyleTTS/blob/main/Data/val_list_libritts.txt) as an example).

diff --git a/Utils/ASR/layers.py b/Utils/ASR/layers.py
@@ -1,10 +1,6 @@
-import math
 import torch
 from torch import nn
-from typing import Optional, Any
-from torch import Tensor
 import torch.nn.functional as F
-import torchaudio
 import torchaudio.functional as audio_F
 
 import random

diff --git a/Utils/ASR/models.py b/Utils/ASR/models.py
@@ -1,7 +1,6 @@
 import math
 import torch
 from torch import nn
-from torch.nn import TransformerEncoder
 import torch.nn.functional as F
 from .layers import MFCC, Attention, LinearNorm, ConvNorm, ConvBlock
 

diff --git a/meldataset.py b/meldataset.py
@@ -1,26 +1,18 @@
 #coding: utf-8
-import os
-import os.path as osp
-import time
 import random
 import numpy as np
 import random
 import soundfile as sf
 import librosa
 
 import torch
-from torch import nn
-import torch.nn.functional as F
 import torchaudio
 from torch.utils.data import DataLoader
 
 import logging
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
-import os
-import os.path as osp
-import pandas as pd
 
 _pad = "$"
 _punctuation = ';:,.!?¡¿—…"«»“” '

diff --git a/models.py b/models.py
@@ -1,12 +1,7 @@
 #coding:utf-8
 
-import os
-import os.path as osp
-
-import copy
 import math
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -488,17 +483,6 @@ def forward(self, x, s):
         x = (1 + gamma) * x + beta
         return x.transpose(1, -1).transpose(-1, -2)
 
-class LinearNorm(torch.nn.Module):
-    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
-        super(LinearNorm, self).__init__()
-        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
-
-        torch.nn.init.xavier_uniform_(
-            self.linear_layer.weight,
-            gain=torch.nn.init.calculate_gain(w_init_gain))
-
-    def forward(self, x):
-        return self.linear_layer(x)
 
 class ProsodyPredictor(nn.Module):
 

diff --git a/optimizers.py b/optimizers.py
@@ -1,10 +1,5 @@
 #coding:utf-8
-import os, sys
-import os.path as osp
-import numpy as np
 import torch
-from torch import nn
-from torch.optim import Optimizer
 from functools import reduce
 from torch.optim import AdamW
 

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,9 @@
+SoundFile
+torchaudio
+torch
+munch
+pydub
+pyyaml
+librosa
+git+https://github.com/resemble-ai/monotonic_align.git
+click
diff --git a/train_first.py b/train_first.py
@@ -1,7 +1,5 @@
 import os
 import os.path as osp
-import re
-import sys
 import yaml
 import shutil
 import numpy as np

diff --git a/train_second.py b/train_second.py
@@ -1,7 +1,5 @@
 import os
 import os.path as osp
-import re
-import sys
 import yaml
 import shutil
 import numpy as np

diff --git a/utils.py b/utils.py
@@ -1,12 +1,7 @@
-from monotonic_align import maximum_path
-from monotonic_align import mask_from_lens
 from monotonic_align.core import maximum_path_c
 import numpy as np
 import torch
-import copy
-from torch import nn
 import torch.nn.functional as F
-import torchaudio
 import librosa
 import matplotlib.pyplot as plt