Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Configs/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ log_interval: 10
device: "cuda"
multigpu: false
epochs_1st: 200 # number of epochs for first stage training
epochs_2nd: 100 # number of peochs for second stage training
epochs_2nd: 100 # number of epochs for second stage training
batch_size: 32
pretrained_model: ""
second_stage_load_pretrained: false # set to true if the pre-trained model is for 2nd stage
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ cd StyleTTS
```
3. Install python requirements:
```bash
pip install SoundFile torchaudio munch torch pydub pyyaml librosa git+https://github.com/resemble-ai/monotonic_align.git
pip install -r requirements.txt
```
4. Download and extract the [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/), unzip to the data folder and upsample the data to 24 kHz. The vocoder, text aligner and pitch extractor are pre-trained on 24 kHz data, but you can easily change the preprocessing and re-train them using your own preprocessing. I will provide more receipes and pre-trained models later if I have time. If you are willing to help, feel free to work on other preprocessing methods.
For LibriTTS, you will need to combine train-clean-360 with train-clean-100 and rename the folder train-clean-460 (see [val_list_libritts.txt](https://github.com/yl4579/StyleTTS/blob/main/Data/val_list_libritts.txt) as an example).
Expand Down
4 changes: 0 additions & 4 deletions Utils/ASR/layers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import math
import torch
from torch import nn
from typing import Optional, Any
from torch import Tensor
import torch.nn.functional as F
import torchaudio
import torchaudio.functional as audio_F

import random
Expand Down
1 change: 0 additions & 1 deletion Utils/ASR/models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import math
import torch
from torch import nn
from torch.nn import TransformerEncoder
import torch.nn.functional as F
from .layers import MFCC, Attention, LinearNorm, ConvNorm, ConvBlock

Expand Down
8 changes: 0 additions & 8 deletions meldataset.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,18 @@
#coding: utf-8
import os
import os.path as osp
import time
import random
import numpy as np
import random
import soundfile as sf
import librosa

import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
from torch.utils.data import DataLoader

import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

import os
import os.path as osp
import pandas as pd

_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
Expand Down
16 changes: 0 additions & 16 deletions models.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
#coding:utf-8

import os
import os.path as osp

import copy
import math

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
Expand Down Expand Up @@ -488,17 +483,6 @@ def forward(self, x, s):
x = (1 + gamma) * x + beta
return x.transpose(1, -1).transpose(-1, -2)

class LinearNorm(torch.nn.Module):
def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
super(LinearNorm, self).__init__()
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)

torch.nn.init.xavier_uniform_(
self.linear_layer.weight,
gain=torch.nn.init.calculate_gain(w_init_gain))

def forward(self, x):
return self.linear_layer(x)

class ProsodyPredictor(nn.Module):

Expand Down
5 changes: 0 additions & 5 deletions optimizers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
#coding:utf-8
import os, sys
import os.path as osp
import numpy as np
import torch
from torch import nn
from torch.optim import Optimizer
from functools import reduce
from torch.optim import AdamW

Expand Down
9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
SoundFile
torchaudio
torch
munch
pydub
pyyaml
librosa
git+https://github.com/resemble-ai/monotonic_align.git
click
2 changes: 0 additions & 2 deletions train_first.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import os
import os.path as osp
import re
import sys
import yaml
import shutil
import numpy as np
Expand Down
2 changes: 0 additions & 2 deletions train_second.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import os
import os.path as osp
import re
import sys
import yaml
import shutil
import numpy as np
Expand Down
5 changes: 0 additions & 5 deletions utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
from monotonic_align import maximum_path
from monotonic_align import mask_from_lens
from monotonic_align.core import maximum_path_c
import numpy as np
import torch
import copy
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
import matplotlib.pyplot as plt

Expand Down