Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
notes.txt
.venv
venv
54 changes: 54 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
[build-system]
requires = ["setuptools>=77.0.3", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "dmospeech2"
version = "0.1.0"
description = "DMOSpeech 2 - Reinforcement learning for duration prediction in speech synthesis"
readme = "README.md"
requires-python = ">=3.9"
license = {file = "LICENSE"}
authors = [{name = "Yinghao Aaron Li", email = "71044569+yl4579@users.noreply.github.com"}]
dependencies = [
"accelerate>=0.33.0",
"bitsandbytes>0.37.0",
"cached_path",
"click",
"datasets",
"ema_pytorch>=0.5.2",
"gradio>=3.45.2",
"hydra-core>=1.3.0",
"jieba",
"librosa",
"matplotlib",
"numpy<=1.26.4",
"pydantic<=2.10.6",
"pydub",
"pypinyin",
"safetensors",
"soundfile",
"tomli",
"torch>=2.0.0",
"torchaudio>=2.0.0",
"torchdiffeq",
"tqdm>=4.65.0",
"transformers",
"transformers_stream_generator",
"unidecode",
"vocos",
"wandb",
"x_transformers>=1.31.14",
]

[tool.setuptools]
include-package-data = true

[tool.setuptools.packages.find]
where = ["src"]

[tool.setuptools.package-data]
"f5_tts" = [
"data/Emilia_ZH_EN_pinyin/vocab.txt",
"data/librispeech_pc_test_clean_cross_sentence.lst",
]
Empty file added src/dmospeech2/__init__.py
Empty file.
77 changes: 29 additions & 48 deletions src/ctcmodel.py → src/dmospeech2/ctcmodel.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,13 @@
from torch import nn
import torch
import copy

from pathlib import Path
from torchaudio.models import Conformer

import torch
from torch import nn
from torchaudio.models import Conformer

from f5_tts.model.utils import default
from f5_tts.model.utils import exists
from f5_tts.model.utils import list_str_to_idx
from f5_tts.model.utils import list_str_to_tensor
from f5_tts.model.utils import lens_to_mask
from f5_tts.model.utils import mask_from_frac_lengths

from f5_tts.model.utils import (default, exists, lens_to_mask, list_str_to_idx,
list_str_to_tensor, mask_from_frac_lengths)

from f5_tts.model.utils import (
default,
exists,
list_str_to_idx,
list_str_to_tensor,
lens_to_mask,
mask_from_frac_lengths,
)

class ResBlock(nn.Module):
def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2):
Expand All @@ -31,7 +17,6 @@ def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2):
self._get_conv(hidden_dim, dilation=3**i, dropout_p=dropout_p)
for i in range(n_conv)])


def forward(self, x):
for block in self.blocks:
res = x
Expand All @@ -55,26 +40,25 @@ def _get_conv(self, hidden_dim, dilation, dropout_p=0.2):
class ConformerCTC(nn.Module):
def __init__(self,
vocab_size,
mel_dim=100,
num_heads=8,
d_hid=512,
mel_dim=100,
num_heads=8,
d_hid=512,
nlayers=6):
super().__init__()

self.mel_proj = nn.Conv1d(mel_dim, d_hid, kernel_size=3, padding=1)

self.d_hid = d_hid

self.resblock1 = nn.Sequential(
ResBlock(d_hid),
nn.GroupNorm(num_groups=1, num_channels=d_hid)
)
ResBlock(d_hid),
nn.GroupNorm(num_groups=1, num_channels=d_hid)
)

self.resblock2 = nn.Sequential(
ResBlock(d_hid),
nn.GroupNorm(num_groups=1, num_channels=d_hid)
)

ResBlock(d_hid),
nn.GroupNorm(num_groups=1, num_channels=d_hid)
)

self.conf_pre = torch.nn.ModuleList(
[Conformer(
Expand All @@ -85,9 +69,9 @@ def __init__(self,
depthwise_conv_kernel_size=15,
use_group_norm=True,)
for _ in range(nlayers // 2)
]
]
)

self.conf_after = torch.nn.ModuleList(
[Conformer(
input_dim=d_hid,
Expand All @@ -97,14 +81,13 @@ def __init__(self,
depthwise_conv_kernel_size=7,
use_group_norm=True,)
for _ in range(nlayers // 2)
]
]
)

self.out = nn.Linear(d_hid, 1 + vocab_size) # 1 for blank
self.out = nn.Linear(d_hid, 1 + vocab_size) # 1 for blank

self.ctc_loss = nn.CTCLoss(blank=vocab_size, zero_infinity=True).cuda()


def forward(self, latent, text=None, text_lens=None):
layers = []

Expand Down Expand Up @@ -147,9 +130,8 @@ def forward(self, latent, text=None, text_lens=None):
if __name__ == "__main__":
from f5_tts.model.utils import get_tokenizer


bsz = 16

tokenizer = "pinyin" # 'pinyin', 'char', or 'custom'
tokenizer_path = None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
dataset_name = "Emilia_ZH_EN"
Expand All @@ -158,15 +140,15 @@ def forward(self, latent, text=None, text_lens=None):
else:
tokenizer_path = dataset_name
vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)

model = ConformerCTC(vocab_size, mel_dim=80, num_heads=8, d_hid=512, nlayers=6).cuda()

text = ["hello world"] * bsz
lens = torch.randint(1, 1000, (bsz,)).cuda()
inp = torch.randn(bsz, lens.max(), 80).cuda()

batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, inp.device

# handle text as string
text_lens = torch.tensor([len(t) for t in text], device=device)
if isinstance(text, list):
Expand Down Expand Up @@ -198,7 +180,6 @@ def forward(self, latent, text=None, text_lens=None):

char_vocab_map = list(vocab_char_map.keys())


for batch in best_path:
decoded_sequence = []
previous_token = None
Expand All @@ -216,6 +197,6 @@ def forward(self, latent, text=None, text_lens=None):
gt_texts = []
for i in range(text_lens.size(0)):
gt_texts.append(''.join([char_vocab_map[token] for token in text[i, :text_lens[i]]]))

print(decoded_texts)
print(gt_texts)
print(gt_texts)
File renamed without changes.
Loading