yl4579 · eschmidbauer · Jul 22, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+notes.txt
+.venv
+venv
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,54 @@
+[build-system]
+requires = ["setuptools>=77.0.3", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "dmospeech2"
+version = "0.1.0"
+description = "DMOSpeech 2 - Reinforcement learning for duration prediction in speech synthesis"
+readme = "README.md"
+requires-python = ">=3.9"
+license = {file = "LICENSE"}
+authors = [{name = "Yinghao Aaron Li", email = "71044569+yl4579@users.noreply.github.com"}]
+dependencies = [
+    "accelerate>=0.33.0",
+    "bitsandbytes>0.37.0",
+    "cached_path",
+    "click",
+    "datasets",
+    "ema_pytorch>=0.5.2",
+    "gradio>=3.45.2",
+    "hydra-core>=1.3.0",
+    "jieba",
+    "librosa",
+    "matplotlib",
+    "numpy<=1.26.4",
+    "pydantic<=2.10.6",
+    "pydub",
+    "pypinyin",
+    "safetensors",
+    "soundfile",
+    "tomli",
+    "torch>=2.0.0",
+    "torchaudio>=2.0.0",
+    "torchdiffeq",
+    "tqdm>=4.65.0",
+    "transformers",
+    "transformers_stream_generator",
+    "unidecode",
+    "vocos",
+    "wandb",
+    "x_transformers>=1.31.14",
+]
+
+[tool.setuptools]
+include-package-data = true
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+"f5_tts" = [
+  "data/Emilia_ZH_EN_pinyin/vocab.txt",
+  "data/librispeech_pc_test_clean_cross_sentence.lst",
+]
diff --git a/src/dmospeech2/__init__.py b/src/dmospeech2/__init__.py
diff --git a/src/ctcmodel.py → src/dmospeech2/ctcmodel.py b/src/ctcmodel.py → src/dmospeech2/ctcmodel.py
@@ -1,27 +1,13 @@
-from torch import nn
-import torch 
 import copy
-
 from pathlib import Path
-from torchaudio.models import Conformer
 
+import torch
+from torch import nn
+from torchaudio.models import Conformer
 
-from f5_tts.model.utils import default
-from f5_tts.model.utils import exists
-from f5_tts.model.utils import list_str_to_idx
-from f5_tts.model.utils import list_str_to_tensor
-from f5_tts.model.utils import lens_to_mask
-from f5_tts.model.utils import mask_from_frac_lengths
-
+from f5_tts.model.utils import (default, exists, lens_to_mask, list_str_to_idx,
+                                list_str_to_tensor, mask_from_frac_lengths)
 
-from f5_tts.model.utils import (
-    default,
-    exists,
-    list_str_to_idx,
-    list_str_to_tensor,
-    lens_to_mask,
-    mask_from_frac_lengths,
-)
 
 class ResBlock(nn.Module):
     def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2):
@@ -31,7 +17,6 @@ def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2):
             self._get_conv(hidden_dim, dilation=3**i, dropout_p=dropout_p)
             for i in range(n_conv)])
 
-
     def forward(self, x):
         for block in self.blocks:
             res = x
@@ -55,26 +40,25 @@ def _get_conv(self, hidden_dim, dilation, dropout_p=0.2):
 class ConformerCTC(nn.Module):
     def __init__(self,
                  vocab_size,
-                 mel_dim=100, 
-                 num_heads=8, 
-                 d_hid=512, 
+                 mel_dim=100,
+                 num_heads=8,
+                 d_hid=512,
                  nlayers=6):
         super().__init__()
-        
+
         self.mel_proj = nn.Conv1d(mel_dim, d_hid, kernel_size=3, padding=1)
-        
+
         self.d_hid = d_hid
-        
+
         self.resblock1 = nn.Sequential(
-                ResBlock(d_hid),
-                nn.GroupNorm(num_groups=1, num_channels=d_hid)
-            )
-        
+            ResBlock(d_hid),
+            nn.GroupNorm(num_groups=1, num_channels=d_hid)
+        )
+
         self.resblock2 = nn.Sequential(
-                ResBlock(d_hid),
-                nn.GroupNorm(num_groups=1, num_channels=d_hid)
-            )
-
+            ResBlock(d_hid),
+            nn.GroupNorm(num_groups=1, num_channels=d_hid)
+        )
 
         self.conf_pre = torch.nn.ModuleList(
             [Conformer(
@@ -85,9 +69,9 @@ def __init__(self,
              depthwise_conv_kernel_size=15,
              use_group_norm=True,)
                 for _ in range(nlayers // 2)
-            ]
+             ]
         )
-        
+
         self.conf_after = torch.nn.ModuleList(
             [Conformer(
              input_dim=d_hid,
@@ -97,14 +81,13 @@ def __init__(self,
              depthwise_conv_kernel_size=7,
              use_group_norm=True,)
                 for _ in range(nlayers // 2)
-            ]
+             ]
         )
 
-        self.out = nn.Linear(d_hid, 1 + vocab_size) # 1 for blank
+        self.out = nn.Linear(d_hid, 1 + vocab_size)  # 1 for blank
 
         self.ctc_loss = nn.CTCLoss(blank=vocab_size, zero_infinity=True).cuda()
 
-
     def forward(self, latent, text=None, text_lens=None):
         layers = []
 
@@ -147,9 +130,8 @@ def forward(self, latent, text=None, text_lens=None):
 if __name__ == "__main__":
     from f5_tts.model.utils import get_tokenizer
 
-
     bsz = 16
-    
+
     tokenizer = "pinyin"  # 'pinyin', 'char', or 'custom'
     tokenizer_path = None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
     dataset_name = "Emilia_ZH_EN"
@@ -158,15 +140,15 @@ def forward(self, latent, text=None, text_lens=None):
     else:
         tokenizer_path = dataset_name
     vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
-    
+
     model = ConformerCTC(vocab_size, mel_dim=80, num_heads=8, d_hid=512, nlayers=6).cuda()
-    
+
     text = ["hello world"] * bsz
     lens = torch.randint(1, 1000, (bsz,)).cuda()
     inp = torch.randn(bsz, lens.max(), 80).cuda()
-    
+
     batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, inp.device
-    
+
     # handle text as string
     text_lens = torch.tensor([len(t) for t in text], device=device)
     if isinstance(text, list):
@@ -198,7 +180,6 @@ def forward(self, latent, text=None, text_lens=None):
 
     char_vocab_map = list(vocab_char_map.keys())
 
-
     for batch in best_path:
         decoded_sequence = []
         previous_token = None
@@ -216,6 +197,6 @@ def forward(self, latent, text=None, text_lens=None):
     gt_texts = []
     for i in range(text_lens.size(0)):
         gt_texts.append(''.join([char_vocab_map[token] for token in text[i, :text_lens[i]]]))
-    
+
     print(decoded_texts)
-    print(gt_texts)
+    print(gt_texts)
diff --git a/src/demo.ipynb → src/dmospeech2/demo.ipynb b/src/demo.ipynb → src/dmospeech2/demo.ipynb