From 312bfa9e49eda7c16572598f2a6609773a352347 Mon Sep 17 00:00:00 2001 From: engineer Date: Sat, 14 Feb 2026 19:14:25 -0800 Subject: [PATCH] feat(tts): switch default model from vctk_vits (~9GB) to vits (~1-2GB) Switch default Coqui TTS model from vctk_vits (multi-speaker, 109 speakers, ~9GB memory) to vits (LJSpeech, single speaker, ~1-2GB). The vctk model loaded all 109 speaker embeddings into GPU memory even when only one speaker was used. Changes: - Default model: vctk_vits -> vits (LJSpeech) - Default voice: p339 -> default (single speaker, no selection needed) - Model lists: LJSpeech now listed first as recommended - Documentation updated with memory usage notes --- README.md | 9 +++++---- backend/src/services/coqui.ts | 18 +++++++++--------- .../src/components/settings/TTSSettings.tsx | 2 +- scripts/coqui-server.py | 18 +++++++++--------- shared/src/schemas/settings.ts | 2 +- 5 files changed, 25 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index dc86a355..0e70df5b 100644 --- a/README.md +++ b/README.md @@ -168,9 +168,10 @@ We regularly sync our fork with upstream sst/opencode to incorporate new feature - **Dual Provider Support** - Browser-native Web Speech API + external OpenAI-compatible endpoints - **Browser-Native TTS** - Built-in Web Speech API for instant playback without API keys - **Coqui TTS with Multi-Model Support** - 7+ high-quality English voice models with runtime switching: - - VCTK VITS (109 multi-speaker voices, default — **p339**, soft American female) + - LJSpeech VITS (single speaker, female, **default** — low memory ~1-2GB) + - VCTK VITS (109 multi-speaker voices — ~9GB memory) - Jenny (single-speaker, fast) - - LJSpeech VITS, Tacotron2, Glow-TTS, FastPitch + - LJSpeech Tacotron2, Glow-TTS, FastPitch - XTTS v2 (multilingual voice cloning) - **17 American Female Voices** - Curated selection from the VCTK corpus with detailed metadata (region, age, vocal quality) - **AI Message Playback** - Listen to assistant responses with TTS @@ -350,8 +351,8 @@ stt: tts: status: running provider: coqui - model: tts_models/en/vctk/vits - voice: p339 + model: tts_models/en/ljspeech/vits + voice: default tunnel: status: connected diff --git a/backend/src/services/coqui.ts b/backend/src/services/coqui.ts index 451ed2e6..58076ad1 100644 --- a/backend/src/services/coqui.ts +++ b/backend/src/services/coqui.ts @@ -11,25 +11,25 @@ const __dirname = path.dirname(__filename) const COQUI_PORT = parseInt(process.env.COQUI_PORT || '5554') const COQUI_HOST = process.env.COQUI_HOST || '127.0.0.1' const COQUI_DEVICE = process.env.COQUI_DEVICE || 'auto' -const COQUI_MODEL = process.env.COQUI_MODEL || 'tts_models/en/vctk/vits' +const COQUI_MODEL = process.env.COQUI_MODEL || 'tts_models/en/ljspeech/vits' const DEFAULT_VENV_DIR = path.join(os.homedir(), '.opencode-manager', 'coqui-venv') const RECOMMENDED_MODELS = [ { - id: "tts_models/en/vctk/vits", - name: "VCTK VITS", - description: "VCTK VITS (109 speakers, recommended)", + id: "tts_models/en/ljspeech/vits", + name: "LJSpeech VITS", + description: "LJSpeech single speaker (recommended, low memory ~1-2GB)", quality: "high", speed: "fast", - multi_speaker: true + multi_speaker: false }, { - id: "tts_models/en/ljspeech/vits", - name: "LJSpeech VITS", - description: "LJSpeech single speaker", + id: "tts_models/en/vctk/vits", + name: "VCTK VITS", + description: "VCTK VITS (109 speakers, ~9GB memory)", quality: "high", speed: "fast", - multi_speaker: false + multi_speaker: true }, { id: "tts_models/en/jenny/jenny", diff --git a/frontend/src/components/settings/TTSSettings.tsx b/frontend/src/components/settings/TTSSettings.tsx index 1bdbd3e5..caca491a 100644 --- a/frontend/src/components/settings/TTSSettings.tsx +++ b/frontend/src/components/settings/TTSSettings.tsx @@ -635,7 +635,7 @@ export function TTSSettings() {
{ if (modelId !== coquiModels?.currentModel) { changeCoquiModelMutation.mutate(modelId) diff --git a/scripts/coqui-server.py b/scripts/coqui-server.py index 6c7cc689..ca776584 100644 --- a/scripts/coqui-server.py +++ b/scripts/coqui-server.py @@ -55,7 +55,7 @@ COQUI_PORT = int(os.environ.get("COQUI_PORT", "5554")) COQUI_HOST = os.environ.get("COQUI_HOST", "127.0.0.1") COQUI_DEVICE = os.environ.get("COQUI_DEVICE", "auto") -COQUI_MODEL = os.environ.get("COQUI_MODEL", "tts_models/en/vctk/vits") +COQUI_MODEL = os.environ.get("COQUI_MODEL", "tts_models/en/ljspeech/vits") # Global state model: Optional[TTS] = None @@ -67,23 +67,23 @@ # Curated list of high-quality TTS models RECOMMENDED_MODELS = [ { - "id": "tts_models/en/vctk/vits", - "name": "VCTK VITS", - "description": "VCTK VITS (109 speakers, recommended)", + "id": "tts_models/en/ljspeech/vits", + "name": "LJSpeech VITS", + "description": "LJSpeech single speaker (recommended, low memory ~1-2GB)", "language": "en", "quality": "high", "speed": "fast", - "multi_speaker": True, + "multi_speaker": False, "recommended": True }, { - "id": "tts_models/en/ljspeech/vits", - "name": "LJSpeech VITS", - "description": "LJSpeech single speaker", + "id": "tts_models/en/vctk/vits", + "name": "VCTK VITS", + "description": "VCTK VITS (109 speakers, ~9GB memory)", "language": "en", "quality": "high", "speed": "fast", - "multi_speaker": False + "multi_speaker": True }, { "id": "tts_models/en/jenny/jenny", diff --git a/shared/src/schemas/settings.ts b/shared/src/schemas/settings.ts index 1f4d0519..921aa363 100644 --- a/shared/src/schemas/settings.ts +++ b/shared/src/schemas/settings.ts @@ -190,7 +190,7 @@ export const DEFAULT_TTS_CONFIG: TTSConfig = { provider: 'coqui', endpoint: "https://api.openai.com", apiKey: "", - voice: "p339", + voice: "default", model: "tts-1", speed: 1.0, availableVoices: [],