From 312bfa9e49eda7c16572598f2a6609773a352347 Mon Sep 17 00:00:00 2001
From: engineer <engineer@opencode.ai>
Date: Sat, 14 Feb 2026 19:14:25 -0800
Subject: [PATCH] feat(tts): switch default model from vctk_vits (~9GB) to vits
 (~1-2GB)

Switch default Coqui TTS model from vctk_vits (multi-speaker, 109
speakers, ~9GB memory) to vits (LJSpeech, single speaker, ~1-2GB).
The vctk model loaded all 109 speaker embeddings into GPU memory even
when only one speaker was used.

Changes:
- Default model: vctk_vits -> vits (LJSpeech)
- Default voice: p339 -> default (single speaker, no selection needed)
- Model lists: LJSpeech now listed first as recommended
- Documentation updated with memory usage notes
---
 README.md                                      |  9 +++++----
 backend/src/services/coqui.ts                  | 18 +++++++++---------
 .../src/components/settings/TTSSettings.tsx    |  2 +-
 scripts/coqui-server.py                        | 18 +++++++++---------
 shared/src/schemas/settings.ts                 |  2 +-
 5 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index dc86a355..0e70df5b 100644
--- a/README.md
+++ b/README.md
@@ -168,9 +168,10 @@ We regularly sync our fork with upstream sst/opencode to incorporate new feature
 - **Dual Provider Support** - Browser-native Web Speech API + external OpenAI-compatible endpoints
 - **Browser-Native TTS** - Built-in Web Speech API for instant playback without API keys
 - **Coqui TTS with Multi-Model Support** - 7+ high-quality English voice models with runtime switching:
-  - VCTK VITS (109 multi-speaker voices, default — **p339**, soft American female)
+  - LJSpeech VITS (single speaker, female, **default** — low memory ~1-2GB)
+  - VCTK VITS (109 multi-speaker voices — ~9GB memory)
   - Jenny (single-speaker, fast)
-  - LJSpeech VITS, Tacotron2, Glow-TTS, FastPitch
+  - LJSpeech Tacotron2, Glow-TTS, FastPitch
   - XTTS v2 (multilingual voice cloning)
 - **17 American Female Voices** - Curated selection from the VCTK corpus with detailed metadata (region, age, vocal quality)
 - **AI Message Playback** - Listen to assistant responses with TTS
@@ -350,8 +351,8 @@ stt:
 tts:
   status: running
   provider: coqui
-  model: tts_models/en/vctk/vits
-  voice: p339
+  model: tts_models/en/ljspeech/vits
+  voice: default
 
 tunnel:
   status: connected
diff --git a/backend/src/services/coqui.ts b/backend/src/services/coqui.ts
index 451ed2e6..58076ad1 100644
--- a/backend/src/services/coqui.ts
+++ b/backend/src/services/coqui.ts
@@ -11,25 +11,25 @@ const __dirname = path.dirname(__filename)
 const COQUI_PORT = parseInt(process.env.COQUI_PORT || '5554')
 const COQUI_HOST = process.env.COQUI_HOST || '127.0.0.1'
 const COQUI_DEVICE = process.env.COQUI_DEVICE || 'auto'
-const COQUI_MODEL = process.env.COQUI_MODEL || 'tts_models/en/vctk/vits'
+const COQUI_MODEL = process.env.COQUI_MODEL || 'tts_models/en/ljspeech/vits'
 const DEFAULT_VENV_DIR = path.join(os.homedir(), '.opencode-manager', 'coqui-venv')
 
 const RECOMMENDED_MODELS = [
   {
-    id: "tts_models/en/vctk/vits",
-    name: "VCTK VITS",
-    description: "VCTK VITS (109 speakers, recommended)",
+    id: "tts_models/en/ljspeech/vits",
+    name: "LJSpeech VITS",
+    description: "LJSpeech single speaker (recommended, low memory ~1-2GB)",
     quality: "high",
     speed: "fast",
-    multi_speaker: true
+    multi_speaker: false
   },
   {
-    id: "tts_models/en/ljspeech/vits",
-    name: "LJSpeech VITS",
-    description: "LJSpeech single speaker",
+    id: "tts_models/en/vctk/vits",
+    name: "VCTK VITS",
+    description: "VCTK VITS (109 speakers, ~9GB memory)",
     quality: "high",
     speed: "fast",
-    multi_speaker: false
+    multi_speaker: true
   },
   {
     id: "tts_models/en/jenny/jenny",
diff --git a/frontend/src/components/settings/TTSSettings.tsx b/frontend/src/components/settings/TTSSettings.tsx
index 1bdbd3e5..caca491a 100644
--- a/frontend/src/components/settings/TTSSettings.tsx
+++ b/frontend/src/components/settings/TTSSettings.tsx
@@ -635,7 +635,7 @@ export function TTSSettings() {
                   <div className="space-y-2">
                     <Label>TTS Model</Label>
                       <Combobox
-                        value={coquiModels?.currentModel || 'tts_models/en/vctk/vits'}
+                        value={coquiModels?.currentModel || 'tts_models/en/ljspeech/vits'}
                         onChange={(modelId) => {
                           if (modelId !== coquiModels?.currentModel) {
                             changeCoquiModelMutation.mutate(modelId)
diff --git a/scripts/coqui-server.py b/scripts/coqui-server.py
index 6c7cc689..ca776584 100644
--- a/scripts/coqui-server.py
+++ b/scripts/coqui-server.py
@@ -55,7 +55,7 @@
 COQUI_PORT = int(os.environ.get("COQUI_PORT", "5554"))
 COQUI_HOST = os.environ.get("COQUI_HOST", "127.0.0.1")
 COQUI_DEVICE = os.environ.get("COQUI_DEVICE", "auto")
-COQUI_MODEL = os.environ.get("COQUI_MODEL", "tts_models/en/vctk/vits")
+COQUI_MODEL = os.environ.get("COQUI_MODEL", "tts_models/en/ljspeech/vits")
 
 # Global state
 model: Optional[TTS] = None
@@ -67,23 +67,23 @@
 # Curated list of high-quality TTS models
 RECOMMENDED_MODELS = [
     {
-        "id": "tts_models/en/vctk/vits",
-        "name": "VCTK VITS",
-        "description": "VCTK VITS (109 speakers, recommended)",
+        "id": "tts_models/en/ljspeech/vits",
+        "name": "LJSpeech VITS",
+        "description": "LJSpeech single speaker (recommended, low memory ~1-2GB)",
         "language": "en",
         "quality": "high",
         "speed": "fast",
-        "multi_speaker": True,
+        "multi_speaker": False,
         "recommended": True
     },
     {
-        "id": "tts_models/en/ljspeech/vits",
-        "name": "LJSpeech VITS",
-        "description": "LJSpeech single speaker",
+        "id": "tts_models/en/vctk/vits",
+        "name": "VCTK VITS",
+        "description": "VCTK VITS (109 speakers, ~9GB memory)",
         "language": "en",
         "quality": "high",
         "speed": "fast",
-        "multi_speaker": False
+        "multi_speaker": True
     },
     {
         "id": "tts_models/en/jenny/jenny",
diff --git a/shared/src/schemas/settings.ts b/shared/src/schemas/settings.ts
index 1f4d0519..921aa363 100644
--- a/shared/src/schemas/settings.ts
+++ b/shared/src/schemas/settings.ts
@@ -190,7 +190,7 @@ export const DEFAULT_TTS_CONFIG: TTSConfig = {
   provider: 'coqui',
   endpoint: "https://api.openai.com",
   apiKey: "",
-  voice: "p339",
+  voice: "default",
   model: "tts-1",
   speed: 1.0,
   availableVoices: [],