From 49449e516c4e6f0ecfaec37508639f499668be2b Mon Sep 17 00:00:00 2001 From: engineer Date: Sat, 14 Feb 2026 18:33:24 -0800 Subject: [PATCH 1/2] feat: change default TTS voice to p339 (American female) and add all American female VCTK speakers to metadata --- scripts/coqui-server.py | 47 +++++++++++++++++++++++++--------- shared/src/schemas/settings.ts | 2 +- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/scripts/coqui-server.py b/scripts/coqui-server.py index 17d6c386..6c7cc689 100644 --- a/scripts/coqui-server.py +++ b/scripts/coqui-server.py @@ -115,34 +115,57 @@ } ] -# Detailed metadata for popular VCTK speakers +# Detailed metadata for VCTK speakers +# Source: VCTK corpus speaker-info.txt (University of Edinburgh) VCTK_METADATA = { - "p226": {"gender": "Male", "accent": "English", "desc": "Clear, professional (recommended)"}, + # British English speakers "p225": {"gender": "Female", "accent": "English", "desc": "Clear, neutral"}, + "p226": {"gender": "Male", "accent": "English", "desc": "Clear, professional"}, "p227": {"gender": "Male", "accent": "English", "desc": "Deep voice"}, "p228": {"gender": "Female", "accent": "English", "desc": "Warm tone"}, "p229": {"gender": "Female", "accent": "English", "desc": "Higher pitch"}, "p230": {"gender": "Female", "accent": "English", "desc": "Soft voice"}, "p231": {"gender": "Male", "accent": "English", "desc": "Standard"}, "p232": {"gender": "Male", "accent": "English", "desc": "Casual"}, - "p233": {"gender": "Female", "accent": "Scottish", "desc": "Scottish accent"}, - "p234": {"gender": "Female", "accent": "Scottish", "desc": "Scottish accent"}, "p236": {"gender": "Female", "accent": "English", "desc": "Professional"}, - "p237": {"gender": "Male", "accent": "Scottish", "desc": "Scottish accent"}, - "p238": {"gender": "Female", "accent": "N. Irish", "desc": "Northern Irish"}, "p239": {"gender": "Female", "accent": "English", "desc": "Young voice"}, "p240": {"gender": "Female", "accent": "English", "desc": "Mature voice"}, - "p241": {"gender": "Male", "accent": "Scottish", "desc": "Scottish accent"}, "p243": {"gender": "Male", "accent": "English", "desc": "Deep, authoritative"}, "p244": {"gender": "Female", "accent": "English", "desc": "Bright voice"}, - "p245": {"gender": "Male", "accent": "Irish", "desc": "Irish accent"}, + "p250": {"gender": "Female", "accent": "English", "desc": "Standard"}, + "p256": {"gender": "Male", "accent": "English", "desc": "Natural, clear"}, + # Scottish speakers + "p233": {"gender": "Female", "accent": "Scottish", "desc": "Scottish accent"}, + "p234": {"gender": "Female", "accent": "Scottish", "desc": "Scottish accent"}, + "p237": {"gender": "Male", "accent": "Scottish", "desc": "Scottish accent"}, + "p241": {"gender": "Male", "accent": "Scottish", "desc": "Scottish accent"}, "p246": {"gender": "Male", "accent": "Scottish", "desc": "Scottish accent"}, "p247": {"gender": "Male", "accent": "Scottish", "desc": "Scottish accent"}, - "p248": {"gender": "Female", "accent": "Indian", "desc": "Indian English"}, "p249": {"gender": "Female", "accent": "Scottish", "desc": "Scottish accent"}, - "p250": {"gender": "Female", "accent": "English", "desc": "Standard"}, + # Irish speakers + "p238": {"gender": "Female", "accent": "N. Irish", "desc": "Northern Irish"}, + "p245": {"gender": "Male", "accent": "Irish", "desc": "Irish accent"}, + # Indian English speakers + "p248": {"gender": "Female", "accent": "Indian", "desc": "Indian English"}, "p251": {"gender": "Male", "accent": "Indian", "desc": "Indian English"}, - "p256": {"gender": "Male", "accent": "English", "desc": "Natural, clear (default)"}, + # American female speakers + "p294": {"gender": "Female", "accent": "American", "desc": "San Francisco, age 33, mature"}, + "p297": {"gender": "Female", "accent": "American", "desc": "New York, age 20, young"}, + "p299": {"gender": "Female", "accent": "American", "desc": "California, age 25, soft"}, + "p300": {"gender": "Female", "accent": "American", "desc": "California, age 23, relaxed"}, + "p301": {"gender": "Female", "accent": "American", "desc": "North Carolina, age 23"}, + "p305": {"gender": "Female", "accent": "American", "desc": "Philadelphia, age 19, young"}, + "p306": {"gender": "Female", "accent": "American", "desc": "New York, age 21"}, + "p308": {"gender": "Female", "accent": "American", "desc": "Alabama, age 18, Southern"}, + "p310": {"gender": "Female", "accent": "American", "desc": "Tennessee, age 21, Southern"}, + "p318": {"gender": "Female", "accent": "American", "desc": "Napa, age 32, mature California"}, + "p329": {"gender": "Female", "accent": "American", "desc": "Age 23"}, + "p330": {"gender": "Female", "accent": "American", "desc": "Age 26"}, + "p333": {"gender": "Female", "accent": "American", "desc": "Indiana, age 19, young"}, + "p339": {"gender": "Female", "accent": "American", "desc": "Pennsylvania, age 21, soft (default)"}, + "p341": {"gender": "Female", "accent": "American", "desc": "Ohio, age 26"}, + "p361": {"gender": "Female", "accent": "American", "desc": "New Jersey, age 19, young"}, + "p362": {"gender": "Female", "accent": "American", "desc": "Age 29"}, } @@ -358,7 +381,7 @@ async def list_voices(): if meta: desc = f"{meta['desc']} ({meta['gender']}, {meta['accent']})" - is_recommended = speaker == "p226" + is_recommended = speaker == "p339" else: desc = f"Speaker: {speaker}" is_recommended = False diff --git a/shared/src/schemas/settings.ts b/shared/src/schemas/settings.ts index 2b3326f0..1f4d0519 100644 --- a/shared/src/schemas/settings.ts +++ b/shared/src/schemas/settings.ts @@ -190,7 +190,7 @@ export const DEFAULT_TTS_CONFIG: TTSConfig = { provider: 'coqui', endpoint: "https://api.openai.com", apiKey: "", - voice: "p256", + voice: "p339", model: "tts-1", speed: 1.0, availableVoices: [], From 9f4f01c1026efc90ef5f1b291791fd060e6f1983 Mon Sep 17 00:00:00 2001 From: engineer Date: Sat, 14 Feb 2026 18:35:59 -0800 Subject: [PATCH 2/2] docs: add voice selection guide and American female speaker table to README --- README.md | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index cbe7bdd8..dc86a355 100644 --- a/README.md +++ b/README.md @@ -168,10 +168,11 @@ We regularly sync our fork with upstream sst/opencode to incorporate new feature - **Dual Provider Support** - Browser-native Web Speech API + external OpenAI-compatible endpoints - **Browser-Native TTS** - Built-in Web Speech API for instant playback without API keys - **Coqui TTS with Multi-Model Support** - 7+ high-quality English voice models with runtime switching: - - Jenny (default, fastest) + - VCTK VITS (109 multi-speaker voices, default — **p339**, soft American female) + - Jenny (single-speaker, fast) - LJSpeech VITS, Tacotron2, Glow-TTS, FastPitch - - VCTK VITS (109 multi-speaker voices) - XTTS v2 (multilingual voice cloning) +- **17 American Female Voices** - Curated selection from the VCTK corpus with detailed metadata (region, age, vocal quality) - **AI Message Playback** - Listen to assistant responses with TTS - **OpenAI-Compatible** - Works with any OpenAI-compatible TTS endpoint - **Voice & Speed Discovery** - Automatic voice detection with caching (1hr TTL) @@ -181,6 +182,30 @@ We regularly sync our fork with upstream sst/opencode to incorporate new feature - **Floating Controls** - Persistent stop button for audio control - **Custom Endpoints** - Connect to local or self-hosted TTS services +**Available American Female Voices (VCTK model):** + +| Voice ID | Region | Age | Description | +|----------|--------|-----|-------------| +| **p339** (default) | Pennsylvania | 21 | Soft, warm tone | +| p294 | San Francisco | 33 | Mature | +| p297 | New York | 20 | Young | +| p299 | California | 25 | Soft | +| p300 | California | 23 | Relaxed | +| p301 | North Carolina | 23 | — | +| p305 | Philadelphia | 19 | Young | +| p306 | New York | 21 | — | +| p308 | Alabama | 18 | Southern | +| p310 | Tennessee | 21 | Southern | +| p318 | Napa | 32 | Mature California | +| p329 | — | 23 | — | +| p330 | — | 26 | — | +| p333 | Indiana | 19 | Young | +| p341 | Ohio | 26 | — | +| p361 | New Jersey | 19 | Young | +| p362 | — | 29 | — | + +The VCTK model also includes British, Scottish, Irish, and Indian English speakers (109 total). Change the voice in Settings → Voice → TTS Voice. + ### Session Management - **Session Pruning** - Automatic cleanup of old sessions to save disk space - **Auto-Prune on Startup** - Configurable retention period (default: 30 days) @@ -325,7 +350,8 @@ stt: tts: status: running provider: coqui - model: tts_models/en/jenny/jenny + model: tts_models/en/vctk/vits + voice: p339 tunnel: status: connected