Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,11 @@ We regularly sync our fork with upstream sst/opencode to incorporate new feature
- **Dual Provider Support** - Browser-native Web Speech API + external OpenAI-compatible endpoints
- **Browser-Native TTS** - Built-in Web Speech API for instant playback without API keys
- **Coqui TTS with Multi-Model Support** - 7+ high-quality English voice models with runtime switching:
- Jenny (default, fastest)
- VCTK VITS (109 multi-speaker voices, default — **p339**, soft American female)
- Jenny (single-speaker, fast)
- LJSpeech VITS, Tacotron2, Glow-TTS, FastPitch
- VCTK VITS (109 multi-speaker voices)
- XTTS v2 (multilingual voice cloning)
- **17 American Female Voices** - Curated selection from the VCTK corpus with detailed metadata (region, age, vocal quality)
- **AI Message Playback** - Listen to assistant responses with TTS
- **OpenAI-Compatible** - Works with any OpenAI-compatible TTS endpoint
- **Voice & Speed Discovery** - Automatic voice detection with caching (1hr TTL)
Expand All @@ -181,6 +182,30 @@ We regularly sync our fork with upstream sst/opencode to incorporate new feature
- **Floating Controls** - Persistent stop button for audio control
- **Custom Endpoints** - Connect to local or self-hosted TTS services

**Available American Female Voices (VCTK model):**

| Voice ID | Region | Age | Description |
|----------|--------|-----|-------------|
| **p339** (default) | Pennsylvania | 21 | Soft, warm tone |
| p294 | San Francisco | 33 | Mature |
| p297 | New York | 20 | Young |
| p299 | California | 25 | Soft |
| p300 | California | 23 | Relaxed |
| p301 | North Carolina | 23 | — |
| p305 | Philadelphia | 19 | Young |
| p306 | New York | 21 | — |
| p308 | Alabama | 18 | Southern |
| p310 | Tennessee | 21 | Southern |
| p318 | Napa | 32 | Mature California |
| p329 | — | 23 | — |
| p330 | — | 26 | — |
| p333 | Indiana | 19 | Young |
| p341 | Ohio | 26 | — |
| p361 | New Jersey | 19 | Young |
| p362 | — | 29 | — |

The VCTK model also includes British, Scottish, Irish, and Indian English speakers (109 total). Change the voice in Settings → Voice → TTS Voice.

### Session Management
- **Session Pruning** - Automatic cleanup of old sessions to save disk space
- **Auto-Prune on Startup** - Configurable retention period (default: 30 days)
Expand Down Expand Up @@ -325,7 +350,8 @@ stt:
tts:
status: running
provider: coqui
model: tts_models/en/jenny/jenny
model: tts_models/en/vctk/vits
voice: p339

tunnel:
status: connected
Expand Down
47 changes: 35 additions & 12 deletions scripts/coqui-server.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,34 +115,57 @@
}
]

# Detailed metadata for popular VCTK speakers
# Detailed metadata for VCTK speakers
# Source: VCTK corpus speaker-info.txt (University of Edinburgh)
VCTK_METADATA = {
"p226": {"gender": "Male", "accent": "English", "desc": "Clear, professional (recommended)"},
# British English speakers
"p225": {"gender": "Female", "accent": "English", "desc": "Clear, neutral"},
"p226": {"gender": "Male", "accent": "English", "desc": "Clear, professional"},
"p227": {"gender": "Male", "accent": "English", "desc": "Deep voice"},
"p228": {"gender": "Female", "accent": "English", "desc": "Warm tone"},
"p229": {"gender": "Female", "accent": "English", "desc": "Higher pitch"},
"p230": {"gender": "Female", "accent": "English", "desc": "Soft voice"},
"p231": {"gender": "Male", "accent": "English", "desc": "Standard"},
"p232": {"gender": "Male", "accent": "English", "desc": "Casual"},
"p233": {"gender": "Female", "accent": "Scottish", "desc": "Scottish accent"},
"p234": {"gender": "Female", "accent": "Scottish", "desc": "Scottish accent"},
"p236": {"gender": "Female", "accent": "English", "desc": "Professional"},
"p237": {"gender": "Male", "accent": "Scottish", "desc": "Scottish accent"},
"p238": {"gender": "Female", "accent": "N. Irish", "desc": "Northern Irish"},
"p239": {"gender": "Female", "accent": "English", "desc": "Young voice"},
"p240": {"gender": "Female", "accent": "English", "desc": "Mature voice"},
"p241": {"gender": "Male", "accent": "Scottish", "desc": "Scottish accent"},
"p243": {"gender": "Male", "accent": "English", "desc": "Deep, authoritative"},
"p244": {"gender": "Female", "accent": "English", "desc": "Bright voice"},
"p245": {"gender": "Male", "accent": "Irish", "desc": "Irish accent"},
"p250": {"gender": "Female", "accent": "English", "desc": "Standard"},
"p256": {"gender": "Male", "accent": "English", "desc": "Natural, clear"},
# Scottish speakers
"p233": {"gender": "Female", "accent": "Scottish", "desc": "Scottish accent"},
"p234": {"gender": "Female", "accent": "Scottish", "desc": "Scottish accent"},
"p237": {"gender": "Male", "accent": "Scottish", "desc": "Scottish accent"},
"p241": {"gender": "Male", "accent": "Scottish", "desc": "Scottish accent"},
"p246": {"gender": "Male", "accent": "Scottish", "desc": "Scottish accent"},
"p247": {"gender": "Male", "accent": "Scottish", "desc": "Scottish accent"},
"p248": {"gender": "Female", "accent": "Indian", "desc": "Indian English"},
"p249": {"gender": "Female", "accent": "Scottish", "desc": "Scottish accent"},
"p250": {"gender": "Female", "accent": "English", "desc": "Standard"},
# Irish speakers
"p238": {"gender": "Female", "accent": "N. Irish", "desc": "Northern Irish"},
"p245": {"gender": "Male", "accent": "Irish", "desc": "Irish accent"},
# Indian English speakers
"p248": {"gender": "Female", "accent": "Indian", "desc": "Indian English"},
"p251": {"gender": "Male", "accent": "Indian", "desc": "Indian English"},
"p256": {"gender": "Male", "accent": "English", "desc": "Natural, clear (default)"},
# American female speakers
"p294": {"gender": "Female", "accent": "American", "desc": "San Francisco, age 33, mature"},
"p297": {"gender": "Female", "accent": "American", "desc": "New York, age 20, young"},
"p299": {"gender": "Female", "accent": "American", "desc": "California, age 25, soft"},
"p300": {"gender": "Female", "accent": "American", "desc": "California, age 23, relaxed"},
"p301": {"gender": "Female", "accent": "American", "desc": "North Carolina, age 23"},
"p305": {"gender": "Female", "accent": "American", "desc": "Philadelphia, age 19, young"},
"p306": {"gender": "Female", "accent": "American", "desc": "New York, age 21"},
"p308": {"gender": "Female", "accent": "American", "desc": "Alabama, age 18, Southern"},
"p310": {"gender": "Female", "accent": "American", "desc": "Tennessee, age 21, Southern"},
"p318": {"gender": "Female", "accent": "American", "desc": "Napa, age 32, mature California"},
"p329": {"gender": "Female", "accent": "American", "desc": "Age 23"},
"p330": {"gender": "Female", "accent": "American", "desc": "Age 26"},
"p333": {"gender": "Female", "accent": "American", "desc": "Indiana, age 19, young"},
"p339": {"gender": "Female", "accent": "American", "desc": "Pennsylvania, age 21, soft (default)"},
"p341": {"gender": "Female", "accent": "American", "desc": "Ohio, age 26"},
"p361": {"gender": "Female", "accent": "American", "desc": "New Jersey, age 19, young"},
"p362": {"gender": "Female", "accent": "American", "desc": "Age 29"},
}


Expand Down Expand Up @@ -358,7 +381,7 @@ async def list_voices():

if meta:
desc = f"{meta['desc']} ({meta['gender']}, {meta['accent']})"
is_recommended = speaker == "p226"
is_recommended = speaker == "p339"
else:
desc = f"Speaker: {speaker}"
is_recommended = False
Expand Down
2 changes: 1 addition & 1 deletion shared/src/schemas/settings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ export const DEFAULT_TTS_CONFIG: TTSConfig = {
provider: 'coqui',
endpoint: "https://api.openai.com",
apiKey: "",
voice: "p256",
voice: "p339",
model: "tts-1",
speed: 1.0,
availableVoices: [],
Expand Down
Loading