diff --git a/agentscope-core/src/main/java/io/agentscope/core/model/tts/Qwen3TTSFlashVoice.java b/agentscope-core/src/main/java/io/agentscope/core/model/tts/Qwen3TTSFlashVoice.java new file mode 100644 index 000000000..f5f7aac54 --- /dev/null +++ b/agentscope-core/src/main/java/io/agentscope/core/model/tts/Qwen3TTSFlashVoice.java @@ -0,0 +1,238 @@ +/* + * Copyright 2024-2026 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.agentscope.core.model.tts; + +import java.util.Locale; +import java.util.Random; +import java.util.concurrent.ThreadLocalRandom; + +/** + * Predefined voices for Qwen3 TTS Flash / Realtime models. + * + *

The {@code voiceId} values correspond to the {@code voice} parameter + * accepted by qwen3-tts-flash and qwen3-tts-flash-realtime. + */ +public enum Qwen3TTSFlashVoice { + + /** + * 芊悦 (Cherry) - A sunny, positive, friendly, and natural young woman. + */ + CHERRY("Cherry", "芊悦", Gender.FEMALE, "A sunny, positive, friendly, and natural young woman"), + + /** + * 晨煦 (Ethan) - A bright, warm, energetic, and vibrant male voice with a standard Mandarin pronunciation and a slight northern accent. + */ + ETHAN( + "Ethan", + "晨煦", + Gender.MALE, + "A bright, warm, energetic, and vibrant male voice with a standard Mandarin" + + " pronunciation and a slight northern accent"), + + /** + * 不吃鱼 (Nofish) - A male designer who cannot pronounce retroflex sounds. + */ + NOFISH("Nofish", "不吃鱼", Gender.MALE, "A male designer who cannot pronounce retroflex sounds"), + + /** + * 詹妮弗 (Jennifer) - A premium, cinematic American English female voice. + */ + JENNIFER( + "Jennifer", "詹妮弗", Gender.FEMALE, "A premium, cinematic American English female voice"), + + /** + * 甜茶 (Ryan) - A rhythmic and dramatic voice with a sense of realism and tension. + */ + RYAN( + "Ryan", + "甜茶", + Gender.MALE, + "A rhythmic and dramatic voice with a sense of realism and tension"), + + /** + * 卡捷琳娜 (Katerina) - A mature female voice with a rich rhythm and lingering resonance. + */ + KATERINA( + "Katerina", + "卡捷琳娜", + Gender.FEMALE, + "A mature female voice with a rich rhythm and lingering resonance"), + + /** + * 墨讲师 (Elias) - A voice that maintains academic rigor while using storytelling techniques to transform complex knowledge into digestible cognitive modules. + */ + ELIAS( + "Elias", + "墨讲师", + Gender.MALE, + "A voice that maintains academic rigor while using storytelling techniques to transform" + + " complex knowledge into digestible cognitive modules"), + + /** + * 上海-阿珍 (Jada) - An energetic woman from Shanghai. + */ + JADA("Jada", "上海-阿珍", Gender.FEMALE, "An energetic woman from Shanghai"), + + /** + * 北京-晓东 (Dylan) - A teenage boy who grew up in the hutongs of Beijing. + */ + DYLAN("Dylan", "北京-晓东", Gender.MALE, "A teenage boy who grew up in the hutongs of Beijing"), + + /** + * 四川-晴儿 (Sunny) - The voice of a Sichuan girl whose sweetness melts your heart. + */ + SUNNY( + "Sunny", + "四川-晴儿", + Gender.FEMALE, + "The voice of a Sichuan girl whose sweetness melts your heart"), + + /** + * 南京-老李 (li) - Patient male yoga instructor. + */ + LI("li", "南京-老李", Gender.MALE, "Patient male yoga instructor"), + + /** + * 陕西-秦川 (Marcus) - A voice that is broad-faced and brief-spoken, sincere-hearted and deep-voiced—the authentic flavor of Shaanxi. + */ + MARCUS( + "Marcus", + "陕西-秦川", + Gender.MALE, + "A voice that is broad-faced and brief-spoken, sincere-hearted and deep-voiced—the" + + " authentic flavor of Shaanxi"), + + /** + * 闽南-阿杰 (Roy) - The voice of a humorous, straightforward, and lively young Taiwanese man. + */ + ROY( + "Roy", + "闽南-阿杰", + Gender.MALE, + "The voice of a humorous, straightforward, and lively young Taiwanese man"), + + /** + * 天津-李彼得 (Peter) - The voice of a professional straight man in Tianjin crosstalk. + */ + PETER( + "Peter", + "天津-李彼得", + Gender.MALE, + "The voice of a professional straight man in Tianjin crosstalk"), + + /** + * 粤语-阿强 (Rocky) - The voice of the humorous and witty Rocky, here for online chatting. + */ + ROCKY( + "Rocky", + "粤语-阿强", + Gender.MALE, + "The voice of the humorous and witty Rocky, here for online chatting"), + + /** + * 粤语-阿清 (Kiki) - A sweet female companion from Hong Kong. + */ + KIKI("Kiki", "粤语-阿清", Gender.FEMALE, "A sweet female companion from Hong Kong"), + + /** + * 四川-程川 (Eric) - An unconventional man from Chengdu, Sichuan. + */ + ERIC("Eric", "四川-程川", Gender.MALE, "An unconventional man from Chengdu, Sichuan"); + + private final String voiceId; + private final String displayName; + private final Gender gender; + private final String description; + + Qwen3TTSFlashVoice(String voiceId, String displayName, Gender gender, String description) { + this.voiceId = voiceId; + this.displayName = displayName; + this.gender = gender; + this.description = description; + } + + /** + * Voice id to use as the {@code voice} parameter in DashScope TTS requests. + */ + public String getVoiceId() { + return voiceId; + } + + /** + * Human friendly display name (typically Chinese). + */ + public String getDisplayName() { + return displayName; + } + + /** + * Gender of this voice (for informational / filtering purposes). + */ + public Gender getGender() { + return gender; + } + + /** + * Short description of the voice characteristics. + */ + public String getDescription() { + return description; + } + + /** + * Find a voice enum by its voiceId (case-insensitive). + * + * @param voiceId the voice id string, e.g. "Cherry" + * @return matching enum value, or {@code null} if not found + */ + public static Qwen3TTSFlashVoice fromVoiceId(String voiceId) { + if (voiceId == null || voiceId.isEmpty()) { + return null; + } + String normalized = voiceId.toLowerCase(Locale.ROOT); + for (Qwen3TTSFlashVoice v : values()) { + if (v.voiceId.toLowerCase(Locale.ROOT).equals(normalized)) { + return v; + } + } + return null; + } + + /** + * Pick a random voice using {@link ThreadLocalRandom}. + */ + public static Qwen3TTSFlashVoice random() { + return random(ThreadLocalRandom.current()); + } + + /** + * Pick a random voice using the provided {@link Random} instance. + */ + public static Qwen3TTSFlashVoice random(Random random) { + Qwen3TTSFlashVoice[] all = values(); + if (all.length == 0) { + throw new IllegalStateException("No Qwen3TTSFlashVoice defined"); + } + int idx = random.nextInt(all.length); + return all[idx]; + } + + /** Simple gender enum for voices. */ + public enum Gender { + MALE, + FEMALE + } +} diff --git a/agentscope-core/src/test/java/io/agentscope/core/model/tts/Qwen3TTSFlashVoiceTest.java b/agentscope-core/src/test/java/io/agentscope/core/model/tts/Qwen3TTSFlashVoiceTest.java new file mode 100644 index 000000000..8a52530ca --- /dev/null +++ b/agentscope-core/src/test/java/io/agentscope/core/model/tts/Qwen3TTSFlashVoiceTest.java @@ -0,0 +1,167 @@ +/* + * Copyright 2024-2026 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.agentscope.core.model.tts; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.HashSet; +import java.util.Random; +import java.util.Set; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for Qwen3TTSFlashVoice enum. + */ +class Qwen3TTSFlashVoiceTest { + + @Test + @DisplayName("should have 17 voice constants defined") + void shouldHave17Voices() { + assertEquals(17, Qwen3TTSFlashVoice.values().length); + } + + @Test + @DisplayName("should have correct voiceId for CHERRY") + void shouldHaveCorrectVoiceIdForCherry() { + assertEquals("Cherry", Qwen3TTSFlashVoice.CHERRY.getVoiceId()); + assertEquals("芊悦", Qwen3TTSFlashVoice.CHERRY.getDisplayName()); + assertEquals(Qwen3TTSFlashVoice.Gender.FEMALE, Qwen3TTSFlashVoice.CHERRY.getGender()); + assertNotNull(Qwen3TTSFlashVoice.CHERRY.getDescription()); + } + + @Test + @DisplayName("should have correct voiceId for ETHAN") + void shouldHaveCorrectVoiceIdForEthan() { + assertEquals("Ethan", Qwen3TTSFlashVoice.ETHAN.getVoiceId()); + assertEquals("晨煦", Qwen3TTSFlashVoice.ETHAN.getDisplayName()); + assertEquals(Qwen3TTSFlashVoice.Gender.MALE, Qwen3TTSFlashVoice.ETHAN.getGender()); + assertNotNull(Qwen3TTSFlashVoice.ETHAN.getDescription()); + } + + @Test + @DisplayName("should have correct gender for ELIAS") + void shouldHaveCorrectGenderForElias() { + assertEquals("Elias", Qwen3TTSFlashVoice.ELIAS.getVoiceId()); + assertEquals(Qwen3TTSFlashVoice.Gender.MALE, Qwen3TTSFlashVoice.ELIAS.getGender()); + } + + @Test + @DisplayName("should find voice by voiceId case-insensitively") + void shouldFindVoiceByVoiceId() { + assertEquals(Qwen3TTSFlashVoice.CHERRY, Qwen3TTSFlashVoice.fromVoiceId("Cherry")); + assertEquals(Qwen3TTSFlashVoice.CHERRY, Qwen3TTSFlashVoice.fromVoiceId("cherry")); + assertEquals(Qwen3TTSFlashVoice.CHERRY, Qwen3TTSFlashVoice.fromVoiceId("CHERRY")); + + assertEquals(Qwen3TTSFlashVoice.ETHAN, Qwen3TTSFlashVoice.fromVoiceId("Ethan")); + assertEquals(Qwen3TTSFlashVoice.LI, Qwen3TTSFlashVoice.fromVoiceId("li")); + assertEquals(Qwen3TTSFlashVoice.KIKI, Qwen3TTSFlashVoice.fromVoiceId("Kiki")); + } + + @Test + @DisplayName("should return null for non-existent voiceId") + void shouldReturnNullForNonExistentVoiceId() { + assertNull(Qwen3TTSFlashVoice.fromVoiceId("NonExistent")); + assertNull(Qwen3TTSFlashVoice.fromVoiceId("Unknown")); + } + + @Test + @DisplayName("should return null for null or empty voiceId") + void shouldReturnNullForNullOrEmptyVoiceId() { + assertNull(Qwen3TTSFlashVoice.fromVoiceId(null)); + assertNull(Qwen3TTSFlashVoice.fromVoiceId("")); + } + + @Test + @DisplayName("should return random voice using ThreadLocalRandom") + void shouldReturnRandomVoice() { + Qwen3TTSFlashVoice voice1 = Qwen3TTSFlashVoice.random(); + assertNotNull(voice1); + + // Call multiple times to verify randomness (not guaranteed to be different but should + // work) + Set voices = new HashSet<>(); + for (int i = 0; i < 50; i++) { + voices.add(Qwen3TTSFlashVoice.random()); + } + // With 17 voices and 50 calls, we should get at least 2 different voices + assertTrue(voices.size() >= 2); + } + + @Test + @DisplayName("should return random voice using provided Random instance") + void shouldReturnRandomVoiceWithProvidedRandom() { + Random random = new Random(12345); // Fixed seed for reproducibility + Qwen3TTSFlashVoice voice1 = Qwen3TTSFlashVoice.random(random); + assertNotNull(voice1); + + // Reset random with same seed to get same result + random = new Random(12345); + Qwen3TTSFlashVoice voice2 = Qwen3TTSFlashVoice.random(random); + assertEquals(voice1, voice2); + } + + @Test + @DisplayName("should have all voices with non-null properties") + void shouldHaveAllVoicesWithNonNullProperties() { + for (Qwen3TTSFlashVoice voice : Qwen3TTSFlashVoice.values()) { + assertNotNull(voice.getVoiceId(), "voiceId should not be null for " + voice); + assertNotNull(voice.getDisplayName(), "displayName should not be null for " + voice); + assertNotNull(voice.getGender(), "gender should not be null for " + voice); + assertNotNull(voice.getDescription(), "description should not be null for " + voice); + } + } + + @Test + @DisplayName("should have unique voiceIds for all voices") + void shouldHaveUniqueVoiceIds() { + Set voiceIds = new HashSet<>(); + for (Qwen3TTSFlashVoice voice : Qwen3TTSFlashVoice.values()) { + assertTrue( + voiceIds.add(voice.getVoiceId()), + "Duplicate voiceId found: " + voice.getVoiceId()); + } + assertEquals(17, voiceIds.size()); + } + + @Test + @DisplayName("Gender enum should have MALE and FEMALE") + void genderEnumShouldHaveMaleAndFemale() { + assertEquals(2, Qwen3TTSFlashVoice.Gender.values().length); + assertEquals(Qwen3TTSFlashVoice.Gender.MALE, Qwen3TTSFlashVoice.Gender.valueOf("MALE")); + assertEquals(Qwen3TTSFlashVoice.Gender.FEMALE, Qwen3TTSFlashVoice.Gender.valueOf("FEMALE")); + } + + @Test + @DisplayName("should have correct distribution of male and female voices") + void shouldHaveCorrectGenderDistribution() { + int maleCount = 0; + int femaleCount = 0; + for (Qwen3TTSFlashVoice voice : Qwen3TTSFlashVoice.values()) { + if (voice.getGender() == Qwen3TTSFlashVoice.Gender.MALE) { + maleCount++; + } else if (voice.getGender() == Qwen3TTSFlashVoice.Gender.FEMALE) { + femaleCount++; + } + } + assertEquals(17, maleCount + femaleCount, "Total male + female should equal total voices"); + assertTrue(maleCount > 0, "Should have at least one male voice"); + assertTrue(femaleCount > 0, "Should have at least one female voice"); + } +} diff --git a/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/GameEvent.java b/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/GameEvent.java index 7a0a771be..e785ce523 100644 --- a/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/GameEvent.java +++ b/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/GameEvent.java @@ -166,4 +166,16 @@ public static GameEvent userInputReceived(String inputType, String content) { GameEventType.USER_INPUT_RECEIVED, Map.of("inputType", inputType, "content", content)); } + + /** + * Create an audio chunk event for TTS. + * + * @param playerName The name of the player speaking + * @param audioBase64 Base64 encoded audio data + * @return The event + */ + public static GameEvent audioChunk(String playerName, String audioBase64) { + return new GameEvent( + GameEventType.AUDIO_CHUNK, Map.of("player", playerName, "audio", audioBase64)); + } } diff --git a/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/GameEventEmitter.java b/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/GameEventEmitter.java index c38c35236..9fc7d455c 100644 --- a/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/GameEventEmitter.java +++ b/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/GameEventEmitter.java @@ -363,6 +363,19 @@ public void emitUserInputReceived(String inputType, String content) { playerSink.tryEmitNext(event); } + /** + * Emit an audio chunk for TTS. + * Audio is always public (everyone can hear day discussion). + * + * @param playerName The name of the player speaking + * @param audioBase64 Base64 encoded audio data + */ + public void emitAudioChunk(String playerName, String audioBase64) { + GameEvent event = GameEvent.audioChunk(playerName, audioBase64); + godViewHistory.add(event); + playerSink.tryEmitNext(event); + } + /** * Get the player event stream as a Flux. * This stream contains events visible to the human player based on their role. diff --git a/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/GameEventType.java b/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/GameEventType.java index 758bd3fb9..02305009f 100644 --- a/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/GameEventType.java +++ b/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/GameEventType.java @@ -59,5 +59,8 @@ public enum GameEventType { WAIT_USER_INPUT, /** User input received confirmation. */ - USER_INPUT_RECEIVED + USER_INPUT_RECEIVED, + + /** Audio chunk for TTS (text-to-speech). */ + AUDIO_CHUNK } diff --git a/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/WerewolfWebGame.java b/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/WerewolfWebGame.java index d883d52a4..4eaac57eb 100644 --- a/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/WerewolfWebGame.java +++ b/agentscope-examples/werewolf-hitl/src/main/java/io/agentscope/examples/werewolf/web/WerewolfWebGame.java @@ -23,6 +23,7 @@ import io.agentscope.core.agent.user.UserAgent; import io.agentscope.core.formatter.dashscope.DashScopeMultiAgentFormatter; import io.agentscope.core.memory.InMemoryMemory; +import io.agentscope.core.message.Base64Source; import io.agentscope.core.message.MessageMetadataKeys; import io.agentscope.core.message.Msg; import io.agentscope.core.message.MsgRole; @@ -30,6 +31,8 @@ import io.agentscope.core.model.DashScopeChatModel; import io.agentscope.core.model.GenerateOptions; import io.agentscope.core.model.StructuredOutputReminder; +import io.agentscope.core.model.tts.DashScopeRealtimeTTSModel; +import io.agentscope.core.model.tts.Qwen3TTSFlashVoice; import io.agentscope.core.pipeline.MsgHub; import io.agentscope.core.tool.Toolkit; import io.agentscope.examples.werewolf.GameConfiguration; @@ -79,6 +82,8 @@ public class WerewolfWebGame { private DashScopeChatModel model; private GameState gameState; private Player humanPlayer; + // Mapping from player name to assigned TTS voice (randomized per game) + private Map playerVoices; public WerewolfWebGame(GameEventEmitter emitter, LocalizationBundle bundle) { this(emitter, bundle, null, null, new GameConfiguration()); @@ -165,6 +170,9 @@ public void start() throws Exception { } private GameState initializeGame() { + // Initialize per-game TTS voice mapping + playerVoices = new HashMap<>(); + List roles = new ArrayList<>(); for (int i = 0; i < gameConfig.getVillagerCount(); i++) roles.add(Role.VILLAGER); for (int i = 0; i < gameConfig.getWerewolfCount(); i++) roles.add(Role.WEREWOLF); @@ -305,6 +313,15 @@ private GameState initializeGame() { teammates); } + // Assign random TTS voice to each player (independent of roles) + List voices = new ArrayList<>(List.of(Qwen3TTSFlashVoice.values())); + Collections.shuffle(voices); + for (int i = 0; i < players.size(); i++) { + Player player = players.get(i); + Qwen3TTSFlashVoice voice = voices.get(i % voices.size()); + playerVoices.put(player.getName(), voice); + } + return new GameState(players); } @@ -455,7 +472,7 @@ private Player werewolvesKill() { try { VoteModel voteData = vote.getStructuredData(VoteModel.class); emitter.emitPlayerVote( - vote.getName(), + werewolf.getName(), voteData.targetPlayer, voteData.reason, EventVisibility.WEREWOLF_ONLY); @@ -872,6 +889,9 @@ private void discussionPhase() { Msg response = player.getAgent().call().block(); String content = utils.extractTextContent(response); emitter.emitPlayerSpeak(player.getName(), content, "day_discussion"); + + // Generate TTS for AI speech (only during day discussion) + generateTTSForSpeech(player.getName(), content); } } } @@ -946,7 +966,7 @@ private Player votingPhase() { try { VoteModel voteData = vote.getStructuredData(VoteModel.class); emitter.emitPlayerVote( - vote.getName(), + player.getName(), voteData.targetPlayer, voteData.reason, EventVisibility.PUBLIC); @@ -1135,4 +1155,73 @@ private void emitStatsUpdate() { gameState.getAliveWerewolves().size(), gameState.getAliveVillagers().size()); } + + /** + * Generate TTS audio for a player's speech and emit audio chunks to frontend. + * Only called during day discussion phase to avoid generating TTS for votes/actions. + * + * @param playerName The name of the speaking player + * @param text The text content to convert to speech + */ + private void generateTTSForSpeech(String playerName, String text) { + if (text == null || text.trim().isEmpty()) { + return; + } + + String apiKey = System.getenv("DASHSCOPE_API_KEY"); + if (apiKey == null || apiKey.isEmpty()) { + // Skip TTS if no API key + return; + } + + // Resolve voice for this player (fallback to a default if not assigned) + Qwen3TTSFlashVoice voice = playerVoices != null ? playerVoices.get(playerName) : null; + if (voice == null) { + voice = Qwen3TTSFlashVoice.CHERRY; + } + + // Create TTS model for this specific speech + DashScopeRealtimeTTSModel ttsModel = null; + try { + ttsModel = + DashScopeRealtimeTTSModel.builder() + .apiKey(apiKey) + .modelName("qwen3-tts-flash-realtime") + .voice(voice.getVoiceId()) + .sampleRate(24000) + .format("pcm") + .build(); + + // Start session + ttsModel.startSession(); + + // Subscribe to audio stream and emit chunks + ttsModel.getAudioStream() + .doOnNext( + audio -> { + if (audio.getSource() instanceof Base64Source src) { + emitter.emitAudioChunk(playerName, src.getData()); + } + }) + .subscribe(); + + // Push text to TTS + ttsModel.push(text); + + // Finish and wait for all audio + ttsModel.finish().blockLast(); + } catch (Exception e) { + // Log error but don't fail the game + System.err.println("TTS generation error for " + playerName + ": " + e.getMessage()); + } finally { + // Clean up TTS resources + if (ttsModel != null) { + try { + ttsModel.close(); + } catch (Exception e) { + // Ignore cleanup errors + } + } + } + } } diff --git a/agentscope-examples/werewolf-hitl/src/main/resources/static/js/app.js b/agentscope-examples/werewolf-hitl/src/main/resources/static/js/app.js index 7b49ad93b..959e7fabd 100644 --- a/agentscope-examples/werewolf-hitl/src/main/resources/static/js/app.js +++ b/agentscope-examples/werewolf-hitl/src/main/resources/static/js/app.js @@ -24,6 +24,13 @@ let currentInputType = null; let selectedRole = 'RANDOM'; let isSpectatorMode = false; +// Audio state +let audioContext = null; +const playerAudioPlayers = new Map(); // Map +// Global audio playback coordination (single speaker at a time) +let currentSpeakingPlayer = null; +const pendingSpeakingPlayers = []; // Queue of player names waiting to speak + // Role icons mapping const roleIcons = { 'VILLAGER': '👤', @@ -342,6 +349,9 @@ function handleEvent(event) { case 'USER_INPUT_RECEIVED': handleUserInputReceived(data.inputType, data.content); break; + case 'AUDIO_CHUNK': + handleAudioChunk(data.player, data.audio); + break; } } @@ -786,3 +796,220 @@ document.addEventListener('DOMContentLoaded', () => { })); renderPlayers(); }); + +// ==================== Audio Functions ==================== +/** + * Initialize audio context on first user interaction. + */ +function initAudio() { + if (!audioContext) { + audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 24000 }); + } +} + +/** + * Handle audio chunk event from backend. + * + * @param {string} playerName - The name of the speaking player + * @param {string} audioBase64 - Base64 encoded audio data + */ +function handleAudioChunk(playerName, audioBase64) { + if (!audioBase64) return; + + // Initialize audio context + initAudio(); + + // Get or create audio player for this player + let audioPlayer = playerAudioPlayers.get(playerName); + if (!audioPlayer) { + audioPlayer = createAudioPlayerForPlayer(playerName); + playerAudioPlayers.set(playerName, audioPlayer); + } + + // Decode and add to playback queue + const audioData = base64ToArrayBuffer(audioBase64); + addAudioChunk(audioPlayer, audioData); + + // Global coordination: only one player speaks at a time. + if (!currentSpeakingPlayer) { + // No one is speaking, start this player immediately + currentSpeakingPlayer = playerName; + if (!audioPlayer.isPlaying) { + playAudio(audioPlayer, playerName); + } + } else if (currentSpeakingPlayer === playerName) { + // Same player is already speaking, its queue will continue in playAudio + } else { + // Another player is speaking, enqueue this player if not already queued + if (!pendingSpeakingPlayers.includes(playerName)) { + pendingSpeakingPlayers.push(playerName); + } + } +} + +/** + * Create an audio player for a specific player. + * + * @param {string} playerName - Player name + * @returns {object} Audio player object + */ +function createAudioPlayerForPlayer(playerName) { + return { + chunks: [], // Queue of audio chunks + sources: [], // Active audio sources + isPlaying: false, + currentIndex: 0 // Current playback position + }; +} + +/** + * Add audio chunk to player's queue. + * + * @param {object} audioPlayer - Audio player object + * @param {ArrayBuffer} audioData - Audio data + */ +function addAudioChunk(audioPlayer, audioData) { + audioPlayer.chunks.push(audioData); +} + +/** + * Play audio from queue. + * + * @param {object} audioPlayer - Audio player object + * @param {string} playerName - Player name for visual feedback + */ +async function playAudio(audioPlayer, playerName) { + if (audioPlayer.isPlaying || audioPlayer.chunks.length === 0) { + return; + } + + audioPlayer.isPlaying = true; + highlightPlayer(playerName); + + // Play chunks from current index to end + while (audioPlayer.currentIndex < audioPlayer.chunks.length && audioPlayer.isPlaying) { + const chunk = audioPlayer.chunks[audioPlayer.currentIndex]; + audioPlayer.currentIndex++; + await playAudioChunk(chunk, audioPlayer); + + if (!audioPlayer.isPlaying) { + break; + } + } + + // Playback completed + audioPlayer.isPlaying = false; + audioPlayer.currentIndex = 0; // Reset index + audioPlayer.chunks = []; // Clear processed chunks + unhighlightPlayer(playerName); + + // Mark current speaker finished + if (currentSpeakingPlayer === playerName) { + currentSpeakingPlayer = null; + } + + // Start next waiting player if any + while (pendingSpeakingPlayers.length > 0) { + const nextPlayerName = pendingSpeakingPlayers.shift(); + const nextAudioPlayer = playerAudioPlayers.get(nextPlayerName); + if (nextAudioPlayer && nextAudioPlayer.chunks.length > 0) { + currentSpeakingPlayer = nextPlayerName; + if (!nextAudioPlayer.isPlaying) { + // Fire-and-forget, chaining will continue when this playback finishes + playAudio(nextAudioPlayer, nextPlayerName); + } + break; + } + } +} + +/** + * Play a single audio chunk. + * + * @param {ArrayBuffer} audioData - Audio data + * @param {object} audioPlayer - Audio player object + * @returns {Promise} Promise that resolves when chunk finishes playing + */ +async function playAudioChunk(audioData, audioPlayer) { + return new Promise((resolve, reject) => { + if (!audioPlayer.isPlaying) { + resolve(); + return; + } + + try { + // Try to decode as PCM + playRawPCM(audioData, audioPlayer).then(resolve).catch(reject); + } catch (e) { + reject(e); + } + }); +} + +/** + * Play raw PCM audio data. + * + * @param {ArrayBuffer} data - PCM audio data + * @param {object} audioPlayer - Audio player object + * @returns {Promise} Promise that resolves when playback finishes + */ +async function playRawPCM(data, audioPlayer) { + return new Promise((resolve, reject) => { + if (!audioPlayer.isPlaying) { + resolve(); + return; + } + + try { + const pcmData = new Int16Array(data); + const floatData = new Float32Array(pcmData.length); + for (let i = 0; i < pcmData.length; i++) { + floatData[i] = pcmData[i] / 32768.0; + } + + const audioBuffer = audioContext.createBuffer(1, floatData.length, 24000); + audioBuffer.getChannelData(0).set(floatData); + + if (!audioPlayer.isPlaying) { + resolve(); + return; + } + + const source = audioContext.createBufferSource(); + source.buffer = audioBuffer; + source.connect(audioContext.destination); + audioPlayer.sources.push(source); + + source.onended = () => { + const index = audioPlayer.sources.indexOf(source); + if (index > -1) { + audioPlayer.sources.splice(index, 1); + } + resolve(); + }; + + if (audioPlayer.isPlaying) { + source.start(); + } else { + resolve(); + } + } catch (e) { + reject(e); + } + }); +} + +/** + * Convert base64 string to ArrayBuffer. + * + * @param {string} base64 - Base64 encoded string + * @returns {ArrayBuffer} Decoded array buffer + */ +function base64ToArrayBuffer(base64) { + const binaryString = atob(base64); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + return bytes.buffer; +}