From 039083d7022c347fb4ba88c0e599505f0ba8b381 Mon Sep 17 00:00:00 2001 From: liweiguang Date: Tue, 10 Feb 2026 13:23:04 +0800 Subject: [PATCH 1/3] fix(voice): handle odd-length audio buffers safely --- src/agents/voice/result.py | 7 ++++++- tests/voice/test_pipeline.py | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/src/agents/voice/result.py b/src/agents/voice/result.py index fea79902ea..67fc079651 100644 --- a/src/agents/voice/result.py +++ b/src/agents/voice/result.py @@ -88,7 +88,12 @@ async def _add_error(self, error: Exception): def _transform_audio_buffer( self, buffer: list[bytes], output_dtype: npt.DTypeLike ) -> npt.NDArray[np.int16 | np.float32]: - np_array = np.frombuffer(b"".join(buffer), dtype=np.int16) + combined_buffer = b"".join(buffer) + if len(combined_buffer) % 2 != 0: + # np.int16 needs 2-byte alignment; pad odd-length chunks safely. + combined_buffer += b"\x00" + + np_array = np.frombuffer(combined_buffer, dtype=np.int16) if output_dtype == np.int16: return np_array diff --git a/tests/voice/test_pipeline.py b/tests/voice/test_pipeline.py index 5190446879..7f1bd80163 100644 --- a/tests/voice/test_pipeline.py +++ b/tests/voice/test_pipeline.py @@ -5,7 +5,13 @@ import pytest try: - from agents.voice import AudioInput, TTSModelSettings, VoicePipeline, VoicePipelineConfig + from agents.voice import ( + AudioInput, + StreamedAudioResult, + TTSModelSettings, + VoicePipeline, + VoicePipelineConfig, + ) from .fake_models import FakeStreamedAudioInput, FakeSTT, FakeTTS, FakeWorkflow from .helpers import extract_events @@ -13,6 +19,33 @@ pass +def test_streamed_audio_result_odd_length_buffer_int16() -> None: + result = StreamedAudioResult( + FakeTTS(), + TTSModelSettings(dtype=np.int16), + VoicePipelineConfig(), + ) + + transformed = result._transform_audio_buffer([b"\x01"], np.int16) + + assert transformed.dtype == np.int16 + assert transformed.tolist() == [1] + + +def test_streamed_audio_result_odd_length_buffer_float32() -> None: + result = StreamedAudioResult( + FakeTTS(), + TTSModelSettings(dtype=np.float32), + VoicePipelineConfig(), + ) + + transformed = result._transform_audio_buffer([b"\x01"], np.float32) + + assert transformed.dtype == np.float32 + assert transformed.shape == (1, 1) + assert transformed[0, 0] == pytest.approx(1 / 32767.0) + + @pytest.mark.asyncio async def test_voicepipeline_run_single_turn() -> None: # Single turn. Should produce a single audio output, which is the TTS output for "out_1". From 6b1036277536a6325d8f7259406ee58c33e2c409 Mon Sep 17 00:00:00 2001 From: liweiguang Date: Tue, 10 Feb 2026 13:34:06 +0800 Subject: [PATCH 2/3] fix(voice): preserve split PCM sample boundaries across chunks --- src/agents/voice/result.py | 33 ++++++++++++++++++++++++++------- tests/voice/test_pipeline.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/src/agents/voice/result.py b/src/agents/voice/result.py index 67fc079651..9dfad571bc 100644 --- a/src/agents/voice/result.py +++ b/src/agents/voice/result.py @@ -123,6 +123,7 @@ async def _stream_audio( first_byte_received = False buffer: list[bytes] = [] full_audio_data: list[bytes] = [] + pending_byte = b"" async for chunk in self.tts_model.run(text, self.tts_settings): if not first_byte_received: @@ -133,15 +134,33 @@ async def _stream_audio( buffer.append(chunk) full_audio_data.append(chunk) if len(buffer) >= self._buffer_size: - audio_np = self._transform_audio_buffer(buffer, self.tts_settings.dtype) - if self.tts_settings.transform_data: - audio_np = self.tts_settings.transform_data(audio_np) - await local_queue.put( - VoiceStreamEventAudio(data=audio_np) - ) # Use local queue + combined = pending_byte + b"".join(buffer) + if len(combined) % 2 != 0: + pending_byte = combined[-1:] + combined = combined[:-1] + else: + pending_byte = b"" + + if combined: + audio_np = self._transform_audio_buffer( + [combined], self.tts_settings.dtype + ) + if self.tts_settings.transform_data: + audio_np = self.tts_settings.transform_data(audio_np) + await local_queue.put( + VoiceStreamEventAudio(data=audio_np) + ) # Use local queue buffer = [] if buffer: - audio_np = self._transform_audio_buffer(buffer, self.tts_settings.dtype) + combined = pending_byte + b"".join(buffer) + else: + combined = pending_byte + + if combined: + # Final flush: pad the remaining half sample if needed. + if len(combined) % 2 != 0: + combined += b"\x00" + audio_np = self._transform_audio_buffer([combined], self.tts_settings.dtype) if self.tts_settings.transform_data: audio_np = self.tts_settings.transform_data(audio_np) await local_queue.put(VoiceStreamEventAudio(data=audio_np)) # Use local queue diff --git a/tests/voice/test_pipeline.py b/tests/voice/test_pipeline.py index 7f1bd80163..b9ad8a6777 100644 --- a/tests/voice/test_pipeline.py +++ b/tests/voice/test_pipeline.py @@ -1,5 +1,7 @@ from __future__ import annotations +import asyncio + import numpy as np import numpy.typing as npt import pytest @@ -11,6 +13,8 @@ TTSModelSettings, VoicePipeline, VoicePipelineConfig, + VoiceStreamEventAudio, + VoiceStreamEventLifecycle, ) from .fake_models import FakeStreamedAudioInput, FakeSTT, FakeTTS, FakeWorkflow @@ -46,6 +50,34 @@ def test_streamed_audio_result_odd_length_buffer_float32() -> None: assert transformed[0, 0] == pytest.approx(1 / 32767.0) +@pytest.mark.asyncio +async def test_streamed_audio_result_preserves_cross_chunk_sample_boundaries() -> None: + class SplitSampleTTS(FakeTTS): + async def run(self, text: str, settings: TTSModelSettings): + del text, settings + yield b"\x01" + yield b"\x00" + + result = StreamedAudioResult( + SplitSampleTTS(), + TTSModelSettings(buffer_size=1, dtype=np.int16), + VoicePipelineConfig(), + ) + local_queue: asyncio.Queue[object] = asyncio.Queue() + + await result._stream_audio("hello", local_queue, finish_turn=True) + + audio_chunks: list[bytes] = [] + while True: + event = await local_queue.get() + if isinstance(event, VoiceStreamEventAudio): + audio_chunks.append(event.data.tobytes()) + if isinstance(event, VoiceStreamEventLifecycle) and event.event == "turn_ended": + break + + assert audio_chunks == [np.array([1], dtype=np.int16).tobytes()] + + @pytest.mark.asyncio async def test_voicepipeline_run_single_turn() -> None: # Single turn. Should produce a single audio output, which is the TTS output for "out_1". From 2a0d9e361bc10ec0521a46a8da81ea8078db2142 Mon Sep 17 00:00:00 2001 From: liweiguang Date: Tue, 10 Feb 2026 14:12:20 +0800 Subject: [PATCH 3/3] test(voice): fix typing in boundary regression test --- tests/voice/test_pipeline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/voice/test_pipeline.py b/tests/voice/test_pipeline.py index b9ad8a6777..3134f04d2c 100644 --- a/tests/voice/test_pipeline.py +++ b/tests/voice/test_pipeline.py @@ -13,6 +13,7 @@ TTSModelSettings, VoicePipeline, VoicePipelineConfig, + VoiceStreamEvent, VoiceStreamEventAudio, VoiceStreamEventLifecycle, ) @@ -63,14 +64,14 @@ async def run(self, text: str, settings: TTSModelSettings): TTSModelSettings(buffer_size=1, dtype=np.int16), VoicePipelineConfig(), ) - local_queue: asyncio.Queue[object] = asyncio.Queue() + local_queue: asyncio.Queue[VoiceStreamEvent | None] = asyncio.Queue() await result._stream_audio("hello", local_queue, finish_turn=True) audio_chunks: list[bytes] = [] while True: event = await local_queue.get() - if isinstance(event, VoiceStreamEventAudio): + if isinstance(event, VoiceStreamEventAudio) and event.data is not None: audio_chunks.append(event.data.tobytes()) if isinstance(event, VoiceStreamEventLifecycle) and event.event == "turn_ended": break