Striveworks · ajsalow · Aug 20, 2025 · Aug 14, 2025
diff --git a/runtimes/huggingface/mlserver_huggingface/codecs/__init__.py b/runtimes/huggingface/mlserver_huggingface/codecs/__init__.py
@@ -1,4 +1,5 @@
 from .base import MultiInputRequestCodec, HuggingfaceRequestCodec
+from .audio import AudioBytesCodec
 from .image import PILImageCodec
 from .json import HuggingfaceSingleJSONCodec
 from .jsonlist import HuggingfaceListJSONCodec
@@ -10,6 +11,7 @@
 __all__ = [
     "MultiInputRequestCodec",
     "HuggingfaceRequestCodec",
+    "AudioBytesCodec",
     "PILImageCodec",
     "HuggingfaceSingleJSONCodec",
     "HuggingfaceListJSONCodec",

diff --git a/runtimes/huggingface/mlserver_huggingface/codecs/audio.py b/runtimes/huggingface/mlserver_huggingface/codecs/audio.py
@@ -0,0 +1,74 @@
+import base64
+from typing import List, Any, Union
+from mlserver.codecs.base import InputCodec, register_input_codec
+from mlserver.codecs.lists import as_list, is_list_of
+from mlserver.types import RequestInput, ResponseOutput, Parameters
+
+
+def _audio_base64encode(audio_bytes: bytes) -> str:
+    """Encode audio bytes to base64 string"""
+    return base64.b64encode(audio_bytes).decode()
+
+
+def _audio_base64decode(audio_b64: Union[bytes, str]) -> bytes:
+    """Decode base64 string to audio bytes"""
+    if isinstance(audio_b64, bytes):
+        audio_b64 = audio_b64.decode()
+    return base64.b64decode(audio_b64)
+
+
+@register_input_codec
+class AudioBytesCodec(InputCodec):
+    """
+    Codec that converts to / from raw audio bytes input.
+    This codec handles raw audio bytes that can be passed directly 
+    to HuggingFace pipelines for automatic speech recognition.
+    """
+
+    ContentType = "audio_bytes"
+    TypeHint = List[bytes]
+
+    @classmethod
+    def can_encode(cls, payload: Any) -> bool:
+        return is_list_of(payload, bytes)
+
+    @classmethod
+    def encode_output(
+        cls, name: str, payload: List[bytes], **kwargs
+    ) -> ResponseOutput:
+        packed = [_audio_base64encode(audio) for audio in payload]
+        shape = [len(payload), 1]
+        return ResponseOutput(
+            name=name,
+            parameters=Parameters(
+                content_type=cls.ContentType,
+            ),
+            datatype="BYTES",
+            shape=shape,
+            data=packed,
+        )
+
+    @classmethod
+    def decode_output(cls, response_output: ResponseOutput) -> List[bytes]:
+        packed = response_output.data.root
+        return [_audio_base64decode(audio) for audio in as_list(packed)]
+
+    @classmethod
+    def encode_input(
+        cls, name: str, payload: List[bytes], **kwargs
+    ) -> RequestInput:
+        output = cls.encode_output(name, payload)
+        return RequestInput(
+            name=output.name,
+            parameters=Parameters(
+                content_type=cls.ContentType,
+            ),
+            datatype=output.datatype,
+            shape=output.shape,
+            data=output.data,
+        )
+
+    @classmethod
+    def decode_input(cls, request_input: RequestInput) -> List[bytes]:
+        packed = request_input.data.root
+        return [_audio_base64decode(audio) for audio in as_list(packed)]
diff --git a/runtimes/huggingface/mlserver_huggingface/codecs/base.py b/runtimes/huggingface/mlserver_huggingface/codecs/base.py
@@ -22,6 +22,7 @@
     RequestInput,
     ResponseOutput,
 )
+from .audio import AudioBytesCodec
 from .image import PILImageCodec
 from .json import HuggingfaceSingleJSONCodec
 from .jsonlist import HuggingfaceListJSONCodec
@@ -203,6 +204,7 @@ def decode_request(cls, request: InferenceRequest) -> Dict[str, Any]:
 @register_request_codec
 class HuggingfaceRequestCodec(MultiInputRequestCodec):
     InputCodecsWithPriority = [
+        AudioBytesCodec,
         PILImageCodec,
         HuggingfaceSingleJSONCodec,
         HuggingfaceListJSONCodec,

diff --git a/runtimes/huggingface/mlserver_huggingface/metadata.py b/runtimes/huggingface/mlserver_huggingface/metadata.py
@@ -18,6 +18,13 @@
                 datatype="BYTES",
                 parameters=dict(content_type="base64"),
             ),
+            # raw audio bytes inputs
+            MetadataTensor(
+                name="inputs",
+                shape=[-1],
+                datatype="BYTES",
+                parameters=dict(content_type="audio_bytes"),
+            ),
             # numpy.ndarray inputs
             MetadataTensor(
                 name="inputs",
@@ -51,6 +58,13 @@
                 datatype="BYTES",
                 parameters=dict(content_type="base64"),
             ),
+            # raw audio bytes inputs
+            MetadataTensor(
+                name="inputs",
+                shape=[-1],
+                datatype="BYTES",
+                parameters=dict(content_type="audio_bytes"),
+            ),
             # numpy.ndarray inputs
             MetadataTensor(
                 name="inputs",