diff --git a/runtimes/huggingface/mlserver_huggingface/codecs/__init__.py b/runtimes/huggingface/mlserver_huggingface/codecs/__init__.py index 9036d4089..31faf7791 100644 --- a/runtimes/huggingface/mlserver_huggingface/codecs/__init__.py +++ b/runtimes/huggingface/mlserver_huggingface/codecs/__init__.py @@ -1,4 +1,5 @@ from .base import MultiInputRequestCodec, HuggingfaceRequestCodec +from .audio import AudioBytesCodec from .image import PILImageCodec from .json import HuggingfaceSingleJSONCodec from .jsonlist import HuggingfaceListJSONCodec @@ -10,6 +11,7 @@ __all__ = [ "MultiInputRequestCodec", "HuggingfaceRequestCodec", + "AudioBytesCodec", "PILImageCodec", "HuggingfaceSingleJSONCodec", "HuggingfaceListJSONCodec", diff --git a/runtimes/huggingface/mlserver_huggingface/codecs/audio.py b/runtimes/huggingface/mlserver_huggingface/codecs/audio.py new file mode 100644 index 000000000..b5d6c3fe5 --- /dev/null +++ b/runtimes/huggingface/mlserver_huggingface/codecs/audio.py @@ -0,0 +1,74 @@ +import base64 +from typing import List, Any, Union +from mlserver.codecs.base import InputCodec, register_input_codec +from mlserver.codecs.lists import as_list, is_list_of +from mlserver.types import RequestInput, ResponseOutput, Parameters + + +def _audio_base64encode(audio_bytes: bytes) -> str: + """Encode audio bytes to base64 string""" + return base64.b64encode(audio_bytes).decode() + + +def _audio_base64decode(audio_b64: Union[bytes, str]) -> bytes: + """Decode base64 string to audio bytes""" + if isinstance(audio_b64, bytes): + audio_b64 = audio_b64.decode() + return base64.b64decode(audio_b64) + + +@register_input_codec +class AudioBytesCodec(InputCodec): + """ + Codec that converts to / from raw audio bytes input. + This codec handles raw audio bytes that can be passed directly + to HuggingFace pipelines for automatic speech recognition. + """ + + ContentType = "audio_bytes" + TypeHint = List[bytes] + + @classmethod + def can_encode(cls, payload: Any) -> bool: + return is_list_of(payload, bytes) + + @classmethod + def encode_output( + cls, name: str, payload: List[bytes], **kwargs + ) -> ResponseOutput: + packed = [_audio_base64encode(audio) for audio in payload] + shape = [len(payload), 1] + return ResponseOutput( + name=name, + parameters=Parameters( + content_type=cls.ContentType, + ), + datatype="BYTES", + shape=shape, + data=packed, + ) + + @classmethod + def decode_output(cls, response_output: ResponseOutput) -> List[bytes]: + packed = response_output.data.root + return [_audio_base64decode(audio) for audio in as_list(packed)] + + @classmethod + def encode_input( + cls, name: str, payload: List[bytes], **kwargs + ) -> RequestInput: + output = cls.encode_output(name, payload) + return RequestInput( + name=output.name, + parameters=Parameters( + content_type=cls.ContentType, + ), + datatype=output.datatype, + shape=output.shape, + data=output.data, + ) + + @classmethod + def decode_input(cls, request_input: RequestInput) -> List[bytes]: + packed = request_input.data.root + return [_audio_base64decode(audio) for audio in as_list(packed)] diff --git a/runtimes/huggingface/mlserver_huggingface/codecs/base.py b/runtimes/huggingface/mlserver_huggingface/codecs/base.py index 4e654bd0e..22a6a287d 100644 --- a/runtimes/huggingface/mlserver_huggingface/codecs/base.py +++ b/runtimes/huggingface/mlserver_huggingface/codecs/base.py @@ -22,6 +22,7 @@ RequestInput, ResponseOutput, ) +from .audio import AudioBytesCodec from .image import PILImageCodec from .json import HuggingfaceSingleJSONCodec from .jsonlist import HuggingfaceListJSONCodec @@ -203,6 +204,7 @@ def decode_request(cls, request: InferenceRequest) -> Dict[str, Any]: @register_request_codec class HuggingfaceRequestCodec(MultiInputRequestCodec): InputCodecsWithPriority = [ + AudioBytesCodec, PILImageCodec, HuggingfaceSingleJSONCodec, HuggingfaceListJSONCodec, diff --git a/runtimes/huggingface/mlserver_huggingface/metadata.py b/runtimes/huggingface/mlserver_huggingface/metadata.py index f4f960b16..185884bfd 100644 --- a/runtimes/huggingface/mlserver_huggingface/metadata.py +++ b/runtimes/huggingface/mlserver_huggingface/metadata.py @@ -18,6 +18,13 @@ datatype="BYTES", parameters=dict(content_type="base64"), ), + # raw audio bytes inputs + MetadataTensor( + name="inputs", + shape=[-1], + datatype="BYTES", + parameters=dict(content_type="audio_bytes"), + ), # numpy.ndarray inputs MetadataTensor( name="inputs", @@ -51,6 +58,13 @@ datatype="BYTES", parameters=dict(content_type="base64"), ), + # raw audio bytes inputs + MetadataTensor( + name="inputs", + shape=[-1], + datatype="BYTES", + parameters=dict(content_type="audio_bytes"), + ), # numpy.ndarray inputs MetadataTensor( name="inputs",