Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions runtimes/huggingface/mlserver_huggingface/codecs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .base import MultiInputRequestCodec, HuggingfaceRequestCodec
from .audio import AudioBytesCodec
from .image import PILImageCodec
from .json import HuggingfaceSingleJSONCodec
from .jsonlist import HuggingfaceListJSONCodec
Expand All @@ -10,6 +11,7 @@
__all__ = [
"MultiInputRequestCodec",
"HuggingfaceRequestCodec",
"AudioBytesCodec",
"PILImageCodec",
"HuggingfaceSingleJSONCodec",
"HuggingfaceListJSONCodec",
Expand Down
74 changes: 74 additions & 0 deletions runtimes/huggingface/mlserver_huggingface/codecs/audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import base64
from typing import List, Any, Union
from mlserver.codecs.base import InputCodec, register_input_codec
from mlserver.codecs.lists import as_list, is_list_of
from mlserver.types import RequestInput, ResponseOutput, Parameters


def _audio_base64encode(audio_bytes: bytes) -> str:
"""Encode audio bytes to base64 string"""
return base64.b64encode(audio_bytes).decode()


def _audio_base64decode(audio_b64: Union[bytes, str]) -> bytes:
"""Decode base64 string to audio bytes"""
if isinstance(audio_b64, bytes):
audio_b64 = audio_b64.decode()
return base64.b64decode(audio_b64)


@register_input_codec
class AudioBytesCodec(InputCodec):
"""
Codec that converts to / from raw audio bytes input.
This codec handles raw audio bytes that can be passed directly
to HuggingFace pipelines for automatic speech recognition.
"""

ContentType = "audio_bytes"
TypeHint = List[bytes]

@classmethod
def can_encode(cls, payload: Any) -> bool:
return is_list_of(payload, bytes)

@classmethod
def encode_output(
cls, name: str, payload: List[bytes], **kwargs
) -> ResponseOutput:
packed = [_audio_base64encode(audio) for audio in payload]
shape = [len(payload), 1]
return ResponseOutput(
name=name,
parameters=Parameters(
content_type=cls.ContentType,
),
datatype="BYTES",
shape=shape,
data=packed,
)

@classmethod
def decode_output(cls, response_output: ResponseOutput) -> List[bytes]:
packed = response_output.data.root
return [_audio_base64decode(audio) for audio in as_list(packed)]

@classmethod
def encode_input(
cls, name: str, payload: List[bytes], **kwargs
) -> RequestInput:
output = cls.encode_output(name, payload)
return RequestInput(
name=output.name,
parameters=Parameters(
content_type=cls.ContentType,
),
datatype=output.datatype,
shape=output.shape,
data=output.data,
)

@classmethod
def decode_input(cls, request_input: RequestInput) -> List[bytes]:
packed = request_input.data.root
return [_audio_base64decode(audio) for audio in as_list(packed)]
2 changes: 2 additions & 0 deletions runtimes/huggingface/mlserver_huggingface/codecs/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
RequestInput,
ResponseOutput,
)
from .audio import AudioBytesCodec
from .image import PILImageCodec
from .json import HuggingfaceSingleJSONCodec
from .jsonlist import HuggingfaceListJSONCodec
Expand Down Expand Up @@ -203,6 +204,7 @@ def decode_request(cls, request: InferenceRequest) -> Dict[str, Any]:
@register_request_codec
class HuggingfaceRequestCodec(MultiInputRequestCodec):
InputCodecsWithPriority = [
AudioBytesCodec,
PILImageCodec,
HuggingfaceSingleJSONCodec,
HuggingfaceListJSONCodec,
Expand Down
14 changes: 14 additions & 0 deletions runtimes/huggingface/mlserver_huggingface/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@
datatype="BYTES",
parameters=dict(content_type="base64"),
),
# raw audio bytes inputs
MetadataTensor(
name="inputs",
shape=[-1],
datatype="BYTES",
parameters=dict(content_type="audio_bytes"),
),
# numpy.ndarray inputs
MetadataTensor(
name="inputs",
Expand Down Expand Up @@ -51,6 +58,13 @@
datatype="BYTES",
parameters=dict(content_type="base64"),
),
# raw audio bytes inputs
MetadataTensor(
name="inputs",
shape=[-1],
datatype="BYTES",
parameters=dict(content_type="audio_bytes"),
),
# numpy.ndarray inputs
MetadataTensor(
name="inputs",
Expand Down
Loading