Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* :sparkles: add support for data schema replace on v2
* :coffin: remove unused feedback calls from CLI


## v4.31.0 - 2025-11-04
### Changes
* :label: better field typing for v2
Expand Down
14 changes: 10 additions & 4 deletions mindee/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from mindee import product
from mindee.client import Client
from mindee.client_v2 import ClientV2
from mindee.input.inference_parameters import InferenceParameters
from mindee.input.local_response import LocalResponse
from mindee.input.page_options import PageOptions
from mindee.input.polling_options import PollingOptions
from mindee.input.inference_parameters import (
InferenceParameters,
DataSchemaField,
DataSchema,
DataSchemaReplace,
)
from mindee.input import LocalResponse, PageOptions, PollingOptions
from mindee.input.sources import (
Base64Input,
BytesInput,
Expand All @@ -23,6 +26,9 @@
__all__ = [
"Client",
"ClientV2",
"DataSchema",
"DataSchemaField",
"DataSchemaReplace",
"InferenceParameters",
"FileInput",
"PathInput",
Expand Down
103 changes: 75 additions & 28 deletions mindee/input/inference_parameters.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,80 @@
import json
from dataclasses import dataclass
from dataclasses import dataclass, asdict
from typing import List, Optional, Union

from mindee.input.polling_options import PollingOptions


class DataSchema:
"""Modify the Data Schema."""
@dataclass
class StringDataClass:
"""Base class for dataclasses that can be serialized to JSON."""

_replace: Optional[dict] = None

def __init__(self, replace: Optional[dict] = None):
self._replace = replace

@property
def replace(self):
"""If set, completely replaces the data schema of the model."""
return self._replace

@replace.setter
def replace(self, value: Optional[Union[dict, str]]) -> None:
if value is None:
_replace = None
elif isinstance(value, str):
_replace = json.loads(value)
elif isinstance(value, dict):
_replace = value
else:
raise TypeError("Invalid type for data schema")
if _replace is not None and _replace == {}:
raise ValueError("Empty override provided")
self._replace = _replace
@staticmethod
def _no_none_values(x) -> dict:
"""Don't include None values in the JSON output."""
return {k: v for (k, v) in x if v is not None}

def __str__(self) -> str:
return json.dumps({"replace": self.replace})
return json.dumps(
asdict(self, dict_factory=self._no_none_values), indent=None, sort_keys=True
)


@dataclass
class DataSchemaField(StringDataClass):
"""A field in the data schema."""

title: str
"""Display name for the field, also impacts inference results."""
name: str
"""Name of the field in the data schema."""
is_array: bool
"""Whether this field can contain multiple values."""
type: str
"""Data type of the field."""
classification_values: Optional[List[str]] = None
"""Allowed values when type is `classification`. Leave empty for other types."""
unique_values: Optional[bool] = None
"""
Whether to remove duplicate values in the array.
Only applicable if `is_array` is True.
"""
description: Optional[str] = None
"""Detailed description of what this field represents."""
guidelines: Optional[str] = None
"""Optional extraction guidelines."""
nested_fields: Optional[dict] = None
"""Subfields when type is `nested_object`. Leave empty for other types"""


@dataclass
class DataSchemaReplace(StringDataClass):
"""The structure to completely replace the data schema of the model."""

fields: List[Union[DataSchemaField, dict]]

def __post_init__(self) -> None:
if not self.fields:
raise ValueError("Data schema replacement fields cannot be empty.")
if isinstance(self.fields[0], dict):
self.fields = [
DataSchemaField(**field) # type: ignore[arg-type]
for field in self.fields
]


@dataclass
class DataSchema(StringDataClass):
"""Modify the Data Schema."""

replace: Optional[Union[DataSchemaReplace, dict, str]] = None
"""If set, completely replaces the data schema of the model."""

def __post_init__(self) -> None:
if isinstance(self.replace, dict):
self.replace = DataSchemaReplace(**self.replace)
elif isinstance(self.replace, str):
self.replace = DataSchemaReplace(**json.loads(self.replace))


@dataclass
Expand Down Expand Up @@ -66,8 +107,14 @@ class InferenceParameters:
Additional text context used by the model during inference.
Not recommended, for specific use only.
"""
data_schema: Optional[DataSchema] = None
data_schema: Optional[Union[DataSchema, str, dict]] = None
"""
Dynamic changes to the data schema of the model for this inference.
Not recommended, for specific use only.
"""

def __post_init__(self):
if isinstance(self.data_schema, str):
self.data_schema = DataSchema(**json.loads(self.data_schema))
elif isinstance(self.data_schema, dict):
self.data_schema = DataSchema(**self.data_schema)
1 change: 1 addition & 0 deletions mindee/parsing/v2/inference_active_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,5 @@ def __str__(self) -> str:
f"\n:Confidence: {self.confidence}"
f"\n:RAG: {self.rag}"
f"\n:Text Context: {self.text_context}"
f"\n\n{self.data_schema}"
)
85 changes: 85 additions & 0 deletions tests/v2/input/test_inference_parameters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import json

import pytest

from mindee import InferenceParameters
from mindee.input.inference_parameters import (
DataSchema,
DataSchemaReplace,
DataSchemaField,
)
from tests.utils import V2_DATA_DIR

expected_data_schema_dict = json.loads(
(V2_DATA_DIR / "inference" / "data_schema_replace_param.json").read_text()
)
expected_data_schema_str = json.dumps(
expected_data_schema_dict, indent=None, sort_keys=True
)


def test_data_schema_replace_none():
params = InferenceParameters(model_id="test-id")
assert params.data_schema is None


def test_data_schema_replace_str():
params = InferenceParameters(
model_id="test-id", data_schema=expected_data_schema_str
)
assert str(params.data_schema) == expected_data_schema_str


def test_data_schema_replace_dict():
params = InferenceParameters(
model_id="test-id", data_schema=expected_data_schema_dict
)
assert str(params.data_schema) == expected_data_schema_str


def test_data_schema_replace_obj_top():
params = InferenceParameters(
model_id="test-id",
data_schema=DataSchema(replace=expected_data_schema_dict["replace"]),
)
assert str(params.data_schema) == expected_data_schema_str


def test_data_schema_replace_obj_fields():
params = InferenceParameters(
model_id="test-id",
data_schema=DataSchema(
replace=DataSchemaReplace(
fields=expected_data_schema_dict["replace"]["fields"]
)
),
)
assert str(params.data_schema) == expected_data_schema_str


def test_data_schema_replace_empty_fields():
with pytest.raises(
ValueError, match="Data schema replacement fields cannot be empty"
):
InferenceParameters(model_id="test-id", data_schema={"replace": {"fields": []}})


def test_data_schema_replace_obj_full():
params = InferenceParameters(
model_id="test-id",
data_schema=DataSchema(
replace=DataSchemaReplace(
fields=[
DataSchemaField(
name="test_replace",
title="Test Replace",
type="string",
is_array=False,
description="A static value for testing.",
guidelines="IMPORTANT: always return this exact string: 'a test value'",
)
]
)
),
)
assert str(params.data_schema) == expected_data_schema_str
7 changes: 5 additions & 2 deletions tests/v2/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from mindee.error.mindee_error import MindeeApiV2Error, MindeeError
from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
from mindee.input import LocalInputSource, PathInput
from mindee.input.inference_parameters import DataSchema
from mindee.mindee_http.base_settings import USER_AGENT
from mindee.parsing.v2.inference import Inference
from mindee.parsing.v2.job import Job
Expand Down Expand Up @@ -141,7 +140,11 @@ def test_enqueue_and_parse_path_with_env_token(custom_base_url_client):
InferenceParameters(
"dummy-model",
text_context="ignore this message",
data_schema=DataSchema(replace={"test_field": {}}),
data_schema=json.loads(
(
V2_DATA_DIR / "inference" / "data_schema_replace_param.json"
).read_text()
),
),
)

Expand Down
25 changes: 8 additions & 17 deletions tests/v2/test_client_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@
from mindee import ClientV2, InferenceParameters, PathInput, UrlInputSource
from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
from mindee.parsing.v2 import InferenceActiveOptions
from mindee.input.inference_parameters import DataSchema
from mindee.parsing.v2.inference_response import InferenceResponse
from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR
from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR, V2_DATA_DIR


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -285,6 +284,9 @@ def test_data_schema_must_succeed(
Load a blank PDF from an HTTPS URL and make sure the inference call completes without raising any errors.
"""
input_path: Path = FILE_TYPES_DIR / "pdf" / "blank_1.pdf"
data_schema_replace_path = (
V2_DATA_DIR / "inference" / "data_schema_replace_param.json"
)

input_source = PathInput(input_path)
params = InferenceParameters(
Expand All @@ -294,24 +296,13 @@ def test_data_schema_must_succeed(
polygon=False,
confidence=False,
webhook_ids=[],
data_schema=DataSchema(
replace={
"fields": [
{
"name": "test",
"title": "Test",
"is_array": False,
"type": "string",
"description": "A test field",
}
]
}
),
alias="py_integration_data_schema_override",
data_schema=data_schema_replace_path.read_text(),
alias="py_integration_data_schema_replace",
)
response: InferenceResponse = v2_client.enqueue_and_get_inference(
input_source, params
)
_basic_assert_success(response=response, page_count=1, model_id=findoc_model_id)
assert response.inference.active_options.data_schema.replace is True
assert response.inference.result.fields["test"] is not None
assert response.inference.result.fields["test_replace"] is not None
assert response.inference.result.fields["test_replace"].value == "a test value"