From f3337cf67ae17e05900721472142b4561721f494 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Wed, 17 Dec 2025 10:25:56 +0100 Subject: [PATCH] :white_check_mark: better tests of data schema --- CHANGELOG.md | 1 + mindee/__init__.py | 14 ++- mindee/input/inference_parameters.py | 103 +++++++++++++----- mindee/parsing/v2/inference_active_options.py | 1 + tests/data | 2 +- tests/v2/input/test_inference_parameters.py | 85 +++++++++++++++ tests/v2/test_client.py | 7 +- tests/v2/test_client_integration.py | 25 ++--- 8 files changed, 186 insertions(+), 52 deletions(-) create mode 100644 tests/v2/input/test_inference_parameters.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e042f1f9..b4ec3260 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * :sparkles: add support for data schema replace on v2 * :coffin: remove unused feedback calls from CLI + ## v4.31.0 - 2025-11-04 ### Changes * :label: better field typing for v2 diff --git a/mindee/__init__.py b/mindee/__init__.py index bd6f0e89..ff5ca240 100644 --- a/mindee/__init__.py +++ b/mindee/__init__.py @@ -1,10 +1,13 @@ from mindee import product from mindee.client import Client from mindee.client_v2 import ClientV2 -from mindee.input.inference_parameters import InferenceParameters -from mindee.input.local_response import LocalResponse -from mindee.input.page_options import PageOptions -from mindee.input.polling_options import PollingOptions +from mindee.input.inference_parameters import ( + InferenceParameters, + DataSchemaField, + DataSchema, + DataSchemaReplace, +) +from mindee.input import LocalResponse, PageOptions, PollingOptions from mindee.input.sources import ( Base64Input, BytesInput, @@ -23,6 +26,9 @@ __all__ = [ "Client", "ClientV2", + "DataSchema", + "DataSchemaField", + "DataSchemaReplace", "InferenceParameters", "FileInput", "PathInput", diff --git a/mindee/input/inference_parameters.py b/mindee/input/inference_parameters.py index dd57c38a..6d4e01fa 100644 --- a/mindee/input/inference_parameters.py +++ b/mindee/input/inference_parameters.py @@ -1,39 +1,80 @@ import json -from dataclasses import dataclass +from dataclasses import dataclass, asdict from typing import List, Optional, Union from mindee.input.polling_options import PollingOptions -class DataSchema: - """Modify the Data Schema.""" +@dataclass +class StringDataClass: + """Base class for dataclasses that can be serialized to JSON.""" - _replace: Optional[dict] = None - - def __init__(self, replace: Optional[dict] = None): - self._replace = replace - - @property - def replace(self): - """If set, completely replaces the data schema of the model.""" - return self._replace - - @replace.setter - def replace(self, value: Optional[Union[dict, str]]) -> None: - if value is None: - _replace = None - elif isinstance(value, str): - _replace = json.loads(value) - elif isinstance(value, dict): - _replace = value - else: - raise TypeError("Invalid type for data schema") - if _replace is not None and _replace == {}: - raise ValueError("Empty override provided") - self._replace = _replace + @staticmethod + def _no_none_values(x) -> dict: + """Don't include None values in the JSON output.""" + return {k: v for (k, v) in x if v is not None} def __str__(self) -> str: - return json.dumps({"replace": self.replace}) + return json.dumps( + asdict(self, dict_factory=self._no_none_values), indent=None, sort_keys=True + ) + + +@dataclass +class DataSchemaField(StringDataClass): + """A field in the data schema.""" + + title: str + """Display name for the field, also impacts inference results.""" + name: str + """Name of the field in the data schema.""" + is_array: bool + """Whether this field can contain multiple values.""" + type: str + """Data type of the field.""" + classification_values: Optional[List[str]] = None + """Allowed values when type is `classification`. Leave empty for other types.""" + unique_values: Optional[bool] = None + """ + Whether to remove duplicate values in the array. + Only applicable if `is_array` is True. + """ + description: Optional[str] = None + """Detailed description of what this field represents.""" + guidelines: Optional[str] = None + """Optional extraction guidelines.""" + nested_fields: Optional[dict] = None + """Subfields when type is `nested_object`. Leave empty for other types""" + + +@dataclass +class DataSchemaReplace(StringDataClass): + """The structure to completely replace the data schema of the model.""" + + fields: List[Union[DataSchemaField, dict]] + + def __post_init__(self) -> None: + if not self.fields: + raise ValueError("Data schema replacement fields cannot be empty.") + if isinstance(self.fields[0], dict): + self.fields = [ + DataSchemaField(**field) # type: ignore[arg-type] + for field in self.fields + ] + + +@dataclass +class DataSchema(StringDataClass): + """Modify the Data Schema.""" + + replace: Optional[Union[DataSchemaReplace, dict, str]] = None + """If set, completely replaces the data schema of the model.""" + + def __post_init__(self) -> None: + if isinstance(self.replace, dict): + self.replace = DataSchemaReplace(**self.replace) + elif isinstance(self.replace, str): + self.replace = DataSchemaReplace(**json.loads(self.replace)) @dataclass @@ -66,8 +107,14 @@ class InferenceParameters: Additional text context used by the model during inference. Not recommended, for specific use only. """ - data_schema: Optional[DataSchema] = None + data_schema: Optional[Union[DataSchema, str, dict]] = None """ Dynamic changes to the data schema of the model for this inference. Not recommended, for specific use only. """ + + def __post_init__(self): + if isinstance(self.data_schema, str): + self.data_schema = DataSchema(**json.loads(self.data_schema)) + elif isinstance(self.data_schema, dict): + self.data_schema = DataSchema(**self.data_schema) diff --git a/mindee/parsing/v2/inference_active_options.py b/mindee/parsing/v2/inference_active_options.py index d94749a2..4c894aef 100644 --- a/mindee/parsing/v2/inference_active_options.py +++ b/mindee/parsing/v2/inference_active_options.py @@ -60,4 +60,5 @@ def __str__(self) -> str: f"\n:Confidence: {self.confidence}" f"\n:RAG: {self.rag}" f"\n:Text Context: {self.text_context}" + f"\n\n{self.data_schema}" ) diff --git a/tests/data b/tests/data index 7560dd55..0c51e1d3 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 7560dd5532c10b4d3fb85991f386e9809dd2750a +Subproject commit 0c51e1d3e2258404c44280f25f4951ba6fe27324 diff --git a/tests/v2/input/test_inference_parameters.py b/tests/v2/input/test_inference_parameters.py new file mode 100644 index 00000000..4def853e --- /dev/null +++ b/tests/v2/input/test_inference_parameters.py @@ -0,0 +1,85 @@ +import json + +import pytest + +from mindee import InferenceParameters +from mindee.input.inference_parameters import ( + DataSchema, + DataSchemaReplace, + DataSchemaField, +) +from tests.utils import V2_DATA_DIR + +expected_data_schema_dict = json.loads( + (V2_DATA_DIR / "inference" / "data_schema_replace_param.json").read_text() +) +expected_data_schema_str = json.dumps( + expected_data_schema_dict, indent=None, sort_keys=True +) + + +def test_data_schema_replace_none(): + params = InferenceParameters(model_id="test-id") + assert params.data_schema is None + + +def test_data_schema_replace_str(): + params = InferenceParameters( + model_id="test-id", data_schema=expected_data_schema_str + ) + assert str(params.data_schema) == expected_data_schema_str + + +def test_data_schema_replace_dict(): + params = InferenceParameters( + model_id="test-id", data_schema=expected_data_schema_dict + ) + assert str(params.data_schema) == expected_data_schema_str + + +def test_data_schema_replace_obj_top(): + params = InferenceParameters( + model_id="test-id", + data_schema=DataSchema(replace=expected_data_schema_dict["replace"]), + ) + assert str(params.data_schema) == expected_data_schema_str + + +def test_data_schema_replace_obj_fields(): + params = InferenceParameters( + model_id="test-id", + data_schema=DataSchema( + replace=DataSchemaReplace( + fields=expected_data_schema_dict["replace"]["fields"] + ) + ), + ) + assert str(params.data_schema) == expected_data_schema_str + + +def test_data_schema_replace_empty_fields(): + with pytest.raises( + ValueError, match="Data schema replacement fields cannot be empty" + ): + InferenceParameters(model_id="test-id", data_schema={"replace": {"fields": []}}) + + +def test_data_schema_replace_obj_full(): + params = InferenceParameters( + model_id="test-id", + data_schema=DataSchema( + replace=DataSchemaReplace( + fields=[ + DataSchemaField( + name="test_replace", + title="Test Replace", + type="string", + is_array=False, + description="A static value for testing.", + guidelines="IMPORTANT: always return this exact string: 'a test value'", + ) + ] + ) + ), + ) + assert str(params.data_schema) == expected_data_schema_str diff --git a/tests/v2/test_client.py b/tests/v2/test_client.py index 7d6453e7..866242a4 100644 --- a/tests/v2/test_client.py +++ b/tests/v2/test_client.py @@ -7,7 +7,6 @@ from mindee.error.mindee_error import MindeeApiV2Error, MindeeError from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2 from mindee.input import LocalInputSource, PathInput -from mindee.input.inference_parameters import DataSchema from mindee.mindee_http.base_settings import USER_AGENT from mindee.parsing.v2.inference import Inference from mindee.parsing.v2.job import Job @@ -141,7 +140,11 @@ def test_enqueue_and_parse_path_with_env_token(custom_base_url_client): InferenceParameters( "dummy-model", text_context="ignore this message", - data_schema=DataSchema(replace={"test_field": {}}), + data_schema=json.loads( + ( + V2_DATA_DIR / "inference" / "data_schema_replace_param.json" + ).read_text() + ), ), ) diff --git a/tests/v2/test_client_integration.py b/tests/v2/test_client_integration.py index e1d45bbf..c7d7a5f6 100644 --- a/tests/v2/test_client_integration.py +++ b/tests/v2/test_client_integration.py @@ -6,9 +6,8 @@ from mindee import ClientV2, InferenceParameters, PathInput, UrlInputSource from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2 from mindee.parsing.v2 import InferenceActiveOptions -from mindee.input.inference_parameters import DataSchema from mindee.parsing.v2.inference_response import InferenceResponse -from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR +from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR, V2_DATA_DIR @pytest.fixture(scope="session") @@ -285,6 +284,9 @@ def test_data_schema_must_succeed( Load a blank PDF from an HTTPS URL and make sure the inference call completes without raising any errors. """ input_path: Path = FILE_TYPES_DIR / "pdf" / "blank_1.pdf" + data_schema_replace_path = ( + V2_DATA_DIR / "inference" / "data_schema_replace_param.json" + ) input_source = PathInput(input_path) params = InferenceParameters( @@ -294,24 +296,13 @@ def test_data_schema_must_succeed( polygon=False, confidence=False, webhook_ids=[], - data_schema=DataSchema( - replace={ - "fields": [ - { - "name": "test", - "title": "Test", - "is_array": False, - "type": "string", - "description": "A test field", - } - ] - } - ), - alias="py_integration_data_schema_override", + data_schema=data_schema_replace_path.read_text(), + alias="py_integration_data_schema_replace", ) response: InferenceResponse = v2_client.enqueue_and_get_inference( input_source, params ) _basic_assert_success(response=response, page_count=1, model_id=findoc_model_id) assert response.inference.active_options.data_schema.replace is True - assert response.inference.result.fields["test"] is not None + assert response.inference.result.fields["test_replace"] is not None + assert response.inference.result.fields["test_replace"].value == "a test value"