✅ better tests of data schema (#378)

ianardee · web-flow · commit 20bfee62623a · 2025-12-17T14:58:36.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
 * :sparkles: add support for data schema replace on v2
 * :coffin: remove unused feedback calls from CLI
 
+
 ## v4.31.0 - 2025-11-04
 ### Changes
 * :label: better field typing for v2
diff --git a/mindee/__init__.py b/mindee/__init__.py
@@ -1,10 +1,13 @@
 from mindee import product
 from mindee.client import Client
 from mindee.client_v2 import ClientV2
-from mindee.input.inference_parameters import InferenceParameters
-from mindee.input.local_response import LocalResponse
-from mindee.input.page_options import PageOptions
-from mindee.input.polling_options import PollingOptions
+from mindee.input.inference_parameters import (
+    InferenceParameters,
+    DataSchemaField,
+    DataSchema,
+    DataSchemaReplace,
+)
+from mindee.input import LocalResponse, PageOptions, PollingOptions
 from mindee.input.sources import (
     Base64Input,
     BytesInput,
@@ -23,6 +26,9 @@
 __all__ = [
     "Client",
     "ClientV2",
+    "DataSchema",
+    "DataSchemaField",
+    "DataSchemaReplace",
     "InferenceParameters",
     "FileInput",
     "PathInput",
diff --git a/mindee/input/inference_parameters.py b/mindee/input/inference_parameters.py
@@ -1,39 +1,80 @@
 import json
-from dataclasses import dataclass
+from dataclasses import dataclass, asdict
 from typing import List, Optional, Union
 
 from mindee.input.polling_options import PollingOptions
 
 
-class DataSchema:
-    """Modify the Data Schema."""
+@dataclass
+class StringDataClass:
+    """Base class for dataclasses that can be serialized to JSON."""
 
-    _replace: Optional[dict] = None
-
-    def __init__(self, replace: Optional[dict] = None):
-        self._replace = replace
-
-    @property
-    def replace(self):
-        """If set, completely replaces the data schema of the model."""
-        return self._replace
-
-    @replace.setter
-    def replace(self, value: Optional[Union[dict, str]]) -> None:
-        if value is None:
-            _replace = None
-        elif isinstance(value, str):
-            _replace = json.loads(value)
-        elif isinstance(value, dict):
-            _replace = value
-        else:
-            raise TypeError("Invalid type for data schema")
-        if _replace is not None and _replace == {}:
-            raise ValueError("Empty override provided")
-        self._replace = _replace
+    @staticmethod
+    def _no_none_values(x) -> dict:
+        """Don't include None values in the JSON output."""
+        return {k: v for (k, v) in x if v is not None}
 
     def __str__(self) -> str:
-        return json.dumps({"replace": self.replace})
+        return json.dumps(
+            asdict(self, dict_factory=self._no_none_values), indent=None, sort_keys=True
+        )
+
+
+@dataclass
+class DataSchemaField(StringDataClass):
+    """A field in the data schema."""
+
+    title: str
+    """Display name for the field, also impacts inference results."""
+    name: str
+    """Name of the field in the data schema."""
+    is_array: bool
+    """Whether this field can contain multiple values."""
+    type: str
+    """Data type of the field."""
+    classification_values: Optional[List[str]] = None
+    """Allowed values when type is `classification`. Leave empty for other types."""
+    unique_values: Optional[bool] = None
+    """
+    Whether to remove duplicate values in the array.
+    Only applicable if `is_array` is True.
+    """
+    description: Optional[str] = None
+    """Detailed description of what this field represents."""
+    guidelines: Optional[str] = None
+    """Optional extraction guidelines."""
+    nested_fields: Optional[dict] = None
+    """Subfields when type is `nested_object`. Leave empty for other types"""
+
+
+@dataclass
+class DataSchemaReplace(StringDataClass):
+    """The structure to completely replace the data schema of the model."""
+
+    fields: List[Union[DataSchemaField, dict]]
+
+    def __post_init__(self) -> None:
+        if not self.fields:
+            raise ValueError("Data schema replacement fields cannot be empty.")
+        if isinstance(self.fields[0], dict):
+            self.fields = [
+                DataSchemaField(**field)  # type: ignore[arg-type]
+                for field in self.fields
+            ]
+
+
+@dataclass
+class DataSchema(StringDataClass):
+    """Modify the Data Schema."""
+
+    replace: Optional[Union[DataSchemaReplace, dict, str]] = None
+    """If set, completely replaces the data schema of the model."""
+
+    def __post_init__(self) -> None:
+        if isinstance(self.replace, dict):
+            self.replace = DataSchemaReplace(**self.replace)
+        elif isinstance(self.replace, str):
+            self.replace = DataSchemaReplace(**json.loads(self.replace))
 
 
 @dataclass
@@ -66,8 +107,14 @@ class InferenceParameters:
     Additional text context used by the model during inference.
     Not recommended, for specific use only.
     """
-    data_schema: Optional[DataSchema] = None
+    data_schema: Optional[Union[DataSchema, str, dict]] = None
     """
     Dynamic changes to the data schema of the model for this inference.
     Not recommended, for specific use only.
     """
+
+    def __post_init__(self):
+        if isinstance(self.data_schema, str):
+            self.data_schema = DataSchema(**json.loads(self.data_schema))
+        elif isinstance(self.data_schema, dict):
+            self.data_schema = DataSchema(**self.data_schema)
diff --git a/mindee/parsing/v2/inference_active_options.py b/mindee/parsing/v2/inference_active_options.py
@@ -60,4 +60,5 @@ def __str__(self) -> str:
             f"\n:Confidence: {self.confidence}"
             f"\n:RAG: {self.rag}"
             f"\n:Text Context: {self.text_context}"
+            f"\n\n{self.data_schema}"
         )
diff --git a/tests/data b/tests/data
@@ -1 +1 @@
-Subproject commit 7560dd5532c10b4d3fb85991f386e9809dd2750a
+Subproject commit 0c51e1d3e2258404c44280f25f4951ba6fe27324
diff --git a/tests/v2/input/test_inference_parameters.py b/tests/v2/input/test_inference_parameters.py
@@ -0,0 +1,85 @@
+import json
+
+import pytest
+
+from mindee import InferenceParameters
+from mindee.input.inference_parameters import (
+    DataSchema,
+    DataSchemaReplace,
+    DataSchemaField,
+)
+from tests.utils import V2_DATA_DIR
+
+expected_data_schema_dict = json.loads(
+    (V2_DATA_DIR / "inference" / "data_schema_replace_param.json").read_text()
+)
+expected_data_schema_str = json.dumps(
+    expected_data_schema_dict, indent=None, sort_keys=True
+)
+
+
+def test_data_schema_replace_none():
+    params = InferenceParameters(model_id="test-id")
+    assert params.data_schema is None
+
+
+def test_data_schema_replace_str():
+    params = InferenceParameters(
+        model_id="test-id", data_schema=expected_data_schema_str
+    )
+    assert str(params.data_schema) == expected_data_schema_str
+
+
+def test_data_schema_replace_dict():
+    params = InferenceParameters(
+        model_id="test-id", data_schema=expected_data_schema_dict
+    )
+    assert str(params.data_schema) == expected_data_schema_str
+
+
+def test_data_schema_replace_obj_top():
+    params = InferenceParameters(
+        model_id="test-id",
+        data_schema=DataSchema(replace=expected_data_schema_dict["replace"]),
+    )
+    assert str(params.data_schema) == expected_data_schema_str
+
+
+def test_data_schema_replace_obj_fields():
+    params = InferenceParameters(
+        model_id="test-id",
+        data_schema=DataSchema(
+            replace=DataSchemaReplace(
+                fields=expected_data_schema_dict["replace"]["fields"]
+            )
+        ),
+    )
+    assert str(params.data_schema) == expected_data_schema_str
+
+
+def test_data_schema_replace_empty_fields():
+    with pytest.raises(
+        ValueError, match="Data schema replacement fields cannot be empty"
+    ):
+        InferenceParameters(model_id="test-id", data_schema={"replace": {"fields": []}})
+
+
+def test_data_schema_replace_obj_full():
+    params = InferenceParameters(
+        model_id="test-id",
+        data_schema=DataSchema(
+            replace=DataSchemaReplace(
+                fields=[
+                    DataSchemaField(
+                        name="test_replace",
+                        title="Test Replace",
+                        type="string",
+                        is_array=False,
+                        description="A static value for testing.",
+                        guidelines="IMPORTANT: always return this exact string: 'a test value'",
+                    )
+                ]
+            )
+        ),
+    )
+    assert str(params.data_schema) == expected_data_schema_str
diff --git a/tests/v2/test_client.py b/tests/v2/test_client.py
@@ -7,7 +7,6 @@
 from mindee.error.mindee_error import MindeeApiV2Error, MindeeError
 from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
 from mindee.input import LocalInputSource, PathInput
-from mindee.input.inference_parameters import DataSchema
 from mindee.mindee_http.base_settings import USER_AGENT
 from mindee.parsing.v2.inference import Inference
 from mindee.parsing.v2.job import Job
@@ -141,7 +140,11 @@ def test_enqueue_and_parse_path_with_env_token(custom_base_url_client):
             InferenceParameters(
                 "dummy-model",
                 text_context="ignore this message",
-                data_schema=DataSchema(replace={"test_field": {}}),
+                data_schema=json.loads(
+                    (
+                        V2_DATA_DIR / "inference" / "data_schema_replace_param.json"
+                    ).read_text()
+                ),
             ),
         )
 
diff --git a/tests/v2/test_client_integration.py b/tests/v2/test_client_integration.py
@@ -6,9 +6,8 @@
 from mindee import ClientV2, InferenceParameters, PathInput, UrlInputSource
 from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
 from mindee.parsing.v2 import InferenceActiveOptions
-from mindee.input.inference_parameters import DataSchema
 from mindee.parsing.v2.inference_response import InferenceResponse
-from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR
+from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR, V2_DATA_DIR
 
 
 @pytest.fixture(scope="session")
@@ -285,6 +284,9 @@ def test_data_schema_must_succeed(
     Load a blank PDF from an HTTPS URL and make sure the inference call completes without raising any errors.
     """
     input_path: Path = FILE_TYPES_DIR / "pdf" / "blank_1.pdf"
+    data_schema_replace_path = (
+        V2_DATA_DIR / "inference" / "data_schema_replace_param.json"
+    )
 
     input_source = PathInput(input_path)
     params = InferenceParameters(
@@ -294,24 +296,13 @@ def test_data_schema_must_succeed(
         polygon=False,
         confidence=False,
         webhook_ids=[],
-        data_schema=DataSchema(
-            replace={
-                "fields": [
-                    {
-                        "name": "test",
-                        "title": "Test",
-                        "is_array": False,
-                        "type": "string",
-                        "description": "A test field",
-                    }
-                ]
-            }
-        ),
-        alias="py_integration_data_schema_override",
+        data_schema=data_schema_replace_path.read_text(),
+        alias="py_integration_data_schema_replace",
     )
     response: InferenceResponse = v2_client.enqueue_and_get_inference(
         input_source, params
     )
     _basic_assert_success(response=response, page_count=1, model_id=findoc_model_id)
     assert response.inference.active_options.data_schema.replace is True
-    assert response.inference.result.fields["test"] is not None
+    assert response.inference.result.fields["test_replace"] is not None
+    assert response.inference.result.fields["test_replace"].value == "a test value"

Original file line number	Diff line number	Diff line change
`@@ -60,4 +60,5 @@ def __str__(self) -> str:`
`60`	`60`	`f"\n:Confidence: {self.confidence}"`
`61`	`61`	`f"\n:RAG: {self.rag}"`
`62`	`62`	`f"\n:Text Context: {self.text_context}"`
	`63`	`+ f"\n\n{self.data_schema}"`
`63`	`64`	`)`