Skip to content

Commit 6c7b829

Browse files
committed
✅ better tests of data schema
1 parent bd72dc6 commit 6c7b829

File tree

8 files changed

+169
-51
lines changed

8 files changed

+169
-51
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* :sparkles: add support for data schema replace on v2
77
* :coffin: remove unused feedback calls from CLI
88

9+
910
## v4.31.0 - 2025-11-04
1011
### Changes
1112
* :label: better field typing for v2

mindee/__init__.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
from mindee import product
22
from mindee.client import Client
33
from mindee.client_v2 import ClientV2
4-
from mindee.input.inference_parameters import InferenceParameters
5-
from mindee.input.local_response import LocalResponse
6-
from mindee.input.page_options import PageOptions
7-
from mindee.input.polling_options import PollingOptions
4+
from mindee.input.inference_parameters import (
5+
InferenceParameters,
6+
DataSchemaField,
7+
DataSchema,
8+
DataSchemaReplace,
9+
)
10+
from mindee.input import LocalResponse, PageOptions, PollingOptions
811
from mindee.input.sources import (
912
Base64Input,
1013
BytesInput,
@@ -23,6 +26,9 @@
2326
__all__ = [
2427
"Client",
2528
"ClientV2",
29+
"DataSchema",
30+
"DataSchemaField",
31+
"DataSchemaReplace",
2632
"InferenceParameters",
2733
"FileInput",
2834
"PathInput",
Lines changed: 71 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,76 @@
11
import json
2-
from dataclasses import dataclass
2+
from dataclasses import dataclass, asdict
33
from typing import List, Optional, Union
44

55
from mindee.input.polling_options import PollingOptions
66

77

8-
class DataSchema:
9-
"""Modify the Data Schema."""
8+
@dataclass
9+
class StringDataClass:
10+
"""Base class for dataclasses that can be serialized to JSON."""
1011

11-
_replace: Optional[dict] = None
12-
13-
def __init__(self, replace: Optional[dict] = None):
14-
self._replace = replace
15-
16-
@property
17-
def replace(self):
18-
"""If set, completely replaces the data schema of the model."""
19-
return self._replace
20-
21-
@replace.setter
22-
def replace(self, value: Optional[Union[dict, str]]) -> None:
23-
if value is None:
24-
_replace = None
25-
elif isinstance(value, str):
26-
_replace = json.loads(value)
27-
elif isinstance(value, dict):
28-
_replace = value
29-
else:
30-
raise TypeError("Invalid type for data schema")
31-
if _replace is not None and _replace == {}:
32-
raise ValueError("Empty override provided")
33-
self._replace = _replace
12+
@staticmethod
13+
def _no_none_values(x) -> dict:
14+
"""Don't include None values in the JSON output."""
15+
return {k: v for (k, v) in x if v is not None}
3416

3517
def __str__(self) -> str:
36-
return json.dumps({"replace": self.replace})
18+
return json.dumps(
19+
asdict(self, dict_factory=self._no_none_values), indent=None, sort_keys=True
20+
)
21+
22+
23+
@dataclass
24+
class DataSchemaField(StringDataClass):
25+
"""A field in the data schema."""
26+
27+
title: str
28+
"""Display name for the field, used in UI and documentation."""
29+
name: str
30+
"""Name of the field in the data schema."""
31+
is_array: bool
32+
"""Whether this field can contain multiple values."""
33+
type: str
34+
"""Data type of the field."""
35+
unique_values: Optional[bool] = None
36+
"""
37+
Whether to remove duplicate values in the array.
38+
Only applicable if `is_array` is True.
39+
"""
40+
description: Optional[str] = None
41+
"""Detailed description of what this field represents."""
42+
guidelines: Optional[str] = None
43+
"""Optional extraction guidelines."""
44+
nested_fields: Optional[dict] = None
45+
"""Subfields when type is `nested_object`, empty for other types."""
46+
47+
48+
@dataclass
49+
class DataSchemaReplace(StringDataClass):
50+
"""The structure to completely replace the data schema of the model."""
51+
52+
fields: List[Union[DataSchemaField, dict]]
53+
54+
def __post_init__(self) -> None:
55+
if self.fields and isinstance(self.fields[0], dict):
56+
self.fields = [
57+
DataSchemaField(**field) # type: ignore[arg-type]
58+
for field in self.fields
59+
]
60+
61+
62+
@dataclass
63+
class DataSchema(StringDataClass):
64+
"""Modify the Data Schema."""
65+
66+
replace: Optional[Union[DataSchemaReplace, dict, str]] = None
67+
"""If set, completely replaces the data schema of the model."""
68+
69+
def __post_init__(self) -> None:
70+
if isinstance(self.replace, dict):
71+
self.replace = DataSchemaReplace(**self.replace)
72+
elif isinstance(self.replace, str):
73+
self.replace = DataSchemaReplace(**json.loads(self.replace))
3774

3875

3976
@dataclass
@@ -66,8 +103,14 @@ class InferenceParameters:
66103
Additional text context used by the model during inference.
67104
Not recommended, for specific use only.
68105
"""
69-
data_schema: Optional[DataSchema] = None
106+
data_schema: Optional[Union[DataSchema, str, dict]] = None
70107
"""
71108
Dynamic changes to the data schema of the model for this inference.
72109
Not recommended, for specific use only.
73110
"""
111+
112+
def __post_init__(self):
113+
if isinstance(self.data_schema, str):
114+
self.data_schema = DataSchema(**json.loads(self.data_schema))
115+
elif isinstance(self.data_schema, dict):
116+
self.data_schema = DataSchema(**self.data_schema)

mindee/parsing/v2/inference_active_options.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,5 @@ def __str__(self) -> str:
6060
f"\n:Confidence: {self.confidence}"
6161
f"\n:RAG: {self.rag}"
6262
f"\n:Text Context: {self.text_context}"
63+
f"\n\n{self.data_schema}"
6364
)
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import json
2+
3+
from mindee import InferenceParameters
4+
from mindee.input.inference_parameters import (
5+
DataSchema,
6+
DataSchemaReplace,
7+
DataSchemaField,
8+
)
9+
from tests.utils import V2_DATA_DIR
10+
11+
expected_data_schema_dict = json.loads(
12+
(V2_DATA_DIR / "inference" / "data_schema_replace_param.json").read_text()
13+
)
14+
expected_data_schema_str = json.dumps(
15+
expected_data_schema_dict, indent=None, sort_keys=True
16+
)
17+
18+
19+
def test_data_schema_replace_none():
20+
params = InferenceParameters(model_id="test-id")
21+
assert params.data_schema is None
22+
23+
24+
def test_data_schema_replace_str():
25+
params = InferenceParameters(
26+
model_id="test-id", data_schema=expected_data_schema_str
27+
)
28+
assert str(params.data_schema) == expected_data_schema_str
29+
30+
31+
def test_data_schema_replace_dict():
32+
params = InferenceParameters(
33+
model_id="test-id", data_schema=expected_data_schema_dict
34+
)
35+
assert str(params.data_schema) == expected_data_schema_str
36+
37+
38+
def test_data_schema_replace_obj_top():
39+
params = InferenceParameters(
40+
model_id="test-id",
41+
data_schema=DataSchema(replace=expected_data_schema_dict["replace"]),
42+
)
43+
assert str(params.data_schema) == expected_data_schema_str
44+
45+
46+
def test_data_schema_replace_obj_fields():
47+
params = InferenceParameters(
48+
model_id="test-id",
49+
data_schema=DataSchema(
50+
replace=DataSchemaReplace(
51+
fields=expected_data_schema_dict["replace"]["fields"]
52+
)
53+
),
54+
)
55+
assert str(params.data_schema) == expected_data_schema_str
56+
57+
58+
def test_data_schema_replace_obj_full():
59+
params = InferenceParameters(
60+
model_id="test-id",
61+
data_schema=DataSchema(
62+
replace=DataSchemaReplace(
63+
fields=[
64+
DataSchemaField(
65+
name="test_replace",
66+
title="Test Replace",
67+
type="string",
68+
is_array=False,
69+
description="A static value for testing.",
70+
guidelines="IMPORTANT: always return this exact string: 'a test value'",
71+
)
72+
]
73+
)
74+
),
75+
)
76+
assert str(params.data_schema) == expected_data_schema_str

tests/v2/test_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def test_enqueue_and_parse_path_with_env_token(custom_base_url_client):
141141
InferenceParameters(
142142
"dummy-model",
143143
text_context="ignore this message",
144-
data_schema=DataSchema(replace={"test_field": {}}),
144+
data_schema=DataSchema(replace={"fields": []}),
145145
),
146146
)
147147

tests/v2/test_client_integration.py

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,8 @@
66
from mindee import ClientV2, InferenceParameters, PathInput, UrlInputSource
77
from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
88
from mindee.parsing.v2 import InferenceActiveOptions
9-
from mindee.input.inference_parameters import DataSchema
109
from mindee.parsing.v2.inference_response import InferenceResponse
11-
from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR
10+
from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR, V2_DATA_DIR
1211

1312

1413
@pytest.fixture(scope="session")
@@ -285,6 +284,9 @@ def test_data_schema_must_succeed(
285284
Load a blank PDF from an HTTPS URL and make sure the inference call completes without raising any errors.
286285
"""
287286
input_path: Path = FILE_TYPES_DIR / "pdf" / "blank_1.pdf"
287+
data_schema_replace_path = (
288+
V2_DATA_DIR / "inference" / "data_schema_replace_param.json"
289+
)
288290

289291
input_source = PathInput(input_path)
290292
params = InferenceParameters(
@@ -294,24 +296,13 @@ def test_data_schema_must_succeed(
294296
polygon=False,
295297
confidence=False,
296298
webhook_ids=[],
297-
data_schema=DataSchema(
298-
replace={
299-
"fields": [
300-
{
301-
"name": "test",
302-
"title": "Test",
303-
"is_array": False,
304-
"type": "string",
305-
"description": "A test field",
306-
}
307-
]
308-
}
309-
),
310-
alias="py_integration_data_schema_override",
299+
data_schema=data_schema_replace_path.read_text(),
300+
alias="py_integration_data_schema_replace",
311301
)
312302
response: InferenceResponse = v2_client.enqueue_and_get_inference(
313303
input_source, params
314304
)
315305
_basic_assert_success(response=response, page_count=1, model_id=findoc_model_id)
316306
assert response.inference.active_options.data_schema.replace is True
317-
assert response.inference.result.fields["test"] is not None
307+
assert response.inference.result.fields["test_replace"] is not None
308+
assert response.inference.result.fields["test_replace"].value == "a test value"

0 commit comments

Comments
 (0)