Skip to content

Commit 20bfee6

Browse files
authored
✅ better tests of data schema (#378)
1 parent bd72dc6 commit 20bfee6

File tree

8 files changed

+186
-52
lines changed

8 files changed

+186
-52
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* :sparkles: add support for data schema replace on v2
77
* :coffin: remove unused feedback calls from CLI
88

9+
910
## v4.31.0 - 2025-11-04
1011
### Changes
1112
* :label: better field typing for v2

mindee/__init__.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
from mindee import product
22
from mindee.client import Client
33
from mindee.client_v2 import ClientV2
4-
from mindee.input.inference_parameters import InferenceParameters
5-
from mindee.input.local_response import LocalResponse
6-
from mindee.input.page_options import PageOptions
7-
from mindee.input.polling_options import PollingOptions
4+
from mindee.input.inference_parameters import (
5+
InferenceParameters,
6+
DataSchemaField,
7+
DataSchema,
8+
DataSchemaReplace,
9+
)
10+
from mindee.input import LocalResponse, PageOptions, PollingOptions
811
from mindee.input.sources import (
912
Base64Input,
1013
BytesInput,
@@ -23,6 +26,9 @@
2326
__all__ = [
2427
"Client",
2528
"ClientV2",
29+
"DataSchema",
30+
"DataSchemaField",
31+
"DataSchemaReplace",
2632
"InferenceParameters",
2733
"FileInput",
2834
"PathInput",
Lines changed: 75 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,80 @@
11
import json
2-
from dataclasses import dataclass
2+
from dataclasses import dataclass, asdict
33
from typing import List, Optional, Union
44

55
from mindee.input.polling_options import PollingOptions
66

77

8-
class DataSchema:
9-
"""Modify the Data Schema."""
8+
@dataclass
9+
class StringDataClass:
10+
"""Base class for dataclasses that can be serialized to JSON."""
1011

11-
_replace: Optional[dict] = None
12-
13-
def __init__(self, replace: Optional[dict] = None):
14-
self._replace = replace
15-
16-
@property
17-
def replace(self):
18-
"""If set, completely replaces the data schema of the model."""
19-
return self._replace
20-
21-
@replace.setter
22-
def replace(self, value: Optional[Union[dict, str]]) -> None:
23-
if value is None:
24-
_replace = None
25-
elif isinstance(value, str):
26-
_replace = json.loads(value)
27-
elif isinstance(value, dict):
28-
_replace = value
29-
else:
30-
raise TypeError("Invalid type for data schema")
31-
if _replace is not None and _replace == {}:
32-
raise ValueError("Empty override provided")
33-
self._replace = _replace
12+
@staticmethod
13+
def _no_none_values(x) -> dict:
14+
"""Don't include None values in the JSON output."""
15+
return {k: v for (k, v) in x if v is not None}
3416

3517
def __str__(self) -> str:
36-
return json.dumps({"replace": self.replace})
18+
return json.dumps(
19+
asdict(self, dict_factory=self._no_none_values), indent=None, sort_keys=True
20+
)
21+
22+
23+
@dataclass
24+
class DataSchemaField(StringDataClass):
25+
"""A field in the data schema."""
26+
27+
title: str
28+
"""Display name for the field, also impacts inference results."""
29+
name: str
30+
"""Name of the field in the data schema."""
31+
is_array: bool
32+
"""Whether this field can contain multiple values."""
33+
type: str
34+
"""Data type of the field."""
35+
classification_values: Optional[List[str]] = None
36+
"""Allowed values when type is `classification`. Leave empty for other types."""
37+
unique_values: Optional[bool] = None
38+
"""
39+
Whether to remove duplicate values in the array.
40+
Only applicable if `is_array` is True.
41+
"""
42+
description: Optional[str] = None
43+
"""Detailed description of what this field represents."""
44+
guidelines: Optional[str] = None
45+
"""Optional extraction guidelines."""
46+
nested_fields: Optional[dict] = None
47+
"""Subfields when type is `nested_object`. Leave empty for other types"""
48+
49+
50+
@dataclass
51+
class DataSchemaReplace(StringDataClass):
52+
"""The structure to completely replace the data schema of the model."""
53+
54+
fields: List[Union[DataSchemaField, dict]]
55+
56+
def __post_init__(self) -> None:
57+
if not self.fields:
58+
raise ValueError("Data schema replacement fields cannot be empty.")
59+
if isinstance(self.fields[0], dict):
60+
self.fields = [
61+
DataSchemaField(**field) # type: ignore[arg-type]
62+
for field in self.fields
63+
]
64+
65+
66+
@dataclass
67+
class DataSchema(StringDataClass):
68+
"""Modify the Data Schema."""
69+
70+
replace: Optional[Union[DataSchemaReplace, dict, str]] = None
71+
"""If set, completely replaces the data schema of the model."""
72+
73+
def __post_init__(self) -> None:
74+
if isinstance(self.replace, dict):
75+
self.replace = DataSchemaReplace(**self.replace)
76+
elif isinstance(self.replace, str):
77+
self.replace = DataSchemaReplace(**json.loads(self.replace))
3778

3879

3980
@dataclass
@@ -66,8 +107,14 @@ class InferenceParameters:
66107
Additional text context used by the model during inference.
67108
Not recommended, for specific use only.
68109
"""
69-
data_schema: Optional[DataSchema] = None
110+
data_schema: Optional[Union[DataSchema, str, dict]] = None
70111
"""
71112
Dynamic changes to the data schema of the model for this inference.
72113
Not recommended, for specific use only.
73114
"""
115+
116+
def __post_init__(self):
117+
if isinstance(self.data_schema, str):
118+
self.data_schema = DataSchema(**json.loads(self.data_schema))
119+
elif isinstance(self.data_schema, dict):
120+
self.data_schema = DataSchema(**self.data_schema)

mindee/parsing/v2/inference_active_options.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,5 @@ def __str__(self) -> str:
6060
f"\n:Confidence: {self.confidence}"
6161
f"\n:RAG: {self.rag}"
6262
f"\n:Text Context: {self.text_context}"
63+
f"\n\n{self.data_schema}"
6364
)
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import json
2+
3+
import pytest
4+
5+
from mindee import InferenceParameters
6+
from mindee.input.inference_parameters import (
7+
DataSchema,
8+
DataSchemaReplace,
9+
DataSchemaField,
10+
)
11+
from tests.utils import V2_DATA_DIR
12+
13+
expected_data_schema_dict = json.loads(
14+
(V2_DATA_DIR / "inference" / "data_schema_replace_param.json").read_text()
15+
)
16+
expected_data_schema_str = json.dumps(
17+
expected_data_schema_dict, indent=None, sort_keys=True
18+
)
19+
20+
21+
def test_data_schema_replace_none():
22+
params = InferenceParameters(model_id="test-id")
23+
assert params.data_schema is None
24+
25+
26+
def test_data_schema_replace_str():
27+
params = InferenceParameters(
28+
model_id="test-id", data_schema=expected_data_schema_str
29+
)
30+
assert str(params.data_schema) == expected_data_schema_str
31+
32+
33+
def test_data_schema_replace_dict():
34+
params = InferenceParameters(
35+
model_id="test-id", data_schema=expected_data_schema_dict
36+
)
37+
assert str(params.data_schema) == expected_data_schema_str
38+
39+
40+
def test_data_schema_replace_obj_top():
41+
params = InferenceParameters(
42+
model_id="test-id",
43+
data_schema=DataSchema(replace=expected_data_schema_dict["replace"]),
44+
)
45+
assert str(params.data_schema) == expected_data_schema_str
46+
47+
48+
def test_data_schema_replace_obj_fields():
49+
params = InferenceParameters(
50+
model_id="test-id",
51+
data_schema=DataSchema(
52+
replace=DataSchemaReplace(
53+
fields=expected_data_schema_dict["replace"]["fields"]
54+
)
55+
),
56+
)
57+
assert str(params.data_schema) == expected_data_schema_str
58+
59+
60+
def test_data_schema_replace_empty_fields():
61+
with pytest.raises(
62+
ValueError, match="Data schema replacement fields cannot be empty"
63+
):
64+
InferenceParameters(model_id="test-id", data_schema={"replace": {"fields": []}})
65+
66+
67+
def test_data_schema_replace_obj_full():
68+
params = InferenceParameters(
69+
model_id="test-id",
70+
data_schema=DataSchema(
71+
replace=DataSchemaReplace(
72+
fields=[
73+
DataSchemaField(
74+
name="test_replace",
75+
title="Test Replace",
76+
type="string",
77+
is_array=False,
78+
description="A static value for testing.",
79+
guidelines="IMPORTANT: always return this exact string: 'a test value'",
80+
)
81+
]
82+
)
83+
),
84+
)
85+
assert str(params.data_schema) == expected_data_schema_str

tests/v2/test_client.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from mindee.error.mindee_error import MindeeApiV2Error, MindeeError
88
from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
99
from mindee.input import LocalInputSource, PathInput
10-
from mindee.input.inference_parameters import DataSchema
1110
from mindee.mindee_http.base_settings import USER_AGENT
1211
from mindee.parsing.v2.inference import Inference
1312
from mindee.parsing.v2.job import Job
@@ -141,7 +140,11 @@ def test_enqueue_and_parse_path_with_env_token(custom_base_url_client):
141140
InferenceParameters(
142141
"dummy-model",
143142
text_context="ignore this message",
144-
data_schema=DataSchema(replace={"test_field": {}}),
143+
data_schema=json.loads(
144+
(
145+
V2_DATA_DIR / "inference" / "data_schema_replace_param.json"
146+
).read_text()
147+
),
145148
),
146149
)
147150

tests/v2/test_client_integration.py

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,8 @@
66
from mindee import ClientV2, InferenceParameters, PathInput, UrlInputSource
77
from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
88
from mindee.parsing.v2 import InferenceActiveOptions
9-
from mindee.input.inference_parameters import DataSchema
109
from mindee.parsing.v2.inference_response import InferenceResponse
11-
from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR
10+
from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR, V2_DATA_DIR
1211

1312

1413
@pytest.fixture(scope="session")
@@ -285,6 +284,9 @@ def test_data_schema_must_succeed(
285284
Load a blank PDF from an HTTPS URL and make sure the inference call completes without raising any errors.
286285
"""
287286
input_path: Path = FILE_TYPES_DIR / "pdf" / "blank_1.pdf"
287+
data_schema_replace_path = (
288+
V2_DATA_DIR / "inference" / "data_schema_replace_param.json"
289+
)
288290

289291
input_source = PathInput(input_path)
290292
params = InferenceParameters(
@@ -294,24 +296,13 @@ def test_data_schema_must_succeed(
294296
polygon=False,
295297
confidence=False,
296298
webhook_ids=[],
297-
data_schema=DataSchema(
298-
replace={
299-
"fields": [
300-
{
301-
"name": "test",
302-
"title": "Test",
303-
"is_array": False,
304-
"type": "string",
305-
"description": "A test field",
306-
}
307-
]
308-
}
309-
),
310-
alias="py_integration_data_schema_override",
299+
data_schema=data_schema_replace_path.read_text(),
300+
alias="py_integration_data_schema_replace",
311301
)
312302
response: InferenceResponse = v2_client.enqueue_and_get_inference(
313303
input_source, params
314304
)
315305
_basic_assert_success(response=response, page_count=1, model_id=findoc_model_id)
316306
assert response.inference.active_options.data_schema.replace is True
317-
assert response.inference.result.fields["test"] is not None
307+
assert response.inference.result.fields["test_replace"] is not None
308+
assert response.inference.result.fields["test_replace"].value == "a test value"

0 commit comments

Comments
 (0)