[MODEL-20346] Add optional inputSchema to metadata returned by /info/ route (#1661)

mdambski · web-flow · commit 3d51161b3f81 · 2025-09-17T15:26:30.000+02:00
* add inputSchema to info/ route

* add docs

* add validation and more test cases

* add more annotations

* add more annotations
diff --git a/MODEL-METADATA.md b/MODEL-METADATA.md
@@ -28,6 +28,12 @@ ignored if modelID is set.
 * majorVersion (optional, default: True): Whether the model version you are creating should be a 
 major version update or a minor version update. If the previous model version is 2.3, a major version 
 update would create the version 3.3, and a minor version update would create the version 2.4. 
+* inputSchema (optional): A schema defining the format of the input data for your model. This is 
+required when building unstructured models to serve as tools within MCP servers. The schema follows 
+JSON Schema format with three key components: `type` (typically "object" for structured data), 
+`properties` (a dictionary defining each field with its type, constraints, and optional default values),
+and `required` (an array listing mandatory fields). This type of schema can be generated by serializing 
+the schema of a pydantic model.
 
 ## Options specific to inference models
 NOTE: All options specific to inference models or tasks are ignored if modelID is set- they
diff --git a/custom_model_runner/datarobot_drum/drum/enum.py b/custom_model_runner/datarobot_drum/drum/enum.py
@@ -442,6 +442,7 @@ class ModelMetadataKeys(object):
     TRAINING_MODEL = "trainingModel"
     HYPERPARAMETERS = "hyperparameters"
     VALIDATION_SCHEMA = "typeSchema"
+    INPUT_SCHEMA = "inputSchema"
     # customPredictor section is not used by DRUM,
     # it is a place holder if user wants to add some fields and read them on his own
     CUSTOM_PREDICTOR = "customPredictor"
diff --git a/custom_model_runner/datarobot_drum/drum/model_metadata.py b/custom_model_runner/datarobot_drum/drum/model_metadata.py
@@ -12,6 +12,8 @@
 import trafaret as t
 
 from pathlib import Path
+
+from pydantic import create_model
 from ruamel.yaml import YAMLError
 from strictyaml import (
     load,
@@ -25,7 +27,7 @@
     StrictYAMLError,
     YAMLValidationError,
 )
-from typing import Optional as PythonTypingOptional, List, Dict
+from typing import Optional as PythonTypingOptional, List, Dict, Union
 
 from datarobot_drum.drum.common import get_drum_logger
 from datarobot_drum.drum.enum import (
@@ -187,7 +189,7 @@ def read_model_metadata_yaml(code_dir) -> PythonTypingOptional[dict]:
                 validate_config_fields(model_config, ModelMetadataKeys.INFERENCE_MODEL)
                 validate_config_fields(
                     model_config[ModelMetadataKeys.INFERENCE_MODEL],
-                    *["positiveClassLabel", "negativeClassLabel"]
+                    *["positiveClassLabel", "negativeClassLabel"],
                 )
 
         if model_config[ModelMetadataKeys.TARGET_TYPE] == TargetType.MULTICLASS.value:
@@ -230,10 +232,67 @@ def read_model_metadata_yaml(code_dir) -> PythonTypingOptional[dict]:
         if hyper_params:
             validate_model_metadata_hyperparameter(hyper_params)
 
+        input_schema = model_config.get(ModelMetadataKeys.INPUT_SCHEMA)
+        if input_schema:
+            try:
+                model = create_model_from_schema(input_schema)
+            except Exception as e:
+                raise DrumCommonException(
+                    "Error creating pydantic model from input schema: {}".format(e)
+                )
+
         return model_config
     return None
 
 
+def convert_json_type_to_python(prop_def: dict):
+    """Convert JSON Schema type to Python type."""
+
+    # Handle anyOf for union types
+    if "anyOf" in prop_def:
+        types = []
+        for schema in prop_def["anyOf"]:
+            types.append(convert_json_type_to_python(schema))
+        return Union[tuple(types)]
+
+    # Handle regular `type` field of the property
+    json_type = prop_def.get("type", "string")
+
+    type_mapping = {
+        "string": str,
+        "integer": int,
+        "number": float,
+        "boolean": bool,
+        "array": list,
+        "object": dict,
+        "null": type(None),
+    }
+
+    return type_mapping.get(json_type, str)
+
+
+def create_model_from_schema(schema_dict: dict):
+    """Create a Pydantic model from a JSON Schema dictionary."""
+    schema_type = schema_dict.get("type")
+    if schema_type != "object":
+        raise ValueError(f"Only 'object' type schemas are supported, got '{schema_type}'")
+
+    properties = schema_dict.get("properties", {})
+
+    properties_type = type(properties)
+    if properties_type is not dict:
+        raise ValueError(f"'properties' must be a dictionary, got '{properties_type}'")
+
+    fields, required_fields = {}, set(schema_dict.get("required", []))
+
+    for prop_name, prop_def in properties.items():
+        py_type = convert_json_type_to_python(prop_def)
+        default_value = ... if prop_name in required_fields else prop_def.get("default")
+        fields[prop_name] = (py_type, default_value)
+
+    return create_model("InputSchema", **fields)
+
+
 def read_default_model_metadata_yaml() -> PythonTypingOptional[dict]:
     default_type_schema_path = os.path.abspath(
         os.path.join(os.path.dirname(__file__), "..", "resource", "default_typeschema")
@@ -358,5 +417,13 @@ def _validate_multi_parameter(multi_params: Dict):
             Map({"key": Str(), "valueFrom": Str(), Optional("reminder"): Str()})
         ),
         Optional(ModelMetadataKeys.LAZY_LOADING): Any(),
+        Optional(ModelMetadataKeys.INPUT_SCHEMA): Map(
+            {
+                Optional("title"): Str(),
+                "type": Str(),
+                "properties": Any(),
+                Optional("required"): Seq(Str()),
+            }
+        ),
     }
 )
diff --git a/tests/unit/datarobot_drum/conftest.py b/tests/unit/datarobot_drum/conftest.py
@@ -210,6 +210,93 @@ def custom_predictor_metadata_yaml():
     )
 
 
+@pytest.fixture
+def custom_unstructured_tool_with_schema_in_yaml():
+    return dedent(
+        """
+        name: "[Tool] Get Data Registry Dataset"
+        description: |
+          Fetches a dataset from the DataRobot Data Registry.
+        
+        type: inference
+        environmentID: 64d2ba178dd3f0b1fa2162f0
+        targetType: unstructured
+        inferenceModel:
+          targetName: target
+        inputSchema:
+          type: object
+          properties:
+            dataset_id:
+              title: Dataset ID
+              description: The ID of the dataset to fetch from the Data Registry.
+              type: string
+            offset:
+              title: Offset
+              description: The number of rows to skip before starting to return rows. Default is 0.
+              type: integer
+              default: 0
+            limit:
+              title: Limit of rows
+              description: The maximum number of rows to return. If not specified, all rows will be returned.
+              anyOf:
+                - type: integer
+                - type: null
+              default: null
+          required:
+            - dataset_id
+        """
+    )
+
+
+@pytest.fixture
+def custom_unstructured_tool_with_invalid_schema1():
+    return dedent(
+        """
+        name: "[Tool] Get Data Registry Dataset"
+        description: |
+          Fetches a dataset from the DataRobot Data Registry.
+
+        type: inference
+        environmentID: 64d2ba178dd3f0b1fa2162f0
+        targetType: unstructured
+        inferenceModel:
+          targetName: target
+        inputSchema:
+          type: unexpected
+          properties:
+            dataset_id:
+              title: Dataset ID
+              type: string
+          required:
+            - dataset_id
+        """
+    )
+
+
+@pytest.fixture
+def custom_unstructured_tool_with_invalid_schema2():
+    return dedent(
+        """
+        name: "[Tool] Get Data Registry Dataset"
+        description: |
+          Fetches a dataset from the DataRobot Data Registry.
+
+        type: inference
+        environmentID: 64d2ba178dd3f0b1fa2162f0
+        targetType: unstructured
+        inferenceModel:
+          targetName: target
+        inputSchema:
+          type: object
+          properties:
+            - list-instead-of-dict
+            - another-item
+          required:
+            - dataset_id
+        """
+    )
+
+
 ###############################################################################
 # HELPER FUNCS
 
diff --git a/tests/unit/datarobot_drum/model_metadata/test_model_metadata.py b/tests/unit/datarobot_drum/model_metadata/test_model_metadata.py
@@ -11,11 +11,12 @@
 from random import sample
 from tempfile import TemporaryDirectory
 from textwrap import dedent
-from typing import List, Union
+from typing import List, Union, Optional
 
 import pytest
 import numpy as np
 import pandas as pd
+from pydantic import BaseModel, Field
 from scipy import sparse
 import yaml
 from strictyaml import load, YAMLValidationError
@@ -1001,6 +1002,16 @@ def _inner(input_dict):
         yield _inner
 
 
+def normalize_schema(obj):
+    if isinstance(obj, dict):
+        return {k: normalize_schema(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [normalize_schema(v) for v in obj]
+    if obj == "null" or obj == "None":
+        return None
+    return obj
+
+
 class TestReadModelMetadata:
     @pytest.fixture
     def minimal_training_metadata(self, environment_id):
@@ -1010,13 +1021,46 @@ def minimal_training_metadata(self, environment_id):
             "targetType": "regression",
             "environmentID": environment_id,
             "validation": {"input": "hello"},
+            "inputSchema": {
+                "type": "object",
+                "properties": {
+                    "foo": {"type": "integer", "default": "42"},
+                    "bar": {"type": "string"},
+                    "baz": {"type": "boolean"},
+                },
+                "required": ["foo", "bar"],
+            },
         }
 
+    @pytest.fixture
+    def metadata_with_pydantic_schema(self, minimal_training_metadata):
+        """An example of using a pydantic model to generate the input schema
+        with rich annotations, which are common practices when defining pydantic
+        models for automated processing using LLMs.
+        """
+
+        class ExampleSchema(BaseModel):
+            foo: int = Field(..., description="A foo field")
+            bar: str = Field(
+                "bar-value", title="The bar field", description="The bar field with a default value"
+            )
+            baz: Optional[bool] = None
+
+        minimal_training_metadata["inputSchema"] = ExampleSchema.model_json_schema()
+        return minimal_training_metadata
+
     def test_minimal_data(self, model_metadata_file_factory, minimal_training_metadata):
         code_dir = model_metadata_file_factory(minimal_training_metadata)
         result = read_model_metadata_yaml(code_dir)
         assert result == minimal_training_metadata
 
+    def test_metadata_with_pydantic_schema(
+        self, model_metadata_file_factory, metadata_with_pydantic_schema
+    ):
+        code_dir = model_metadata_file_factory(metadata_with_pydantic_schema)
+        result = read_model_metadata_yaml(code_dir)
+        assert normalize_schema(result) == normalize_schema(metadata_with_pydantic_schema)
+
     def test_user_credential_specs(self, model_metadata_file_factory, minimal_training_metadata):
         credential_specs = [
             {"key": "HI", "valueFrom": "65170a6bc4b7f4bec89db932", "reminder": "remember"},
@@ -1176,8 +1220,11 @@ def test_validate_model_metadata_output_requirements_r():
         ("inference_binary_metadata_no_label", 2),
         ("inference_multiclass_metadata_yaml_no_labels", 3),
         ("inference_multiclass_metadata_yaml_labels_and_label_file", 4),
+        ("custom_unstructured_tool_with_invalid_schema1", 5),
+        ("custom_unstructured_tool_with_invalid_schema2", 6),
         ("inference_multiclass_metadata_yaml", 100),
         ("inference_multiclass_metadata_yaml_label_file", 100),
+        ("custom_unstructured_tool_with_schema_in_yaml", 100),
     ],
 )
 def test_yaml_metadata_missing_fields(tmp_path, config_yaml, request, test_case_number):
@@ -1211,5 +1258,17 @@ def test_yaml_metadata_missing_fields(tmp_path, config_yaml, request, test_case_
             match="Error - for multiclass classification, either the class labels or a class labels file should be provided in model-metadata.yaml file, but not both",
         ):
             read_model_metadata_yaml(tmp_path)
+    elif test_case_number == 5:
+        with pytest.raises(
+            DrumCommonException,
+            match="Error creating pydantic model from input schema: Only 'object' type schemas are supported, got ",
+        ):
+            read_model_metadata_yaml(tmp_path)
+    elif test_case_number == 6:
+        with pytest.raises(
+            DrumCommonException,
+            match="Error creating pydantic model from input schema: 'properties' must be a dictionary, got ",
+        ):
+            read_model_metadata_yaml(tmp_path)
     elif test_case_number == 100:
         read_model_metadata_yaml(tmp_path)