From c7ba4b13808f53a2acb04c6cc42336057abd6120 Mon Sep 17 00:00:00 2001 From: Somasundaram Sekar Date: Mon, 8 Dec 2025 11:13:34 +0100 Subject: [PATCH] fix: Allow writing data with missing optional map fields When writing data to a table with an optional map field, the schema compatibility check incorrectly failed if the data was missing that field. This happened because the validator descended into the map's internal key field (which is always required) even when the parent map field was optional and missing. The fix modifies _SchemaCompatibilityVisitor.field() to skip child validation when an optional parent field is missing from the provided schema. Closes #2684 --- pyiceberg/schema.py | 6 ++- tests/test_schema.py | 105 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+), 1 deletion(-) diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 57ef915c04..5896e7e1eb 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -1783,7 +1783,11 @@ def struct(self, struct: StructType, field_results: builtins.list[Callable[[], b return all(results) def field(self, field: NestedField, field_result: Callable[[], bool]) -> bool: - return self._is_field_compatible(field) and field_result() + # Skip child validation for missing optional fields (#2797) + is_compatible = self._is_field_compatible(field) + if field.field_id not in self.provided_schema._lazy_id_to_field: + return is_compatible + return is_compatible and field_result() def list(self, list_type: ListType, element_result: Callable[[], bool]) -> bool: return self._is_field_compatible(list_type.element_field) and element_result() diff --git a/tests/test_schema.py b/tests/test_schema.py index 589a45c3b4..0c006879ea 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -25,6 +25,7 @@ from pyiceberg.schema import ( Accessor, Schema, + _check_schema_compatible, build_position_accessors, index_by_id, index_by_name, @@ -1687,3 +1688,107 @@ def test_arrow_schema() -> None: ) assert base_schema.as_arrow() == expected_schema + + +def test_check_schema_compatible_optional_map_field_missing() -> None: + """Test that optional map field missing from provided schema is compatible (issue #2684).""" + requested_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="data", + field_type=MapType(key_id=3, key_type=StringType(), value_id=4, value_type=StringType()), + required=False, # Optional map field + ), + ) + # Provided schema is missing the optional map field + provided_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + ) + # Should not raise - optional field can be missing + _check_schema_compatible(requested_schema, provided_schema) + + +def test_check_schema_compatible_required_map_field_missing() -> None: + """Test that required map field missing from provided schema raises error.""" + requested_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="data", + field_type=MapType(key_id=3, key_type=StringType(), value_id=4, value_type=StringType()), + required=True, # Required map field + ), + ) + # Provided schema is missing the required map field + provided_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + ) + # Should raise - required field cannot be missing + with pytest.raises(ValueError, match="Mismatch in fields"): + _check_schema_compatible(requested_schema, provided_schema) + + +def test_check_schema_compatible_optional_list_field_missing() -> None: + """Test that optional list field missing from provided schema is compatible.""" + requested_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="items", + field_type=ListType(element_id=3, element_type=StringType(), element_required=True), + required=False, # Optional list field + ), + ) + # Provided schema is missing the optional list field + provided_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + ) + # Should not raise - optional field can be missing + _check_schema_compatible(requested_schema, provided_schema) + + +def test_check_schema_compatible_optional_struct_field_missing() -> None: + """Test that optional struct field missing from provided schema is compatible.""" + requested_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="details", + field_type=StructType( + NestedField(field_id=3, name="name", field_type=StringType(), required=True), + NestedField(field_id=4, name="count", field_type=IntegerType(), required=True), + ), + required=False, # Optional struct field + ), + ) + # Provided schema is missing the optional struct field + provided_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + ) + # Should not raise - optional field can be missing + _check_schema_compatible(requested_schema, provided_schema) + + +def test_check_schema_compatible_optional_map_field_present() -> None: + """Test that optional map field present in provided schema is compatible.""" + requested_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="data", + field_type=MapType(key_id=3, key_type=StringType(), value_id=4, value_type=StringType()), + required=False, + ), + ) + provided_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="data", + field_type=MapType(key_id=3, key_type=StringType(), value_id=4, value_type=StringType()), + required=False, + ), + ) + # Should not raise - schemas match + _check_schema_compatible(requested_schema, provided_schema)