diff --git a/src/iceberg/json_internal.cc b/src/iceberg/json_internal.cc index a52f418e4..c0f33d375 100644 --- a/src/iceberg/json_internal.cc +++ b/src/iceberg/json_internal.cc @@ -227,6 +227,9 @@ nlohmann::json ToJson(const SchemaField& field) { json[kName] = field.name(); json[kRequired] = !field.optional(); json[kType] = ToJson(*field.type()); + if (!field.doc().empty()) { + json[kDoc] = field.doc(); + } return json; } @@ -463,9 +466,10 @@ Result> FieldFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE(auto field_id, GetJsonValue(json, kId)); ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue(json, kName)); ICEBERG_ASSIGN_OR_RAISE(auto required, GetJsonValue(json, kRequired)); + ICEBERG_ASSIGN_OR_RAISE(auto doc, GetJsonValueOrDefault(json, kDoc)); return std::make_unique(field_id, std::move(name), std::move(type), - !required); + !required, doc); } Result> SchemaFromJson(const nlohmann::json& json) { diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index 04b55a025..206915ec2 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -20,28 +20,29 @@ #include "iceberg/schema_field.h" #include +#include #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep namespace iceberg { -SchemaField::SchemaField(int32_t field_id, std::string name, std::shared_ptr type, - bool optional, std::string doc) +SchemaField::SchemaField(int32_t field_id, std::string_view name, + std::shared_ptr type, bool optional, std::string_view doc) : field_id_(field_id), - name_(std::move(name)), + name_(name), type_(std::move(type)), optional_(optional), - doc_(std::move(doc)) {} + doc_(doc) {} -SchemaField SchemaField::MakeOptional(int32_t field_id, std::string name, - std::shared_ptr type, std::string doc) { - return {field_id, std::move(name), std::move(type), true, std::move(doc)}; +SchemaField SchemaField::MakeOptional(int32_t field_id, std::string_view name, + std::shared_ptr type, std::string_view doc) { + return {field_id, name, std::move(type), true, doc}; } -SchemaField SchemaField::MakeRequired(int32_t field_id, std::string name, - std::shared_ptr type, std::string doc) { - return {field_id, std::move(name), std::move(type), false, std::move(doc)}; +SchemaField SchemaField::MakeRequired(int32_t field_id, std::string_view name, + std::shared_ptr type, std::string_view doc) { + return {field_id, name, std::move(type), false, doc}; } int32_t SchemaField::field_id() const { return field_id_; } diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index c7c826d87..fd20226a5 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -46,15 +46,15 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { /// \param[in] type The field type. /// \param[in] optional Whether values of this field are required or nullable. /// \param[in] doc Optional documentation string for the field. - SchemaField(int32_t field_id, std::string name, std::shared_ptr type, - bool optional, std::string doc = {}); + SchemaField(int32_t field_id, std::string_view name, std::shared_ptr type, + bool optional, std::string_view doc = {}); /// \brief Construct an optional (nullable) field. - static SchemaField MakeOptional(int32_t field_id, std::string name, - std::shared_ptr type, std::string doc = {}); + static SchemaField MakeOptional(int32_t field_id, std::string_view name, + std::shared_ptr type, std::string_view doc = {}); /// \brief Construct a required (non-null) field. - static SchemaField MakeRequired(int32_t field_id, std::string name, - std::shared_ptr type, std::string doc = {}); + static SchemaField MakeRequired(int32_t field_id, std::string_view name, + std::shared_ptr type, std::string_view doc = {}); /// \brief Get the field ID. [[nodiscard]] int32_t field_id() const; diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index a32bbe4de..c7b64df2b 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -165,6 +165,7 @@ if(ICEBERG_BUILD_BUNDLE) transaction_test.cc update_partition_spec_test.cc update_properties_test.cc + update_schema_test.cc update_sort_order_test.cc) add_iceberg_test(data_writer_test USE_BUNDLE SOURCES data_writer_test.cc) diff --git a/src/iceberg/test/update_schema_test.cc b/src/iceberg/test/update_schema_test.cc new file mode 100644 index 000000000..ce3c77ccd --- /dev/null +++ b/src/iceberg/test/update_schema_test.cc @@ -0,0 +1,739 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/update/update_schema.h" + +#include + +#include + +#include "iceberg/schema.h" +#include "iceberg/schema_field.h" +#include "iceberg/test/matchers.h" +#include "iceberg/test/update_test_base.h" +#include "iceberg/type.h" + +namespace iceberg { + +class UpdateSchemaTest : public UpdateTestBase {}; + +// Test adding a simple optional column +TEST_F(UpdateSchemaTest, AddOptionalColumn) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32(), "A new integer column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ASSERT_TRUE(result.schema != nullptr); + + // Verify the new column was added + ICEBERG_UNWRAP_OR_FAIL(auto new_field_opt, result.schema->FindFieldByName("new_col")); + ASSERT_TRUE(new_field_opt.has_value()); + + const auto& new_field = new_field_opt->get(); + EXPECT_EQ(new_field.name(), "new_col"); + EXPECT_EQ(new_field.type(), int32()); + EXPECT_TRUE(new_field.optional()); + EXPECT_EQ(new_field.doc(), "A new integer column"); +} + +// Test adding a required column (should fail without AllowIncompatibleChanges) +TEST_F(UpdateSchemaTest, AddRequiredColumnFails) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddRequiredColumn("required_col", string(), "A required string column"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Incompatible change")); +} + +// Test adding a required column with AllowIncompatibleChanges +TEST_F(UpdateSchemaTest, AddRequiredColumnWithAllowIncompatible) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AllowIncompatibleChanges().AddRequiredColumn("required_col", string(), + "A required string column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ASSERT_TRUE(result.schema != nullptr); + + // Verify the new column was added + ICEBERG_UNWRAP_OR_FAIL(auto new_field_opt, + result.schema->FindFieldByName("required_col")); + ASSERT_TRUE(new_field_opt.has_value()); + + const auto& new_field = new_field_opt->get(); + EXPECT_EQ(new_field.name(), "required_col"); + EXPECT_EQ(new_field.type(), string()); + EXPECT_FALSE(new_field.optional()); + EXPECT_EQ(new_field.doc(), "A required string column"); +} + +// Test adding multiple columns +TEST_F(UpdateSchemaTest, AddMultipleColumns) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("col1", int32(), "First column") + .AddColumn("col2", string(), "Second column") + .AddColumn("col3", boolean(), "Third column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ASSERT_TRUE(result.schema != nullptr); + + // Verify all columns were added + ICEBERG_UNWRAP_OR_FAIL(auto col1_opt, result.schema->FindFieldByName("col1")); + ICEBERG_UNWRAP_OR_FAIL(auto col2_opt, result.schema->FindFieldByName("col2")); + ICEBERG_UNWRAP_OR_FAIL(auto col3_opt, result.schema->FindFieldByName("col3")); + + ASSERT_TRUE(col1_opt.has_value()); + ASSERT_TRUE(col2_opt.has_value()); + ASSERT_TRUE(col3_opt.has_value()); + + EXPECT_EQ(col1_opt->get().type(), int32()); + EXPECT_EQ(col2_opt->get().type(), string()); + EXPECT_EQ(col3_opt->get().type(), boolean()); +} + +// Test adding column with dot in name should fail for top-level +TEST_F(UpdateSchemaTest, AddColumnWithDotInNameFails) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("col.with.dots", int32(), "Column with dots"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot add column with ambiguous name")); +} + +// Test adding column to nested struct +TEST_F(UpdateSchemaTest, AddColumnToNestedStruct) { + // First, add a struct column to the table + auto struct_type = std::make_shared(std::vector{ + SchemaField(100, "nested_field", int32(), true, "Nested field")}); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("struct_col", struct_type, "A struct column"); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Reload table and add column to the nested struct + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->AddColumn("struct_col", "new_nested_field", string(), "New nested field"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ASSERT_TRUE(result.schema != nullptr); + + // Verify the nested field was added + ICEBERG_UNWRAP_OR_FAIL(auto struct_field_opt, + result.schema->FindFieldByName("struct_col")); + ASSERT_TRUE(struct_field_opt.has_value()); + + const auto& struct_field = struct_field_opt->get(); + ASSERT_TRUE(struct_field.type()->is_nested()); + + const auto& nested_struct = static_cast(*struct_field.type()); + ICEBERG_UNWRAP_OR_FAIL(auto nested_field_opt, + nested_struct.GetFieldByName("new_nested_field")); + ASSERT_TRUE(nested_field_opt.has_value()); + + const auto& nested_field = nested_field_opt->get(); + EXPECT_EQ(nested_field.name(), "new_nested_field"); + EXPECT_EQ(nested_field.type(), string()); +} + +// Test adding column to non-existent parent fails +TEST_F(UpdateSchemaTest, AddColumnToNonExistentParentFails) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("non_existent_parent", "new_field", int32(), "New field"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot find parent struct")); +} + +// Test adding column to non-struct parent fails +TEST_F(UpdateSchemaTest, AddColumnToNonStructParentFails) { + // First, add a primitive column + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("primitive_col", int32(), "A primitive column"); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Try to add column to the primitive column (should fail) + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->AddColumn("primitive_col", "nested_field", string(), "Should fail"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot add to non-struct column")); +} + +// Test adding duplicate column name fails +TEST_F(UpdateSchemaTest, AddDuplicateColumnNameFails) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("duplicate_col", int32(), "First column") + .AddColumn("duplicate_col", string(), "Duplicate column"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + EXPECT_THAT(result, HasErrorMessage("Duplicate path found")); +} + +// Test column ID assignment +TEST_F(UpdateSchemaTest, ColumnIdAssignment) { + ICEBERG_UNWRAP_OR_FAIL(auto original_schema, table_->schema()); + int32_t original_last_id = table_->metadata()->last_column_id; + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col1", int32(), "First new column") + .AddColumn("new_col2", string(), "Second new column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify new last column ID is incremented + EXPECT_EQ(result.new_last_column_id, original_last_id + 2); + + // Verify new columns have sequential IDs + ICEBERG_UNWRAP_OR_FAIL(auto col1_opt, result.schema->FindFieldByName("new_col1")); + ICEBERG_UNWRAP_OR_FAIL(auto col2_opt, result.schema->FindFieldByName("new_col2")); + + ASSERT_TRUE(col1_opt.has_value()); + ASSERT_TRUE(col2_opt.has_value()); + + EXPECT_EQ(col1_opt->get().field_id(), original_last_id + 1); + EXPECT_EQ(col2_opt->get().field_id(), original_last_id + 2); +} + +// Test adding nested struct with multiple fields +TEST_F(UpdateSchemaTest, AddNestedStructColumn) { + auto nested_struct = std::make_shared(std::vector{ + SchemaField(100, "field1", int32(), true, "First nested field"), + SchemaField(101, "field2", string(), false, "Second nested field")}); + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("complex_struct", nested_struct, "A complex struct column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ASSERT_TRUE(result.schema != nullptr); + + // Verify the struct column was added + ICEBERG_UNWRAP_OR_FAIL(auto struct_field_opt, + result.schema->FindFieldByName("complex_struct")); + ASSERT_TRUE(struct_field_opt.has_value()); + + const auto& struct_field = struct_field_opt->get(); + EXPECT_EQ(struct_field.name(), "complex_struct"); + EXPECT_TRUE(struct_field.type()->is_nested()); + EXPECT_TRUE(struct_field.optional()); + + // Verify nested fields exist + const auto& nested_type = static_cast(*struct_field.type()); + EXPECT_EQ(nested_type.fields().size(), 2); + + ICEBERG_UNWRAP_OR_FAIL(auto field1_opt, nested_type.GetFieldByName("field1")); + ICEBERG_UNWRAP_OR_FAIL(auto field2_opt, nested_type.GetFieldByName("field2")); + + ASSERT_TRUE(field1_opt.has_value()); + ASSERT_TRUE(field2_opt.has_value()); + + EXPECT_EQ(field1_opt->get().type(), int32()); + EXPECT_EQ(field2_opt->get().type(), string()); + EXPECT_TRUE(field1_opt->get().optional()); + EXPECT_FALSE(field2_opt->get().optional()); +} + +// Test case sensitivity +TEST_F(UpdateSchemaTest, CaseSensitiveColumnNames) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->CaseSensitive(true) + .AddColumn("Column", int32(), "Uppercase column") + .AddColumn("column", string(), "Lowercase column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ASSERT_TRUE(result.schema != nullptr); + + // Both columns should exist with case-sensitive search + ICEBERG_UNWRAP_OR_FAIL(auto upper_opt, result.schema->FindFieldByName("Column", true)); + ICEBERG_UNWRAP_OR_FAIL(auto lower_opt, result.schema->FindFieldByName("column", true)); + + ASSERT_TRUE(upper_opt.has_value()); + ASSERT_TRUE(lower_opt.has_value()); + + EXPECT_EQ(upper_opt->get().type(), int32()); + EXPECT_EQ(lower_opt->get().type(), string()); +} + +// Test case insensitive duplicate detection +TEST_F(UpdateSchemaTest, CaseInsensitiveDuplicateDetection) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->CaseSensitive(false) + .AddColumn("Column", int32(), "First column") + .AddColumn("COLUMN", string(), "Duplicate column"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + EXPECT_THAT(result, HasErrorMessage("Duplicate path found")); +} + +// Test empty update +TEST_F(UpdateSchemaTest, EmptyUpdate) { + ICEBERG_UNWRAP_OR_FAIL(auto original_schema, table_->schema()); + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Schema should be unchanged + EXPECT_EQ(*result.schema, *original_schema); + EXPECT_EQ(result.new_last_column_id, table_->metadata()->last_column_id); +} + +// Test commit success +TEST_F(UpdateSchemaTest, CommitSuccess) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("committed_col", int64(), "A committed column"); + + EXPECT_THAT(update->Commit(), IsOk()); + + // Reload table and verify column exists + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto schema, reloaded->schema()); + + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, schema->FindFieldByName("committed_col")); + ASSERT_TRUE(field_opt.has_value()); + + const auto& field = field_opt->get(); + EXPECT_EQ(field.name(), "committed_col"); + EXPECT_EQ(*field.type(), *int64()); + EXPECT_TRUE(field.optional()); + EXPECT_EQ(field.doc(), "A committed column"); +} + +// Test adding fields to map value and list element structs +TEST_F(UpdateSchemaTest, AddFieldsToMapAndList) { + // Create a schema with map and list of structs + auto map_key_struct = std::make_shared( + std::vector{SchemaField(20, "address", string(), false), + SchemaField(21, "city", string(), false)}); + + auto map_value_struct = std::make_shared( + std::vector{SchemaField(12, "lat", float32(), false), + SchemaField(13, "long", float32(), false)}); + + auto map_type = + std::make_shared(SchemaField(10, "key", map_key_struct, false), + SchemaField(11, "value", map_value_struct, false)); + + auto list_element_struct = std::make_shared(std::vector{ + SchemaField(15, "x", int64(), false), SchemaField(16, "y", int64(), false)}); + + auto list_type = + std::make_shared(SchemaField(14, "element", list_element_struct, true)); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("locations", map_type, "map of address to coordinate") + .AddColumn("points", list_type, "2-D cartesian points"); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Reload and add fields to nested structs + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update + ->AddColumn("locations", "alt", float32(), "altitude") // add to map value + .AddColumn("points", "z", int64(), "z coordinate"); // add to list element + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify map value has new field + ICEBERG_UNWRAP_OR_FAIL(auto locations_opt, result.schema->FindFieldByName("locations")); + ASSERT_TRUE(locations_opt.has_value()); + const auto& locations_field = locations_opt->get(); + ASSERT_EQ(locations_field.type()->type_id(), TypeId::kMap); + + const auto& map = static_cast(*locations_field.type()); + const auto& value_struct = static_cast(*map.value().type()); + ICEBERG_UNWRAP_OR_FAIL(auto alt_opt, value_struct.GetFieldByName("alt")); + ASSERT_TRUE(alt_opt.has_value()); + EXPECT_EQ(alt_opt->get().type(), float32()); + + // Verify list element has new field + ICEBERG_UNWRAP_OR_FAIL(auto points_opt, result.schema->FindFieldByName("points")); + ASSERT_TRUE(points_opt.has_value()); + const auto& points_field = points_opt->get(); + ASSERT_EQ(points_field.type()->type_id(), TypeId::kList); + + const auto& list = static_cast(*points_field.type()); + const auto& element_struct = static_cast(*list.element().type()); + ICEBERG_UNWRAP_OR_FAIL(auto z_opt, element_struct.GetFieldByName("z")); + ASSERT_TRUE(z_opt.has_value()); + EXPECT_EQ(z_opt->get().type(), int64()); +} + +// Test adding nested struct with ID reassignment +TEST_F(UpdateSchemaTest, AddNestedStructWithIdReassignment) { + // Create a struct with conflicting IDs (will be reassigned) + auto nested_struct = std::make_shared(std::vector{ + SchemaField(1, "lat", int32(), false), // ID 1 conflicts with existing schema + SchemaField(2, "long", int32(), true)}); // ID 2 conflicts with existing schema + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("location", nested_struct); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the struct was added with reassigned IDs + ICEBERG_UNWRAP_OR_FAIL(auto location_opt, result.schema->FindFieldByName("location")); + ASSERT_TRUE(location_opt.has_value()); + + const auto& location_field = location_opt->get(); + ASSERT_TRUE(location_field.type()->is_nested()); + + const auto& struct_type = static_cast(*location_field.type()); + ASSERT_EQ(struct_type.fields().size(), 2); + + // IDs should be reassigned to avoid conflicts + ICEBERG_UNWRAP_OR_FAIL(auto lat_opt, struct_type.GetFieldByName("lat")); + ICEBERG_UNWRAP_OR_FAIL(auto long_opt, struct_type.GetFieldByName("long")); + + ASSERT_TRUE(lat_opt.has_value()); + ASSERT_TRUE(long_opt.has_value()); + + // IDs should be > 1 (reassigned) + EXPECT_GT(lat_opt->get().field_id(), 1); + EXPECT_GT(long_opt->get().field_id(), 1); +} + +// Test adding nested map of structs with ID reassignment +TEST_F(UpdateSchemaTest, AddNestedMapOfStructs) { + auto key_struct = std::make_shared(std::vector{ + SchemaField(20, "address", string(), false), + SchemaField(21, "city", string(), false), SchemaField(22, "state", string(), false), + SchemaField(23, "zip", int32(), false)}); + + auto value_struct = std::make_shared(std::vector{ + SchemaField(9, "lat", int32(), false), SchemaField(8, "long", int32(), true)}); + + auto map_type = std::make_shared(SchemaField(1, "key", key_struct, false), + SchemaField(2, "value", value_struct, true)); + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("locations", map_type); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the map was added with reassigned IDs + ICEBERG_UNWRAP_OR_FAIL(auto locations_opt, result.schema->FindFieldByName("locations")); + ASSERT_TRUE(locations_opt.has_value()); + + const auto& locations_field = locations_opt->get(); + ASSERT_EQ(locations_field.type()->type_id(), TypeId::kMap); + + const auto& map = static_cast(*locations_field.type()); + + // Verify key struct fields + const auto& key_struct_type = static_cast(*map.key().type()); + EXPECT_EQ(key_struct_type.fields().size(), 4); + + // Verify value struct fields + const auto& value_struct_type = static_cast(*map.value().type()); + EXPECT_EQ(value_struct_type.fields().size(), 2); + + ICEBERG_UNWRAP_OR_FAIL(auto lat_opt, value_struct_type.GetFieldByName("lat")); + ICEBERG_UNWRAP_OR_FAIL(auto long_opt, value_struct_type.GetFieldByName("long")); + + ASSERT_TRUE(lat_opt.has_value()); + ASSERT_TRUE(long_opt.has_value()); +} + +// Test adding nested list of structs with ID reassignment +TEST_F(UpdateSchemaTest, AddNestedListOfStructs) { + auto element_struct = std::make_shared(std::vector{ + SchemaField(9, "lat", int32(), false), SchemaField(8, "long", int32(), true)}); + + auto list_type = + std::make_shared(SchemaField(1, "element", element_struct, true)); + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("locations", list_type); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the list was added with reassigned IDs + ICEBERG_UNWRAP_OR_FAIL(auto locations_opt, result.schema->FindFieldByName("locations")); + ASSERT_TRUE(locations_opt.has_value()); + + const auto& locations_field = locations_opt->get(); + ASSERT_EQ(locations_field.type()->type_id(), TypeId::kList); + + const auto& list = static_cast(*locations_field.type()); + const auto& element_struct_type = + static_cast(*list.element().type()); + + EXPECT_EQ(element_struct_type.fields().size(), 2); + + ICEBERG_UNWRAP_OR_FAIL(auto lat_opt, element_struct_type.GetFieldByName("lat")); + ICEBERG_UNWRAP_OR_FAIL(auto long_opt, element_struct_type.GetFieldByName("long")); + + ASSERT_TRUE(lat_opt.has_value()); + ASSERT_TRUE(long_opt.has_value()); +} + +// Test adding field with dots in name to nested struct +TEST_F(UpdateSchemaTest, AddFieldWithDotsInName) { + // First add a struct column + auto struct_type = std::make_shared( + std::vector{SchemaField(100, "field1", int32(), true)}); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("struct_col", struct_type); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Add a field with dots in its name to the nested struct + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->AddColumn("struct_col", "field.with.dots", int64(), "Field with dots in name"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the field with dots was added + ICEBERG_UNWRAP_OR_FAIL(auto struct_field_opt, + result.schema->FindFieldByName("struct_col")); + ASSERT_TRUE(struct_field_opt.has_value()); + + const auto& struct_field = struct_field_opt->get(); + const auto& nested_struct = static_cast(*struct_field.type()); + + ICEBERG_UNWRAP_OR_FAIL(auto dotted_field_opt, + nested_struct.GetFieldByName("field.with.dots")); + ASSERT_TRUE(dotted_field_opt.has_value()); + EXPECT_EQ(dotted_field_opt->get().name(), "field.with.dots"); + EXPECT_EQ(dotted_field_opt->get().type(), int64()); +} + +// Test adding field to map key should fail +TEST_F(UpdateSchemaTest, AddFieldToMapKeyFails) { + // Create a map with struct key + auto key_struct = std::make_shared( + std::vector{SchemaField(20, "address", string(), false)}); + + auto value_struct = std::make_shared( + std::vector{SchemaField(12, "lat", float32(), false)}); + + auto map_type = + std::make_shared(SchemaField(10, "key", key_struct, false), + SchemaField(11, "value", value_struct, false)); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("locations", map_type); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Try to add field to map key (should fail) + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->AddColumn("locations.key", "city", string(), "Should fail"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot add fields to map keys")); +} + +// Test deleting a column +TEST_F(UpdateSchemaTest, DeleteColumn) { + // First add a column + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("to_delete", string(), "A column to delete"); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Delete the column + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->DeleteColumn("to_delete"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the column was deleted + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, result.schema->FindFieldByName("to_delete")); + EXPECT_FALSE(field_opt.has_value()); +} + +// Test deleting a nested column +TEST_F(UpdateSchemaTest, DeleteNestedColumn) { + // First add a struct with nested fields + auto struct_type = std::make_shared( + std::vector{SchemaField(100, "field1", int32(), true), + SchemaField(101, "field2", string(), true)}); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("struct_col", struct_type); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Delete one of the nested fields + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->DeleteColumn("struct_col.field1"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify field1 was deleted but field2 still exists + ICEBERG_UNWRAP_OR_FAIL(auto struct_field_opt, + result.schema->FindFieldByName("struct_col")); + ASSERT_TRUE(struct_field_opt.has_value()); + + const auto& struct_field = struct_field_opt->get(); + const auto& nested_struct = static_cast(*struct_field.type()); + + ICEBERG_UNWRAP_OR_FAIL(auto field1_opt, nested_struct.GetFieldByName("field1")); + ICEBERG_UNWRAP_OR_FAIL(auto field2_opt, nested_struct.GetFieldByName("field2")); + + EXPECT_FALSE(field1_opt.has_value()); + EXPECT_TRUE(field2_opt.has_value()); +} + +// Test deleting missing column fails +TEST_F(UpdateSchemaTest, DeleteMissingColumnFails) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->DeleteColumn("non_existent"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot delete missing column")); +} + +// Test delete then add same column +TEST_F(UpdateSchemaTest, DeleteThenAdd) { + // First add a required column + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AllowIncompatibleChanges().AddRequiredColumn("col", int32(), + "Required column"); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Delete then add with different type and optional + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->DeleteColumn("col").AddColumn("col", string(), "Now optional string"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the column was re-added with new properties + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, result.schema->FindFieldByName("col")); + ASSERT_TRUE(field_opt.has_value()); + + const auto& field = field_opt->get(); + EXPECT_EQ(field.type(), string()); + EXPECT_TRUE(field.optional()); +} + +// Test delete then add nested field +TEST_F(UpdateSchemaTest, DeleteThenAddNested) { + // First add a struct with a field + auto struct_type = std::make_shared( + std::vector{SchemaField(100, "field1", boolean(), false)}); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("struct_col", struct_type); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Delete then re-add the nested field with different type + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->DeleteColumn("struct_col.field1") + .AddColumn("struct_col", "field1", int32(), "Re-added field"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the field was re-added + ICEBERG_UNWRAP_OR_FAIL(auto struct_field_opt, + result.schema->FindFieldByName("struct_col")); + ASSERT_TRUE(struct_field_opt.has_value()); + + const auto& struct_field = struct_field_opt->get(); + const auto& nested_struct = static_cast(*struct_field.type()); + + ICEBERG_UNWRAP_OR_FAIL(auto field1_opt, nested_struct.GetFieldByName("field1")); + ASSERT_TRUE(field1_opt.has_value()); + EXPECT_EQ(field1_opt->get().type(), int32()); +} + +// Test add-delete conflict +TEST_F(UpdateSchemaTest, AddDeleteConflict) { + // Try to delete a newly added column (should fail - column doesn't exist in schema) + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32()).DeleteColumn("new_col"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot delete missing column")); +} + +// Test delete column that has additions fails +TEST_F(UpdateSchemaTest, DeleteColumnWithAdditionsFails) { + // First add a struct + auto struct_type = std::make_shared( + std::vector{SchemaField(100, "field1", int32(), true)}); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("struct_col", struct_type); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Try to add a field to the struct and delete it in the same update + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->AddColumn("struct_col", "field2", string()).DeleteColumn("struct_col"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot delete a column that has additions")); +} + +// Test delete map key fails +TEST_F(UpdateSchemaTest, DeleteMapKeyFails) { + // Create a map + auto map_type = std::make_shared(SchemaField(10, "key", string(), false), + SchemaField(11, "value", int32(), true)); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("map_col", map_type); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Try to delete the map key (should fail in Apply) + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->DeleteColumn("map_col.key"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot delete map keys")); +} + +// Test case insensitive delete +TEST_F(UpdateSchemaTest, DeleteColumnCaseInsensitive) { + // First add a column + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("MyColumn", string(), "A column with mixed case"); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Delete using different case + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->CaseSensitive(false).DeleteColumn("mycolumn"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the column was deleted + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, + result.schema->FindFieldByName("MyColumn", false)); + EXPECT_FALSE(field_opt.has_value()); +} + +} // namespace iceberg diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index ed10e127e..b4e876780 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -27,6 +27,7 @@ #include "iceberg/exception.h" #include "iceberg/schema.h" +#include "iceberg/schema_field.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" #include "iceberg/util/string_util.h" @@ -151,7 +152,12 @@ ListType::ListType(int32_t field_id, std::shared_ptr type, bool optional) : element_(field_id, std::string(kElementName), std::move(type), optional) {} TypeId ListType::type_id() const { return kTypeId; } -std::string ListType::ToString() const { return std::format("list<{}>", element_); } + +const SchemaField& ListType::element() const { return element_; } + +std::string ListType::ToString() const { + return std::vformat("list<{}>", std::make_format_args(element_)); +} std::span ListType::fields() const { return {&element_, 1}; } Result> ListType::GetFieldById( @@ -207,7 +213,7 @@ const SchemaField& MapType::value() const { return fields_[1]; } TypeId MapType::type_id() const { return kTypeId; } std::string MapType::ToString() const { - return std::format("map<{}: {}>", key(), value()); + return std::vformat("map<{}: {}>", std::make_format_args(key(), value())); } std::span MapType::fields() const { return fields_; } diff --git a/src/iceberg/type.h b/src/iceberg/type.h index 7e17b78d4..1c50135dc 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -155,6 +155,7 @@ class ICEBERG_EXPORT ListType : public NestedType { ~ListType() override = default; TypeId type_id() const override; + const SchemaField& element() const; std::string ToString() const override; std::span fields() const override; diff --git a/src/iceberg/update/update_schema.cc b/src/iceberg/update/update_schema.cc index 14b962bd8..0e81c4ad7 100644 --- a/src/iceberg/update/update_schema.cc +++ b/src/iceberg/update/update_schema.cc @@ -19,6 +19,7 @@ #include "iceberg/update/update_schema.h" +#include #include #include #include @@ -26,16 +27,220 @@ #include #include #include +#include #include "iceberg/schema.h" +#include "iceberg/schema_field.h" #include "iceberg/table_metadata.h" #include "iceberg/transaction.h" #include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" #include "iceberg/util/error_collector.h" #include "iceberg/util/macros.h" +#include "iceberg/util/type_util.h" +#include "iceberg/util/visit_type.h" namespace iceberg { +namespace { +constexpr int32_t kTableRootId = -1; + +/// \brief Visitor for applying schema changes recursively to nested types +class ApplyChangesVisitor { + public: + ApplyChangesVisitor( + const std::unordered_set& deletes, + const std::unordered_map>& updates, + const std::unordered_map>& parent_to_added_ids) + : deletes_(deletes), updates_(updates), parent_to_added_ids_(parent_to_added_ids) {} + + /// \brief Apply changes to a type using schema visitor pattern + Result> ApplyChanges(const std::shared_ptr& type, + int32_t parent_id) { + return VisitTypeCategory(*type, this, type, parent_id); + } + + /// \brief Apply changes to a struct type + Result> VisitStruct(const StructType& struct_type, + const std::shared_ptr& base_type, + int32_t parent_id) { + std::vector new_fields; + bool has_changes = false; + + // Process existing fields + for (const auto& field : struct_type.fields()) { + // Recursively process the field's type first + ICEBERG_ASSIGN_OR_RAISE(auto field_type_result, + ApplyChanges(field.type(), field.field_id())); + + // Process field-level changes (deletes, updates, nested additions) + ICEBERG_ASSIGN_OR_RAISE(auto processed_field, + ProcessField(field, field_type_result)); + + if (processed_field.has_value()) { + const auto& new_field = processed_field.value(); + new_fields.push_back(new_field); + + // Check if this field changed + if (new_field != field) { + has_changes = true; + } + } else { + // Field was deleted + has_changes = true; + } + } + + // Add new fields for this struct + auto adds_it = parent_to_added_ids_.find(parent_id); + if (adds_it != parent_to_added_ids_.end() && !adds_it->second.empty()) { + has_changes = true; + for (int32_t added_id : adds_it->second) { + auto added_field_it = updates_.find(added_id); + if (added_field_it != updates_.end()) { + new_fields.push_back(*added_field_it->second); + } + } + } + + // Return original type if nothing changed + if (!has_changes) { + return base_type; + } + + return std::make_shared(std::move(new_fields)); + } + + /// \brief Apply changes to a list type + Result> VisitList(const ListType& list_type, + const std::shared_ptr& base_type, + int32_t parent_id) { + const auto& element = list_type.element(); + + // Recursively process element type + ICEBERG_ASSIGN_OR_RAISE(auto element_type_result, + ApplyChanges(element.type(), element.field_id())); + + // Process element field (handles deletes, updates, nested additions) + ICEBERG_ASSIGN_OR_RAISE(auto processed_element, + ProcessField(element, element_type_result)); + + ICEBERG_CHECK(processed_element.has_value(), + "Cannot delete element field from list: {}", list_type.ToString()); + + const auto& new_element = processed_element.value(); + + // Return unchanged if element didn't change + if (element == new_element) { + return base_type; + } + + return std::make_shared(new_element); + } + + /// \brief Apply changes to a map type + Result> VisitMap(const MapType& map_type, + const std::shared_ptr& base_type, + int32_t parent_id) { + const auto& key = map_type.key(); + const auto& value = map_type.value(); + + // Check for key modifications (not allowed in Iceberg) + int32_t key_id = key.field_id(); + ICEBERG_CHECK(!deletes_.contains(key_id), "Cannot delete map keys"); + ICEBERG_CHECK(!updates_.contains(key_id), "Cannot update map keys"); + ICEBERG_CHECK(!parent_to_added_ids_.contains(key_id), + "Cannot add fields to map keys"); + + // Recursively process key and value types + ICEBERG_ASSIGN_OR_RAISE(auto key_type_result, ApplyChanges(key.type(), key_id)); + ICEBERG_ASSIGN_OR_RAISE(auto value_type_result, + ApplyChanges(value.type(), value.field_id())); + + // Key type must not change + ICEBERG_CHECK(*key_type_result == *key.type(), "Cannot alter map keys"); + + // Process value field (handles deletes, updates, nested additions) + ICEBERG_ASSIGN_OR_RAISE(auto processed_value, ProcessField(value, value_type_result)); + + ICEBERG_CHECK(processed_value.has_value(), "Cannot delete value field from map: {}", + map_type.ToString()); + + const auto& new_value = processed_value.value(); + + // Return unchanged if nothing changed + if (key == map_type.key() && value == new_value) { + return base_type; + } + + return std::make_shared(key, new_value); + } + + /// \brief Handle primitive types - return unchanged + Result> VisitPrimitive(const PrimitiveType& primitive_type, + const std::shared_ptr& base_type, + int32_t parent_id) { + // Primitive types are returned as-is + return base_type; + } + + private: + /// \brief Process a field: handle deletes, updates, and nested additions + /// + /// It processes field-level operations after the field's type has been recursively + /// processed. + Result> ProcessField( + const SchemaField& field, const std::shared_ptr& field_type_result) { + int32_t field_id = field.field_id(); + + // 1. Handle deletes + if (deletes_.contains(field_id)) { + // Field is deleted + return std::nullopt; + } + + // 2. Start with the recursively processed type + std::shared_ptr result_type = field_type_result; + + // 3. Handle type updates (e.g., type widening) + // Note: We check the update against the ORIGINAL field type, not the recursively + // processed type, because we want to preserve nested changes from recursion + auto update_it = updates_.find(field_id); + if (update_it != updates_.end()) { + const auto& update_field = update_it->second; + // If the update specifies a type change, use the new type + // Otherwise keep the recursively processed type + if (update_field->type() != field.type()) { + result_type = update_field->type(); + } + } + + // Note: Nested field additions are handled in VisitStruct, not here + // to avoid duplication + + // 4. Build the result field + if (update_it != updates_.end()) { + // Use update field metadata but with the processed type + const auto& update_field = update_it->second; + return SchemaField(field_id, update_field->name(), std::move(result_type), + update_field->optional(), update_field->doc()); + } else if (result_type != field.type()) { + // Type changed but no field-level update + return SchemaField(field_id, field.name(), std::move(result_type), field.optional(), + field.doc()); + } else { + // No changes + return field; + } + } + + const std::unordered_set& deletes_; + const std::unordered_map>& updates_; + const std::unordered_map>& parent_to_added_ids_; +}; + +} // namespace + Result> UpdateSchema::Make( std::shared_ptr transaction) { ICEBERG_PRECHECK(transaction != nullptr, @@ -64,8 +269,10 @@ UpdateSchema::UpdateSchema(std::shared_ptr transaction) AddError(identifier_names_result.error()); return; } - identifier_field_names_ = identifier_names_result.value() | - std::ranges::to>(); + identifier_field_names_ = std::move(identifier_names_result.value()); + + // Initialize id_to_parent map from the schema + id_to_parent_ = IndexParents(*schema_); } UpdateSchema::~UpdateSchema() = default; @@ -132,16 +339,6 @@ UpdateSchema& UpdateSchema::UpdateColumnDoc(std::string_view name, return *this; } -UpdateSchema& UpdateSchema::AddColumnInternal(std::optional parent, - std::string_view name, bool is_optional, - std::shared_ptr type, - std::string_view doc) { - // TODO(Guotao Yu): Implement AddColumnInternal logic - // This is where the real work happens - finding parent, validating, etc. - AddError(NotImplemented("UpdateSchema::AddColumnInternal not implemented")); - return *this; -} - UpdateSchema& UpdateSchema::RenameColumn(std::string_view name, std::string_view new_name) { // TODO(Guotao Yu): Implement RenameColumn @@ -162,8 +359,20 @@ UpdateSchema& UpdateSchema::RequireColumn(std::string_view name) { } UpdateSchema& UpdateSchema::DeleteColumn(std::string_view name) { - // TODO(Guotao Yu): Implement DeleteColumn - AddError(NotImplemented("UpdateSchema::DeleteColumn not implemented")); + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto field_opt, FindField(name)); + ICEBERG_BUILDER_CHECK(field_opt.has_value(), "Cannot delete missing column: {}", name); + + const auto& field = field_opt->get(); + int32_t field_id = field.field_id(); + + ICEBERG_BUILDER_CHECK(!parent_to_added_ids_.contains(field_id), + "Cannot delete a column that has additions: {}", name); + ICEBERG_BUILDER_CHECK(!updates_.contains(field_id), + "Cannot delete a column that has updates: {}", name); + + // Add to deletes set + deletes_.insert(field_id); + return *this; } @@ -195,13 +404,172 @@ UpdateSchema& UpdateSchema::UnionByNameWith(std::shared_ptr new_schema) UpdateSchema& UpdateSchema::SetIdentifierFields( const std::span& names) { - identifier_field_names_ = names | std::ranges::to>(); + identifier_field_names_ = names | std::ranges::to>(); return *this; } Result UpdateSchema::Apply() { - // TODO(Guotao Yu): Implement Apply - return NotImplemented("UpdateSchema::Apply not implemented"); + ICEBERG_RETURN_UNEXPECTED(CheckErrors()); + + // Validate existing identifier fields are not deleted + for (const auto& name : identifier_field_names_) { + ICEBERG_ASSIGN_OR_RAISE(auto field_opt, FindField(name)); + if (field_opt.has_value()) { + const auto& field = field_opt->get(); + auto field_id = field.field_id(); + + ICEBERG_CHECK(!deletes_.contains(field_id), + "Cannot delete identifier field {}. To force deletion, also call " + "SetIdentifierFields to update identifier fields.", + name); + + // Check no parent of this field is deleted + auto parent_it = id_to_parent_.find(field_id); + while (parent_it != id_to_parent_.end()) { + int32_t parent_id = parent_it->second; + ICEBERG_CHECK( + !deletes_.contains(parent_id), + "Cannot delete field with id {} as it will delete nested identifier field {}", + parent_id, name); + parent_it = id_to_parent_.find(parent_id); + } + } + } + + // Apply changes recursively using the visitor + ApplyChangesVisitor visitor(deletes_, updates_, parent_to_added_ids_); + ICEBERG_ASSIGN_OR_RAISE(auto new_type, visitor.ApplyChanges(schema_, kTableRootId)); + + // Cast result back to StructType and extract fields + auto new_struct_type = internal::checked_pointer_cast(new_type); + + // Convert identifier field names to IDs + auto temp_schema = new_struct_type->ToSchema(); + std::vector fresh_identifier_ids; + for (const auto& name : identifier_field_names_) { + ICEBERG_ASSIGN_OR_RAISE(auto field_opt, + temp_schema->FindFieldByName(name, case_sensitive_)); + ICEBERG_CHECK(field_opt.has_value(), + "Cannot add field {} as an identifier field: not found in current " + "schema or added columns", + name); + fresh_identifier_ids.push_back(field_opt->get().field_id()); + } + + // Create the new schema + auto new_fields = temp_schema->fields() | std::ranges::to>(); + ICEBERG_ASSIGN_OR_RAISE( + auto new_schema, + Schema::Make(std::move(new_fields), schema_->schema_id(), fresh_identifier_ids)); + + return ApplyResult{.schema = std::move(new_schema), + .new_last_column_id = last_column_id_}; +} + +// TODO(Guotao Yu): v3 default value is not yet supported +UpdateSchema& UpdateSchema::AddColumnInternal(std::optional parent, + std::string_view name, bool is_optional, + std::shared_ptr type, + std::string_view doc) { + int32_t parent_id = kTableRootId; + std::string full_name; + + // Handle parent field + if (parent.has_value()) { + ICEBERG_BUILDER_CHECK(!parent->empty(), "Parent name cannot be empty"); + // Find parent field + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto parent_field_opt, FindField(*parent)); + ICEBERG_BUILDER_CHECK(parent_field_opt.has_value(), "Cannot find parent struct: {}", + *parent); + + const SchemaField& parent_field = parent_field_opt->get(); + const auto& parent_type = parent_field.type(); + + // Get the actual field to add to (handle map/list) + const SchemaField* target_field = &parent_field; + + if (parent_type->type_id() == TypeId::kMap) { + // For maps, add to value field + const auto& map_type = internal::checked_cast(*parent_type); + target_field = &map_type.value(); + } else if (parent_type->type_id() == TypeId::kList) { + // For lists, add to element field + const auto& list_type = internal::checked_cast(*parent_type); + target_field = &list_type.element(); + } + + // Validate target is a struct + ICEBERG_BUILDER_CHECK(target_field->type()->type_id() == TypeId::kStruct, + "Cannot add to non-struct column: {}: {}", *parent, + target_field->type()->ToString()); + + parent_id = target_field->field_id(); + + // Check parent is not being deleted + ICEBERG_BUILDER_CHECK(!deletes_.contains(parent_id), + "Cannot add to a column that will be deleted: {}", *parent); + + // Check field doesn't already exist (unless it's being deleted) + std::string nested_name = std::format("{}.{}", *parent, name); + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto current_field, FindField(nested_name)); + ICEBERG_BUILDER_CHECK( + !current_field.has_value() || deletes_.contains(current_field->get().field_id()), + "Cannot add column, name already exists: {}.{}", *parent, name); + + // Build full name using canonical name of parent + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto parent_name_opt, + schema_->FindColumnNameById(parent_id)); + ICEBERG_BUILDER_CHECK(parent_name_opt.has_value(), + "Cannot find column name for parent id: {}", parent_id); + + full_name = std::format("{}.{}", *parent_name_opt, name); + } else { + // Top-level field + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto current_field, FindField(name)); + ICEBERG_BUILDER_CHECK( + !current_field.has_value() || deletes_.contains(current_field->get().field_id()), + "Cannot add column, name already exists: {}", name); + + full_name = std::string(name); + } + + // V3 supports default values, but this implementation doesn't support them yet + // Check for incompatible change: adding required column without default + ICEBERG_BUILDER_CHECK( + is_optional || allow_incompatible_changes_, + "Incompatible change: cannot add required column without a default value: {}", + full_name); + + // Assign new column ID + int32_t new_id = AssignNewColumnId(); + + // Update tracking for moves + added_name_to_id_[full_name] = new_id; + if (parent_id != kTableRootId) { + id_to_parent_[new_id] = parent_id; + } + + // Assign fresh IDs to nested types + AssignFreshIdVisitor id_assigner([this]() { return AssignNewColumnId(); }); + auto type_with_fresh_ids = id_assigner.Visit(type); + + // Create new field + auto new_field = std::make_shared(new_id, std::string(name), + std::move(type_with_fresh_ids), + is_optional, std::string(doc)); + + // Record the update + updates_[new_id] = std::move(new_field); + parent_to_added_ids_[parent_id].push_back(new_id); + + return *this; +} + +int32_t UpdateSchema::AssignNewColumnId() { return ++last_column_id_; } + +Result>> UpdateSchema::FindField( + std::string_view name) const { + return schema_->FindFieldByName(name, case_sensitive_); } } // namespace iceberg diff --git a/src/iceberg/update/update_schema.h b/src/iceberg/update/update_schema.h index bed2bfeb2..d68c291d8 100644 --- a/src/iceberg/update/update_schema.h +++ b/src/iceberg/update/update_schema.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "iceberg/iceberg_export.h" @@ -345,12 +346,31 @@ class ICEBERG_EXPORT UpdateSchema : public PendingUpdate { std::string_view name, bool is_optional, std::shared_ptr type, std::string_view doc); + /// \brief Assign a new column ID and increment the counter. + int32_t AssignNewColumnId(); + + /// \brief Find a field by name using case-sensitive or case-insensitive search. + Result>> FindField( + std::string_view name) const; + // Internal state std::shared_ptr schema_; int32_t last_column_id_; bool allow_incompatible_changes_{false}; bool case_sensitive_{true}; - std::unordered_set identifier_field_names_; + std::vector identifier_field_names_; + + // Tracking changes + // field ID -> parent field ID + std::unordered_map id_to_parent_; + // field IDs to delete + std::unordered_set deletes_; + // field ID -> updated field + std::unordered_map> updates_; + // parent ID -> added child IDs + std::unordered_map> parent_to_added_ids_; + // full name -> field ID for added fields + std::unordered_map added_name_to_id_; }; } // namespace iceberg diff --git a/src/iceberg/util/visit_type.h b/src/iceberg/util/visit_type.h index ce734e8f0..bf52d2e9a 100644 --- a/src/iceberg/util/visit_type.h +++ b/src/iceberg/util/visit_type.h @@ -124,4 +124,42 @@ inline Status VisitTypeIdInline(TypeId id, VISITOR* visitor, ARGS&&... args) { #undef TYPE_ID_VISIT_INLINE +/// \brief Visit a type using a categorical visitor pattern +/// +/// This function provides a simplified visitor interface that groups Iceberg types into +/// four categories based on their structural properties: +/// +/// - **Struct types**: Complex types with named fields (StructType) +/// - **List types**: Sequential container types (ListType) +/// - **Map types**: Key-value container types (MapType) +/// - **Primitive types**: All leaf types without nested structure (14 primitive types) +/// +/// This grouping is useful for algorithms that need to distinguish between container +/// types and leaf types, but don't require separate handling for each primitive type +/// variant (e.g., Int vs Long vs String). +/// +/// \tparam VISITOR Visitor class that must implement four Visit methods: +/// - `VisitStruct(const StructType&, ARGS...)` for struct types +/// - `VisitList(const ListType&, ARGS...)` for list types +/// - `VisitMap(const MapType&, ARGS...)` for map types +/// - `VisitPrimitive(const PrimitiveType&, ARGS...)` for all primitive types +/// \tparam ARGS Additional argument types forwarded to Visit methods +/// \param type The type to visit +/// \param visitor Pointer to the visitor instance +/// \param args Additional arguments forwarded to the Visit methods +/// \return The return value from the invoked Visit method +template +inline auto VisitTypeCategory(const Type& type, VISITOR* visitor, ARGS&&... args) { +#define SCHEMA_VISIT_ACTION(TYPE_CLASS) \ + return visitor->Visit##TYPE_CLASS( \ + internal::checked_cast(type), \ + std::forward(args)...); + + switch (type.type_id()) { + ICEBERG_TYPE_SWITCH_WITH_PRIMITIVE_DEFAULT(SCHEMA_VISIT_ACTION) + } + +#undef SCHEMA_VISIT_ACTION +} + } // namespace iceberg diff --git a/src/iceberg/util/visitor_generate.h b/src/iceberg/util/visitor_generate.h index 2ea8282cb..053371d41 100644 --- a/src/iceberg/util/visitor_generate.h +++ b/src/iceberg/util/visitor_generate.h @@ -40,4 +40,21 @@ namespace iceberg { ACTION(List); \ ACTION(Map); +/// \brief Generate switch-case for schema visitor pattern +/// +/// This macro generates switch cases that dispatch to visitor methods based on type: +/// - Struct types -> calls ACTION with Struct +/// - List types -> calls ACTION with List +/// - Map types -> calls ACTION with Map +/// - All primitive types (default) -> calls ACTION with Primitive +#define ICEBERG_TYPE_SWITCH_WITH_PRIMITIVE_DEFAULT(ACTION) \ + case ::iceberg::TypeId::kStruct: \ + ACTION(Struct) \ + case ::iceberg::TypeId::kList: \ + ACTION(List) \ + case ::iceberg::TypeId::kMap: \ + ACTION(Map) \ + default: \ + ACTION(Primitive) + } // namespace iceberg