Skip to content

Commit ff8eea9

Browse files
authored
feat: implement ValidateRedundantPartitions for PartitionSpec (#442)
- Add ValidateRedundantPartitions method to detect redundant partition fields - Use deduplication key (source_id + transform.DedupName()) to identify conflicts - Integrate validation into PartitionSpec::Validate method
1 parent 4657dcd commit ff8eea9

File tree

7 files changed

+440
-26
lines changed

7 files changed

+440
-26
lines changed

src/iceberg/partition_spec.cc

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@
2323
#include <cstddef>
2424
#include <cstdint>
2525
#include <format>
26+
#include <map>
2627
#include <memory>
2728
#include <ranges>
2829
#include <unordered_map>
30+
#include <utility>
2931

3032
#include "iceberg/result.h"
3133
#include "iceberg/schema.h"
@@ -132,6 +134,7 @@ bool PartitionSpec::Equals(const PartitionSpec& other) const {
132134

133135
Status PartitionSpec::Validate(const Schema& schema, bool allow_missing_fields) const {
134136
ICEBERG_RETURN_UNEXPECTED(ValidatePartitionName(schema, *this));
137+
ICEBERG_RETURN_UNEXPECTED(ValidateRedundantPartitions(*this));
135138

136139
std::unordered_map<int32_t, int32_t> parents = IndexParents(schema);
137140
for (const auto& partition_field : fields_) {
@@ -184,11 +187,10 @@ Status PartitionSpec::ValidatePartitionName(const Schema& schema,
184187
std::unordered_set<std::string> partition_names;
185188
for (const auto& partition_field : spec.fields()) {
186189
auto name = std::string(partition_field.name());
187-
ICEBERG_PRECHECK(!name.empty(), "Cannot use empty partition name: {}", name);
190+
ICEBERG_CHECK(!name.empty(), "Cannot use empty partition name: {}", name);
188191

189-
if (partition_names.contains(name)) {
190-
return InvalidArgument("Cannot use partition name more than once: {}", name);
191-
}
192+
ICEBERG_CHECK(!partition_names.contains(name),
193+
"Cannot use partition name more than once: {}", name);
192194
partition_names.insert(name);
193195

194196
ICEBERG_ASSIGN_OR_RAISE(auto schema_field, schema.FindFieldByName(name));
@@ -199,17 +201,15 @@ Status PartitionSpec::ValidatePartitionName(const Schema& schema,
199201
// field name as long as they are sourced from the same schema field
200202
if (schema_field.has_value() &&
201203
schema_field.value().get().field_id() != partition_field.source_id()) {
202-
return InvalidArgument(
204+
return ValidationFailed(
203205
"Cannot create identity partition sourced from different field in schema: {}",
204206
name);
205207
}
206208
} else {
207209
// for all other transforms we don't allow conflicts between partition name and
208210
// schema field name
209-
if (schema_field.has_value()) {
210-
return InvalidArgument(
211-
"Cannot create partition from name that exists in schema: {}", name);
212-
}
211+
ICEBERG_CHECK(!schema_field.has_value(),
212+
"Cannot create partition from name that exists in schema: {}", name);
213213
}
214214
}
215215

@@ -261,4 +261,27 @@ bool PartitionSpec::HasSequentialFieldIds(const PartitionSpec& spec) {
261261
return true;
262262
}
263263

264+
Status PartitionSpec::ValidateRedundantPartitions(const PartitionSpec& spec) {
265+
// Use a map to track deduplication keys (source_id + transform dedup name)
266+
std::map<std::pair<int32_t, std::string>, const PartitionField*> dedup_fields;
267+
268+
for (const auto& field : spec.fields()) {
269+
// The dedup name is provided by the transform's DedupName() method
270+
// which typically returns the transform's string representation
271+
auto dedup_key = std::make_pair(field.source_id(), field.transform()->DedupName());
272+
273+
// Check if this dedup key already exists
274+
// If it does, we have found a redundant partition field
275+
auto existing_field_iter = dedup_fields.find(dedup_key);
276+
ICEBERG_CHECK(existing_field_iter == dedup_fields.end(),
277+
"Cannot add redundant partition: {} conflicts with {}",
278+
field.ToString(), existing_field_iter->second->ToString());
279+
280+
// Add this field to the dedup map for future conflict detection
281+
dedup_fields[dedup_key] = &field;
282+
}
283+
284+
return {};
285+
}
286+
264287
} // namespace iceberg

src/iceberg/partition_spec.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable {
135135
/// \brief Compare two partition specs for equality.
136136
bool Equals(const PartitionSpec& other) const;
137137

138+
/// \brief Validates that there are no redundant partition fields in the spec.
139+
static Status ValidateRedundantPartitions(const PartitionSpec& spec);
140+
138141
using SourceIdToFieldsMap = std::unordered_map<int32_t, std::vector<PartitionFieldRef>>;
139142
static Result<SourceIdToFieldsMap> InitSourceIdToFieldsMap(const PartitionSpec&);
140143

src/iceberg/test/partition_spec_test.cc

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,4 +303,126 @@ TEST(PartitionSpecTest, PartitionFieldInStructInMap) {
303303
EXPECT_THAT(result_value, HasErrorMessage("Invalid partition field parent"));
304304
}
305305

306+
TEST(PartitionSpecTest, ValidateRedundantPartitionsExactDuplicates) {
307+
// Create a schema with different field types
308+
auto ts_field = SchemaField::MakeRequired(1, "ts", timestamp());
309+
auto id_field = SchemaField::MakeRequired(2, "id", int64());
310+
Schema schema({ts_field, id_field}, Schema::kInitialSchemaId);
311+
312+
// Test: exact duplicate transforms on same field (same dedup name)
313+
{
314+
PartitionField field1(1, 1000, "ts_day_1_0", Transform::Day());
315+
PartitionField field2(1, 1001, "ts_day_1_1", Transform::Day());
316+
317+
auto result = PartitionSpec::Make(schema, 1, {field1, field2}, false);
318+
EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed));
319+
EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition"));
320+
EXPECT_THAT(result, HasErrorMessage("conflicts with"));
321+
}
322+
323+
// Test: same bucket size on same field (redundant)
324+
{
325+
PartitionField bucket1(2, 1000, "id_bucket_16_2_0", Transform::Bucket(16));
326+
PartitionField bucket2(2, 1001, "id_bucket_16_2_1", Transform::Bucket(16));
327+
328+
auto result = PartitionSpec::Make(schema, 1, {bucket1, bucket2}, false);
329+
EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed));
330+
EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition"));
331+
}
332+
333+
// Test: same truncate width on same field (redundant)
334+
{
335+
auto name_field = SchemaField::MakeRequired(3, "name", string());
336+
Schema schema_with_string({name_field}, Schema::kInitialSchemaId);
337+
338+
PartitionField truncate1(3, 1000, "name_trunc_4_3_1", Transform::Truncate(4));
339+
PartitionField truncate2(3, 1001, "name_trunc_4_3_2", Transform::Truncate(4));
340+
341+
auto result =
342+
PartitionSpec::Make(schema_with_string, 1, {truncate1, truncate2}, false);
343+
EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed));
344+
EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition"));
345+
}
346+
}
347+
348+
TEST(PartitionSpecTest, ValidateRedundantPartitionsAllowedCases) {
349+
// Create a schema with different field types
350+
auto ts_field = SchemaField::MakeRequired(1, "ts", timestamp());
351+
auto id_field = SchemaField::MakeRequired(2, "id", int64());
352+
auto name_field = SchemaField::MakeRequired(3, "name", string());
353+
Schema schema({ts_field, id_field, name_field}, Schema::kInitialSchemaId);
354+
355+
// Test: different bucket sizes on same field (allowed - different dedup names)
356+
{
357+
PartitionField bucket16(2, 1000, "id_bucket_16_2", Transform::Bucket(16));
358+
PartitionField bucket32(2, 1001, "id_bucket_32_2", Transform::Bucket(32));
359+
360+
auto result = PartitionSpec::Make(schema, 1, {bucket16, bucket32}, false);
361+
EXPECT_THAT(result, IsOk());
362+
}
363+
364+
// Test: different truncate widths on same field (allowed - different dedup names)
365+
{
366+
PartitionField truncate4(3, 1000, "name_trunc_4_3", Transform::Truncate(4));
367+
PartitionField truncate8(3, 1001, "name_trunc_8_3", Transform::Truncate(8));
368+
369+
auto result = PartitionSpec::Make(schema, 1, {truncate4, truncate8}, false);
370+
EXPECT_THAT(result, IsOk());
371+
}
372+
373+
// Test: same transforms on different fields (allowed)
374+
{
375+
PartitionField ts_day(1, 1000, "ts_day_1", Transform::Day());
376+
PartitionField id_bucket(2, 1001, "id_bucket_2", Transform::Bucket(16));
377+
378+
auto result = PartitionSpec::Make(schema, 1, {ts_day, id_bucket}, false);
379+
EXPECT_THAT(result, IsOk());
380+
}
381+
382+
// Test: different transforms on same field (allowed if dedup names differ)
383+
{
384+
PartitionField ts_day(1, 1000, "ts_day_1", Transform::Day());
385+
PartitionField ts_month(1, 1001, "ts_month_1", Transform::Month());
386+
387+
// This should be allowed since Day and Month have different dedup names
388+
// The Java logic only checks for exact dedup name matches
389+
auto result = PartitionSpec::Make(schema, 1, {ts_day, ts_month}, false);
390+
EXPECT_THAT(result, IsOk());
391+
}
392+
393+
// Test: single partition field (no redundancy possible)
394+
{
395+
PartitionField single_field(1, 1000, "ts_year_1", Transform::Year());
396+
397+
auto result = PartitionSpec::Make(schema, 1, {single_field}, false);
398+
EXPECT_THAT(result, IsOk());
399+
}
400+
}
401+
402+
TEST(PartitionSpecTest, ValidateRedundantPartitionsIdentityTransforms) {
403+
// Create a schema with different field types
404+
auto id_field = SchemaField::MakeRequired(1, "id", int64());
405+
auto name_field = SchemaField::MakeRequired(2, "name", string());
406+
Schema schema({id_field, name_field}, Schema::kInitialSchemaId);
407+
408+
// Test: multiple identity transforms on same field (redundant)
409+
{
410+
PartitionField identity1(1, 1000, "id_1_0", Transform::Identity());
411+
PartitionField identity2(1, 1001, "id_1_1", Transform::Identity());
412+
413+
auto result = PartitionSpec::Make(schema, 1, {identity1, identity2}, false);
414+
EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed));
415+
EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition"));
416+
}
417+
418+
// Test: identity transforms on different fields (allowed)
419+
{
420+
PartitionField id_identity(1, 1000, "id_1", Transform::Identity());
421+
PartitionField name_identity(2, 1001, "name_2", Transform::Identity());
422+
423+
auto result = PartitionSpec::Make(schema, 1, {id_identity, name_identity}, false);
424+
EXPECT_THAT(result, IsOk());
425+
}
426+
}
427+
306428
} // namespace iceberg

0 commit comments

Comments
 (0)