diff --git a/cpp/.gitignore b/cpp/.gitignore index 134bd4f3ee9..64c147d2868 100644 --- a/cpp/.gitignore +++ b/cpp/.gitignore @@ -27,6 +27,7 @@ build/ Testing/ build-support/boost_* vcpkg_installed/ +_deps/ # Build directories created by Clion cmake-build-*/ diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 5f75089170d..5474cf12523 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -185,6 +185,7 @@ set(PARQUET_SRCS platform.cc printer.cc properties.cc + row_selection.cc schema.cc size_statistics.cc statistics.cc @@ -393,7 +394,8 @@ add_parquet_test(reader-test level_conversion_test.cc column_scanner_test.cc reader_test.cc - stream_reader_test.cc) + stream_reader_test.cc + row_selection_test.cc) add_parquet_test(writer-test SOURCES diff --git a/cpp/src/parquet/row_selection.cc b/cpp/src/parquet/row_selection.cc new file mode 100644 index 00000000000..692942c10e1 --- /dev/null +++ b/cpp/src/parquet/row_selection.cc @@ -0,0 +1,233 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/row_selection.h" + +#include "arrow/util/bitmap_ops.h" +#include "arrow/util/unreachable.h" +#include "parquet/exception.h" + +namespace parquet { + +class IteratorImpl : public RowSelection::Iterator { + public: + explicit IteratorImpl(const RowSelection& ranges, size_t batch_size = 1) + : ranges_(ranges.ranges_), index_(0), batch_size_(batch_size) {} + + ~IteratorImpl() override = default; + + ::arrow::util::span NextRanges() override { + if (index_ >= ranges_.size()) { + return {}; + } + // Return up to batch_size_ ranges + size_t remaining = ranges_.size() - index_; + size_t count = std::min(batch_size_, remaining); + auto result = ::arrow::util::span( + ranges_.data() + index_, count); + index_ += count; + return result; + } + + private: + const std::vector& ranges_; + size_t index_; + size_t batch_size_; +}; + +std::unique_ptr RowSelection::NewIterator() const { + return std::make_unique(*this); +} + +void RowSelection::Validate() const { + int64_t last_end = -1; + for (const auto& interval : ranges_) { + if (interval.start <= last_end) { + throw ParquetException("Row ranges are not in ascending order"); + } + if (interval.length <= 0) { + throw ParquetException("Invalid interval range: length must be positive"); + } + last_end = interval.start + interval.length - 1; + } +} + +int64_t RowSelection::row_count() const { + int64_t count = 0; + for (const auto& interval : ranges_) { + count += interval.length; + } + return count; +} + +RowSelection RowSelection::Intersect(const RowSelection& lhs, const RowSelection& rhs) { + RowSelection result; + + // Use iterators to get batches + auto lhs_iter = lhs.NewIterator(); + auto rhs_iter = rhs.NewIterator(); + + auto lhs_batch = lhs_iter->NextRanges(); + auto rhs_batch = rhs_iter->NextRanges(); + size_t lhs_idx = 0; + size_t rhs_idx = 0; + + while (!lhs_batch.empty() && !rhs_batch.empty()) { + // Get current ranges from batches + const auto& left = lhs_batch[lhs_idx]; + const auto& right = rhs_batch[rhs_idx]; + + int64_t left_end = left.start + left.length - 1; + int64_t right_end = right.start + right.length - 1; + + // Find overlapping region + int64_t start = std::max(left.start, right.start); + int64_t end = std::min(left_end, right_end); + + // If there is an overlap, add it to results + if (start <= end) { + result.ranges_.push_back(IntervalRange{start, end - start + 1}); + } + + // Advance the index with smaller end + if (left_end < right_end) { + lhs_idx++; + if (lhs_idx >= lhs_batch.size()) { + lhs_batch = lhs_iter->NextRanges(); + lhs_idx = 0; + } + } else { + rhs_idx++; + if (rhs_idx >= rhs_batch.size()) { + rhs_batch = rhs_iter->NextRanges(); + rhs_idx = 0; + } + } + } + + return result; +} + +RowSelection RowSelection::Union(const RowSelection& lhs, const RowSelection& rhs) { + RowSelection result; + + if (lhs.ranges_.empty()) { + return rhs; + } + if (rhs.ranges_.empty()) { + return lhs; + } + + // Use iterators to get batches + auto lhs_iter = lhs.NewIterator(); + auto rhs_iter = rhs.NewIterator(); + + auto lhs_batch = lhs_iter->NextRanges(); + auto rhs_batch = rhs_iter->NextRanges(); + size_t lhs_idx = 0; + size_t rhs_idx = 0; + + // Start with whichever range has the smaller start + IntervalRange current; + if (lhs_batch[0].start <= rhs_batch[0].start) { + current = lhs_batch[lhs_idx++]; + if (lhs_idx >= lhs_batch.size()) { + lhs_batch = lhs_iter->NextRanges(); + lhs_idx = 0; + } + } else { + current = rhs_batch[rhs_idx++]; + if (rhs_idx >= rhs_batch.size()) { + rhs_batch = rhs_iter->NextRanges(); + rhs_idx = 0; + } + } + + while (!lhs_batch.empty() || !rhs_batch.empty()) { + IntervalRange next; + + if (rhs_batch.empty()) { + // Only lhs ranges remain + next = lhs_batch[lhs_idx++]; + if (lhs_idx >= lhs_batch.size()) { + lhs_batch = lhs_iter->NextRanges(); + lhs_idx = 0; + } + } else if (lhs_batch.empty()) { + // Only rhs ranges remain + next = rhs_batch[rhs_idx++]; + if (rhs_idx >= rhs_batch.size()) { + rhs_batch = rhs_iter->NextRanges(); + rhs_idx = 0; + } + } else { + // Both have ranges - pick the one with smaller start + const auto& left = lhs_batch[lhs_idx]; + const auto& right = rhs_batch[rhs_idx]; + + if (left.start <= right.start) { + next = left; + lhs_idx++; + if (lhs_idx >= lhs_batch.size()) { + lhs_batch = lhs_iter->NextRanges(); + lhs_idx = 0; + } + } else { + next = right; + rhs_idx++; + if (rhs_idx >= rhs_batch.size()) { + rhs_batch = rhs_iter->NextRanges(); + rhs_idx = 0; + } + } + } + + int64_t current_end = current.start + current.length - 1; + if (current_end + 1 >= next.start) { + // Concatenate overlapping or adjacent ranges + int64_t next_end = next.start + next.length - 1; + int64_t new_end = std::max(current_end, next_end); + current.length = new_end - current.start + 1; + } else { + // Gap between current and next range + result.ranges_.push_back(current); + current = next; + } + } + + result.ranges_.push_back(current); + return result; +} + +RowSelection RowSelection::MakeSingle(int64_t start, int64_t end) { + RowSelection rowSelection; + rowSelection.ranges_.push_back(IntervalRange{start, end - start + 1}); + return rowSelection; +} + +RowSelection RowSelection::FromIntervals(::arrow::util::span intervals) { + RowSelection rowSelection; + rowSelection.ranges_.reserve(intervals.size()); + rowSelection.ranges_.insert(rowSelection.ranges_.end(), intervals.begin(), intervals.end()); + return rowSelection; +} + +RowSelection RowSelection::FromIntervals(const std::vector& intervals) { + return FromIntervals(::arrow::util::span(intervals)); +} + +} // namespace parquet \ No newline at end of file diff --git a/cpp/src/parquet/row_selection.h b/cpp/src/parquet/row_selection.h new file mode 100644 index 00000000000..14df8b97071 --- /dev/null +++ b/cpp/src/parquet/row_selection.h @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/util/span.h" +#include "parquet/platform.h" + +namespace parquet { + +/// RowSelection is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowSelection { + public: + /// \brief EXPERIMENTAL: A range of contiguous rows represented by an interval. + struct IntervalRange { + /// Start row of the range (inclusive). + int64_t start; + /// Number of rows in the range. + int64_t length; + }; + + /// \brief EXPERIMENTAL: An iterator for accessing row ranges in batches. + class Iterator { + public: + virtual ~Iterator() = default; + /// \brief Get the next batch of ranges. + /// Returns an empty span when exhausted. + virtual ::arrow::util::span NextRanges() = 0; + }; + + /// \brief EXPERIMENTAL: Create a new iterator for accessing row ranges in order. + std::unique_ptr NewIterator() const; + + /// \brief EXPERIMENTAL: Validate the row ranges. + /// \throws ParquetException if the row ranges are not in ascending order or + /// overlapped. + void Validate() const; + + /// \brief EXPERIMENTAL: Get the total number of rows in the row ranges. + int64_t row_count() const; + + /// \brief EXPERIMENTAL: Compute the intersection of two row ranges. + static RowSelection Intersect(const RowSelection& lhs, const RowSelection& rhs); + + /// \brief EXPERIMENTAL: Compute the union of two row ranges. + static RowSelection Union(const RowSelection& lhs, const RowSelection& rhs); + + /// \brief EXPERIMENTAL: Make a single row range of [start, end]. + static RowSelection MakeSingle(int64_t start, int64_t end); + + /// \brief EXPERIMENTAL: Make a row range from a list of intervals. + static RowSelection FromIntervals(::arrow::util::span intervals); + + /// \brief EXPERIMENTAL: Make a row range from a vector of intervals. + static RowSelection FromIntervals(const std::vector& intervals); + + private: + friend class IteratorImpl; + std::vector ranges_; +}; + +} // namespace parquet diff --git a/cpp/src/parquet/row_selection_test.cc b/cpp/src/parquet/row_selection_test.cc new file mode 100644 index 00000000000..ecfb4b5ccb8 --- /dev/null +++ b/cpp/src/parquet/row_selection_test.cc @@ -0,0 +1,515 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "parquet/exception.h" +#include "parquet/row_selection.h" + +namespace parquet { + +// Test factory methods +TEST(RowSelection, MakeSingleWithCount) { + auto ranges = RowSelection::MakeSingle(0, 99); + ASSERT_EQ(ranges.row_count(), 100); + + auto iter = ranges.NewIterator(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + + auto interval = batch[0]; + EXPECT_EQ(interval.start, 0); + EXPECT_EQ((interval.start + interval.length - 1), 99); + + // Should be exhausted + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); +} + +TEST(RowSelection, MakeSingleWithStartEnd) { + auto ranges = RowSelection::MakeSingle(10, 20); + ASSERT_EQ(ranges.row_count(), 11); + + auto iter = ranges.NewIterator(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + + auto interval = batch[0]; + EXPECT_EQ(interval.start, 10); + EXPECT_EQ((interval.start + interval.length - 1), 20); + + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); +} + +TEST(RowSelection, FromIntervals) { + std::vector intervals = { + {0, 11}, + {20, 11}, + {40, 11} + }; + + auto ranges = RowSelection::FromIntervals(intervals); + ASSERT_EQ(ranges.row_count(), 33); // 11 + 11 + 11 + + auto iter = ranges.NewIterator(); + + // First interval + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; + EXPECT_EQ(interval.start, 0); + EXPECT_EQ((interval.start + interval.length - 1), 10); + + // Second interval + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; + EXPECT_EQ(interval.start, 20); + EXPECT_EQ((interval.start + interval.length - 1), 30); + + // Third interval + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; + EXPECT_EQ(interval.start, 40); + EXPECT_EQ((interval.start + interval.length - 1), 50); + + // Exhausted + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); +} + +TEST(RowSelection, EmptyRanges) { + std::vector intervals; + auto ranges = RowSelection::FromIntervals(intervals); + ASSERT_EQ(ranges.row_count(), 0); + + auto iter = ranges.NewIterator(); + auto batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); +} + +// Test validation +TEST(RowSelection, ValidateValidRanges) { + std::vector intervals = { + {0, 11}, + {15, 6}, + {25, 6} + }; + + auto ranges = RowSelection::FromIntervals(intervals); + EXPECT_NO_THROW(ranges.Validate()); +} + +TEST(RowSelection, ValidateSingleRange) { + auto ranges = RowSelection::MakeSingle(0, 99); + EXPECT_NO_THROW(ranges.Validate()); +} + +TEST(RowSelection, ValidateOverlappingRanges) { + std::vector intervals = { + {0, 11}, + {5, 11} // Overlaps with previous + }; + + auto ranges = RowSelection::FromIntervals(intervals); + EXPECT_THROW(ranges.Validate(), ParquetException); +} + +TEST(RowSelection, ValidateAdjacentRanges) { + std::vector intervals = { + {0, 11}, + {11, 10} // Adjacent but not overlapping + }; + + auto ranges = RowSelection::FromIntervals(intervals); + EXPECT_NO_THROW(ranges.Validate()); +} + +TEST(RowSelection, ValidateInvalidRangeTouching) { + std::vector intervals = { + {0, 11}, + {10, 11} // Touches at end/start (overlaps at 10) + }; + + auto ranges = RowSelection::FromIntervals(intervals); + EXPECT_THROW(ranges.Validate(), ParquetException); +} + +TEST(RowSelection, ValidateNotAscendingOrder) { + std::vector intervals = { + {20, 11}, + {0, 11} // Not in ascending order + }; + + auto ranges = RowSelection::FromIntervals(intervals); + EXPECT_THROW(ranges.Validate(), ParquetException); +} + +TEST(RowSelection, ValidateInvalidInterval) { + std::vector intervals = { + {10, -4} // end < start + }; + + auto ranges = RowSelection::FromIntervals(intervals); + EXPECT_THROW(ranges.Validate(), ParquetException); +} + +// Test row_count +TEST(RowSelection, RowCountSingle) { + auto ranges = RowSelection::MakeSingle(0, 49); + EXPECT_EQ(ranges.row_count(), 50); +} + +TEST(RowSelection, RowCountMultiple) { + std::vector intervals = { + {0, 10}, // 10 rows + {20, 10}, // 10 rows + {50, 5} // 5 rows + }; + + auto ranges = RowSelection::FromIntervals(intervals); + EXPECT_EQ(ranges.row_count(), 25); +} + +TEST(RowSelection, RowCountEmpty) { + std::vector intervals; + auto ranges = RowSelection::FromIntervals(intervals); + EXPECT_EQ(ranges.row_count(), 0); +} + +TEST(RowSelection, RowCountSingleRow) { + auto ranges = RowSelection::MakeSingle(5, 5); + EXPECT_EQ(ranges.row_count(), 1); +} + +// Test Intersect +TEST(RowSelection, IntersectNoOverlap) { + auto lhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::FromIntervals({{40, 11}, {60, 11}}); + + auto result = RowSelection::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 0); +} + +TEST(RowSelection, IntersectCompleteOverlap) { + auto lhs = RowSelection::FromIntervals({{0, 101}}); + auto rhs = RowSelection::FromIntervals({{20, 11}, {40, 11}}); + + auto result = RowSelection::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 22); // (30-20+1) + (50-40+1) + + auto iter = result.NewIterator(); + + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; + EXPECT_EQ(interval.start, 20); + EXPECT_EQ((interval.start + interval.length - 1), 30); + + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; + EXPECT_EQ(interval.start, 40); + EXPECT_EQ((interval.start + interval.length - 1), 50); +} + +TEST(RowSelection, IntersectPartialOverlap) { + auto lhs = RowSelection::FromIntervals({{0, 16}, {20, 16}}); + auto rhs = RowSelection::FromIntervals({{10, 16}, {40, 11}}); + + auto result = RowSelection::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 12); // (15-10+1) + (25-20+1) + + auto iter = result.NewIterator(); + + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; + EXPECT_EQ(interval.start, 10); + EXPECT_EQ((interval.start + interval.length - 1), 15); + + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; + EXPECT_EQ(interval.start, 20); + EXPECT_EQ((interval.start + interval.length - 1), 25); +} + +TEST(RowSelection, IntersectIdentical) { + auto lhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + + auto result = RowSelection::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 22); +} + +TEST(RowSelection, IntersectWithEmpty) { + auto lhs = RowSelection::FromIntervals({{0, 11}}); + auto rhs = RowSelection::FromIntervals(std::vector{}); + + auto result = RowSelection::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 0); +} + +TEST(RowSelection, IntersectComplex) { + auto lhs = RowSelection::FromIntervals({{0, 11}, {15, 11}, {30, 11}, {50, 11}}); + auto rhs = RowSelection::FromIntervals({{5, 8}, {20, 16}, {55, 16}}); + + auto result = RowSelection::Intersect(lhs, rhs); + + // Expected intersections: + // [5, 10] from [0,10] ∩ [5,12] = 6 rows + // [20, 25] from [15,25] ∩ [20,35] = 6 rows + // [30, 35] from [30,40] ∩ [20,35] = 6 rows + // [55, 60] from [50,60] ∩ [55,70] = 6 rows + EXPECT_EQ(result.row_count(), 24); +} + +// Test Union +TEST(RowSelection, UnionNoOverlap) { + auto lhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::FromIntervals({{40, 11}, {60, 11}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 44); // 11+11+11+11 + + auto iter = result.NewIterator(); + + // Should have 4 separate ranges + for (int i = 0; i < 4; ++i) { + auto batch = iter->NextRanges(); + EXPECT_TRUE(batch.size() > 0); + } + + auto batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); +} + +TEST(RowSelection, UnionWithOverlap) { + auto lhs = RowSelection::FromIntervals({{0, 16}}); + auto rhs = RowSelection::FromIntervals({{10, 16}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 26); // [0, 25] = 26 rows + + auto iter = result.NewIterator(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; + EXPECT_EQ(interval.start, 0); + EXPECT_EQ((interval.start + interval.length - 1), 25); +} + +TEST(RowSelection, UnionAdjacent) { + auto lhs = RowSelection::FromIntervals({{0, 11}}); + auto rhs = RowSelection::FromIntervals({{11, 10}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 21); // [0, 20] = 21 rows + + // Should merge adjacent ranges + auto iter = result.NewIterator(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; + EXPECT_EQ(interval.start, 0); + EXPECT_EQ((interval.start + interval.length - 1), 20); + + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); +} + +TEST(RowSelection, UnionWithGap) { + auto lhs = RowSelection::FromIntervals({{0, 11}}); + auto rhs = RowSelection::FromIntervals({{20, 11}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 22); + + // Should have 2 ranges + auto iter = result.NewIterator(); + + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; + EXPECT_EQ(interval.start, 0); + EXPECT_EQ((interval.start + interval.length - 1), 10); + + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; + EXPECT_EQ(interval.start, 20); + EXPECT_EQ((interval.start + interval.length - 1), 30); +} + +TEST(RowSelection, UnionWithEmpty) { + auto lhs = RowSelection::FromIntervals({{0, 11}}); + auto rhs = RowSelection::FromIntervals(std::vector{}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 11); +} + +TEST(RowSelection, UnionEmptyWithNonEmpty) { + auto lhs = RowSelection::FromIntervals(std::vector{}); + auto rhs = RowSelection::FromIntervals({{0, 11}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 11); +} + +TEST(RowSelection, UnionIdentical) { + auto lhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 22); + + // Should still have 2 ranges (merged) + auto iter = result.NewIterator(); + + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); +} + +TEST(RowSelection, UnionComplex) { + auto lhs = RowSelection::FromIntervals({{0, 11}, {20, 11}, {50, 11}}); + auto rhs = RowSelection::FromIntervals({{5, 11}, {25, 11}, {45, 11}}); + + auto result = RowSelection::Union(lhs, rhs); + + // Expected: [0,15], [20,35], [45,60] + EXPECT_EQ(result.row_count(), 48); // 16 + 16 + 16 + + auto iter = result.NewIterator(); + + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; + EXPECT_EQ(interval.start, 0); + EXPECT_EQ((interval.start + interval.length - 1), 15); + + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; + EXPECT_EQ(interval.start, 20); + EXPECT_EQ((interval.start + interval.length - 1), 35); + + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; + EXPECT_EQ(interval.start, 45); + EXPECT_EQ((interval.start + interval.length - 1), 60); +} + +TEST(RowSelection, UnionManyOverlapping) { + auto lhs = RowSelection::FromIntervals({{0, 101}}); + auto rhs = RowSelection::FromIntervals({{50, 101}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 151); // [0, 150] + + auto iter = result.NewIterator(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; + EXPECT_EQ(interval.start, 0); + EXPECT_EQ((interval.start + interval.length - 1), 150); +} + +// Test iterator behavior +TEST(RowSelection, IteratorMultipleIterators) { + auto ranges = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + + auto iter1 = ranges.NewIterator(); + auto iter2 = ranges.NewIterator(); + + // Both iterators should work independently + auto batch1 = iter1->NextRanges(); + auto batch2 = iter2->NextRanges(); + + ASSERT_FALSE(batch1.empty()); + ASSERT_FALSE(batch2.empty()); + + auto interval1 = batch1[0]; + auto interval2 = batch2[0]; + + EXPECT_EQ(interval1.start, interval2.start); + EXPECT_EQ((interval1.start + interval1.length - 1), (interval2.start + interval2.length - 1)); +} + +TEST(RowSelection, IteratorExhaustion) { + auto ranges = RowSelection::MakeSingle(0, 9); + auto iter = ranges.NewIterator(); + + // First call returns the range + auto batch = iter->NextRanges(); + EXPECT_TRUE(batch.size() > 0); + + // Subsequent calls should return End + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); + + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); +} + +// Test edge cases +TEST(RowSelection, LargeRanges) { + auto ranges = RowSelection::MakeSingle(0, 1000000); + EXPECT_EQ(ranges.row_count(), 1000001); + EXPECT_NO_THROW(ranges.Validate()); +} + +TEST(RowSelection, ZeroStartRange) { + auto ranges = RowSelection::MakeSingle(0, 0); + EXPECT_EQ(ranges.row_count(), 1); + + auto iter = ranges.NewIterator(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; + EXPECT_EQ(interval.start, 0); + EXPECT_EQ((interval.start + interval.length - 1), 0); +} + +TEST(RowSelection, IntersectAndUnionCommutative) { + auto lhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::FromIntervals({{5, 11}, {25, 11}}); + + // Intersect should be commutative + auto intersect1 = RowSelection::Intersect(lhs, rhs); + auto intersect2 = RowSelection::Intersect(rhs, lhs); + EXPECT_EQ(intersect1.row_count(), intersect2.row_count()); + + // Union should be commutative + auto union1 = RowSelection::Union(lhs, rhs); + auto union2 = RowSelection::Union(rhs, lhs); + EXPECT_EQ(union1.row_count(), union2.row_count()); +} + +} // namespace parquet