Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/content/pypaimon/data-evolution.md
Original file line number Diff line number Diff line change
Expand Up @@ -196,3 +196,8 @@ commit.close()
- **Row order matters**: the batches you write must have the **same number of rows** as the batches you read, in the
same order for that shard.
- **Parallelism**: run multiple shards by calling `new_shard_updator(shard_idx, num_shards)` for each shard.

## Read After Partial Shard Update

- **Full table read**: rows from updated shards have the new column; rows from other shards have null for that column.
- **Per-shard read** (`with_shard(shard_idx, num_shards)`): read only the shard(s) you need. (new column where written, null elsewhere).
69 changes: 69 additions & 0 deletions paimon-python/pypaimon/globalindex/data_evolution_batch_scan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
################################################################################
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################


from typing import Optional

from pypaimon.common.predicate import Predicate
from pypaimon.table.special_fields import SpecialFields


class DataEvolutionBatchScan:
@staticmethod
def remove_row_id_filter(predicate: Optional[Predicate]) -> Optional[Predicate]:
if predicate is None:
return None
return DataEvolutionBatchScan._remove(predicate)

@staticmethod
def _remove(predicate: Predicate) -> Optional[Predicate]:
if predicate.method == 'and':
new_children = []
for p in predicate.literals:
sub = DataEvolutionBatchScan._remove(p)
if sub is not None:
new_children.append(sub)
if not new_children:
return None
if len(new_children) == 1:
return new_children[0]
return Predicate(
method='and',
index=predicate.index,
field=predicate.field,
literals=new_children
)
if predicate.method == 'or':
new_children = []
for p in predicate.literals:
sub = DataEvolutionBatchScan._remove(p)
if sub is None:
return None
new_children.append(sub)
if len(new_children) == 1:
return new_children[0]
return Predicate(
method='or',
index=predicate.index,
field=predicate.field,
literals=new_children
)
# Leaf: remove if _ROW_ID
if predicate.field == SpecialFields.ROW_ID.name:
return None
return predicate
8 changes: 8 additions & 0 deletions paimon-python/pypaimon/globalindex/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,14 @@ def merge_sorted_as_possible(ranges: List['Range']) -> List['Range']:

return result

@staticmethod
def to_ranges(ids: List[int]) -> List['Range']:
if not ids:
return []
sorted_ids = sorted(set(ids))
ranges = [Range(i, i) for i in sorted_ids]
return Range.sort_and_merge_overlap(ranges, merge=True, adjacent=True)

@staticmethod
def sort_and_merge_overlap(ranges: List['Range'], merge: bool = True, adjacent: bool = True) -> List['Range']:
"""
Expand Down
3 changes: 2 additions & 1 deletion paimon-python/pypaimon/read/read_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def new_read(self) -> TableRead:
return TableRead(
table=self.table,
predicate=self._predicate,
read_type=self.read_type()
read_type=self.read_type(),
projection=self._projection,
)

def new_predicate_builder(self) -> PredicateBuilder:
Expand Down
30 changes: 27 additions & 3 deletions paimon-python/pypaimon/read/reader/concat_batch_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ class DataEvolutionMergeReader(RecordBatchReader):
- The fourth field comes from batch1, and it is at offset 1 in batch1.
- The fifth field comes from batch2, and it is at offset 1 in batch2.
- The sixth field comes from batch1, and it is at offset 0 in batch1.

When row_offsets[i] == -1 (no file provides that field), output a column of nulls using schema.
"""

def __init__(
Expand Down Expand Up @@ -207,14 +209,36 @@ def read_arrow_batch(self) -> Optional[RecordBatch]:
for i in range(len(self.row_offsets)):
batch_index = self.row_offsets[i]
field_index = self.field_offsets[i]
field_name = self.schema.field(i).name if self.schema else None
column = None

if batch_index >= 0 and batches[batch_index] is not None:
columns.append(batches[batch_index].column(field_index).slice(0, min_rows))
else:
src_batch = batches[batch_index]
if field_name is not None and field_name in src_batch.schema.names:
column = src_batch.column(
src_batch.schema.get_field_index(field_name)
).slice(0, min_rows)
elif field_index < src_batch.num_columns:
column = src_batch.column(field_index).slice(0, min_rows)

if column is None and field_name is not None:
for b in batches:
if b is not None and field_name in b.schema.names:
column = b.column(b.schema.get_field_index(field_name)).slice(
0, min_rows
)
break

if column is not None:
columns.append(column)
elif self.schema is not None and i < len(self.schema):
columns.append(pa.nulls(min_rows, type=self.schema.field(i).type))

for i in range(len(self.readers)):
if batches[i] is not None and batches[i].num_rows > min_rows:
self._buffers[i] = batches[i].slice(min_rows, batches[i].num_rows - min_rows)
self._buffers[i] = batches[i].slice(
min_rows, batches[i].num_rows - min_rows
)

return pa.RecordBatch.from_arrays(columns, schema=self.schema)

Expand Down
Loading
Loading