Skip to content

Commit afb244c

Browse files
sumedhsakdeoclaude
andcommitted
refactor: replace streaming param with order=ScanOrder in benchmarks and docs
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b2ae725 commit afb244c

File tree

2 files changed

+17
-17
lines changed

2 files changed

+17
-17
lines changed

mkdocs/docs/api.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,11 +394,11 @@ Within each file, batch ordering always follows row order. The `limit` parameter
394394
| Use case | Recommended config |
395395
|---|---|
396396
| Small tables, simple queries | Default — no extra args needed |
397-
| Large tables, memory-constrained | `streaming=True` — one file at a time, minimal memory |
398-
| Maximum throughput with bounded memory | `streaming=True, concurrent_files=N` — tune N to balance throughput vs memory |
397+
| Large tables, memory-constrained | `order=ScanOrder.ARRIVAL` — one file at a time, minimal memory |
398+
| Maximum throughput with bounded memory | `order=ScanOrder.ARRIVAL, concurrent_files=N` — tune N to balance throughput vs memory |
399399
| Fine-grained batch control | Add `batch_size=N` to any of the above |
400400

401-
**Note:** `streaming=True` yields batches in arrival order (interleaved across files when `concurrent_files > 1`). For deterministic file ordering, use the default non-streaming mode. `batch_size` is usually an advanced tuning knob — the PyArrow default of 131,072 rows works well for most workloads.
401+
**Note:** `ScanOrder.ARRIVAL` yields batches in arrival order (interleaved across files when `concurrent_files > 1`). For deterministic file ordering, use the default `ScanOrder.TASK` mode. `batch_size` is usually an advanced tuning knob — the PyArrow default of 131,072 rows works well for most workloads.
402402

403403
To avoid any type inconsistencies during writing, you can convert the Iceberg table schema to Arrow:
404404

tests/benchmark/test_read_benchmark.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# under the License.
1717
"""Read throughput micro-benchmark for ArrowScan configurations.
1818
19-
Measures records/sec and peak Arrow memory across streaming, concurrent_files,
19+
Measures records/sec and peak Arrow memory across ScanOrder, concurrent_files,
2020
and batch_size configurations introduced for issue #3036.
2121
2222
Memory is measured using pa.total_allocated_bytes() which tracks PyArrow's C++
@@ -35,7 +35,7 @@
3535
import pytest
3636

3737
from pyiceberg.catalog.sql import SqlCatalog
38-
from pyiceberg.table import Table
38+
from pyiceberg.table import ScanOrder, Table
3939

4040
NUM_FILES = 32
4141
ROWS_PER_FILE = 500_000
@@ -85,28 +85,28 @@ def benchmark_table(tmp_path_factory: pytest.TempPathFactory) -> Table:
8585

8686

8787
@pytest.mark.parametrize(
88-
"streaming,concurrent_files,batch_size",
88+
"order,concurrent_files,batch_size",
8989
[
90-
pytest.param(False, 1, None, id="default"),
91-
pytest.param(True, 1, None, id="streaming-cf1"),
92-
pytest.param(True, 2, None, id="streaming-cf2"),
93-
pytest.param(True, 4, None, id="streaming-cf4"),
94-
pytest.param(True, 8, None, id="streaming-cf8"),
95-
pytest.param(True, 16, None, id="streaming-cf16"),
90+
pytest.param(ScanOrder.TASK, 1, None, id="default"),
91+
pytest.param(ScanOrder.ARRIVAL, 1, None, id="arrival-cf1"),
92+
pytest.param(ScanOrder.ARRIVAL, 2, None, id="arrival-cf2"),
93+
pytest.param(ScanOrder.ARRIVAL, 4, None, id="arrival-cf4"),
94+
pytest.param(ScanOrder.ARRIVAL, 8, None, id="arrival-cf8"),
95+
pytest.param(ScanOrder.ARRIVAL, 16, None, id="arrival-cf16"),
9696
],
9797
)
9898
def test_read_throughput(
9999
benchmark_table: Table,
100-
streaming: bool,
100+
order: ScanOrder,
101101
concurrent_files: int,
102102
batch_size: int | None,
103103
) -> None:
104104
"""Measure records/sec, time to first record, and peak Arrow memory for a scan configuration."""
105105
effective_batch_size = batch_size or 131_072 # PyArrow default
106-
if streaming:
107-
config_str = f"streaming=True, concurrent_files={concurrent_files}, batch_size={effective_batch_size}"
106+
if order == ScanOrder.ARRIVAL:
107+
config_str = f"order=ARRIVAL, concurrent_files={concurrent_files}, batch_size={effective_batch_size}"
108108
else:
109-
config_str = f"streaming=False (executor.map, all files parallel), batch_size={effective_batch_size}"
109+
config_str = f"order=TASK (executor.map, all files parallel), batch_size={effective_batch_size}"
110110
print("\n--- ArrowScan Read Throughput Benchmark ---")
111111
print(f"Config: {config_str}")
112112
print(f" Files: {NUM_FILES}, Rows per file: {ROWS_PER_FILE}, Total rows: {TOTAL_ROWS}")
@@ -128,7 +128,7 @@ def test_read_throughput(
128128
first_batch_time = None
129129
for batch in benchmark_table.scan().to_arrow_batch_reader(
130130
batch_size=batch_size,
131-
streaming=streaming,
131+
order=order,
132132
concurrent_files=concurrent_files,
133133
):
134134
if first_batch_time is None:

0 commit comments

Comments
 (0)