Skip to content

Commit 6f92b3d

Browse files
committed
Add fixture for multi-batch DataFrame and test early stream termination with memory limits
1 parent 1085992 commit 6f92b3d

File tree

1 file changed

+89
-0
lines changed

1 file changed

+89
-0
lines changed

python/tests/test_dataframe.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,39 @@ def large_df():
9191
return ctx.from_arrow(batch)
9292

9393

94+
@pytest.fixture
95+
def large_multi_batch_df():
96+
"""Create a DataFrame with multiple record batches for testing stream behavior.
97+
98+
This fixture creates 10 batches of 10,000 rows each (100,000 rows total),
99+
ensuring the DataFrame spans multiple batches. This is essential for testing
100+
that memory limits actually cause early stream termination rather than
101+
truncating all collected data.
102+
"""
103+
ctx = SessionContext()
104+
105+
# Create multiple batches, each with 10,000 rows
106+
batches = []
107+
rows_per_batch = 10000
108+
num_batches = 10
109+
110+
for batch_idx in range(num_batches):
111+
start_row = batch_idx * rows_per_batch
112+
end_row = start_row + rows_per_batch
113+
data = {
114+
"a": list(range(start_row, end_row)),
115+
"b": [f"s-{i}" for i in range(start_row, end_row)],
116+
"c": [float(i + 0.1) for i in range(start_row, end_row)],
117+
}
118+
batch = pa.record_batch(data)
119+
batches.append(batch)
120+
121+
# Register as record batches to maintain multi-batch structure
122+
# Using [batches] wraps list in another list as required by register_record_batches
123+
ctx.register_record_batches("large_multi_batch_data", [batches])
124+
return ctx.table("large_multi_batch_data")
125+
126+
94127
@pytest.fixture
95128
def struct_df():
96129
ctx = SessionContext()
@@ -1523,6 +1556,62 @@ def test_html_formatter_memory_boundary_conditions(large_df, clean_formatter_sta
15231556
assert tr_count < unrestricted_rows
15241557

15251558

1559+
def test_html_formatter_stream_early_termination(
1560+
large_multi_batch_df, clean_formatter_state
1561+
):
1562+
"""Test that memory limits cause early stream termination with multi-batch data.
1563+
1564+
This test specifically validates that the formatter stops collecting data when
1565+
the memory limit is reached, rather than collecting all data and then truncating.
1566+
The large_multi_batch_df fixture creates 10 record batches, allowing us to verify
1567+
that not all batches are consumed when memory limit is hit.
1568+
1569+
Key difference from test_html_formatter_memory_boundary_conditions:
1570+
- Uses multi-batch DataFrame to verify stream termination behavior
1571+
- Tests with memory limit exceeded by 2-3 batches but not 1 batch
1572+
- Verifies partial data + truncation message + respects min_rows
1573+
"""
1574+
1575+
# Get baseline: how much data fits without memory limit
1576+
configure_formatter(max_memory_bytes=100 * MB, min_rows=1, max_rows=200000)
1577+
unrestricted_output = large_multi_batch_df._repr_html_()
1578+
unrestricted_rows = count_table_rows(unrestricted_output)
1579+
1580+
# Test 1: Memory limit exceeded by ~2 batches (each batch ~10k rows)
1581+
# With 1 batch (~1-2MB), we should have space. With 2-3 batches, we exceed limit.
1582+
# Set limit to ~3MB to ensure we collect ~1 batch before hitting limit
1583+
configure_formatter(max_memory_bytes=3 * MB, min_rows=1, max_rows=200000)
1584+
html_output = large_multi_batch_df._repr_html_()
1585+
tr_count = count_table_rows(html_output)
1586+
1587+
# Should show significant truncation (not all 100k rows)
1588+
assert tr_count < unrestricted_rows, "Should be truncated by memory limit"
1589+
assert tr_count >= 2, "Should respect min_rows"
1590+
assert "data truncated" in html_output.lower(), "Should indicate truncation"
1591+
1592+
# Test 2: Very tight memory limit should still respect min_rows
1593+
# Even with tiny memory (10 bytes), should show at least min_rows
1594+
configure_formatter(max_memory_bytes=10, min_rows=5, max_rows=200000)
1595+
html_output = large_multi_batch_df._repr_html_()
1596+
tr_count = count_table_rows(html_output)
1597+
1598+
assert tr_count >= 6, "Should show header + at least min_rows (5)"
1599+
assert "data truncated" in html_output.lower(), "Should indicate truncation"
1600+
1601+
# Test 3: Memory limit should take precedence over max_rows in early termination
1602+
# With max_rows=100 but small memory limit, should terminate early due to memory
1603+
configure_formatter(max_memory_bytes=2 * MB, min_rows=1, max_rows=100)
1604+
html_output = large_multi_batch_df._repr_html_()
1605+
tr_count = count_table_rows(html_output)
1606+
1607+
# Should be truncated by memory limit (showing more than max_rows would suggest
1608+
# but less than unrestricted)
1609+
assert tr_count >= 2, "Should respect min_rows"
1610+
assert tr_count < unrestricted_rows, "Should be truncated"
1611+
# Output should indicate why truncation occurred
1612+
assert "data truncated" in html_output.lower()
1613+
1614+
15261615
def test_html_formatter_max_rows(df, clean_formatter_state):
15271616
configure_formatter(min_rows=2, max_rows=2)
15281617
html_output = df._repr_html_()

0 commit comments

Comments
 (0)