@@ -91,6 +91,39 @@ def large_df():
9191 return ctx .from_arrow (batch )
9292
9393
94+ @pytest .fixture
95+ def large_multi_batch_df ():
96+ """Create a DataFrame with multiple record batches for testing stream behavior.
97+
98+ This fixture creates 10 batches of 10,000 rows each (100,000 rows total),
99+ ensuring the DataFrame spans multiple batches. This is essential for testing
100+ that memory limits actually cause early stream termination rather than
101+ truncating all collected data.
102+ """
103+ ctx = SessionContext ()
104+
105+ # Create multiple batches, each with 10,000 rows
106+ batches = []
107+ rows_per_batch = 10000
108+ num_batches = 10
109+
110+ for batch_idx in range (num_batches ):
111+ start_row = batch_idx * rows_per_batch
112+ end_row = start_row + rows_per_batch
113+ data = {
114+ "a" : list (range (start_row , end_row )),
115+ "b" : [f"s-{ i } " for i in range (start_row , end_row )],
116+ "c" : [float (i + 0.1 ) for i in range (start_row , end_row )],
117+ }
118+ batch = pa .record_batch (data )
119+ batches .append (batch )
120+
121+ # Register as record batches to maintain multi-batch structure
122+ # Using [batches] wraps list in another list as required by register_record_batches
123+ ctx .register_record_batches ("large_multi_batch_data" , [batches ])
124+ return ctx .table ("large_multi_batch_data" )
125+
126+
94127@pytest .fixture
95128def struct_df ():
96129 ctx = SessionContext ()
@@ -1523,6 +1556,62 @@ def test_html_formatter_memory_boundary_conditions(large_df, clean_formatter_sta
15231556 assert tr_count < unrestricted_rows
15241557
15251558
1559+ def test_html_formatter_stream_early_termination (
1560+ large_multi_batch_df , clean_formatter_state
1561+ ):
1562+ """Test that memory limits cause early stream termination with multi-batch data.
1563+
1564+ This test specifically validates that the formatter stops collecting data when
1565+ the memory limit is reached, rather than collecting all data and then truncating.
1566+ The large_multi_batch_df fixture creates 10 record batches, allowing us to verify
1567+ that not all batches are consumed when memory limit is hit.
1568+
1569+ Key difference from test_html_formatter_memory_boundary_conditions:
1570+ - Uses multi-batch DataFrame to verify stream termination behavior
1571+ - Tests with memory limit exceeded by 2-3 batches but not 1 batch
1572+ - Verifies partial data + truncation message + respects min_rows
1573+ """
1574+
1575+ # Get baseline: how much data fits without memory limit
1576+ configure_formatter (max_memory_bytes = 100 * MB , min_rows = 1 , max_rows = 200000 )
1577+ unrestricted_output = large_multi_batch_df ._repr_html_ ()
1578+ unrestricted_rows = count_table_rows (unrestricted_output )
1579+
1580+ # Test 1: Memory limit exceeded by ~2 batches (each batch ~10k rows)
1581+ # With 1 batch (~1-2MB), we should have space. With 2-3 batches, we exceed limit.
1582+ # Set limit to ~3MB to ensure we collect ~1 batch before hitting limit
1583+ configure_formatter (max_memory_bytes = 3 * MB , min_rows = 1 , max_rows = 200000 )
1584+ html_output = large_multi_batch_df ._repr_html_ ()
1585+ tr_count = count_table_rows (html_output )
1586+
1587+ # Should show significant truncation (not all 100k rows)
1588+ assert tr_count < unrestricted_rows , "Should be truncated by memory limit"
1589+ assert tr_count >= 2 , "Should respect min_rows"
1590+ assert "data truncated" in html_output .lower (), "Should indicate truncation"
1591+
1592+ # Test 2: Very tight memory limit should still respect min_rows
1593+ # Even with tiny memory (10 bytes), should show at least min_rows
1594+ configure_formatter (max_memory_bytes = 10 , min_rows = 5 , max_rows = 200000 )
1595+ html_output = large_multi_batch_df ._repr_html_ ()
1596+ tr_count = count_table_rows (html_output )
1597+
1598+ assert tr_count >= 6 , "Should show header + at least min_rows (5)"
1599+ assert "data truncated" in html_output .lower (), "Should indicate truncation"
1600+
1601+ # Test 3: Memory limit should take precedence over max_rows in early termination
1602+ # With max_rows=100 but small memory limit, should terminate early due to memory
1603+ configure_formatter (max_memory_bytes = 2 * MB , min_rows = 1 , max_rows = 100 )
1604+ html_output = large_multi_batch_df ._repr_html_ ()
1605+ tr_count = count_table_rows (html_output )
1606+
1607+ # Should be truncated by memory limit (showing more than max_rows would suggest
1608+ # but less than unrestricted)
1609+ assert tr_count >= 2 , "Should respect min_rows"
1610+ assert tr_count < unrestricted_rows , "Should be truncated"
1611+ # Output should indicate why truncation occurred
1612+ assert "data truncated" in html_output .lower ()
1613+
1614+
15261615def test_html_formatter_max_rows (df , clean_formatter_state ):
15271616 configure_formatter (min_rows = 2 , max_rows = 2 )
15281617 html_output = df ._repr_html_ ()
0 commit comments