feat: Add configurable display options for PyDataFrame

kosiew · kosiew · commit 41e6ad290d7d · 2025-03-28T12:45:14.000+08:00
- Introduced DisplayConfig struct to manage display settings such as max_table_bytes, min_table_rows, and max_cell_length.
- Updated PyDataFrame to utilize DisplayConfig for rendering and displaying DataFrames.
- Added methods to configure and reset display settings, allowing users to customize their DataFrame presentation in Python.
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -72,9 +72,27 @@ impl PyTableProvider {
         PyTable::new(table_provider)
     }
 }
-const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB
-const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
-const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25;
+
+/// Configuration for DataFrame display in Python environment
+#[derive(Debug, Clone)]
+pub struct DisplayConfig {
+    /// Maximum bytes to display for table presentation (default: 2MB)
+    pub max_table_bytes: usize,
+    /// Minimum number of table rows to display (default: 20)
+    pub min_table_rows: usize,
+    /// Maximum length of a cell before it gets minimized (default: 25)
+    pub max_cell_length: usize,
+}
+
+impl Default for DisplayConfig {
+    fn default() -> Self {
+        Self {
+            max_table_bytes: 2 * 1024 * 1024, // 2 MB
+            min_table_rows: 20,
+            max_cell_length: 25,
+        }
+    }
+}
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
@@ -83,12 +101,16 @@ const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25;
 #[derive(Clone)]
 pub struct PyDataFrame {
     df: Arc<DataFrame>,
+    config: Arc<DisplayConfig>,
 }
 
 impl PyDataFrame {
     /// creates a new PyDataFrame
     pub fn new(df: DataFrame) -> Self {
-        Self { df: Arc::new(df) }
+        Self {
+            df: Arc::new(df),
+            config: Arc::new(DisplayConfig::default()),
+        }
     }
 }
 
@@ -118,7 +140,7 @@ impl PyDataFrame {
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
         let (batches, has_more) = wait_for_future(
             py,
-            collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10),
+            collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10, &self.config),
         )?;
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
@@ -141,8 +163,9 @@ impl PyDataFrame {
             py,
             collect_record_batches_to_display(
                 self.df.as_ref().clone(),
-                MIN_TABLE_ROWS_TO_DISPLAY,
+                self.config.min_table_rows,
                 usize::MAX,
+                &self.config,
             ),
         )?;
         if batches.is_empty() {
@@ -218,8 +241,8 @@ impl PyDataFrame {
                 for (col, formatter) in batch_formatter.iter().enumerate() {
                     let cell_data = formatter.value(batch_row).to_string();
                     // From testing, primitive data types do not typically get larger than 21 characters
-                    if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE {
-                        let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE];
+                    if cell_data.len() > self.config.max_cell_length {
+                        let short_cell_data = &cell_data[0..self.config.max_cell_length];
                         cells.push(format!("
                             <td style='border: 1px solid black; padding: 8px; text-align: left; white-space: nowrap;'>
                                 <div class=\"expandable-container\">
@@ -797,6 +820,42 @@ impl PyDataFrame {
     fn count(&self, py: Python) -> PyDataFusionResult<usize> {
         Ok(wait_for_future(py, self.df.as_ref().clone().count())?)
     }
+
+    /// Get the current display configuration
+    #[getter]
+    fn display_config(&self) -> DisplayConfig {
+        (*self.config).clone()
+    }
+
+    /// Update display configuration
+    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None))]
+    fn configure_display(
+        &mut self,
+        max_table_bytes: Option<usize>,
+        min_table_rows: Option<usize>,
+        max_cell_length: Option<usize>,
+    ) {
+        let mut new_config = (*self.config).clone();
+
+        if let Some(bytes) = max_table_bytes {
+            new_config.max_table_bytes = bytes;
+        }
+
+        if let Some(rows) = min_table_rows {
+            new_config.min_table_rows = rows;
+        }
+
+        if let Some(length) = max_cell_length {
+            new_config.max_cell_length = length;
+        }
+
+        self.config = Arc::new(new_config);
+    }
+
+    /// Reset display configuration to default values
+    fn reset_display_config(&mut self) {
+        self.config = Arc::new(DisplayConfig::default());
+    }
 }
 
 /// Print DataFrame
@@ -886,6 +945,7 @@ async fn collect_record_batches_to_display(
     df: DataFrame,
     min_rows: usize,
     max_rows: usize,
+    config: &DisplayConfig,
 ) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
     let partitioned_stream = df.execute_stream_partitioned().await?;
     let mut stream = futures::stream::iter(partitioned_stream).flatten();
@@ -894,7 +954,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows)
+    while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows)
         || rows_so_far < min_rows
     {
         let mut rb = match stream.next().await {
@@ -909,8 +969,8 @@ async fn collect_record_batches_to_display(
         if rows_in_rb > 0 {
             size_estimate_so_far += rb.get_array_memory_size();
 
-            if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY {
-                let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32;
+            if size_estimate_so_far > config.max_table_bytes {
+                let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32;
                 let total_rows = rows_in_rb + rows_so_far;
 
                 let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;