feat: Add truncated_rows parameter to register_csv and read_csv

djouallah · claude · djouallah · commit f9ca7414f1db · 2026-01-31T10:49:34.000+10:00
Exposes the truncated_rows parameter from Rust DataFusion to Python bindings. This enables reading CSV files with inconsistent column counts by creating a union schema and filling missing columns with nulls. The parameter was added to DataFusion Rust in PR apache/datafusion#17553 and is now available in datafusion 51.0.0. Changes: - Add truncated_rows parameter to SessionContext.register_csv() - Add truncated_rows parameter to SessionContext.read_csv() - Add comprehensive tests for both methods - Update docstrings with parameter documentation Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -927,6 +927,7 @@ def register_csv(
         schema_infer_max_records: int = 1000,
         file_extension: str = ".csv",
         file_compression_type: str | None = None,
+        truncated_rows: bool = False,
     ) -> None:
         """Register a CSV file as a table.
 
@@ -946,6 +947,10 @@ def register_csv(
             file_extension: File extension; only files with this extension are
                 selected for data input.
             file_compression_type: File compression type.
+            truncated_rows: Allow reading CSV files with inconsistent column
+                counts by creating a union schema. Missing columns are filled
+                with nulls. Default is False. Useful for evolving datasets
+                where newer files have additional columns.
         """
         path = [str(p) for p in path] if isinstance(path, list) else str(path)
 
@@ -958,6 +963,7 @@ def register_csv(
             schema_infer_max_records,
             file_extension,
             file_compression_type,
+            truncated_rows,
         )
 
     def register_json(
@@ -1123,6 +1129,7 @@ def read_csv(
         file_extension: str = ".csv",
         table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
         file_compression_type: str | None = None,
+        truncated_rows: bool = False,
     ) -> DataFrame:
         """Read a CSV data source.
 
@@ -1140,6 +1147,10 @@ def read_csv(
                 selected for data input.
             table_partition_cols:  Partition columns.
             file_compression_type:  File compression type.
+            truncated_rows: Allow reading CSV files with inconsistent column
+                counts by creating a union schema. Missing columns are filled
+                with nulls. Default is False. Useful for evolving datasets
+                where newer files have additional columns.
 
         Returns:
             DataFrame representation of the read CSV files
@@ -1160,6 +1171,7 @@ def read_csv(
                 file_extension,
                 table_partition_cols,
                 file_compression_type,
+                truncated_rows,
             )
         )
 
diff --git a/python/tests/test_context.py b/python/tests/test_context.py
@@ -21,6 +21,7 @@
 import pyarrow as pa
 import pyarrow.dataset as ds
 import pytest
+from pyarrow.csv import write_csv
 from datafusion import (
     DataFrame,
     RuntimeEnvBuilder,
@@ -639,6 +640,65 @@ def test_read_csv_compressed(ctx, tmp_path):
     csv_df.select(column("c1")).show()
 
 
+def test_read_csv_truncated_rows(ctx, tmp_path):
+    # Create CSV file with 3 columns
+    path1 = tmp_path / "file1.csv"
+    table1 = pa.Table.from_arrays(
+        [
+            [1, 2],
+            ["a", "b"],
+            [1.1, 2.2],
+        ],
+        names=["int", "str", "float"],
+    )
+    write_csv(table1, path1)
+
+    # Create CSV file with 5 columns
+    path2 = tmp_path / "file2.csv"
+    table2 = pa.Table.from_arrays(
+        [
+            [3, 4],
+            ["c", "d"],
+            [3.3, 4.4],
+            ["x", "y"],
+            [10, 20],
+        ],
+        names=["int", "str", "float", "extra1", "extra2"],
+    )
+    write_csv(table2, path2)
+
+    # Read with truncated_rows=True to handle mismatched columns
+    df = ctx.read_csv([path1, path2], truncated_rows=True)
+    result = df.collect()
+    result_table = pa.Table.from_batches(result)
+
+    # Should have 5 columns (union schema)
+    assert len(result_table.schema) == 5
+    assert result_table.schema.names == ["int", "str", "float", "extra1", "extra2"]
+
+    # Should have 4 rows total (2 from each file)
+    assert result_table.num_rows == 4
+
+    # Convert to dict for easier validation
+    result_dict = result_table.to_pydict()
+
+    # Check that rows from file1 have nulls for extra1 and extra2
+    assert result_dict["int"] == [1, 2, 3, 4]
+    assert result_dict["str"] == ["a", "b", "c", "d"]
+    assert result_dict["float"] == [1.1, 2.2, 3.3, 4.4]
+
+    # First two rows should have None for extra1 and extra2
+    assert result_dict["extra1"][0] is None
+    assert result_dict["extra1"][1] is None
+    assert result_dict["extra1"][2] == "x"
+    assert result_dict["extra1"][3] == "y"
+
+    assert result_dict["extra2"][0] is None
+    assert result_dict["extra2"][1] is None
+    assert result_dict["extra2"][2] == 10
+    assert result_dict["extra2"][3] == 20
+
+
 def test_read_parquet(ctx):
     parquet_df = ctx.read_parquet(path="parquet/data/alltypes_plain.parquet")
     parquet_df.show()
diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py
@@ -137,6 +137,67 @@ def test_register_csv_list(ctx, tmp_path):
     assert int_sum == 2 * sum(int_values)
 
 
+def test_register_csv_truncated_rows(ctx, tmp_path):
+    # Create CSV file with 3 columns
+    path1 = tmp_path / "file1.csv"
+    table1 = pa.Table.from_arrays(
+        [
+            [1, 2],
+            ["a", "b"],
+            [1.1, 2.2],
+        ],
+        names=["int", "str", "float"],
+    )
+    write_csv(table1, path1)
+
+    # Create CSV file with 5 columns
+    path2 = tmp_path / "file2.csv"
+    table2 = pa.Table.from_arrays(
+        [
+            [3, 4],
+            ["c", "d"],
+            [3.3, 4.4],
+            ["x", "y"],
+            [10, 20],
+        ],
+        names=["int", "str", "float", "extra1", "extra2"],
+    )
+    write_csv(table2, path2)
+
+    # Register with truncated_rows=True to handle mismatched columns
+    ctx.register_csv("mixed", [path1, path2], truncated_rows=True)
+
+    # Verify the table exists and has correct schema
+    result = ctx.sql("SELECT * FROM mixed").collect()
+    result_table = pa.Table.from_batches(result)
+
+    # Should have 5 columns (union schema)
+    assert len(result_table.schema) == 5
+    assert result_table.schema.names == ["int", "str", "float", "extra1", "extra2"]
+
+    # Should have 4 rows total (2 from each file)
+    assert result_table.num_rows == 4
+
+    # Convert to dict for easier validation
+    result_dict = result_table.to_pydict()
+
+    # Check that rows from file1 have nulls for extra1 and extra2
+    assert result_dict["int"] == [1, 2, 3, 4]
+    assert result_dict["str"] == ["a", "b", "c", "d"]
+    assert result_dict["float"] == [1.1, 2.2, 3.3, 4.4]
+
+    # First two rows should have None for extra1 and extra2
+    assert result_dict["extra1"][0] is None
+    assert result_dict["extra1"][1] is None
+    assert result_dict["extra1"][2] == "x"
+    assert result_dict["extra1"][3] == "y"
+
+    assert result_dict["extra2"][0] is None
+    assert result_dict["extra2"][1] is None
+    assert result_dict["extra2"][2] == 10
+    assert result_dict["extra2"][3] == 20
+
+
 def test_register_http_csv(ctx):
     url = "https://raw.githubusercontent.com/ibis-project/testing-data/refs/heads/master/csv/diamonds.csv"
     ctx.register_object_store("", Http(url))
diff --git a/src/context.rs b/src/context.rs
@@ -715,7 +715,8 @@ impl PySessionContext {
                         delimiter=",",
                         schema_infer_max_records=1000,
                         file_extension=".csv",
-                        file_compression_type=None))]
+                        file_compression_type=None,
+                        truncated_rows=false))]
     pub fn register_csv(
         &self,
         name: &str,
@@ -726,6 +727,7 @@ impl PySessionContext {
         schema_infer_max_records: usize,
         file_extension: &str,
         file_compression_type: Option<String>,
+        truncated_rows: bool,
         py: Python,
     ) -> PyDataFusionResult<()> {
         let delimiter = delimiter.as_bytes();
@@ -740,7 +742,8 @@ impl PySessionContext {
             .delimiter(delimiter[0])
             .schema_infer_max_records(schema_infer_max_records)
             .file_extension(file_extension)
-            .file_compression_type(parse_file_compression_type(file_compression_type)?);
+            .file_compression_type(parse_file_compression_type(file_compression_type)?)
+            .truncated_rows(truncated_rows);
         options.schema = schema.as_ref().map(|x| &x.0);
 
         if path.is_instance_of::<PyList>() {
@@ -969,7 +972,8 @@ impl PySessionContext {
         schema_infer_max_records=1000,
         file_extension=".csv",
         table_partition_cols=vec![],
-        file_compression_type=None))]
+        file_compression_type=None,
+        truncated_rows=false))]
     pub fn read_csv(
         &self,
         path: &Bound<'_, PyAny>,
@@ -980,6 +984,7 @@ impl PySessionContext {
         file_extension: &str,
         table_partition_cols: Vec<(String, PyArrowType<DataType>)>,
         file_compression_type: Option<String>,
+        truncated_rows: bool,
         py: Python,
     ) -> PyDataFusionResult<PyDataFrame> {
         let delimiter = delimiter.as_bytes();
@@ -1000,7 +1005,8 @@ impl PySessionContext {
                     .map(|(name, ty)| (name, ty.0))
                     .collect::<Vec<(String, DataType)>>(),
             )
-            .file_compression_type(parse_file_compression_type(file_compression_type)?);
+            .file_compression_type(parse_file_compression_type(file_compression_type)?)
+            .truncated_rows(truncated_rows);
         options.schema = schema.as_ref().map(|x| &x.0);
 
         if path.is_instance_of::<PyList>() {