Skip to content

Commit c680d60

Browse files
committed
Implement all CSV options with a builder pattern
1 parent eaa3f79 commit c680d60

File tree

11 files changed

+712
-134
lines changed

11 files changed

+712
-134
lines changed

docs/source/user-guide/io/csv.rst

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,22 @@ An alternative is to use :py:func:`~datafusion.context.SessionContext.register_c
3636
3737
ctx.register_csv("file", "file.csv")
3838
df = ctx.table("file")
39+
40+
If you require additional control over how to read the CSV file, you can use
41+
:py:class:`~datafusion.options.CsvReadOptions` to set a variety of options.
42+
43+
.. code-block:: python
44+
45+
from datafusion import CsvReadOptions
46+
options = (
47+
CsvReadOptions()
48+
.with_has_header(True) # File contains a header row
49+
.with_delimiter(";") # Use ; as the delimiter instead of ,
50+
.with_comment("#") # Skip lines starting with #
51+
.with_escape("\\") # Escape character
52+
.with_null_regex(r"^(null|NULL|N/A)$") # Treat these as NULL
53+
.with_truncated_rows(True) # Allow rows to have incomplete columns
54+
.with_file_compression_type("gzip") # Read gzipped CSV
55+
.with_file_extension(".gz") # File extension other than .csv
56+
)
57+
df = ctx.read_csv("data.csv.gz", options=options)

examples/csv-read-options.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
"""Example demonstrating CsvReadOptions usage."""
19+
20+
from datafusion import CsvReadOptions, SessionContext
21+
22+
# Create a SessionContext
23+
ctx = SessionContext()
24+
25+
# Example 1: Using CsvReadOptions with default values
26+
print("Example 1: Default CsvReadOptions")
27+
options = CsvReadOptions()
28+
df = ctx.read_csv("data.csv", options=options)
29+
30+
# Example 2: Using CsvReadOptions with custom parameters
31+
print("\nExample 2: Custom CsvReadOptions")
32+
options = CsvReadOptions(
33+
has_header=True,
34+
delimiter=",",
35+
quote='"',
36+
schema_infer_max_records=1000,
37+
file_extension=".csv",
38+
)
39+
df = ctx.read_csv("data.csv", options=options)
40+
41+
# Example 3: Using the builder pattern (recommended for readability)
42+
print("\nExample 3: Builder pattern")
43+
options = (
44+
CsvReadOptions()
45+
.with_has_header(True) # noqa: FBT003
46+
.with_delimiter("|")
47+
.with_quote("'")
48+
.with_schema_infer_max_records(500)
49+
.with_truncated_rows(False) # noqa: FBT003
50+
.with_newlines_in_values(True) # noqa: FBT003
51+
)
52+
df = ctx.read_csv("data.csv", options=options)
53+
54+
# Example 4: Advanced options
55+
print("\nExample 4: Advanced options")
56+
options = (
57+
CsvReadOptions()
58+
.with_has_header(True) # noqa: FBT003
59+
.with_delimiter(",")
60+
.with_comment("#") # Skip lines starting with #
61+
.with_escape("\\") # Escape character
62+
.with_null_regex(r"^(null|NULL|N/A)$") # Treat these as NULL
63+
.with_truncated_rows(True) # noqa: FBT003
64+
.with_file_compression_type("gzip") # Read gzipped CSV
65+
.with_file_extension(".gz")
66+
)
67+
df = ctx.read_csv("data.csv.gz", options=options)
68+
69+
# Example 5: Register CSV table with options
70+
print("\nExample 5: Register CSV table")
71+
options = CsvReadOptions().with_has_header(True).with_delimiter(",") # noqa: FBT003
72+
ctx.register_csv("my_table", "data.csv", options=options)
73+
df = ctx.sql("SELECT * FROM my_table")
74+
75+
# Example 6: Backward compatibility (without options)
76+
print("\nExample 6: Backward compatibility")
77+
# Still works the old way!
78+
df = ctx.read_csv("data.csv", has_header=True, delimiter=",")
79+
80+
print("\nAll examples completed!")
81+
print("\nFor all available options, see the CsvReadOptions documentation:")
82+
print(" - has_header: bool")
83+
print(" - delimiter: str")
84+
print(" - quote: str")
85+
print(" - terminator: str | None")
86+
print(" - escape: str | None")
87+
print(" - comment: str | None")
88+
print(" - newlines_in_values: bool")
89+
print(" - schema: pa.Schema | None")
90+
print(" - schema_infer_max_records: int")
91+
print(" - file_extension: str")
92+
print(" - table_partition_cols: list[tuple[str, pa.DataType]]")
93+
print(" - file_compression_type: str")
94+
print(" - file_sort_order: list[list[SortExpr]]")
95+
print(" - null_regex: str | None")
96+
print(" - truncated_rows: bool")

python/datafusion/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
from .dataframe_formatter import configure_formatter
5555
from .expr import Expr, WindowFrame
5656
from .io import read_avro, read_csv, read_json, read_parquet
57+
from .options import CsvReadOptions
5758
from .plan import ExecutionPlan, LogicalPlan
5859
from .record_batch import RecordBatch, RecordBatchStream
5960
from .user_defined import (
@@ -75,6 +76,7 @@
7576
"AggregateUDF",
7677
"Catalog",
7778
"Config",
79+
"CsvReadOptions",
7880
"DFSchema",
7981
"DataFrame",
8082
"DataFrameWriteOptions",
@@ -106,6 +108,7 @@
106108
"lit",
107109
"literal",
108110
"object_store",
111+
"options",
109112
"read_avro",
110113
"read_csv",
111114
"read_json",

0 commit comments

Comments
 (0)