ci: update pre-commit hooks and fix linting issues

dariocurr · dariocurr · commit 5a30e03cbf32 · 2026-02-19T11:36:58.000+01:00
* Update Ruff version in pre-commit configuration to v0.15.1.
* Add noqa comments to suppress specific linting warnings in various files.
* Update regex patterns in test cases for better matching.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
           - id: actionlint-docker
       - repo: https://github.com/astral-sh/ruff-pre-commit
         # Ruff version.
-        rev: v0.9.10
+        rev: v0.15.1
         hooks:
           # Run the linter.
           - id: ruff
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -330,7 +330,7 @@ def into_view(self, temporary: bool = False) -> Table:
             >>> df.collect()  # The DataFrame is still usable
             >>> ctx.sql("SELECT value FROM values_view").collect()
         """
-        from datafusion.catalog import Table as _Table
+        from datafusion.catalog import Table as _Table  # noqa: PLC0415
 
         return _Table(self.df.into_view(temporary))
 
diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py
@@ -20,6 +20,8 @@
 See :ref:`Expressions` in the online documentation for more details.
 """
 
+# ruff: noqa: PLC0415
+
 from __future__ import annotations
 
 from collections.abc import Iterable, Sequence
@@ -340,7 +342,7 @@ def sort_list_to_raw_sort_list(
     return raw_sort_list
 
 
-class Expr:
+class Expr:  # noqa: PLW1641
     """Expression object.
 
     Expressions are one of the core concepts in DataFusion. See
diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py
@@ -46,7 +46,7 @@ def build_table(
         num_rows = 0  # Total number of rows in the file. Used for statistics
         columns = []
         if file_format == "parquet":
-            import pyarrow.parquet as pq
+            import pyarrow.parquet as pq  # noqa: PLC0415
 
             # Read the Parquet metadata
             metadata = pq.read_metadata(input_item)
@@ -61,7 +61,7 @@ def build_table(
             ]
 
         elif format == "csv":
-            import csv
+            import csv  # noqa: PLC0415
 
             # Consume header row and count number of rows for statistics.
             # TODO: Possibly makes sense to have the eager number of rows
diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py
@@ -32,7 +32,7 @@
 ]
 
 
-class LogicalPlan:
+class LogicalPlan:  # noqa: PLW1641
     """Logical Plan.
 
     A `LogicalPlan` is a node in a tree of relational operators (such as
diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py
@@ -583,11 +583,11 @@ def from_pycapsule(func: AggregateUDFExportable | _PyCapsule) -> AggregateUDF:
         AggregateUDF that is exported via the FFI bindings.
         """
         if _is_pycapsule(func):
-            aggregate = cast(AggregateUDF, object.__new__(AggregateUDF))
+            aggregate = cast("AggregateUDF", object.__new__(AggregateUDF))
             aggregate._udaf = df_internal.AggregateUDF.from_pycapsule(func)
             return aggregate
 
-        capsule = cast(AggregateUDFExportable, func)
+        capsule = cast("AggregateUDFExportable", func)
         name = str(capsule.__class__)
         return AggregateUDF(
             name=name,
diff --git a/python/tests/test_catalog.py b/python/tests/test_catalog.py
@@ -248,7 +248,7 @@ def test_exception_not_mangled(ctx: SessionContext):
 
     schema.register_table("test_table", create_dataset())
 
-    with pytest.raises(ValueError, match="^test_table is not an acceptable name$"):
+    with pytest.raises(ValueError, match=r"^test_table is not an acceptable name$"):
         ctx.sql(f"select * from {catalog_name}.{schema_name}.test_table")
 
 
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -2790,7 +2790,7 @@ def test_write_parquet_with_options_encoding(tmp_path, encoding, data_types, res
 def test_write_parquet_with_options_unsupported_encoding(df, tmp_path, encoding):
     """Test that unsupported Parquet encodings do not work."""
     # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519
-    with pytest.raises(BaseException, match="Encoding .*? is not supported"):
+    with pytest.raises(BaseException, match=r"Encoding .*? is not supported"):
         df.write_parquet_with_options(tmp_path, ParquetWriterOptions(encoding=encoding))
 
 
diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
@@ -303,31 +303,31 @@ def py_flatten(arr):
             lambda data: [np.concatenate([arr, arr]) for arr in data],
         ),
         (
-            lambda col: f.array_dims(col),
+            f.array_dims,
             lambda data: [[len(r)] for r in data],
         ),
         (
-            lambda col: f.array_distinct(col),
+            f.array_distinct,
             lambda data: [list(set(r)) for r in data],
         ),
         (
-            lambda col: f.list_distinct(col),
+            f.list_distinct,
             lambda data: [list(set(r)) for r in data],
         ),
         (
-            lambda col: f.list_dims(col),
+            f.list_dims,
             lambda data: [[len(r)] for r in data],
         ),
         (
             lambda col: f.array_element(col, literal(1)),
             lambda data: [r[0] for r in data],
         ),
         (
-            lambda col: f.array_empty(col),
+            f.array_empty,
             lambda data: [len(r) == 0 for r in data],
         ),
         (
-            lambda col: f.empty(col),
+            f.empty,
             lambda data: [len(r) == 0 for r in data],
         ),
         (
@@ -343,11 +343,11 @@ def py_flatten(arr):
             lambda data: [r[0] for r in data],
         ),
         (
-            lambda col: f.array_length(col),
+            f.array_length,
             lambda data: [len(r) for r in data],
         ),
         (
-            lambda col: f.list_length(col),
+            f.list_length,
             lambda data: [len(r) for r in data],
         ),
         (
@@ -391,11 +391,11 @@ def py_flatten(arr):
             lambda data: [[i + 1 for i, _v in enumerate(r) if _v == 1.0] for r in data],
         ),
         (
-            lambda col: f.array_ndims(col),
+            f.array_ndims,
             lambda data: [np.array(r).ndim for r in data],
         ),
         (
-            lambda col: f.list_ndims(col),
+            f.list_ndims,
             lambda data: [np.array(r).ndim for r in data],
         ),
         (
@@ -415,11 +415,11 @@ def py_flatten(arr):
             lambda data: [np.insert(arr, 0, 99.0) for arr in data],
         ),
         (
-            lambda col: f.array_pop_back(col),
+            f.array_pop_back,
             lambda data: [arr[:-1] for arr in data],
         ),
         (
-            lambda col: f.array_pop_front(col),
+            f.array_pop_front,
             lambda data: [arr[1:] for arr in data],
         ),
         (
diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py
@@ -31,7 +31,7 @@
 def test_no_table(ctx):
     with pytest.raises(
         ValueError,
-        match="^Error during planning: table 'datafusion.public.b' not found$",
+        match=r"^Error during planning: table 'datafusion.public.b' not found$",
     ):
         ctx.sql("SELECT a FROM b").collect()
 
@@ -188,7 +188,7 @@ def test_register_parquet_partitioned(ctx, tmp_path, path_to_str, legacy_data_ty
     partition_data_type = "string" if legacy_data_type else pa.string()
 
     if legacy_data_type:
-        with pytest.warns(DeprecationWarning):
+        with pytest.warns(DeprecationWarning):  # noqa: PT030
             ctx.register_parquet(
                 "datapp",
                 dir_root,
diff --git a/python/tests/test_udf.py b/python/tests/test_udf.py
@@ -15,7 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from uuid import UUID
+
 import pyarrow as pa
+import pyarrow.compute as pc
 import pytest
 from datafusion import SessionContext, column, udf
 from datafusion import functions as f
@@ -128,8 +131,6 @@ def udf_with_param(values: pa.Array) -> pa.Array:
 
 
 def test_udf_with_metadata(ctx) -> None:
-    from uuid import UUID
-
     @udf([pa.string()], pa.uuid(), "stable")
     def uuid_from_string(uuid_string):
         return pa.array((UUID(s).bytes for s in uuid_string.to_pylist()), pa.uuid())
@@ -151,8 +152,6 @@ def uuid_version(uuid):
 
 
 def test_udf_with_nullability(ctx: SessionContext) -> None:
-    import pyarrow.compute as pc
-
     field_nullable_i64 = pa.field("with_nulls", type=pa.int64(), nullable=True)
     field_non_nullable_i64 = pa.field("no_nulls", type=pa.int64(), nullable=False)
 
diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py
@@ -433,8 +433,8 @@ def test_udwf_functions(complex_window_df, name, expr, expected):
     [
         udwf(SimpleWindowCount, pa.int64(), pa.int64(), "immutable"),
         udwf(SimpleWindowCount, [pa.int64()], pa.int64(), "immutable"),
-        udwf([pa.int64()], pa.int64(), "immutable")(lambda: SimpleWindowCount()),
-        udwf(pa.int64(), pa.int64(), "immutable")(lambda: SimpleWindowCount()),
+        udwf([pa.int64()], pa.int64(), "immutable")(SimpleWindowCount),
+        udwf(pa.int64(), pa.int64(), "immutable")(SimpleWindowCount),
     ],
 )
 def test_udwf_overloads(udwf_func, count_window_df):

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@`
`32`	`32`	`]`
`33`	`33`
`34`	`34`
`35`		`-class LogicalPlan:`
	`35`	`+class LogicalPlan: # noqa: PLW1641`
`36`	`36`	`"""Logical Plan.`
`37`	`37`
`38`	`38`	A `LogicalPlan` is a node in a tree of relational operators (such as
Original file line number	Diff line number	Diff line change
`@@ -303,31 +303,31 @@ def py_flatten(arr):`
`303`	`303`	`lambda data: [np.concatenate([arr, arr]) for arr in data],`
`304`	`304`	`),`
`305`	`305`	`(`
`306`		`- lambda col: f.array_dims(col),`
	`306`	`+ f.array_dims,`
`307`	`307`	`lambda data: [[len(r)] for r in data],`
`308`	`308`	`),`
`309`	`309`	`(`
`310`		`- lambda col: f.array_distinct(col),`
	`310`	`+ f.array_distinct,`
`311`	`311`	`lambda data: [list(set(r)) for r in data],`
`312`	`312`	`),`
`313`	`313`	`(`
`314`		`- lambda col: f.list_distinct(col),`
	`314`	`+ f.list_distinct,`
`315`	`315`	`lambda data: [list(set(r)) for r in data],`
`316`	`316`	`),`
`317`	`317`	`(`
`318`		`- lambda col: f.list_dims(col),`
	`318`	`+ f.list_dims,`
`319`	`319`	`lambda data: [[len(r)] for r in data],`
`320`	`320`	`),`
`321`	`321`	`(`
`322`	`322`	`lambda col: f.array_element(col, literal(1)),`
`323`	`323`	`lambda data: [r[0] for r in data],`
`324`	`324`	`),`
`325`	`325`	`(`
`326`		`- lambda col: f.array_empty(col),`
	`326`	`+ f.array_empty,`
`327`	`327`	`lambda data: [len(r) == 0 for r in data],`
`328`	`328`	`),`
`329`	`329`	`(`
`330`		`- lambda col: f.empty(col),`
	`330`	`+ f.empty,`
`331`	`331`	`lambda data: [len(r) == 0 for r in data],`
`332`	`332`	`),`
`333`	`333`	`(`
`@@ -343,11 +343,11 @@ def py_flatten(arr):`
`343`	`343`	`lambda data: [r[0] for r in data],`
`344`	`344`	`),`
`345`	`345`	`(`
`346`		`- lambda col: f.array_length(col),`
	`346`	`+ f.array_length,`
`347`	`347`	`lambda data: [len(r) for r in data],`
`348`	`348`	`),`
`349`	`349`	`(`
`350`		`- lambda col: f.list_length(col),`
	`350`	`+ f.list_length,`
`351`	`351`	`lambda data: [len(r) for r in data],`
`352`	`352`	`),`
`353`	`353`	`(`
`@@ -391,11 +391,11 @@ def py_flatten(arr):`
`391`	`391`	`lambda data: [[i + 1 for i, _v in enumerate(r) if _v == 1.0] for r in data],`
`392`	`392`	`),`
`393`	`393`	`(`
`394`		`- lambda col: f.array_ndims(col),`
	`394`	`+ f.array_ndims,`
`395`	`395`	`lambda data: [np.array(r).ndim for r in data],`
`396`	`396`	`),`
`397`	`397`	`(`
`398`		`- lambda col: f.list_ndims(col),`
	`398`	`+ f.list_ndims,`
`399`	`399`	`lambda data: [np.array(r).ndim for r in data],`
`400`	`400`	`),`
`401`	`401`	`(`
`@@ -415,11 +415,11 @@ def py_flatten(arr):`
`415`	`415`	`lambda data: [np.insert(arr, 0, 99.0) for arr in data],`
`416`	`416`	`),`
`417`	`417`	`(`
`418`		`- lambda col: f.array_pop_back(col),`
	`418`	`+ f.array_pop_back,`
`419`	`419`	`lambda data: [arr[:-1] for arr in data],`
`420`	`420`	`),`
`421`	`421`	`(`
`422`		`- lambda col: f.array_pop_front(col),`
	`422`	`+ f.array_pop_front,`
`423`	`423`	`lambda data: [arr[1:] for arr in data],`
`424`	`424`	`),`
`425`	`425`	`(`