diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 55248b6bf..4cad8db24 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies uses: astral-sh/setup-uv@v7 with: - enable-cache: true + enable-cache: true # Download the Linux wheel built in the build workflow - name: Download pre-built Linux wheel diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bcefa405d..8ae6a4e32 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: - id: actionlint-docker - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.9.10 + rev: v0.15.1 hooks: # Run the linter. - id: ruff diff --git a/Cargo.lock b/Cargo.lock index cd853a03f..d6fd05a77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -593,9 +593,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.20.0" +version = "3.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c81d250916401487680ed13b8b675660281dcfc3ab0121fe44c94bcab9eae2fb" +checksum = "5c6f81257d10a0f602a294ae4182251151ff97dbb504ef9afcdda4a64b24d9b4" [[package]] name = "byteorder" diff --git a/Cargo.toml b/Cargo.toml index 3e632bafc..371554021 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,7 +42,7 @@ protoc = ["datafusion-substrait/protoc"] substrait = ["dep:datafusion-substrait"] [dependencies] -tokio = { version = "1.47", features = [ +tokio = { version = "1.49", features = [ "macros", "rt", "rt-multi-thread", @@ -54,16 +54,16 @@ pyo3 = { version = "0.26", features = [ "abi3-py310", ] } pyo3-async-runtimes = { version = "0.26", features = ["tokio-runtime"] } -pyo3-log = "0.13.2" +pyo3-log = "0.13.3" arrow = { version = "57", features = ["pyarrow"] } arrow-select = { version = "57" } datafusion = { version = "52", features = ["avro", "unicode_expressions"] } datafusion-substrait = { version = "52", optional = true } datafusion-proto = { version = "52" } datafusion-ffi = { version = "52" } -prost = "0.14.1" # keep in line with `datafusion-substrait` +prost = "0.14.3" # keep in line with `datafusion-substrait` serde_json = "1" -uuid = { version = "1.18", features = ["v4"] } +uuid = { version = "1.21", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = [ "local_dynamic_tls", ] } @@ -77,11 +77,11 @@ object_store = { version = "0.12.4", features = [ "http", ] } url = "2" -log = "0.4.27" +log = "0.4.29" parking_lot = "0.12" [build-dependencies] -prost-types = "0.14.1" # keep in line with `datafusion-substrait` +prost-types = "0.14.3" # keep in line with `datafusion-substrait` pyo3-build-config = "0.26" [lib] diff --git a/pyproject.toml b/pyproject.toml index 5a5128a2f..08d64eca0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ description = "Build and run queries against data" readme = "README.md" license = { file = "LICENSE.txt" } requires-python = ">=3.10" -keywords = ["datafusion", "dataframe", "rust", "query-engine"] +keywords = ["dataframe", "datafusion", "query-engine", "rust"] classifiers = [ "Development Status :: 2 - Pre-Alpha", "Intended Audience :: Developers", @@ -62,7 +62,7 @@ profile = "black" python-source = "python" module-name = "datafusion._internal" include = [{ path = "Cargo.lock", format = "sdist" }] -exclude = [".github/**", "ci/**", ".asf.yaml"] +exclude = [".asf.yaml", ".github/**", "ci/**"] # Require Cargo.lock is up to date locked = true features = ["substrait"] @@ -77,19 +77,19 @@ select = ["ALL"] ignore = [ "A001", # Allow using words like min as variable names "A002", # Allow using words like filter as variable names + "A005", # Allow module named io "ANN401", # Allow Any for wrapper classes "COM812", # Recommended to ignore these rules when using with ruff-format - "FIX002", # Allow TODO lines - consider removing at some point "FBT001", # Allow boolean positional args "FBT002", # Allow boolean positional args + "FIX002", # Allow TODO lines - consider removing at some point "ISC001", # Recommended to ignore these rules when using with ruff-format + "N812", # Allow importing functions as `F` + "PD901", # Allow variable name df + "PLR0913", # Allow many arguments in function definition "SLF001", # Allow accessing private members "TD002", # Do not require author names in TODO statements "TD003", # Allow TODO lines - "PLR0913", # Allow many arguments in function definition - "PD901", # Allow variable name df - "N812", # Allow importing functions as `F` - "A005", # Allow module named io ] [tool.ruff.lint.pydocstyle] @@ -99,7 +99,7 @@ convention = "google" max-doc-length = 88 [tool.ruff.lint.flake8-boolean-trap] -extend-allowed-calls = ["lit", "datafusion.lit"] +extend-allowed-calls = ["datafusion.lit", "lit"] # Disable docstring checking for these directories [tool.ruff.lint.per-file-ignores] @@ -108,68 +108,69 @@ extend-allowed-calls = ["lit", "datafusion.lit"] "ARG", "BLE001", "D", - "S101", - "SLF", "PD", + "PLC0415", + "PLR0913", "PLR2004", + "PT004", "PT011", "RUF015", + "S101", "S608", - "PLR0913", - "PT004", + "SLF", ] "examples/*" = [ - "D", - "W505", - "E501", - "T201", - "S101", - "PLR2004", "ANN001", "ANN202", - "INP001", + "D", "DTZ007", + "E501", + "INP001", + "PLR2004", "RUF015", + "S101", + "T201", + "W505", ] "dev/*" = [ + "ANN001", + "C", "D", "E", - "T", - "S", + "ERA001", + "EXE", + "N817", "PLR", - "C", + "S", "SIM", + "T", "UP", - "EXE", - "N817", - "ERA001", - "ANN001", ] "benchmarks/*" = [ + "ANN001", + "BLE", "D", + "E", + "ERA001", + "EXE", "F", - "T", - "BLE", "FURB", + "INP001", "PLR", - "E", - "TD", - "TRY", "S", "SIM", - "EXE", + "T", + "TD", + "TRY", "UP", - "ERA001", - "ANN001", - "INP001", ] "docs/*" = ["D"] -"docs/source/conf.py" = ["ERA001", "ANN001", "INP001"] +"docs/source/conf.py" = ["ANN001", "ERA001", "INP001"] [tool.codespell] -skip = ["./target", "uv.lock", "./python/tests/test_functions.py"] +skip = ["./python/tests/test_functions.py", "./target", "uv.lock"] count = true -ignore-words-list = ["ans", "IST"] +ignore-words-list = ["IST", "ans"] [dependency-groups] dev = [ @@ -182,8 +183,8 @@ dev = [ "pre-commit>=4.3.0", "pyarrow>=19.0.0", "pygithub==2.5.0", - "pytest>=7.4.4", "pytest-asyncio>=0.23.3", + "pytest>=7.4.4", "pyyaml>=6.0.3", "ruff>=0.9.1", "toml>=0.10.2", @@ -196,6 +197,6 @@ docs = [ "pickleshare>=0.7.5", "pydata-sphinx-theme==0.8.0", "setuptools>=75.3.0", - "sphinx>=7.1.2", "sphinx-autoapi>=3.4.0", + "sphinx>=7.1.2", ] diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 9df58f52a..5760b8948 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -20,6 +20,8 @@ See :ref:`Expressions` in the online documentation for more details. """ +# ruff: noqa: PLC0415 + from __future__ import annotations from collections.abc import Iterable, Sequence diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py index d4e5302b5..eef23e741 100644 --- a/python/datafusion/user_defined.py +++ b/python/datafusion/user_defined.py @@ -583,11 +583,11 @@ def from_pycapsule(func: AggregateUDFExportable | _PyCapsule) -> AggregateUDF: AggregateUDF that is exported via the FFI bindings. """ if _is_pycapsule(func): - aggregate = cast(AggregateUDF, object.__new__(AggregateUDF)) + aggregate = cast("AggregateUDF", object.__new__(AggregateUDF)) aggregate._udaf = df_internal.AggregateUDF.from_pycapsule(func) return aggregate - capsule = cast(AggregateUDFExportable, func) + capsule = cast("AggregateUDFExportable", func) name = str(capsule.__class__) return AggregateUDF( name=name, diff --git a/python/tests/test_catalog.py b/python/tests/test_catalog.py index 71c08da26..9310da506 100644 --- a/python/tests/test_catalog.py +++ b/python/tests/test_catalog.py @@ -248,7 +248,7 @@ def test_exception_not_mangled(ctx: SessionContext): schema.register_table("test_table", create_dataset()) - with pytest.raises(ValueError, match="^test_table is not an acceptable name$"): + with pytest.raises(ValueError, match=r"^test_table is not an acceptable name$"): ctx.sql(f"select * from {catalog_name}.{schema_name}.test_table") diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 71abe2925..de6b00acf 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -2790,7 +2790,7 @@ def test_write_parquet_with_options_encoding(tmp_path, encoding, data_types, res def test_write_parquet_with_options_unsupported_encoding(df, tmp_path, encoding): """Test that unsupported Parquet encodings do not work.""" # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519 - with pytest.raises(BaseException, match="Encoding .*? is not supported"): + with pytest.raises(BaseException, match=r"Encoding .*? is not supported"): df.write_parquet_with_options(tmp_path, ParquetWriterOptions(encoding=encoding)) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 7b3332ed7..5a61a2dd1 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -303,19 +303,19 @@ def py_flatten(arr): lambda data: [np.concatenate([arr, arr]) for arr in data], ), ( - lambda col: f.array_dims(col), + f.array_dims, lambda data: [[len(r)] for r in data], ), ( - lambda col: f.array_distinct(col), + f.array_distinct, lambda data: [list(set(r)) for r in data], ), ( - lambda col: f.list_distinct(col), + f.list_distinct, lambda data: [list(set(r)) for r in data], ), ( - lambda col: f.list_dims(col), + f.list_dims, lambda data: [[len(r)] for r in data], ), ( @@ -323,11 +323,11 @@ def py_flatten(arr): lambda data: [r[0] for r in data], ), ( - lambda col: f.array_empty(col), + f.array_empty, lambda data: [len(r) == 0 for r in data], ), ( - lambda col: f.empty(col), + f.empty, lambda data: [len(r) == 0 for r in data], ), ( @@ -343,11 +343,11 @@ def py_flatten(arr): lambda data: [r[0] for r in data], ), ( - lambda col: f.array_length(col), + f.array_length, lambda data: [len(r) for r in data], ), ( - lambda col: f.list_length(col), + f.list_length, lambda data: [len(r) for r in data], ), ( @@ -391,11 +391,11 @@ def py_flatten(arr): lambda data: [[i + 1 for i, _v in enumerate(r) if _v == 1.0] for r in data], ), ( - lambda col: f.array_ndims(col), + f.array_ndims, lambda data: [np.array(r).ndim for r in data], ), ( - lambda col: f.list_ndims(col), + f.list_ndims, lambda data: [np.array(r).ndim for r in data], ), ( @@ -415,11 +415,11 @@ def py_flatten(arr): lambda data: [np.insert(arr, 0, 99.0) for arr in data], ), ( - lambda col: f.array_pop_back(col), + f.array_pop_back, lambda data: [arr[:-1] for arr in data], ), ( - lambda col: f.array_pop_front(col), + f.array_pop_front, lambda data: [arr[1:] for arr in data], ), ( diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index 12710cf08..92c311930 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -31,7 +31,7 @@ def test_no_table(ctx): with pytest.raises( ValueError, - match="^Error during planning: table 'datafusion.public.b' not found$", + match=r"^Error during planning: table 'datafusion.public.b' not found$", ): ctx.sql("SELECT a FROM b").collect() diff --git a/python/tests/test_udf.py b/python/tests/test_udf.py index c0ba1d831..b2540fb57 100644 --- a/python/tests/test_udf.py +++ b/python/tests/test_udf.py @@ -15,7 +15,10 @@ # specific language governing permissions and limitations # under the License. +from uuid import UUID + import pyarrow as pa +import pyarrow.compute as pc import pytest from datafusion import SessionContext, column, udf from datafusion import functions as f @@ -128,8 +131,6 @@ def udf_with_param(values: pa.Array) -> pa.Array: def test_udf_with_metadata(ctx) -> None: - from uuid import UUID - @udf([pa.string()], pa.uuid(), "stable") def uuid_from_string(uuid_string): return pa.array((UUID(s).bytes for s in uuid_string.to_pylist()), pa.uuid()) @@ -151,8 +152,6 @@ def uuid_version(uuid): def test_udf_with_nullability(ctx: SessionContext) -> None: - import pyarrow.compute as pc - field_nullable_i64 = pa.field("with_nulls", type=pa.int64(), nullable=True) field_non_nullable_i64 = pa.field("no_nulls", type=pa.int64(), nullable=False) diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 5aaf00664..38b935b7e 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -433,8 +433,8 @@ def test_udwf_functions(complex_window_df, name, expr, expected): [ udwf(SimpleWindowCount, pa.int64(), pa.int64(), "immutable"), udwf(SimpleWindowCount, [pa.int64()], pa.int64(), "immutable"), - udwf([pa.int64()], pa.int64(), "immutable")(lambda: SimpleWindowCount()), - udwf(pa.int64(), pa.int64(), "immutable")(lambda: SimpleWindowCount()), + udwf([pa.int64()], pa.int64(), "immutable")(SimpleWindowCount), + udwf(pa.int64(), pa.int64(), "immutable")(SimpleWindowCount), ], ) def test_udwf_overloads(udwf_func, count_window_df): diff --git a/src/expr/indexed_field.rs b/src/expr/indexed_field.rs index 1dfa0ed2f..79f528179 100644 --- a/src/expr/indexed_field.rs +++ b/src/expr/indexed_field.rs @@ -15,12 +15,13 @@ // specific language governing permissions and limitations // under the License. -use crate::expr::PyExpr; +use std::fmt::{Display, Formatter}; + use datafusion::logical_expr::expr::{GetFieldAccess, GetIndexedField}; use pyo3::prelude::*; -use std::fmt::{Display, Formatter}; use super::literal::PyLiteral; +use crate::expr::PyExpr; #[pyclass(frozen, name = "GetIndexedField", module = "datafusion.expr", subclass)] #[derive(Clone)]