diff --git a/Cargo.lock b/Cargo.lock index 22ec58253606..bf2d5ad877d0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -450,7 +450,7 @@ version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d27609cd7dd45f006abae27995c2729ef6f4b9361cde1ddd019dc31a5aa017e0" dependencies = [ - "bitflags 2.9.4", + "bitflags", "serde", "serde_core", "serde_json", @@ -1038,7 +1038,7 @@ version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ - "bitflags 2.9.4", + "bitflags", "cexpr", "clang-sys", "itertools 0.13.0", @@ -1052,12 +1052,6 @@ dependencies = [ "syn 2.0.111", ] -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.9.4" @@ -1115,7 +1109,7 @@ checksum = "ec7646ee90964aa59e9f832a67182791396a19a5b1d76eb17599a8310a7e2e09" dependencies = [ "async-stream", "base64 0.22.1", - "bitflags 2.9.4", + "bitflags", "bollard-buildkit-proto", "bollard-stubs", "bytes", @@ -1423,17 +1417,6 @@ dependencies = [ "libloading 0.8.9", ] -[[package]] -name = "clap" -version = "2.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" -dependencies = [ - "bitflags 1.3.2", - "textwrap", - "unicode-width 0.1.14", -] - [[package]] name = "clap" version = "4.5.53" @@ -1462,7 +1445,7 @@ version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "syn 2.0.111", @@ -1659,7 +1642,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.53", + "clap", "criterion-plot", "futures", "itertools 0.13.0", @@ -1905,6 +1888,7 @@ name = "datafusion-benchmarks" version = "51.0.0" dependencies = [ "arrow", + "clap", "datafusion", "datafusion-common", "datafusion-proto", @@ -1920,7 +1904,6 @@ dependencies = [ "serde", "serde_json", "snmalloc-rs", - "structopt", "tokio", "tokio-util", ] @@ -1979,7 +1962,7 @@ dependencies = [ "aws-config", "aws-credential-types", "chrono", - "clap 4.5.53", + "clap", "ctor", "datafusion", "datafusion-common", @@ -2725,7 +2708,7 @@ dependencies = [ "bigdecimal", "bytes", "chrono", - "clap 4.5.53", + "clap", "datafusion", "datafusion-spark", "datafusion-substrait", @@ -3091,7 +3074,7 @@ version = "25.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" dependencies = [ - "bitflags 2.9.4", + "bitflags", "rustc_version", ] @@ -3391,15 +3374,6 @@ version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" -[[package]] -name = "heck" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" -dependencies = [ - "unicode-segmentation", -] - [[package]] name = "heck" version = "0.5.0" @@ -4045,7 +4019,7 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb" dependencies = [ - "bitflags 2.9.4", + "bitflags", "libc", "redox_syscall", ] @@ -4058,7 +4032,7 @@ checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33" dependencies = [ "anstream", "anstyle", - "clap 4.5.53", + "clap", "escape8259", ] @@ -4209,7 +4183,7 @@ version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ - "bitflags 2.9.4", + "bitflags", "cfg-if", "cfg_aliases", "libc", @@ -4330,7 +4304,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166" dependencies = [ - "bitflags 2.9.4", + "bitflags", ] [[package]] @@ -4549,7 +4523,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095" dependencies = [ - "heck 0.5.0", + "heck", "itertools 0.14.0", "prost", "prost-types", @@ -4722,7 +4696,7 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56df96f5394370d1b20e49de146f9e6c25aa9ae750f449c9d665eafecb3ccae6" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "syn 2.0.111", @@ -4812,30 +4786,6 @@ dependencies = [ "toml_edit", ] -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn 1.0.109", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", -] - [[package]] name = "proc-macro2" version = "1.0.101" @@ -4861,7 +4811,7 @@ version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac6c3320f9abac597dcbc668774ef006702672474aad53c6d596b62e487b40b1" dependencies = [ - "heck 0.5.0", + "heck", "itertools 0.14.0", "log", "multimap", @@ -5152,7 +5102,7 @@ version = "0.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" dependencies = [ - "bitflags 2.9.4", + "bitflags", ] [[package]] @@ -5418,7 +5368,7 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ - "bitflags 2.9.4", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -5496,7 +5446,7 @@ version = "17.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564" dependencies = [ - "bitflags 2.9.4", + "bitflags", "cfg-if", "clipboard-win", "fd-lock", @@ -5602,7 +5552,7 @@ version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc198e42d9b7510827939c9a15f5062a0c913f3371d765977e586d2fe6c16f4a" dependencies = [ - "bitflags 2.9.4", + "bitflags", "core-foundation", "core-foundation-sys", "libc", @@ -6001,30 +5951,6 @@ dependencies = [ "syn 2.0.111", ] -[[package]] -name = "structopt" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" -dependencies = [ - "clap 2.34.0", - "lazy_static", - "structopt-derive", -] - -[[package]] -name = "structopt-derive" -version = "0.4.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" -dependencies = [ - "heck 0.3.3", - "proc-macro-error", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "strum" version = "0.26.3" @@ -6043,7 +5969,7 @@ version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "rustversion", @@ -6056,7 +5982,7 @@ version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "syn 2.0.111", @@ -6078,7 +6004,7 @@ version = "0.62.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21f1cb6d0bcd097a39fc25f7236236be29881fe122e282e4173d6d007a929927" dependencies = [ - "heck 0.5.0", + "heck", "pbjson", "pbjson-build", "pbjson-types", @@ -6228,15 +6154,6 @@ dependencies = [ "testcontainers", ] -[[package]] -name = "textwrap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "unicode-width 0.1.14", -] - [[package]] name = "thiserror" version = "2.0.17" @@ -6535,7 +6452,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" dependencies = [ - "bitflags 2.9.4", + "bitflags", "bytes", "futures-util", "http 1.3.1", @@ -6677,7 +6594,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1eb359f7ffa4f9ebe947fa11a1b2da054564502968db5f317b7e37693cb2240" dependencies = [ - "heck 0.5.0", + "heck", "log", "proc-macro2", "quote", diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 5f91175ca8ba..df04f56235ec 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -40,6 +40,7 @@ mimalloc_extended = ["libmimalloc-sys/extended"] [dependencies] arrow = { workspace = true } +clap = { version = "4.5.53", features = ["derive"] } datafusion = { workspace = true, default-features = true } datafusion-common = { workspace = true, default-features = true } env_logger = { workspace = true } @@ -54,7 +55,6 @@ regex.workspace = true serde = { version = "1.0.228", features = ["derive"] } serde_json = { workspace = true } snmalloc-rs = { version = "0.3", optional = true } -structopt = { version = "0.3", default-features = false } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } tokio-util = { version = "0.7.17" } diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs index d842d306c1f6..7e21890519fd 100644 --- a/benchmarks/src/bin/dfbench.rs +++ b/benchmarks/src/bin/dfbench.rs @@ -18,7 +18,7 @@ //! DataFusion benchmark runner use datafusion::error::Result; -use structopt::StructOpt; +use clap::{Parser, Subcommand}; #[cfg(all(feature = "snmalloc", feature = "mimalloc"))] compile_error!( @@ -37,8 +37,14 @@ use datafusion_benchmarks::{ cancellation, clickbench, h2o, hj, imdb, nlj, smj, sort_tpch, tpcds, tpch, }; -#[derive(Debug, StructOpt)] -#[structopt(about = "benchmark command")] +#[derive(Debug, Parser)] +#[command(about = "benchmark command")] +struct Cli { + #[command(subcommand)] + command: Options, +} + +#[derive(Debug, Subcommand)] enum Options { Cancellation(cancellation::RunOpt), Clickbench(clickbench::RunOpt), @@ -57,7 +63,8 @@ enum Options { pub async fn main() -> Result<()> { env_logger::init(); - match Options::from_args() { + let cli = Cli::parse(); + match cli.command { Options::Cancellation(opt) => opt.run().await, Options::Clickbench(opt) => opt.run().await, Options::H2o(opt) => opt.run().await, diff --git a/benchmarks/src/bin/external_aggr.rs b/benchmarks/src/bin/external_aggr.rs index 2bc2bd4458a5..ee604ec7365a 100644 --- a/benchmarks/src/bin/external_aggr.rs +++ b/benchmarks/src/bin/external_aggr.rs @@ -17,13 +17,13 @@ //! external_aggr binary entrypoint +use clap::{Args, Parser, Subcommand}; use datafusion::execution::memory_pool::GreedyMemoryPool; use datafusion::execution::memory_pool::MemoryPool; use std::collections::HashMap; use std::path::PathBuf; use std::sync::Arc; use std::sync::LazyLock; -use structopt::StructOpt; use arrow::record_batch::RecordBatch; use arrow::util::pretty; @@ -45,35 +45,41 @@ use datafusion_common::utils::get_available_parallelism; use datafusion_common::{DEFAULT_PARQUET_EXTENSION, exec_err}; use datafusion_common::{human_readable_size, units}; -#[derive(Debug, StructOpt)] -#[structopt( +#[derive(Debug, Parser)] +#[command( name = "datafusion-external-aggregation", about = "DataFusion external aggregation benchmark" )] +struct Cli { + #[command(subcommand)] + command: ExternalAggrOpt, +} + +#[derive(Debug, Subcommand)] enum ExternalAggrOpt { Benchmark(ExternalAggrConfig), } -#[derive(Debug, StructOpt)] +#[derive(Debug, Args)] struct ExternalAggrConfig { /// Query number. If not specified, runs all queries - #[structopt(short, long)] + #[arg(short, long)] query: Option, /// Common options - #[structopt(flatten)] + #[command(flatten)] common: CommonOpt, /// Path to data files (lineitem). Only parquet format is supported - #[structopt(parse(from_os_str), required = true, short = "p", long = "path")] + #[arg(required = true, short = 'p', long = "path")] path: PathBuf, /// Load the data into a MemTable before executing the query - #[structopt(short = "m", long = "mem-table")] + #[arg(short = 'm', long = "mem-table")] mem_table: bool, /// Path to JSON benchmark result to be compare using `compare.py` - #[structopt(parse(from_os_str), short = "o", long = "output")] + #[arg(short = 'o', long = "output")] output_path: Option, } @@ -338,7 +344,8 @@ impl ExternalAggrConfig { pub async fn main() -> Result<()> { env_logger::init(); - match ExternalAggrOpt::from_args() { + let cli = Cli::parse(); + match cli.command { ExternalAggrOpt::Benchmark(opt) => opt.run().await?, } diff --git a/benchmarks/src/bin/imdb.rs b/benchmarks/src/bin/imdb.rs index 5ce99928df66..df02f60476b5 100644 --- a/benchmarks/src/bin/imdb.rs +++ b/benchmarks/src/bin/imdb.rs @@ -17,9 +17,9 @@ //! IMDB binary entrypoint +use clap::{Parser, Subcommand}; use datafusion::error::Result; use datafusion_benchmarks::imdb; -use structopt::StructOpt; #[cfg(all(feature = "snmalloc", feature = "mimalloc"))] compile_error!( @@ -34,16 +34,16 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; -#[derive(Debug, StructOpt)] -#[structopt(about = "benchmark command")] +#[derive(Debug, Subcommand)] enum BenchmarkSubCommandOpt { - #[structopt(name = "datafusion")] + #[command(name = "datafusion")] DataFusionBenchmark(imdb::RunOpt), } -#[derive(Debug, StructOpt)] -#[structopt(name = "IMDB", about = "IMDB Dataset Processing.")] +#[derive(Debug, Parser)] +#[command(name = "IMDB", about = "IMDB Dataset Processing.")] enum ImdbOpt { + #[command(subcommand)] Benchmark(BenchmarkSubCommandOpt), Convert(imdb::ConvertOpt), } @@ -51,7 +51,7 @@ enum ImdbOpt { #[tokio::main] pub async fn main() -> Result<()> { env_logger::init(); - match ImdbOpt::from_args() { + match ImdbOpt::parse() { ImdbOpt::Benchmark(BenchmarkSubCommandOpt::DataFusionBenchmark(opt)) => { Box::pin(opt.run()).await } diff --git a/benchmarks/src/bin/mem_profile.rs b/benchmarks/src/bin/mem_profile.rs index 025efefe062e..ab61ad86b474 100644 --- a/benchmarks/src/bin/mem_profile.rs +++ b/benchmarks/src/bin/mem_profile.rs @@ -16,6 +16,7 @@ // under the License. //! mem_profile binary entrypoint +use clap::{Parser, Subcommand}; use datafusion::error::Result; use std::{ env, @@ -23,7 +24,6 @@ use std::{ path::Path, process::{Command, Stdio}, }; -use structopt::StructOpt; use datafusion_benchmarks::{ clickbench, @@ -31,19 +31,19 @@ use datafusion_benchmarks::{ imdb, sort_tpch, tpch, }; -#[derive(Debug, StructOpt)] -#[structopt(name = "Memory Profiling Utility")] +#[derive(Debug, Parser)] +#[command(name = "Memory Profiling Utility")] struct MemProfileOpt { /// Cargo profile to use in dfbench (e.g. release, release-nonlto) - #[structopt(long, default_value = "release")] + #[arg(long, default_value = "release")] bench_profile: String, - #[structopt(subcommand)] + #[command(subcommand)] command: Options, } -#[derive(Debug, StructOpt)] -#[structopt(about = "Benchmark command")] +#[derive(Debug, Subcommand)] +#[command(about = "Benchmark command")] enum Options { Clickbench(clickbench::RunOpt), H2o(h2o::RunOpt), @@ -55,7 +55,7 @@ enum Options { #[tokio::main] pub async fn main() -> Result<()> { // 1. Parse args and check which benchmarks should be run - let mem_profile_opt = MemProfileOpt::from_args(); + let mem_profile_opt = MemProfileOpt::parse(); let profile = mem_profile_opt.bench_profile; let query_range = match mem_profile_opt.command { Options::Clickbench(opt) => { diff --git a/benchmarks/src/cancellation.rs b/benchmarks/src/cancellation.rs index 1b4c04b409cc..d3da1b0e8362 100644 --- a/benchmarks/src/cancellation.rs +++ b/benchmarks/src/cancellation.rs @@ -24,6 +24,7 @@ use crate::util::{BenchmarkRun, CommonOpt}; use arrow::array::Array; use arrow::datatypes::DataType; use arrow::record_batch::RecordBatch; +use clap::Args; use datafusion::common::{Result, ScalarValue}; use datafusion::datasource::file_format::FileFormat; use datafusion::datasource::file_format::parquet::ParquetFormat; @@ -41,7 +42,6 @@ use parquet::arrow::async_writer::ParquetObjectWriter; use rand::Rng; use rand::distr::Alphanumeric; use rand::rngs::ThreadRng; -use structopt::StructOpt; use tokio::runtime::Runtime; use tokio_util::sync::CancellationToken; @@ -57,31 +57,31 @@ use tokio_util::sync::CancellationToken; /// The query is an anonymized version of a real-world query, and the /// test starts the query then cancels it and reports how long it takes /// for the runtime to fully exit. -#[derive(Debug, StructOpt, Clone)] -#[structopt(verbatim_doc_comment)] +#[derive(Debug, Args, Clone)] +#[command(verbatim_doc_comment)] pub struct RunOpt { /// Common options - #[structopt(flatten)] + #[command(flatten)] common: CommonOpt, /// Path to folder where data will be generated - #[structopt(parse(from_os_str), required = true, short = "p", long = "path")] + #[arg(required = true, short = 'p', long = "path")] path: PathBuf, /// Path to machine readable output file - #[structopt(parse(from_os_str), short = "o", long = "output")] + #[arg(short = 'o', long = "output")] output_path: Option, /// Number of files to generate - #[structopt(long = "num-files", default_value = "7")] + #[arg(long = "num-files", default_value = "7")] num_files: usize, /// Number of rows per file to generate - #[structopt(long = "num-rows-per-file", default_value = "5000000")] + #[arg(long = "num-rows-per-file", default_value = "5000000")] num_rows_per_file: usize, /// How long to wait, in milliseconds, before attempting to cancel - #[structopt(long = "wait-time", default_value = "100")] + #[arg(long = "wait-time", default_value = "100")] wait_time: u64, } diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs index 9036e7d9501e..a9da57b02ae3 100644 --- a/benchmarks/src/clickbench.rs +++ b/benchmarks/src/clickbench.rs @@ -20,6 +20,7 @@ use std::io::ErrorKind; use std::path::{Path, PathBuf}; use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats}; +use clap::Args; use datafusion::logical_expr::{ExplainFormat, ExplainOption}; use datafusion::{ error::{DataFusionError, Result}, @@ -27,7 +28,6 @@ use datafusion::{ }; use datafusion_common::exec_datafusion_err; use datafusion_common::instant::Instant; -use structopt::StructOpt; /// Driver program to run the ClickBench benchmark /// @@ -37,11 +37,11 @@ use structopt::StructOpt; /// /// [1]: https://github.com/ClickHouse/ClickBench /// [2]: https://github.com/ClickHouse/ClickBench/tree/main/datafusion -#[derive(Debug, StructOpt, Clone)] -#[structopt(verbatim_doc_comment)] +#[derive(Debug, Args, Clone)] +#[command(verbatim_doc_comment)] pub struct RunOpt { /// Query number (between 0 and 42). If not specified, runs all queries - #[structopt(short, long)] + #[arg(short, long)] pub query: Option, /// If specified, enables Parquet Filter Pushdown. @@ -49,34 +49,32 @@ pub struct RunOpt { /// Specifically, it enables: /// * `pushdown_filters = true` /// * `reorder_filters = true` - #[structopt(long = "pushdown")] + #[arg(long = "pushdown")] pushdown: bool, /// Common options - #[structopt(flatten)] + #[command(flatten)] common: CommonOpt, /// Path to hits.parquet (single file) or `hits_partitioned` /// (partitioned, 100 files) - #[structopt( - parse(from_os_str), - short = "p", + #[arg( + short = 'p', long = "path", default_value = "benchmarks/data/hits.parquet" )] path: PathBuf, /// Path to queries directory - #[structopt( - parse(from_os_str), - short = "r", + #[arg( + short = 'r', long = "queries-path", default_value = "benchmarks/queries/clickbench/queries" )] pub queries_path: PathBuf, /// If present, write results json here - #[structopt(parse(from_os_str), short = "o", long = "output")] + #[arg(short = 'o', long = "output")] output_path: Option, /// Column name that the data is sorted by (e.g., "EventTime") @@ -86,18 +84,18 @@ pub struct RunOpt { /// Recommended to use with: -c datafusion.optimizer.prefer_existing_sort=true /// This allows DataFusion to optimize away redundant sorts while maintaining /// multi-core parallelism for other operations. - #[structopt(long = "sorted-by")] + #[arg(long = "sorted-by")] sorted_by: Option, /// Sort order: ASC or DESC (default: ASC) - #[structopt(long = "sort-order", default_value = "ASC")] + #[arg(long = "sort-order", default_value = "ASC")] sort_order: String, /// Configuration options in the format key=value /// Can be specified multiple times. /// /// Example: -c datafusion.optimizer.prefer_existing_sort=true - #[structopt(short = "c", long = "config")] + #[arg(short = 'c', long = "config")] config_options: Vec, } diff --git a/benchmarks/src/h2o.rs b/benchmarks/src/h2o.rs index 07a40447d414..f55dad27cc63 100644 --- a/benchmarks/src/h2o.rs +++ b/benchmarks/src/h2o.rs @@ -21,30 +21,29 @@ //! - [Extended window function benchmark](https://duckdb.org/2024/06/26/benchmarks-over-time.html#window-functions-benchmark) use crate::util::{BenchmarkRun, CommonOpt, print_memory_stats}; +use clap::Args; use datafusion::logical_expr::{ExplainFormat, ExplainOption}; use datafusion::{error::Result, prelude::SessionContext}; use datafusion_common::{ DataFusionError, TableReference, exec_datafusion_err, instant::Instant, internal_err, }; use std::path::{Path, PathBuf}; -use structopt::StructOpt; /// Run the H2O benchmark -#[derive(Debug, StructOpt, Clone)] -#[structopt(verbatim_doc_comment)] +#[derive(Debug, Args, Clone)] +#[command(verbatim_doc_comment)] pub struct RunOpt { - #[structopt(short, long)] + #[arg(short, long)] pub query: Option, /// Common options - #[structopt(flatten)] + #[command(flatten)] common: CommonOpt, /// Path to queries.sql (single file) /// default value is the groupby.sql file in the h2o benchmark - #[structopt( - parse(from_os_str), - short = "r", + #[arg( + short = 'r', long = "queries-path", default_value = "benchmarks/queries/h2o/groupby.sql" )] @@ -53,9 +52,8 @@ pub struct RunOpt { /// Path to data file (parquet or csv) /// Default value is the G1_1e7_1e7_100_0.csv file in the h2o benchmark /// This is the small csv file with 10^7 rows - #[structopt( - parse(from_os_str), - short = "p", + #[arg( + short = 'p', long = "path", default_value = "benchmarks/data/h2o/G1_1e7_1e7_100_0.csv" )] @@ -64,15 +62,15 @@ pub struct RunOpt { /// Path to data files (parquet or csv), using , to separate the paths /// Default value is the small files for join x table, small table, medium table, big table files in the h2o benchmark /// This is the small csv file case - #[structopt( - short = "join-paths", + #[arg( + short = 'j', long = "join-paths", default_value = "benchmarks/data/h2o/J1_1e7_NA_0.csv,benchmarks/data/h2o/J1_1e7_1e1_0.csv,benchmarks/data/h2o/J1_1e7_1e4_0.csv,benchmarks/data/h2o/J1_1e7_1e7_NA.csv" )] join_paths: String, /// If present, write results json here - #[structopt(parse(from_os_str), short = "o", long = "output")] + #[arg(short = 'o', long = "output")] output_path: Option, } diff --git a/benchmarks/src/hj.rs b/benchmarks/src/hj.rs index 562047f615bc..ddb2d268e601 100644 --- a/benchmarks/src/hj.rs +++ b/benchmarks/src/hj.rs @@ -16,11 +16,11 @@ // under the License. use crate::util::{BenchmarkRun, CommonOpt, QueryResult}; +use clap::Args; use datafusion::physical_plan::execute_stream; use datafusion::{error::Result, prelude::SessionContext}; use datafusion_common::instant::Instant; use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err}; -use structopt::StructOpt; use futures::StreamExt; @@ -32,19 +32,19 @@ use futures::StreamExt; /// It uses simple equality predicates to ensure a hash join is selected. /// Where we vary selectivity, we do so with additional cheap predicates that /// do not change the join key (so the physical operator remains HashJoin). -#[derive(Debug, StructOpt, Clone)] -#[structopt(verbatim_doc_comment)] +#[derive(Debug, Args, Clone)] +#[command(verbatim_doc_comment)] pub struct RunOpt { /// Query number (between 1 and 12). If not specified, runs all queries - #[structopt(short, long)] + #[arg(short, long)] query: Option, /// Common options (iterations, batch size, target_partitions, etc.) - #[structopt(flatten)] + #[command(flatten)] common: CommonOpt, /// If present, write results json here - #[structopt(parse(from_os_str), short = "o", long = "output")] + #[arg(short = 'o', long = "output")] output_path: Option, } diff --git a/benchmarks/src/imdb/convert.rs b/benchmarks/src/imdb/convert.rs index 2c4e1270255b..aaed186da490 100644 --- a/benchmarks/src/imdb/convert.rs +++ b/benchmarks/src/imdb/convert.rs @@ -20,31 +20,31 @@ use datafusion::logical_expr::select_expr::SelectExpr; use datafusion_common::instant::Instant; use std::path::PathBuf; +use clap::Args; use datafusion::error::Result; use datafusion::prelude::*; -use structopt::StructOpt; use datafusion::common::not_impl_err; use super::IMDB_TABLES; use super::get_imdb_table_schema; -#[derive(Debug, StructOpt)] +#[derive(Debug, Args)] pub struct ConvertOpt { /// Path to csv files - #[structopt(parse(from_os_str), required = true, short = "i", long = "input")] + #[arg(required = true, short = 'i', long = "input")] input_path: PathBuf, /// Output path - #[structopt(parse(from_os_str), required = true, short = "o", long = "output")] + #[arg(required = true, short = 'o', long = "output")] output_path: PathBuf, /// Output file format: `csv` or `parquet` - #[structopt(short = "f", long = "format")] + #[arg(short = 'f', long = "format")] file_format: String, /// Batch size when reading CSV or Parquet files - #[structopt(short = "s", long = "batch-size", default_value = "8192")] + #[arg(short = 's', long = "batch-size", default_value = "8192")] batch_size: usize, } diff --git a/benchmarks/src/imdb/run.rs b/benchmarks/src/imdb/run.rs index 05f1870c5d45..9ddea67148ef 100644 --- a/benchmarks/src/imdb/run.rs +++ b/benchmarks/src/imdb/run.rs @@ -41,8 +41,8 @@ use datafusion_common::instant::Instant; use datafusion_common::utils::get_available_parallelism; use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION}; +use clap::Args; use log::info; -use structopt::StructOpt; // hack to avoid `default_value is meaningless for bool` errors type BoolDefaultTrue = bool; @@ -57,40 +57,40 @@ type BoolDefaultTrue = bool; /// [2]: https://event.cwi.nl/da/job/imdb.tgz /// [3]: https://db.in.tum.de/~leis/qo/job.tgz -#[derive(Debug, StructOpt, Clone)] -#[structopt(verbatim_doc_comment)] +#[derive(Debug, Args, Clone)] +#[command(verbatim_doc_comment)] pub struct RunOpt { /// Query number. If not specified, runs all queries - #[structopt(short, long)] + #[arg(short, long)] pub query: Option, /// Common options - #[structopt(flatten)] + #[command(flatten)] common: CommonOpt, /// Path to data files - #[structopt(parse(from_os_str), required = true, short = "p", long = "path")] + #[arg(required = true, short = 'p', long = "path")] path: PathBuf, /// File format: `csv` or `parquet` - #[structopt(short = "f", long = "format", default_value = "csv")] + #[arg(short = 'f', long = "format", default_value = "csv")] file_format: String, /// Load the data into a MemTable before executing the query - #[structopt(short = "m", long = "mem-table")] + #[arg(short = 'm', long = "mem-table")] mem_table: bool, /// Path to machine readable output file - #[structopt(parse(from_os_str), short = "o", long = "output")] + #[arg(short = 'o', long = "output")] output_path: Option, /// Whether to disable collection of statistics (and cost based optimizations) or not. - #[structopt(short = "S", long = "disable-statistics")] + #[arg(short = 'S', long = "disable-statistics")] disable_statistics: bool, /// If true then hash join used, if false then sort merge join /// True by default. - #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")] + #[arg(short = 'j', long = "prefer_hash_join", default_value = "true")] prefer_hash_join: BoolDefaultTrue, } diff --git a/benchmarks/src/nlj.rs b/benchmarks/src/nlj.rs index cbf5a03fbf93..ade8c0f7789b 100644 --- a/benchmarks/src/nlj.rs +++ b/benchmarks/src/nlj.rs @@ -16,11 +16,11 @@ // under the License. use crate::util::{BenchmarkRun, CommonOpt, QueryResult}; +use clap::Args; use datafusion::physical_plan::execute_stream; use datafusion::{error::Result, prelude::SessionContext}; use datafusion_common::instant::Instant; use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err}; -use structopt::StructOpt; use futures::StreamExt; @@ -40,19 +40,19 @@ use futures::StreamExt; /// - Input size: Different combinations of left (build) side and right (probe) /// side sizes /// - Selectivity of join filters -#[derive(Debug, StructOpt, Clone)] -#[structopt(verbatim_doc_comment)] +#[derive(Debug, Args, Clone)] +#[command(verbatim_doc_comment)] pub struct RunOpt { /// Query number (between 1 and 10). If not specified, runs all queries - #[structopt(short, long)] + #[arg(short, long)] query: Option, /// Common options - #[structopt(flatten)] + #[command(flatten)] common: CommonOpt, /// If present, write results json here - #[structopt(parse(from_os_str), short = "o", long = "output")] + #[arg(short = 'o', long = "output")] output_path: Option, } diff --git a/benchmarks/src/smj.rs b/benchmarks/src/smj.rs index 53902e09302c..b420ef1d64c6 100644 --- a/benchmarks/src/smj.rs +++ b/benchmarks/src/smj.rs @@ -16,11 +16,11 @@ // under the License. use crate::util::{BenchmarkRun, CommonOpt, QueryResult}; +use clap::Args; use datafusion::physical_plan::execute_stream; use datafusion::{error::Result, prelude::SessionContext}; use datafusion_common::instant::Instant; use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err}; -use structopt::StructOpt; use futures::StreamExt; @@ -36,19 +36,19 @@ use futures::StreamExt; /// /// All inputs are pre-sorted in CTEs before the join to isolate join /// performance from sort overhead. -#[derive(Debug, StructOpt, Clone)] -#[structopt(verbatim_doc_comment)] +#[derive(Debug, Args, Clone)] +#[command(verbatim_doc_comment)] pub struct RunOpt { /// Query number (between 1 and 20). If not specified, runs all queries - #[structopt(short, long)] + #[arg(short, long)] query: Option, /// Common options - #[structopt(flatten)] + #[command(flatten)] common: CommonOpt, /// If present, write results json here - #[structopt(parse(from_os_str), short = "o", long = "output")] + #[arg(short = 'o', long = "output")] output_path: Option, } diff --git a/benchmarks/src/sort_tpch.rs b/benchmarks/src/sort_tpch.rs index 2f3be76f050b..806f1f6c33d0 100644 --- a/benchmarks/src/sort_tpch.rs +++ b/benchmarks/src/sort_tpch.rs @@ -21,10 +21,10 @@ //! Another `Sort` benchmark focus on single core execution. This benchmark //! runs end-to-end sort queries and test the performance on multiple CPU cores. +use clap::Args; use futures::StreamExt; use std::path::PathBuf; use std::sync::Arc; -use structopt::StructOpt; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::{ @@ -42,35 +42,35 @@ use datafusion_common::utils::get_available_parallelism; use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats}; -#[derive(Debug, StructOpt)] +#[derive(Debug, Args)] pub struct RunOpt { /// Common options - #[structopt(flatten)] + #[command(flatten)] common: CommonOpt, /// Sort query number. If not specified, runs all queries - #[structopt(short, long)] + #[arg(short, long)] pub query: Option, /// Path to data files (lineitem). Only parquet format is supported - #[structopt(parse(from_os_str), required = true, short = "p", long = "path")] + #[arg(required = true, short = 'p', long = "path")] path: PathBuf, /// Path to JSON benchmark result to be compare using `compare.py` - #[structopt(parse(from_os_str), short = "o", long = "output")] + #[arg(short = 'o', long = "output")] output_path: Option, /// Load the data into a MemTable before executing the query - #[structopt(short = "m", long = "mem-table")] + #[arg(short = 'm', long = "mem-table")] mem_table: bool, /// Mark the first column of each table as sorted in ascending order. /// The tables should have been created with the `--sort` option for this to have any effect. - #[structopt(short = "t", long = "sorted")] + #[arg(short = 't', long = "sorted")] sorted: bool, /// Append a `LIMIT n` clause to the query - #[structopt(short = "l", long = "limit")] + #[arg(short = 'l', long = "limit")] limit: Option, } diff --git a/benchmarks/src/tpcds/run.rs b/benchmarks/src/tpcds/run.rs index 3f579024ba51..586ee754d211 100644 --- a/benchmarks/src/tpcds/run.rs +++ b/benchmarks/src/tpcds/run.rs @@ -36,8 +36,8 @@ use datafusion_common::instant::Instant; use datafusion_common::utils::get_available_parallelism; use datafusion_common::{DEFAULT_PARQUET_EXTENSION, plan_err}; +use clap::Args; use log::info; -use structopt::StructOpt; // hack to avoid `default_value is meaningless for bool` errors type BoolDefaultTrue = bool; @@ -95,46 +95,46 @@ pub fn get_query_sql(base_query_path: &str, query: usize) -> Result> } /// Run the tpcds benchmark. -#[derive(Debug, StructOpt, Clone)] -#[structopt(verbatim_doc_comment)] +#[derive(Debug, Args, Clone)] +#[command(verbatim_doc_comment)] pub struct RunOpt { /// Query number. If not specified, runs all queries - #[structopt(short, long)] + #[arg(short, long)] pub query: Option, /// Common options - #[structopt(flatten)] + #[command(flatten)] common: CommonOpt, /// Path to data files - #[structopt(parse(from_os_str), required = true, short = "p", long = "path")] + #[arg(required = true, short = 'p', long = "path")] path: PathBuf, /// Path to query files - #[structopt(parse(from_os_str), required = true, short = "Q", long = "query_path")] + #[arg(required = true, short = 'Q', long = "query_path")] query_path: PathBuf, /// Load the data into a MemTable before executing the query - #[structopt(short = "m", long = "mem-table")] + #[arg(short = 'm', long = "mem-table")] mem_table: bool, /// Path to machine readable output file - #[structopt(parse(from_os_str), short = "o", long = "output")] + #[arg(short = 'o', long = "output")] output_path: Option, /// Whether to disable collection of statistics (and cost based optimizations) or not. - #[structopt(short = "S", long = "disable-statistics")] + #[arg(short = 'S', long = "disable-statistics")] disable_statistics: bool, /// If true then hash join used, if false then sort merge join /// True by default. - #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")] + #[arg(short = 'j', long = "prefer_hash_join", default_value = "true")] prefer_hash_join: BoolDefaultTrue, /// If true then Piecewise Merge Join can be used, if false then it will opt for Nested Loop Join /// False by default. - #[structopt( - short = "w", + #[arg( + short = 'w', long = "enable_piecewise_merge_join", default_value = "false" )] @@ -142,7 +142,7 @@ pub struct RunOpt { /// Mark the first column of each table as sorted in ascending order. /// The tables should have been created with the `--sort` option for this to have any effect. - #[structopt(short = "t", long = "sorted")] + #[arg(short = 't', long = "sorted")] sorted: bool, } diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs index 65bb9594f00a..9706296feae6 100644 --- a/benchmarks/src/tpch/run.rs +++ b/benchmarks/src/tpch/run.rs @@ -41,8 +41,8 @@ use datafusion_common::instant::Instant; use datafusion_common::utils::get_available_parallelism; use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION}; +use clap::Args; use log::info; -use structopt::StructOpt; // hack to avoid `default_value is meaningless for bool` errors type BoolDefaultTrue = bool; @@ -56,46 +56,46 @@ type BoolDefaultTrue = bool; /// [1]: http://www.tpc.org/tpch/ /// [2]: https://github.com/databricks/tpch-dbgen.git /// [2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf -#[derive(Debug, StructOpt, Clone)] -#[structopt(verbatim_doc_comment)] +#[derive(Debug, Args, Clone)] +#[command(verbatim_doc_comment)] pub struct RunOpt { /// Query number. If not specified, runs all queries - #[structopt(short, long)] + #[arg(short, long)] pub query: Option, /// Common options - #[structopt(flatten)] + #[command(flatten)] common: CommonOpt, /// Path to data files - #[structopt(parse(from_os_str), required = true, short = "p", long = "path")] + #[arg(required = true, short = 'p', long = "path")] path: PathBuf, /// File format: `csv` or `parquet` - #[structopt(short = "f", long = "format", default_value = "csv")] + #[arg(short = 'f', long = "format", default_value = "csv")] file_format: String, /// Load the data into a MemTable before executing the query - #[structopt(short = "m", long = "mem-table")] + #[arg(short = 'm', long = "mem-table")] mem_table: bool, /// Path to machine readable output file - #[structopt(parse(from_os_str), short = "o", long = "output")] + #[arg(short = 'o', long = "output")] output_path: Option, /// Whether to disable collection of statistics (and cost based optimizations) or not. - #[structopt(short = "S", long = "disable-statistics")] + #[arg(short = 'S', long = "disable-statistics")] disable_statistics: bool, /// If true then hash join used, if false then sort merge join /// True by default. - #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")] + #[arg(short = 'j', long = "prefer_hash_join", default_value = "true")] prefer_hash_join: BoolDefaultTrue, /// If true then Piecewise Merge Join can be used, if false then it will opt for Nested Loop Join /// False by default. - #[structopt( - short = "w", + #[arg( + short = 'w', long = "enable_piecewise_merge_join", default_value = "false" )] @@ -103,7 +103,7 @@ pub struct RunOpt { /// Mark the first column of each table as sorted in ascending order. /// The tables should have been created with the `--sort` option for this to have any effect. - #[structopt(short = "t", long = "sorted")] + #[arg(short = 't', long = "sorted")] sorted: bool, } diff --git a/benchmarks/src/util/options.rs b/benchmarks/src/util/options.rs index b1d5bc99fb40..6f7267eabb83 100644 --- a/benchmarks/src/util/options.rs +++ b/benchmarks/src/util/options.rs @@ -17,6 +17,7 @@ use std::{num::NonZeroUsize, sync::Arc}; +use clap::Args; use datafusion::{ execution::{ disk_manager::DiskManagerBuilder, @@ -26,40 +27,39 @@ use datafusion::{ prelude::SessionConfig, }; use datafusion_common::{DataFusionError, Result}; -use structopt::StructOpt; // Common benchmark options (don't use doc comments otherwise this doc // shows up in help files) -#[derive(Debug, StructOpt, Clone)] +#[derive(Debug, Args, Clone)] pub struct CommonOpt { /// Number of iterations of each test run - #[structopt(short = "i", long = "iterations", default_value = "3")] + #[arg(short = 'i', long = "iterations", default_value = "3")] pub iterations: usize, /// Number of partitions to process in parallel. Defaults to number of available cores. - #[structopt(short = "n", long = "partitions")] + #[arg(short = 'n', long = "partitions")] pub partitions: Option, /// Batch size when reading CSV or Parquet files - #[structopt(short = "s", long = "batch-size")] + #[arg(short = 's', long = "batch-size")] pub batch_size: Option, /// The memory pool type to use, should be one of "fair" or "greedy" - #[structopt(long = "mem-pool-type", default_value = "fair")] + #[arg(long = "mem-pool-type", default_value = "fair")] pub mem_pool_type: String, /// Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query /// if there's any, otherwise run with no memory limit. - #[structopt(long = "memory-limit", parse(try_from_str = parse_memory_limit))] + #[arg(long = "memory-limit", value_parser = parse_memory_limit)] pub memory_limit: Option, /// The amount of memory to reserve for sort spill operations. DataFusion's default value will be used /// if not specified. - #[structopt(long = "sort-spill-reservation-bytes", parse(try_from_str = parse_memory_limit))] + #[arg(long = "sort-spill-reservation-bytes", value_parser = parse_memory_limit)] pub sort_spill_reservation_bytes: Option, /// Activate debug mode to see more details - #[structopt(short, long)] + #[arg(short, long)] pub debug: bool, }