From df8084312bad64c5a7e3cef2a2f1b0a1b66acf0f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 23 Dec 2025 13:04:16 -0700 Subject: [PATCH 1/3] Add microbenchmark for casting string to temporal types --- .../CometCastStringToTemporalBenchmark.scala | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala new file mode 100644 index 0000000000..595f661a21 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +import org.apache.spark.sql.internal.SQLConf + +case class CastStringToTemporalConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +/** + * Benchmark to measure performance of Comet cast from String to temporal types. To run this + * benchmark: `SPARK_GENERATE_BENCHMARK_FILES=1 make + * benchmark-org.apache.spark.sql.benchmark.CometCastStringToTemporalBenchmark` Results will be + * written to "spark/benchmarks/CometCastStringToTemporalBenchmark-**results.txt". + */ +object CometCastStringToTemporalBenchmark extends CometBenchmarkBase { + + /** + * Generic method to run a cast benchmark with the given configuration. + */ + def runCastBenchmark(config: CastStringToTemporalConfig, values: Int): Unit = { + withTempPath { dir => + withTempTable("parquetV1Table") { + // Generate date strings like "2020-01-01", "2020-01-02", etc. + // This covers the full range for date parsing + prepareTable( + dir, + spark.sql( + s"SELECT CAST(DATE_ADD('2020-01-01', CAST(value % 3650 AS INT)) AS STRING) AS c1 FROM $tbl")) + + runExpressionBenchmark(config.name, values, config.query, config.extraCometConfigs) + } + } + } + + /** + * Benchmark for String to Timestamp with timestamp-formatted strings. + */ + def runTimestampCastBenchmark(config: CastStringToTemporalConfig, values: Int): Unit = { + withTempPath { dir => + withTempTable("parquetV1Table") { + // Generate timestamp strings like "2020-01-01 12:34:56", etc. + prepareTable( + dir, + spark.sql( + s"SELECT CAST(TIMESTAMP_MICROS(value % 9999999999) AS STRING) AS c1 FROM $tbl")) + + runExpressionBenchmark(config.name, values, config.query, config.extraCometConfigs) + } + } + } + + // Configuration for String to temporal cast benchmarks + private val castConfigs = List( + // Date + CastStringToTemporalConfig( + "Cast String to Date (LEGACY)", + "SELECT CAST(c1 AS DATE) FROM parquetV1Table", + Map(SQLConf.ANSI_ENABLED.key -> "false")), + CastStringToTemporalConfig( + "Cast String to Date (ANSI)", + "SELECT CAST(c1 AS DATE) FROM parquetV1Table", + Map(SQLConf.ANSI_ENABLED.key -> "true"))) + + private val timestampCastConfigs = List( + // Timestamp (only UTC timezone is compatible) + CastStringToTemporalConfig( + "Cast String to Timestamp (LEGACY, UTC)", + "SELECT CAST(c1 AS TIMESTAMP) FROM parquetV1Table", + Map(SQLConf.ANSI_ENABLED.key -> "false")), + CastStringToTemporalConfig( + "Cast String to Timestamp (ANSI, UTC)", + "SELECT CAST(c1 AS TIMESTAMP) FROM parquetV1Table", + Map(SQLConf.ANSI_ENABLED.key -> "true"))) + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = 1024 * 1024 * 10 // 10M rows + + // Run date casts + castConfigs.foreach { config => + runBenchmarkWithTable(config.name, values) { v => + runCastBenchmark(config, v) + } + } + + // Run timestamp casts + timestampCastConfigs.foreach { config => + runBenchmarkWithTable(config.name, values) { v => + runTimestampCastBenchmark(config, v) + } + } + } +} From 360edb8e36a400b61ee84a1a081a6d92be12fff5 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 23 Dec 2025 14:38:44 -0700 Subject: [PATCH 2/3] fix --- .../CometCastStringToTemporalBenchmark.scala | 113 ++++++++---------- 1 file changed, 52 insertions(+), 61 deletions(-) diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala index 595f661a21..3924805884 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala @@ -26,86 +26,77 @@ case class CastStringToTemporalConfig( query: String, extraCometConfigs: Map[String, String] = Map.empty) +// spotless:off /** * Benchmark to measure performance of Comet cast from String to temporal types. To run this - * benchmark: `SPARK_GENERATE_BENCHMARK_FILES=1 make - * benchmark-org.apache.spark.sql.benchmark.CometCastStringToTemporalBenchmark` Results will be - * written to "spark/benchmarks/CometCastStringToTemporalBenchmark-**results.txt". + * benchmark: + * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastStringToTemporalBenchmark` + * Results will be written to "spark/benchmarks/CometCastStringToTemporalBenchmark-**results.txt". */ +// spotless:on object CometCastStringToTemporalBenchmark extends CometBenchmarkBase { - /** - * Generic method to run a cast benchmark with the given configuration. - */ - def runCastBenchmark(config: CastStringToTemporalConfig, values: Int): Unit = { - withTempPath { dir => - withTempTable("parquetV1Table") { - // Generate date strings like "2020-01-01", "2020-01-02", etc. - // This covers the full range for date parsing - prepareTable( - dir, - spark.sql( - s"SELECT CAST(DATE_ADD('2020-01-01', CAST(value % 3650 AS INT)) AS STRING) AS c1 FROM $tbl")) - - runExpressionBenchmark(config.name, values, config.query, config.extraCometConfigs) - } - } - } - - /** - * Benchmark for String to Timestamp with timestamp-formatted strings. - */ - def runTimestampCastBenchmark(config: CastStringToTemporalConfig, values: Int): Unit = { - withTempPath { dir => - withTempTable("parquetV1Table") { - // Generate timestamp strings like "2020-01-01 12:34:56", etc. - prepareTable( - dir, - spark.sql( - s"SELECT CAST(TIMESTAMP_MICROS(value % 9999999999) AS STRING) AS c1 FROM $tbl")) - - runExpressionBenchmark(config.name, values, config.query, config.extraCometConfigs) - } - } - } - // Configuration for String to temporal cast benchmarks - private val castConfigs = List( - // Date + private val dateCastConfigs = List( CastStringToTemporalConfig( - "Cast String to Date (LEGACY)", - "SELECT CAST(c1 AS DATE) FROM parquetV1Table", - Map(SQLConf.ANSI_ENABLED.key -> "false")), + "Cast String to Date", + "SELECT CAST(c1 AS DATE) FROM parquetV1Table"), CastStringToTemporalConfig( - "Cast String to Date (ANSI)", - "SELECT CAST(c1 AS DATE) FROM parquetV1Table", - Map(SQLConf.ANSI_ENABLED.key -> "true"))) + "Try_Cast String to Date", + "SELECT TRY_CAST(c1 AS DATE) FROM parquetV1Table")) private val timestampCastConfigs = List( - // Timestamp (only UTC timezone is compatible) CastStringToTemporalConfig( - "Cast String to Timestamp (LEGACY, UTC)", - "SELECT CAST(c1 AS TIMESTAMP) FROM parquetV1Table", - Map(SQLConf.ANSI_ENABLED.key -> "false")), + "Cast String to Timestamp", + "SELECT CAST(c1 AS TIMESTAMP) FROM parquetV1Table"), CastStringToTemporalConfig( - "Cast String to Timestamp (ANSI, UTC)", - "SELECT CAST(c1 AS TIMESTAMP) FROM parquetV1Table", - Map(SQLConf.ANSI_ENABLED.key -> "true"))) + "Try_Cast String to Timestamp", + "SELECT TRY_CAST(c1 AS TIMESTAMP) FROM parquetV1Table")) override def runCometBenchmark(mainArgs: Array[String]): Unit = { val values = 1024 * 1024 * 10 // 10M rows - // Run date casts - castConfigs.foreach { config => - runBenchmarkWithTable(config.name, values) { v => - runCastBenchmark(config, v) + // Generate date data once with ~10% invalid values + runBenchmarkWithTable("date data generation", values) { v => + withTempPath { dateDir => + withTempTable("parquetV1Table") { + prepareTable( + dateDir, + spark.sql(s""" + SELECT CASE + WHEN value % 10 = 0 THEN 'invalid-date' + ELSE CAST(DATE_ADD('2020-01-01', CAST(value % 3650 AS INT)) AS STRING) + END AS c1 + FROM $tbl + """)) + + // Run date cast benchmarks with the same data + dateCastConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } } } - // Run timestamp casts - timestampCastConfigs.foreach { config => - runBenchmarkWithTable(config.name, values) { v => - runTimestampCastBenchmark(config, v) + // Generate timestamp data once with ~10% invalid values + runBenchmarkWithTable("timestamp data generation", values) { v => + withTempPath { timestampDir => + withTempTable("parquetV1Table") { + prepareTable( + timestampDir, + spark.sql(s""" + SELECT CASE + WHEN value % 10 = 0 THEN 'not-a-timestamp' + ELSE CAST(TIMESTAMP_MICROS(value % 9999999999) AS STRING) + END AS c1 + FROM $tbl + """)) + + // Run timestamp cast benchmarks with the same data + timestampCastConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } } } } From 6bc01dabdcd58be717f3977f6033f10d34d2bef6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 23 Dec 2025 15:27:06 -0700 Subject: [PATCH 3/3] remove unused import --- .../sql/benchmark/CometCastStringToTemporalBenchmark.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala index 3924805884..39337be5c8 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql.benchmark -import org.apache.spark.sql.internal.SQLConf - case class CastStringToTemporalConfig( name: String, query: String,