From 08b338b01643122517f726429981141ec7f0eea2 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 28 Sep 2025 01:14:03 -0700 Subject: [PATCH 1/4] use spark4 --- dev/spark/Dockerfile | 14 +++++++------- dev/spark/spark-defaults.conf | 2 ++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/dev/spark/Dockerfile b/dev/spark/Dockerfile index d0fc6a4fdd..754085d687 100644 --- a/dev/spark/Dockerfile +++ b/dev/spark/Dockerfile @@ -13,17 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG BASE_IMAGE_SPARK_VERSION=3.5.6 +ARG BASE_IMAGE_SPARK_VERSION=4.0.1 FROM apache/spark:${BASE_IMAGE_SPARK_VERSION} # Dependency versions - keep these compatible ARG ICEBERG_VERSION=1.10.0 -ARG ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 -ARG SPARK_VERSION=3.5.6 -ARG SCALA_VERSION=2.12 -ARG HADOOP_VERSION=3.3.4 -ARG AWS_SDK_VERSION=1.12.753 +ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13 +ARG SPARK_VERSION=4.0.1 +ARG HADOOP_VERSION=3.4.1 +ARG SCALA_VERSION=2.13 +ARG AWS_SDK_VERSION=2.34.4 ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2 USER root @@ -47,7 +47,7 @@ ENV JARS_TO_DOWNLOAD="\ org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \ org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \ org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \ - com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar" + software/amazon/awssdk/s3-transfer-manager/${AWS_SDK_VERSION}/s3-transfer-manager-${AWS_SDK_VERSION}.jar" # Download JARs with retry logic RUN set -e && \ diff --git a/dev/spark/spark-defaults.conf b/dev/spark/spark-defaults.conf index 3a12e25818..4e50f590c7 100644 --- a/dev/spark/spark-defaults.conf +++ b/dev/spark/spark-defaults.conf @@ -48,3 +48,5 @@ spark.sql.defaultCatalog rest spark.ui.enabled true spark.eventLog.enabled true spark.eventLog.dir /home/iceberg/spark-events + +spark.sql.ansi.enabled false From 33779ba2d6d60d6eb0784979bc4e9b71dad38aaf Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 28 Sep 2025 01:27:38 -0700 Subject: [PATCH 2/4] https://hadoop.apache.org/docs/r3.4.1/hadoop-aws/dependency-analysis.html --- dev/spark/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/spark/Dockerfile b/dev/spark/Dockerfile index 754085d687..cd35671c4b 100644 --- a/dev/spark/Dockerfile +++ b/dev/spark/Dockerfile @@ -23,7 +23,7 @@ ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13 ARG SPARK_VERSION=4.0.1 ARG HADOOP_VERSION=3.4.1 ARG SCALA_VERSION=2.13 -ARG AWS_SDK_VERSION=2.34.4 +ARG AWS_SDK_VERSION=2.24.6 ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2 USER root @@ -47,7 +47,7 @@ ENV JARS_TO_DOWNLOAD="\ org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \ org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \ org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \ - software/amazon/awssdk/s3-transfer-manager/${AWS_SDK_VERSION}/s3-transfer-manager-${AWS_SDK_VERSION}.jar" + software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar" # Download JARs with retry logic RUN set -e && \ From 3a0a3665877cbc518ee8863c51b820c5ff7f3c5f Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 4 Oct 2025 08:55:50 -0700 Subject: [PATCH 3/4] use pyspark 4.0.1 --- poetry.lock | 100 ++++++++++++++++++++++++------------------------- pyproject.toml | 2 +- 2 files changed, 51 insertions(+), 51 deletions(-) diff --git a/poetry.lock b/poetry.lock index c49944ab00..a877dfe8a6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.0 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "adlfs" @@ -59,7 +59,7 @@ description = "Happy Eyeballs for asyncio" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\"" +markers = "extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8"}, {file = "aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558"}, @@ -72,7 +72,7 @@ description = "Async http client/server framework (asyncio)" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\"" +markers = "extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "aiohttp-3.12.12-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6f25e9d274d6abbb15254f76f100c3984d6b9ad6e66263cc60a465dd5c7e48f5"}, {file = "aiohttp-3.12.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b8ec3c1a1c13d24941b5b913607e57b9364e4c0ea69d5363181467492c4b2ba6"}, @@ -202,7 +202,7 @@ description = "aiosignal: a list of registered asynchronous callbacks" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"ray\" or extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\"" +markers = "extra == \"ray\" or extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5"}, {file = "aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54"}, @@ -254,7 +254,7 @@ description = "Timeout context manager for asyncio programs" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "(extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\") and python_version <= \"3.10\"" +markers = "(extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\") and python_version <= \"3.10\"" files = [ {file = "async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c"}, {file = "async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"}, @@ -271,7 +271,7 @@ files = [ {file = "attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3"}, {file = "attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b"}, ] -markers = {main = "extra == \"ray\" or extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\""} +markers = {main = "extra == \"ray\" or extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\""} [package.extras] benchmark = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] @@ -530,7 +530,7 @@ files = [ {file = "boto3-1.40.18-py3-none-any.whl", hash = "sha256:daa776ba1251a7458c9d6c7627873d0c2460c8e8272d35759065580e9193700a"}, {file = "boto3-1.40.18.tar.gz", hash = "sha256:64301d39adecc154e3e595eaf0d4f28998ef0a5551f1d033aeac51a9e1a688e5"}, ] -markers = {main = "extra == \"dynamodb\" or extra == \"glue\" or extra == \"rest-sigv4\""} +markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\""} [package.dependencies] botocore = ">=1.40.18,<1.41.0" @@ -551,7 +551,7 @@ files = [ {file = "botocore-1.40.18-py3-none-any.whl", hash = "sha256:57025c46ca00cf8cec25de07a759521bfbfb3036a0f69b272654a354615dc45f"}, {file = "botocore-1.40.18.tar.gz", hash = "sha256:afd69bdadd8c55cc89d69de0799829e555193a352d87867f746e19020271cc0f"}, ] -markers = {main = "extra == \"dynamodb\" or extra == \"glue\" or extra == \"rest-sigv4\" or extra == \"s3fs\""} +markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\" or extra == \"s3fs\""} [package.dependencies] jmespath = ">=0.7.1,<2.0.0" @@ -1617,7 +1617,7 @@ description = "A list-like structure which implements collections.abc.MutableSeq optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"ray\" or extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\"" +markers = "extra == \"ray\" or extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "frozenlist-1.7.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cc4df77d638aa2ed703b878dd093725b72a824c3c546c076e8fdf276f78ee84a"}, {file = "frozenlist-1.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:716a9973a2cc963160394f701964fe25012600f3d311f60c790400b00e568b61"}, @@ -1816,7 +1816,7 @@ description = "Google API client core library" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\"" +markers = "extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "google_api_core-2.25.0-py3-none-any.whl", hash = "sha256:1db79d1281dcf9f3d10023283299ba38f3dc9f639ec41085968fd23e5bcf512e"}, {file = "google_api_core-2.25.0.tar.gz", hash = "sha256:9b548e688702f82a34ed8409fb8a6961166f0b7795032f0be8f48308dff4333a"}, @@ -1853,7 +1853,7 @@ description = "Google Authentication Library" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\" or extra == \"gcp-auth\"" +markers = "extra == \"gcp-auth\" or extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "google_auth-2.41.0-py2.py3-none-any.whl", hash = "sha256:d8bed9b53ab63b7b0374656b8e1bef051f95bb14ecc0cf21ba49de7911d62e09"}, {file = "google_auth-2.41.0.tar.gz", hash = "sha256:c9d7b534ea4a5d9813c552846797fafb080312263cd4994d6622dd50992ae101"}, @@ -1935,7 +1935,7 @@ description = "Google Cloud API client core library" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\"" +markers = "extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "google_cloud_core-2.4.3-py2.py3-none-any.whl", hash = "sha256:5130f9f4c14b4fafdff75c79448f9495cfade0d8775facf1b09c3bf67e027f6e"}, {file = "google_cloud_core-2.4.3.tar.gz", hash = "sha256:1fab62d7102844b278fe6dead3af32408b1df3eb06f5c7e8634cbd40edc4da53"}, @@ -1980,7 +1980,7 @@ description = "A python wrapper of the C library 'Google CRC32C'" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\"" +markers = "extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "google_crc32c-1.7.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:b07d48faf8292b4db7c3d64ab86f950c2e94e93a11fd47271c28ba458e4a0d76"}, {file = "google_crc32c-1.7.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:7cc81b3a2fbd932a4313eb53cc7d9dde424088ca3a0337160f35d91826880c1d"}, @@ -2028,7 +2028,7 @@ description = "Utilities for Google Media Downloads and Resumable Uploads" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\"" +markers = "extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa"}, {file = "google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0"}, @@ -2052,7 +2052,7 @@ files = [ {file = "googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8"}, {file = "googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257"}, ] -markers = {main = "python_version >= \"3.11\" and (extra == \"bigquery\" or extra == \"gcsfs\") or extra == \"gcsfs\" or extra == \"bigquery\""} +markers = {main = "extra == \"bigquery\" or extra == \"gcsfs\""} [package.dependencies] protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<7.0.0" @@ -2491,7 +2491,7 @@ files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] -markers = {main = "extra == \"dynamodb\" or extra == \"glue\" or extra == \"rest-sigv4\" or extra == \"s3fs\""} +markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\" or extra == \"s3fs\""} [[package]] name = "joserfc" @@ -3391,7 +3391,7 @@ description = "multidict implementation" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\"" +markers = "extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "multidict-6.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8adee3ac041145ffe4488ea73fa0a622b464cc25340d98be76924d0cda8545ff"}, {file = "multidict-6.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b61e98c3e2a861035aaccd207da585bdcacef65fe01d7a0d07478efac005e028"}, @@ -3740,7 +3740,7 @@ files = [ {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"}, {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, ] -markers = {main = "extra == \"bodo\" or extra == \"pandas\" or extra == \"ray\""} +markers = {main = "extra == \"pandas\" or extra == \"ray\" or extra == \"bodo\""} [[package]] name = "oauthlib" @@ -3875,7 +3875,7 @@ files = [ {file = "pandas-2.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:a9d7ec92d71a420185dec44909c32e9a362248c4ae2238234b76d5be37f208cc"}, {file = "pandas-2.3.2.tar.gz", hash = "sha256:ab7b58f8f82706890924ccdfb5f48002b83d2b5a3845976a9fb705d36c34dcdb"}, ] -markers = {main = "extra == \"bodo\" or extra == \"pandas\" or extra == \"ray\""} +markers = {main = "extra == \"pandas\" or extra == \"ray\" or extra == \"bodo\""} [package.dependencies] numpy = [ @@ -4052,7 +4052,7 @@ description = "Accelerated property cache" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\"" +markers = "extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "propcache-0.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:22d9962a358aedbb7a2e36187ff273adeaab9743373a272976d2e348d08c7770"}, {file = "propcache-0.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0d0fda578d1dc3f77b6b5a5dce3b9ad69a8250a891760a548df850a5e8da87f3"}, @@ -4161,7 +4161,7 @@ description = "Beautiful, Pythonic protocol buffers" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "python_version >= \"3.13\" and (extra == \"gcsfs\" or extra == \"bigquery\") or extra == \"bigquery\" or extra == \"gcsfs\"" +markers = "extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "proto_plus-1.26.1-py3-none-any.whl", hash = "sha256:13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66"}, {file = "proto_plus-1.26.1.tar.gz", hash = "sha256:21a515a4c4c0088a773899e23c7bbade3d18f9c66c73edd4c7ee3816bc96a012"}, @@ -4191,7 +4191,7 @@ files = [ {file = "protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e"}, {file = "protobuf-6.31.1.tar.gz", hash = "sha256:d8cac4c982f0b957a4dc73a80e2ea24fab08e679c0de9deb835f4a12d69aca9a"}, ] -markers = {main = "python_version >= \"3.11\" and (extra == \"ray\" or extra == \"bigquery\" or extra == \"gcsfs\") or extra == \"ray\" or extra == \"gcsfs\" or extra == \"bigquery\""} +markers = {main = "extra == \"ray\" or extra == \"bigquery\" or extra == \"gcsfs\""} [[package]] name = "psutil" @@ -4330,14 +4330,14 @@ dev = ["black (==22.6.0)", "flake8", "mypy", "pytest"] [[package]] name = "py4j" -version = "0.10.9.7" +version = "0.10.9.9" description = "Enables Python programs to dynamically access arbitrary Java objects" optional = false python-versions = "*" groups = ["dev"] files = [ - {file = "py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b"}, - {file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"}, + {file = "py4j-0.10.9.9-py2.py3-none-any.whl", hash = "sha256:c7c26e4158defb37b0bb124933163641a2ff6e3a3913f7811b0ddbe07ed61533"}, + {file = "py4j-0.10.9.9.tar.gz", hash = "sha256:f694cad19efa5bd1dee4f3e5270eb406613c974394035e5bfc4ec1aba870b879"}, ] [[package]] @@ -4391,7 +4391,7 @@ files = [ {file = "pyarrow-19.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:8464c9fbe6d94a7fe1599e7e8965f350fd233532868232ab2596a71586c5a429"}, {file = "pyarrow-19.0.1.tar.gz", hash = "sha256:3bf266b485df66a400f282ac0b6d1b500b9d2ae73314a153dbe97d6d5cc8a99e"}, ] -markers = {main = "extra == \"bodo\" or extra == \"daft\" or extra == \"datafusion\" or extra == \"duckdb\" or extra == \"pandas\" or extra == \"pyarrow\" or extra == \"ray\""} +markers = {main = "extra == \"pyarrow\" or extra == \"pandas\" or extra == \"duckdb\" or extra == \"ray\" or extra == \"bodo\" or extra == \"daft\" or extra == \"datafusion\""} [package.extras] test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] @@ -4403,7 +4403,7 @@ description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\" or extra == \"gcp-auth\"" +markers = "extra == \"gcp-auth\" or extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629"}, {file = "pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034"}, @@ -4416,7 +4416,7 @@ description = "A collection of ASN.1-based protocols modules" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\" or extra == \"gcp-auth\"" +markers = "extra == \"gcp-auth\" or extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a"}, {file = "pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6"}, @@ -4768,30 +4768,30 @@ files = [ [[package]] name = "pyspark" -version = "3.5.6" +version = "4.0.1" description = "Apache Spark Python API" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "pyspark-3.5.6.tar.gz", hash = "sha256:f8b1c4360e41ab398c64904fae08740503bcb6bd389457d659fa6d9f2952cc48"}, + {file = "pyspark-4.0.1.tar.gz", hash = "sha256:9d1f22d994f60369228397e3479003ffe2dd736ba79165003246ff7bd48e2c73"}, ] [package.dependencies] -googleapis-common-protos = {version = ">=1.56.4", optional = true, markers = "extra == \"connect\""} -grpcio = {version = ">=1.56.0", optional = true, markers = "extra == \"connect\""} -grpcio-status = {version = ">=1.56.0", optional = true, markers = "extra == \"connect\""} -numpy = {version = ">=1.15,<2", optional = true, markers = "extra == \"connect\""} -pandas = {version = ">=1.0.5", optional = true, markers = "extra == \"connect\""} -py4j = "0.10.9.7" -pyarrow = {version = ">=4.0.0", optional = true, markers = "extra == \"connect\""} +googleapis-common-protos = {version = ">=1.65.0", optional = true, markers = "extra == \"connect\""} +grpcio = {version = ">=1.67.0", optional = true, markers = "extra == \"connect\""} +grpcio-status = {version = ">=1.67.0", optional = true, markers = "extra == \"connect\""} +numpy = {version = ">=1.21", optional = true, markers = "extra == \"connect\""} +pandas = {version = ">=2.0.0", optional = true, markers = "extra == \"connect\""} +py4j = "0.10.9.9" +pyarrow = {version = ">=11.0.0", optional = true, markers = "extra == \"connect\""} [package.extras] -connect = ["googleapis-common-protos (>=1.56.4)", "grpcio (>=1.56.0)", "grpcio-status (>=1.56.0)", "numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] -ml = ["numpy (>=1.15,<2)"] -mllib = ["numpy (>=1.15,<2)"] -pandas-on-spark = ["numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] -sql = ["numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] +connect = ["googleapis-common-protos (>=1.65.0)", "grpcio (>=1.67.0)", "grpcio-status (>=1.67.0)", "numpy (>=1.21)", "pandas (>=2.0.0)", "pyarrow (>=11.0.0)"] +ml = ["numpy (>=1.21)"] +mllib = ["numpy (>=1.21)"] +pandas-on-spark = ["numpy (>=1.21)", "pandas (>=2.0.0)", "pyarrow (>=11.0.0)"] +sql = ["numpy (>=1.21)", "pandas (>=2.0.0)", "pyarrow (>=11.0.0)"] [[package]] name = "pytest" @@ -4911,7 +4911,7 @@ files = [ {file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"}, {file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"}, ] -markers = {main = "extra == \"bodo\" or extra == \"pandas\" or extra == \"ray\""} +markers = {main = "extra == \"pandas\" or extra == \"ray\" or extra == \"bodo\""} [[package]] name = "pywin32" @@ -5466,7 +5466,7 @@ description = "Pure-Python RSA implementation" optional = true python-versions = "<4,>=3.6" groups = ["main"] -markers = "extra == \"gcsfs\" or extra == \"bigquery\" or extra == \"gcp-auth\"" +markers = "extra == \"gcp-auth\" or extra == \"bigquery\" or extra == \"gcsfs\"" files = [ {file = "rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762"}, {file = "rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75"}, @@ -5508,7 +5508,7 @@ files = [ {file = "s3transfer-0.13.1-py3-none-any.whl", hash = "sha256:a981aa7429be23fe6dfc13e80e4020057cbab622b08c0315288758d67cabc724"}, {file = "s3transfer-0.13.1.tar.gz", hash = "sha256:c3fdba22ba1bd367922f27ec8032d6a1cf5f10c934fb5d68cf60fd5a23d936cf"}, ] -markers = {main = "extra == \"dynamodb\" or extra == \"glue\" or extra == \"rest-sigv4\""} +markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\""} [package.dependencies] botocore = ">=1.37.4,<2.0a.0" @@ -5940,7 +5940,7 @@ description = "Fast, Extensible Progress Meter" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"daft\" or extra == \"hf\"" +markers = "extra == \"hf\" or extra == \"daft\"" files = [ {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"}, {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"}, @@ -5995,7 +5995,7 @@ files = [ {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, ] -markers = {main = "extra == \"bodo\" or extra == \"pandas\" or extra == \"ray\""} +markers = {main = "extra == \"pandas\" or extra == \"ray\" or extra == \"bodo\""} [[package]] name = "urllib3" @@ -6225,7 +6225,7 @@ description = "Yet another URL library" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"adlfs\" or extra == \"gcsfs\" or extra == \"s3fs\"" +markers = "extra == \"s3fs\" or extra == \"adlfs\" or extra == \"gcsfs\"" files = [ {file = "yarl-1.20.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6032e6da6abd41e4acda34d75a816012717000fa6839f37124a47fcefc49bec4"}, {file = "yarl-1.20.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2c7b34d804b8cf9b214f05015c4fee2ebe7ed05cf581e7192c06555c71f4446a"}, @@ -6501,4 +6501,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.1" python-versions = "^3.9.2, !=3.9.7" -content-hash = "aff36ee1d48cca375367caaac63badd2691cb72abd7b23ae458eb6048e2b89ea" +content-hash = "98d52283213deb95a118948bdc0e1fc0262571193a2114cb1364b74dd2108977" diff --git a/pyproject.toml b/pyproject.toml index 1d94838081..bc6354e847 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ requests-mock = "1.12.1" moto = { version = "^5.0.2", extras = ["server"] } typing-extensions = "4.15.0" pytest-mock = "3.15.1" -pyspark = { version = "3.5.6", extras = ["connect"] } +pyspark = { version = "4.0.1", extras = ["connect"] } cython = "3.1.4" deptry = ">=0.14,<0.24" docutils = "!=0.21.post1" # https://github.com/python-poetry/poetry/issues/9248#issuecomment-2026240520 From e3a67ca70ae73267170cd9b363467bbc1222a2bc Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 4 Oct 2025 09:07:58 -0700 Subject: [PATCH 4/4] pin protobuf version in dev dep to match spark connect https://github.com/apache/spark/blob/v4.0.1/dev/requirements.txt#L64 --- poetry.lock | 37 +++++++++++++++++++------------------ pyproject.toml | 1 + 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/poetry.lock b/poetry.lock index a877dfe8a6..343cc4c722 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2226,21 +2226,21 @@ protobuf = ["grpcio-tools (>=1.73.0)"] [[package]] name = "grpcio-status" -version = "1.73.0" +version = "1.71.2" description = "Status proto mapping for gRPC" optional = false python-versions = ">=3.9" groups = ["main", "dev"] files = [ - {file = "grpcio_status-1.73.0-py3-none-any.whl", hash = "sha256:a3f3a9994b44c364f014e806114ba44cc52e50c426779f958c8b22f14ff0d892"}, - {file = "grpcio_status-1.73.0.tar.gz", hash = "sha256:a2b7f430568217f884fe52a5a0133b6f4c9338beae33fb5370134a8eaf58f974"}, + {file = "grpcio_status-1.71.2-py3-none-any.whl", hash = "sha256:803c98cb6a8b7dc6dbb785b1111aed739f241ab5e9da0bba96888aa74704cfd3"}, + {file = "grpcio_status-1.71.2.tar.gz", hash = "sha256:c7a97e176df71cdc2c179cd1847d7fc86cca5832ad12e9798d7fed6b7a1aab50"}, ] markers = {main = "extra == \"bigquery\""} [package.dependencies] googleapis-common-protos = ">=1.5.5" -grpcio = ">=1.73.0" -protobuf = ">=6.30.0,<7.0.0" +grpcio = ">=1.71.2" +protobuf = ">=5.26.1,<6.0dev" [[package]] name = "hf-xet" @@ -4175,23 +4175,24 @@ testing = ["google-api-core (>=1.31.5)"] [[package]] name = "protobuf" -version = "6.31.1" +version = "5.29.1" description = "" optional = false -python-versions = ">=3.9" +python-versions = ">=3.8" groups = ["main", "dev"] files = [ - {file = "protobuf-6.31.1-cp310-abi3-win32.whl", hash = "sha256:7fa17d5a29c2e04b7d90e5e32388b8bfd0e7107cd8e616feef7ed3fa6bdab5c9"}, - {file = "protobuf-6.31.1-cp310-abi3-win_amd64.whl", hash = "sha256:426f59d2964864a1a366254fa703b8632dcec0790d8862d30034d8245e1cd447"}, - {file = "protobuf-6.31.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:6f1227473dc43d44ed644425268eb7c2e488ae245d51c6866d19fe158e207402"}, - {file = "protobuf-6.31.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:a40fc12b84c154884d7d4c4ebd675d5b3b5283e155f324049ae396b95ddebc39"}, - {file = "protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:4ee898bf66f7a8b0bd21bce523814e6fbd8c6add948045ce958b73af7e8878c6"}, - {file = "protobuf-6.31.1-cp39-cp39-win32.whl", hash = "sha256:0414e3aa5a5f3ff423828e1e6a6e907d6c65c1d5b7e6e975793d5590bdeecc16"}, - {file = "protobuf-6.31.1-cp39-cp39-win_amd64.whl", hash = "sha256:8764cf4587791e7564051b35524b72844f845ad0bb011704c3736cce762d8fe9"}, - {file = "protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e"}, - {file = "protobuf-6.31.1.tar.gz", hash = "sha256:d8cac4c982f0b957a4dc73a80e2ea24fab08e679c0de9deb835f4a12d69aca9a"}, + {file = "protobuf-5.29.1-cp310-abi3-win32.whl", hash = "sha256:22c1f539024241ee545cbcb00ee160ad1877975690b16656ff87dde107b5f110"}, + {file = "protobuf-5.29.1-cp310-abi3-win_amd64.whl", hash = "sha256:1fc55267f086dd4050d18ef839d7bd69300d0d08c2a53ca7df3920cc271a3c34"}, + {file = "protobuf-5.29.1-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:d473655e29c0c4bbf8b69e9a8fb54645bc289dead6d753b952e7aa660254ae18"}, + {file = "protobuf-5.29.1-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:b5ba1d0e4c8a40ae0496d0e2ecfdbb82e1776928a205106d14ad6985a09ec155"}, + {file = "protobuf-5.29.1-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:8ee1461b3af56145aca2800e6a3e2f928108c749ba8feccc6f5dd0062c410c0d"}, + {file = "protobuf-5.29.1-cp38-cp38-win32.whl", hash = "sha256:50879eb0eb1246e3a5eabbbe566b44b10348939b7cc1b267567e8c3d07213853"}, + {file = "protobuf-5.29.1-cp38-cp38-win_amd64.whl", hash = "sha256:027fbcc48cea65a6b17028510fdd054147057fa78f4772eb547b9274e5219331"}, + {file = "protobuf-5.29.1-cp39-cp39-win32.whl", hash = "sha256:5a41deccfa5e745cef5c65a560c76ec0ed8e70908a67cc8f4da5fce588b50d57"}, + {file = "protobuf-5.29.1-cp39-cp39-win_amd64.whl", hash = "sha256:012ce28d862ff417fd629285aca5d9772807f15ceb1a0dbd15b88f58c776c98c"}, + {file = "protobuf-5.29.1-py3-none-any.whl", hash = "sha256:32600ddb9c2a53dedc25b8581ea0f1fd8ea04956373c0c07577ce58d312522e0"}, + {file = "protobuf-5.29.1.tar.gz", hash = "sha256:683be02ca21a6ffe80db6dd02c0b5b2892322c59ca57fd6c872d652cb80549cb"}, ] -markers = {main = "extra == \"ray\" or extra == \"bigquery\" or extra == \"gcsfs\""} [[package]] name = "psutil" @@ -6501,4 +6502,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.1" python-versions = "^3.9.2, !=3.9.7" -content-hash = "98d52283213deb95a118948bdc0e1fc0262571193a2114cb1364b74dd2108977" +content-hash = "21456aae4eb5ae5bf02826b4513e03a74d3c95c293bfd14ea19cb17c15c3c9f5" diff --git a/pyproject.toml b/pyproject.toml index bc6354e847..d468b038fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,6 +102,7 @@ moto = { version = "^5.0.2", extras = ["server"] } typing-extensions = "4.15.0" pytest-mock = "3.15.1" pyspark = { version = "4.0.1", extras = ["connect"] } +protobuf = "5.29.1" # match Spark Connect's gencode cython = "3.1.4" deptry = ">=0.14,<0.24" docutils = "!=0.21.post1" # https://github.com/python-poetry/poetry/issues/9248#issuecomment-2026240520