From fa9dcc5d6413185b13fe50b02e1ef169f40bd07c Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 27 Sep 2025 21:47:58 -0700 Subject: [PATCH 01/12] use spark connect for provision.py --- Makefile | 3 +-- dev/Dockerfile | 1 - dev/provision.py | 22 +++++++--------------- 3 files changed, 8 insertions(+), 18 deletions(-) diff --git a/Makefile b/Makefile index cbcc26dd21..756ecc0305 100644 --- a/Makefile +++ b/Makefile @@ -102,8 +102,7 @@ test-integration-setup: ## Start Docker services for integration tests docker compose -f dev/docker-compose-integration.yml rm -f docker compose -f dev/docker-compose-integration.yml up -d sleep 10 - docker compose -f dev/docker-compose-integration.yml cp ./dev/provision.py spark-iceberg:/opt/spark/provision.py - docker compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py + ${TEST_RUNNER} python dev/provision.py test-integration-exec: ## Run integration tests (excluding provision) $(TEST_RUNNER) pytest tests/ -m integration $(PYTEST_ARGS) diff --git a/dev/Dockerfile b/dev/Dockerfile index 77ba154851..7194378963 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -92,7 +92,6 @@ RUN pip3 install -q ipython RUN pip3 install "pyiceberg[s3fs,hive,pyarrow]==${PYICEBERG_VERSION}" COPY entrypoint.sh . -COPY provision.py . ENTRYPOINT ["./entrypoint.sh"] CMD ["notebook"] diff --git a/dev/provision.py b/dev/provision.py index 71bbbd73c3..cdc9ead10c 100644 --- a/dev/provision.py +++ b/dev/provision.py @@ -23,25 +23,17 @@ from pyiceberg.schema import Schema from pyiceberg.types import FixedType, NestedField, UUIDType -# The configuration is important, otherwise we get many small -# parquet files with a single row. When a positional delete -# hits the Parquet file with one row, the parquet file gets -# dropped instead of having a merge-on-read delete file. -spark = ( - SparkSession - .builder - .config("spark.sql.shuffle.partitions", "1") - .config("spark.default.parallelism", "1") - .getOrCreate() -) +# Create SparkSession against the remote Spark Connect server +spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate() + catalogs = { 'rest': load_catalog( "rest", **{ "type": "rest", - "uri": "http://rest:8181", - "s3.endpoint": "http://minio:9000", + "uri": "http://localhost:8181", + "s3.endpoint": "http://localhost:9000", "s3.access-key-id": "admin", "s3.secret-access-key": "password", }, @@ -50,8 +42,8 @@ "hive", **{ "type": "hive", - "uri": "thrift://hive:9083", - "s3.endpoint": "http://minio:9000", + "uri": "thrift://localhost:9083", + "s3.endpoint": "http://localhost:9000", "s3.access-key-id": "admin", "s3.secret-access-key": "password", }, From 03c1da9d0e89b5b45663d59d08c9003a2fa7a690 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 27 Sep 2025 21:58:03 -0700 Subject: [PATCH 02/12] remove python packages from docker container --- dev/Dockerfile | 4 ---- dev/docker-compose-integration.yml | 1 - 2 files changed, 5 deletions(-) diff --git a/dev/Dockerfile b/dev/Dockerfile index 7194378963..faa271ead8 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -87,10 +87,6 @@ ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" RUN chmod u+x /opt/spark/sbin/* && \ chmod u+x /opt/spark/bin/* -RUN pip3 install -q ipython - -RUN pip3 install "pyiceberg[s3fs,hive,pyarrow]==${PYICEBERG_VERSION}" - COPY entrypoint.sh . ENTRYPOINT ["./entrypoint.sh"] diff --git a/dev/docker-compose-integration.yml b/dev/docker-compose-integration.yml index ec4245bcbf..77360ed469 100644 --- a/dev/docker-compose-integration.yml +++ b/dev/docker-compose-integration.yml @@ -17,7 +17,6 @@ services: spark-iceberg: - image: python-integration container_name: pyiceberg-spark build: . networks: From 5e5d348b3a976117121d56e227effe307059f7c5 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 27 Sep 2025 23:02:20 -0700 Subject: [PATCH 03/12] use spark base image --- dev/Dockerfile | 123 +++++++++++++---------------- dev/docker-compose-integration.yml | 2 +- dev/entrypoint.sh | 23 ------ 3 files changed, 55 insertions(+), 93 deletions(-) delete mode 100755 dev/entrypoint.sh diff --git a/dev/Dockerfile b/dev/Dockerfile index faa271ead8..99979cf91a 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -13,81 +13,66 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM python:3.12-bullseye +ARG IMAGE_SPARK_VERSION=3.5.6 -RUN apt-get -qq update && \ - apt-get -qq install -y --no-install-recommends \ - sudo \ - curl \ - vim \ - unzip \ - openjdk-11-jdk \ - build-essential \ - software-properties-common \ - ssh && \ - apt-get -qq clean && \ - rm -rf /var/lib/apt/lists/* +FROM apache/spark:${IMAGE_SPARK_VERSION} -# Optional env variables -ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"} -ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"} -ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH +ARG SPARK_VERSION=3.5.6 +ARG SCALA_VERSION=2.12 +ARG ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 +ARG ICEBERG_VERSION=1.10.0 +ARG HADOOP_VERSION=3.3.4 +ARG AWS_SDK_VERSION=1.12.753 +ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2 -RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events -WORKDIR ${SPARK_HOME} +# Install dependencies and download JARs in single layer +USER root +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + wget \ + curl && \ + # Create temporary directory for downloads + mkdir -p /tmp/jars && \ + cd /tmp/jars && \ + # Download JARs with error handling + for url in \ + "${MAVEN_MIRROR}/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar" \ + "${MAVEN_MIRROR}/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \ + "${MAVEN_MIRROR}/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \ + "${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \ + "${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar" \ + ; do \ + echo "Downloading: ${url}" && \ + wget --progress=dot:giga --retry-connrefused --waitretry=1 --timeout=60 "${url}" || exit 1; \ + done && \ + # Move JARs to Spark directory + mv *.jar "${SPARK_HOME}/jars/" && \ + chown spark:spark "${SPARK_HOME}/jars"/*.jar && \ + # Create Spark events directory + mkdir -p "/home/iceberg/spark-events" && \ + chown spark:spark "/home/iceberg/spark-events" && \ + # Cleanup + cd / && \ + rm -rf /tmp/jars && \ + apt-get remove -y wget && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -ENV SPARK_VERSION=3.5.6 -ENV SCALA_VERSION=2.12 -ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_${SCALA_VERSION} -ENV ICEBERG_VERSION=1.10.0 -ENV PYICEBERG_VERSION=0.10.0 -ENV HADOOP_VERSION=3.3.4 -ENV AWS_SDK_VERSION=1.12.753 +# Switch back to spark user +USER spark -# Try the primary Apache mirror (downloads.apache.org) first, then fall back to the archive -RUN set -eux; \ - FILE=spark-${SPARK_VERSION}-bin-hadoop3.tgz; \ - URLS="https://downloads.apache.org/spark/spark-${SPARK_VERSION}/${FILE} https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${FILE}"; \ - for url in $URLS; do \ - echo "Attempting download: $url"; \ - if curl --retry 3 --retry-delay 5 -f -s -C - "$url" -o "$FILE"; then \ - echo "Downloaded from: $url"; \ - break; \ - else \ - echo "Failed to download from: $url"; \ - fi; \ - done; \ - if [ ! -f "$FILE" ]; then echo "Failed to download Spark from all mirrors" >&2; exit 1; fi; \ - tar xzf "$FILE" --directory /opt/spark --strip-components 1; \ - rm -rf "$FILE" +# Working directory +WORKDIR "${SPARK_HOME}" -# Download Spark Connect server JAR -RUN curl --retry 5 -s -L https://repo1.maven.org/maven2/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar \ - -Lo /opt/spark/jars/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar +# Copy Spark configuration +COPY spark-defaults.conf "${SPARK_HOME}/conf/" -# Download iceberg spark runtime -RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \ - -Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar +# Create healthcheck +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD curl -f http://localhost:15002/ || exit 1 -# Download AWS bundle -RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \ - -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar +# Expose Spark Connect port (default is 15002) +EXPOSE 15002 -# Download hadoop-aws (required for S3 support) -RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \ - -Lo /opt/spark/jars/hadoop-aws-${HADOOP_VERSION}.jar - -# Download AWS SDK bundle -RUN curl --retry 5 -s https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar \ - -Lo /opt/spark/jars/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar - -COPY spark-defaults.conf /opt/spark/conf -ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" - -RUN chmod u+x /opt/spark/sbin/* && \ - chmod u+x /opt/spark/bin/* - -COPY entrypoint.sh . - -ENTRYPOINT ["./entrypoint.sh"] -CMD ["notebook"] +CMD ["/bin/bash", "-c", "${SPARK_HOME}/sbin/start-connect-server.sh && tail -f /dev/null"] diff --git a/dev/docker-compose-integration.yml b/dev/docker-compose-integration.yml index 77360ed469..f547d15884 100644 --- a/dev/docker-compose-integration.yml +++ b/dev/docker-compose-integration.yml @@ -86,7 +86,7 @@ services: " hive: build: hive/ - container_name: hive + container_name: pyiceberg-hive hostname: hive networks: iceberg_net: diff --git a/dev/entrypoint.sh b/dev/entrypoint.sh deleted file mode 100755 index 3912eb4b15..0000000000 --- a/dev/entrypoint.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -start-connect-server.sh - -tail -f /dev/null From 4cc22b5dced8a5ea695f68ed65c9bf10b4ab99de Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 27 Sep 2025 23:32:22 -0700 Subject: [PATCH 04/12] refactor --- Makefile | 1 - dev/Dockerfile | 85 ++++++++++++++++++++++++-------------------------- 2 files changed, 40 insertions(+), 46 deletions(-) diff --git a/Makefile b/Makefile index 756ecc0305..d1458649f7 100644 --- a/Makefile +++ b/Makefile @@ -101,7 +101,6 @@ test-integration-setup: ## Start Docker services for integration tests docker compose -f dev/docker-compose-integration.yml kill docker compose -f dev/docker-compose-integration.yml rm -f docker compose -f dev/docker-compose-integration.yml up -d - sleep 10 ${TEST_RUNNER} python dev/provision.py test-integration-exec: ## Run integration tests (excluding provision) diff --git a/dev/Dockerfile b/dev/Dockerfile index 99979cf91a..2237cddb4b 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -13,10 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG IMAGE_SPARK_VERSION=3.5.6 +ARG BASE_IMAGE_SPARK_VERSION=3.5.6 -FROM apache/spark:${IMAGE_SPARK_VERSION} +FROM apache/spark:${BASE_IMAGE_SPARK_VERSION} +# Dependency versions - keep these compatible ARG SPARK_VERSION=3.5.6 ARG SCALA_VERSION=2.12 ARG ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 @@ -25,54 +26,48 @@ ARG HADOOP_VERSION=3.3.4 ARG AWS_SDK_VERSION=1.12.753 ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2 -# Install dependencies and download JARs in single layer USER root +WORKDIR ${SPARK_HOME} + +# Install curl for JAR downloads RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - wget \ - curl && \ - # Create temporary directory for downloads - mkdir -p /tmp/jars && \ - cd /tmp/jars && \ - # Download JARs with error handling - for url in \ - "${MAVEN_MIRROR}/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar" \ - "${MAVEN_MIRROR}/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \ - "${MAVEN_MIRROR}/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \ - "${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \ - "${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar" \ - ; do \ - echo "Downloading: ${url}" && \ - wget --progress=dot:giga --retry-connrefused --waitretry=1 --timeout=60 "${url}" || exit 1; \ - done && \ - # Move JARs to Spark directory - mv *.jar "${SPARK_HOME}/jars/" && \ - chown spark:spark "${SPARK_HOME}/jars"/*.jar && \ - # Create Spark events directory - mkdir -p "/home/iceberg/spark-events" && \ - chown spark:spark "/home/iceberg/spark-events" && \ - # Cleanup - cd / && \ - rm -rf /tmp/jars && \ - apt-get remove -y wget && \ - apt-get autoremove -y && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + apt-get install -y --no-install-recommends curl && \ + rm -rf /var/lib/apt/lists/* -# Switch back to spark user -USER spark +# Copy configuration (early for better caching) +COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/ -# Working directory -WORKDIR "${SPARK_HOME}" +# Create event log directory +RUN mkdir -p /home/iceberg/spark-events && \ + chown -R spark:spark /home/iceberg -# Copy Spark configuration -COPY spark-defaults.conf "${SPARK_HOME}/conf/" +# Required JAR dependencies +ENV JARS_TO_DOWNLOAD="\ + org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar \ + org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \ + org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \ + org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \ + com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar" -# Create healthcheck -HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ - CMD curl -f http://localhost:15002/ || exit 1 +# Download JARs with retry logic +RUN set -e && \ + cd "${SPARK_HOME}/jars" && \ + for jar_path in ${JARS_TO_DOWNLOAD}; do \ + jar_name=$(basename "${jar_path}") && \ + echo "Downloading ${jar_name}..." && \ + curl -fsSL --retry 3 --retry-delay 5 \ + -o "${jar_name}" \ + "${MAVEN_MIRROR}/${jar_path}" && \ + echo "✓ Downloaded ${jar_name}"; \ + done && \ + chown -R spark:spark "${SPARK_HOME}/jars" + +USER spark +WORKDIR ${SPARK_HOME} -# Expose Spark Connect port (default is 15002) -EXPOSE 15002 +# Health check for Spark Connect server +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD curl -sf http://localhost:15002/ || exit 1 -CMD ["/bin/bash", "-c", "${SPARK_HOME}/sbin/start-connect-server.sh && tail -f /dev/null"] +# Start Spark Connect server +CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"] From 13074c90468ed107c8699aaa31604af84577fd41 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 27 Sep 2025 23:43:56 -0700 Subject: [PATCH 05/12] add healthcheck --- Makefile | 2 +- dev/Dockerfile | 4 ---- dev/docker-compose-integration.yml | 6 ++++++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index d1458649f7..e980f62980 100644 --- a/Makefile +++ b/Makefile @@ -100,7 +100,7 @@ test-integration: test-integration-setup test-integration-exec test-integration- test-integration-setup: ## Start Docker services for integration tests docker compose -f dev/docker-compose-integration.yml kill docker compose -f dev/docker-compose-integration.yml rm -f - docker compose -f dev/docker-compose-integration.yml up -d + docker compose -f dev/docker-compose-integration.yml up -d --wait ${TEST_RUNNER} python dev/provision.py test-integration-exec: ## Run integration tests (excluding provision) diff --git a/dev/Dockerfile b/dev/Dockerfile index 2237cddb4b..af5bebac42 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -65,9 +65,5 @@ RUN set -e && \ USER spark WORKDIR ${SPARK_HOME} -# Health check for Spark Connect server -HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ - CMD curl -sf http://localhost:15002/ || exit 1 - # Start Spark Connect server CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"] diff --git a/dev/docker-compose-integration.yml b/dev/docker-compose-integration.yml index f547d15884..c0fac1bf73 100644 --- a/dev/docker-compose-integration.yml +++ b/dev/docker-compose-integration.yml @@ -36,6 +36,12 @@ services: - rest:rest - hive:hive - minio:minio + healthcheck: + test: ["CMD", "sh", "-c", "ss -tuln | grep :15002 || nc -z localhost 15002"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 90s rest: image: apache/iceberg-rest-fixture container_name: pyiceberg-rest From 9d876842a072f6fc1374da5c298de2cb8786f4e0 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 27 Sep 2025 23:46:28 -0700 Subject: [PATCH 06/12] include python --- .github/workflows/python-ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 9f8ebf7215..6a2b144f05 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -75,6 +75,9 @@ jobs: steps: - uses: actions/checkout@v5 + - uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python }} - name: Install system dependencies run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos - name: Install From a923880549fd6705823c54004a7e04e2c9795de7 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 27 Sep 2025 23:54:35 -0700 Subject: [PATCH 07/12] remove reference --- mkdocs/docs/how-to-release.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mkdocs/docs/how-to-release.md b/mkdocs/docs/how-to-release.md index a2fa3f7047..cefc982d54 100644 --- a/mkdocs/docs/how-to-release.md +++ b/mkdocs/docs/how-to-release.md @@ -389,10 +389,6 @@ Run the [`Release Docs` Github Action](https://github.com/apache/iceberg-python/ Make sure to create a PR to update the [GitHub issues template](https://github.com/apache/iceberg-python/blob/main/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml) with the latest version. -### Update the integration tests - -Ensure to update the `PYICEBERG_VERSION` in the [Dockerfile](https://github.com/apache/iceberg-python/blob/main/dev/Dockerfile). - ## Misc ### Set up GPG key and Upload to Apache Iceberg KEYS file From d66c40b7acbfa22ef2e229fd7a685035c327e749 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 27 Sep 2025 23:54:45 -0700 Subject: [PATCH 08/12] make lint --- dev/provision.py | 9 ++++----- ruff.toml | 1 - 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/dev/provision.py b/dev/provision.py index cdc9ead10c..d00feecc35 100644 --- a/dev/provision.py +++ b/dev/provision.py @@ -14,7 +14,6 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import math from pyspark.sql import SparkSession from pyspark.sql.functions import current_date, date_add, expr @@ -28,7 +27,7 @@ catalogs = { - 'rest': load_catalog( + "rest": load_catalog( "rest", **{ "type": "rest", @@ -38,7 +37,7 @@ "s3.secret-access-key": "password", }, ), - 'hive': load_catalog( + "hive": load_catalog( "hive", **{ "type": "hive", @@ -111,7 +110,7 @@ # v3: Using deletion vectors for format_version in [2, 3]: - identifier = f'{catalog_name}.default.test_positional_mor_deletes_v{format_version}' + identifier = f"{catalog_name}.default.test_positional_mor_deletes_v{format_version}" spark.sql( f""" CREATE OR REPLACE TABLE {identifier} ( @@ -156,7 +155,7 @@ spark.sql(f"DELETE FROM {identifier} WHERE number = 9") - identifier = f'{catalog_name}.default.test_positional_mor_double_deletes_v{format_version}' + identifier = f"{catalog_name}.default.test_positional_mor_double_deletes_v{format_version}" spark.sql( f""" diff --git a/ruff.toml b/ruff.toml index 11fd2a957b..b7bc461cf6 100644 --- a/ruff.toml +++ b/ruff.toml @@ -16,7 +16,6 @@ # under the License. src = ['pyiceberg','tests'] -extend-exclude = ["dev/provision.py"] # Exclude a variety of commonly ignored directories. exclude = [ From 311931f8d15fd9328642e43b1b63f551d96bc3e0 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 27 Sep 2025 23:59:10 -0700 Subject: [PATCH 09/12] healthcheck --- dev/docker-compose-integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/docker-compose-integration.yml b/dev/docker-compose-integration.yml index c0fac1bf73..9e983356f6 100644 --- a/dev/docker-compose-integration.yml +++ b/dev/docker-compose-integration.yml @@ -37,7 +37,7 @@ services: - hive:hive - minio:minio healthcheck: - test: ["CMD", "sh", "-c", "ss -tuln | grep :15002 || nc -z localhost 15002"] + test: ["CMD", "sh", "-c", "netstat -an | grep 15002 | grep LISTEN"] interval: 30s timeout: 10s retries: 5 From fd0d3d8c83af156f07fd41db0cf80ac72c2ee065 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 28 Sep 2025 00:09:02 -0700 Subject: [PATCH 10/12] use the right command --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e980f62980..d142c5ad1c 100644 --- a/Makefile +++ b/Makefile @@ -101,7 +101,7 @@ test-integration-setup: ## Start Docker services for integration tests docker compose -f dev/docker-compose-integration.yml kill docker compose -f dev/docker-compose-integration.yml rm -f docker compose -f dev/docker-compose-integration.yml up -d --wait - ${TEST_RUNNER} python dev/provision.py + $(POETRY) run python dev/provision.py test-integration-exec: ## Run integration tests (excluding provision) $(TEST_RUNNER) pytest tests/ -m integration $(PYTEST_ARGS) From efa91e7bfa82b21d6ce4858c6e6b748da8791726 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 28 Sep 2025 00:47:09 -0700 Subject: [PATCH 11/12] reorder --- dev/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/Dockerfile b/dev/Dockerfile index af5bebac42..d0fc6a4fdd 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -18,10 +18,10 @@ ARG BASE_IMAGE_SPARK_VERSION=3.5.6 FROM apache/spark:${BASE_IMAGE_SPARK_VERSION} # Dependency versions - keep these compatible +ARG ICEBERG_VERSION=1.10.0 +ARG ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 ARG SPARK_VERSION=3.5.6 ARG SCALA_VERSION=2.12 -ARG ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 -ARG ICEBERG_VERSION=1.10.0 ARG HADOOP_VERSION=3.3.4 ARG AWS_SDK_VERSION=1.12.753 ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2 From 8b198f591bf98a58717839567984e9d009ea5e88 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 1 Oct 2025 09:49:53 -0700 Subject: [PATCH 12/12] fix positional deletes --- dev/provision.py | 27 +++++++++++---------------- tests/integration/test_reads.py | 8 ++++++++ 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/dev/provision.py b/dev/provision.py index d00feecc35..695ef9b1bf 100644 --- a/dev/provision.py +++ b/dev/provision.py @@ -25,7 +25,6 @@ # Create SparkSession against the remote Spark Connect server spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate() - catalogs = { "rest": load_catalog( "rest", @@ -128,10 +127,8 @@ """ ) - spark.sql( - f""" - INSERT INTO {identifier} - VALUES + spark.sql(""" + SELECT * FROM VALUES (CAST('2023-03-01' AS date), 1, 'a'), (CAST('2023-03-02' AS date), 2, 'b'), (CAST('2023-03-03' AS date), 3, 'c'), @@ -143,9 +140,9 @@ (CAST('2023-03-09' AS date), 9, 'i'), (CAST('2023-03-10' AS date), 10, 'j'), (CAST('2023-03-11' AS date), 11, 'k'), - (CAST('2023-03-12' AS date), 12, 'l'); - """ - ) + (CAST('2023-03-12' AS date), 12, 'l') + AS t(dt, number, letter) + """).coalesce(1).writeTo(identifier).append() spark.sql(f"ALTER TABLE {identifier} CREATE TAG tag_12") @@ -169,15 +166,13 @@ 'write.delete.mode'='merge-on-read', 'write.update.mode'='merge-on-read', 'write.merge.mode'='merge-on-read', - 'format-version'='2' + 'format-version'='{format_version}' ); """ ) - spark.sql( - f""" - INSERT INTO {identifier} - VALUES + spark.sql(""" + SELECT * FROM VALUES (CAST('2023-03-01' AS date), 1, 'a'), (CAST('2023-03-02' AS date), 2, 'b'), (CAST('2023-03-03' AS date), 3, 'c'), @@ -189,9 +184,9 @@ (CAST('2023-03-09' AS date), 9, 'i'), (CAST('2023-03-10' AS date), 10, 'j'), (CAST('2023-03-11' AS date), 11, 'k'), - (CAST('2023-03-12' AS date), 12, 'l'); - """ - ) + (CAST('2023-03-12' AS date), 12, 'l') + AS t(dt, number, letter) + """).coalesce(1).writeTo(identifier).append() # Perform two deletes, should produce: # v2: two positional delete files in v2 diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 375eb35b2b..99116ad16f 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -432,6 +432,8 @@ def test_pyarrow_deletes(catalog: Catalog, format_version: int) -> None: # (11, 'k'), # (12, 'l') test_positional_mor_deletes = catalog.load_table(f"default.test_positional_mor_deletes_v{format_version}") + if format_version == 2: + assert len(test_positional_mor_deletes.inspect.delete_files()) > 0, "Table should produce position delete files" arrow_table = test_positional_mor_deletes.scan().to_arrow() assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12] @@ -470,6 +472,8 @@ def test_pyarrow_deletes_double(catalog: Catalog, format_version: int) -> None: # (11, 'k'), # (12, 'l') test_positional_mor_double_deletes = catalog.load_table(f"default.test_positional_mor_double_deletes_v{format_version}") + if format_version == 2: + assert len(test_positional_mor_double_deletes.inspect.delete_files()) > 0, "Table should produce position delete files" arrow_table = test_positional_mor_double_deletes.scan().to_arrow() assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 7, 8, 10, 11, 12] @@ -508,6 +512,8 @@ def test_pyarrow_batches_deletes(catalog: Catalog, format_version: int) -> None: # (11, 'k'), # (12, 'l') test_positional_mor_deletes = catalog.load_table(f"default.test_positional_mor_deletes_v{format_version}") + if format_version == 2: + assert len(test_positional_mor_deletes.inspect.delete_files()) > 0, "Table should produce position delete files" arrow_table = test_positional_mor_deletes.scan().to_arrow_batch_reader().read_all() assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12] @@ -550,6 +556,8 @@ def test_pyarrow_batches_deletes_double(catalog: Catalog, format_version: int) - # (11, 'k'), # (12, 'l') test_positional_mor_double_deletes = catalog.load_table(f"default.test_positional_mor_double_deletes_v{format_version}") + if format_version == 2: + assert len(test_positional_mor_double_deletes.inspect.delete_files()) > 0, "Table should produce position delete files" arrow_table = test_positional_mor_double_deletes.scan().to_arrow_batch_reader().read_all() assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 7, 8, 10, 11, 12]