diff --git a/.env b/.env index b50a16eb01..5398249b24 100644 --- a/.env +++ b/.env @@ -40,6 +40,7 @@ ARCH_SHORT=amd64 # Default repository to pull and push images from REPO=ghcr.io/apache/arrow-java-dev +ARROW_REPO=apache/arrow-dev # The setup attempts to generate coredumps by default, in order to disable the # coredump generation set it to 0 @@ -48,3 +49,9 @@ ULIMIT_CORE=-1 # Default versions for various dependencies JDK=11 MAVEN=3.9.9 + +# Versions for various dependencies used to build artifacts +# Keep in sync with apache/arrow +ARROW_REPO_ROOT=./arrow +PYTHON=3.9 +VCPKG="943c5ef1c8f6b5e6ced092b242c8299caae2ff01" # 2024.04.26 Release diff --git a/.github/workflows/test_jni.yml b/.github/workflows/test_jni.yml new file mode 100644 index 0000000000..31eb9b8743 --- /dev/null +++ b/.github/workflows/test_jni.yml @@ -0,0 +1,267 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Test (JNI) + +on: + push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' + pull_request: + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +permissions: + contents: read + +env: + DOCKER_VOLUME_PREFIX: ".docker/" + +jobs: + cpp-ubuntu: + name: Build C++ libraries ${{ matrix.platform.runs_on }} ${{ matrix.platform.arch }} + runs-on: ${{ matrix.platform.runs_on }} + strategy: + fail-fast: false + matrix: + platform: + - runs_on: ubuntu-latest + arch: "x86_64" + archery_arch: "amd64" + env: + # architecture name used for archery build + ARCH: ${{ matrix.platform.archery_arch }} + permissions: + contents: read + packages: write + steps: + - name: Checkout apache/arrow-java + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + fetch-depth: 0 + submodules: recursive + - name: Checkout apache/arrow + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + repository: apache/arrow + fetch-depth: 0 + path: arrow + submodules: recursive + - uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build C++ libraries + run: | + docker compose run vcpkg-jni + - name: Push Docker image + if: success() && github.event_name == 'push' && github.repository == 'apache/arrow-java' && github.ref_name == 'main' + run: | + docker compose push vcpkg-jni + - name: Compress into single artifact to keep directory structure + run: tar -cvzf arrow-shared-libs-linux-${{ matrix.platform.arch }}.tar.gz dist/ + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: ubuntu-shared-lib-${{ matrix.platform.arch }} + path: arrow-shared-libs-linux-${{ matrix.platform.arch }}.tar.gz + + cpp-macos: + name: Build C++ libraries macOS ${{ matrix.platform.runs_on }} ${{ matrix.platform.arch }} + runs-on: ${{ matrix.platform.runs_on }} + strategy: + fail-fast: false + matrix: + platform: + - { runs_on: macos-13, arch: "x86_64"} + - { runs_on: macos-14, arch: "aarch_64" } + env: + MACOSX_DEPLOYMENT_TARGET: "14.0" + steps: + - name: Checkout apache/arrow-java + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + fetch-depth: 0 + submodules: recursive + - name: Checkout apache/arrow + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + repository: apache/arrow + fetch-depth: 0 + path: arrow + submodules: recursive + - name: Set up Python + uses: actions/setup-python@v4 + with: + cache: 'pip' + python-version: 3.12 + - name: Install Archery + run: pip install -e arrow/dev/archery[all] + - name: Install dependencies + run: | + # We want to use llvm@14 to avoid shared z3 + # dependency. llvm@14 doesn't depend on z3 and llvm depends + # on z3. And Homebrew's z3 provides only shared library. It + # doesn't provides static z3 because z3's CMake doesn't accept + # building both shared and static libraries at once. + # See also: Z3_BUILD_LIBZ3_SHARED in + # https://github.com/Z3Prover/z3/blob/master/README-CMake.md + # + # If llvm is installed, Apache Arrow C++ uses llvm rather than + # llvm@14 because llvm is newer than llvm@14. + brew uninstall llvm || : + + # Ensure updating python@XXX with the "--overwrite" option. + # If python@XXX is updated without "--overwrite", it causes + # a conflict error. Because Python 3 installed not by + # Homebrew exists in /usr/local on GitHub Actions. If + # Homebrew's python@XXX is updated without "--overwrite", it + # tries to replace /usr/local/bin/2to3 and so on and causes + # a conflict error. + brew update + for python_package in $(brew list | grep python@); do + brew install --overwrite ${python_package} + done + brew install --overwrite python + + if [ "$(uname -m)" = "arm64" ]; then + # pkg-config formula is deprecated but it's still installed + # in GitHub Actions runner now. We can remove this once + # pkg-config formula is removed from GitHub Actions runner. + brew uninstall pkg-config || : + brew uninstall pkg-config@0.29.2 || : + fi + + brew bundle --file=arrow/cpp/Brewfile + # We want to link aws-sdk-cpp statically but Homebrew's + # aws-sdk-cpp provides only shared library. If we have + # Homebrew's aws-sdk-cpp, our build mix Homebrew's + # aws-sdk-cpp and bundled aws-sdk-cpp. We uninstall Homebrew's + # aws-sdk-cpp to ensure using only bundled aws-sdk-cpp. + brew uninstall aws-sdk-cpp + # We want to use bundled RE2 for static linking. If + # Homebrew's RE2 is installed, its header file may be used. + # We uninstall Homebrew's RE2 to ensure using bundled RE2. + brew uninstall grpc || : # gRPC depends on RE2 + brew uninstall grpc@1.54 || : # gRPC 1.54 may be installed too + brew uninstall re2 + # We want to use bundled Protobuf for static linking. If + # Homebrew's Protobuf is installed, its library file may be + # used on test We uninstall Homebrew's Protobuf to ensure using + # bundled Protobuf. + brew uninstall protobuf + + brew bundle --file=Brewfile + - name: Build C++ libraries + run: | + set -e + # make brew Java available to CMake + export JAVA_HOME=$(brew --prefix openjdk@11)/libexec/openjdk.jdk/Contents/Home + ./ci/scripts/jni_macos_build.sh \ + $GITHUB_WORKSPACE \ + $GITHUB_WORKSPACE/arrow \ + $GITHUB_WORKSPACE/arrow-java/cpp-build \ + $GITHUB_WORKSPACE/dist + - name: Compress into single artifact to keep directory structure + run: tar -cvzf arrow-shared-libs-macos-${{ matrix.platform.arch }}.tar.gz dist/ + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: macos-shared-lib-${{ matrix.platform.arch }} + path: arrow-shared-libs-macos-${{ matrix.platform.arch }}.tar.gz + + java-jars: + name: Build JAR files + runs-on: ubuntu-latest + needs: + - cpp-ubuntu + - cpp-macos + steps: + - name: Checkout apache/arrow-java + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + fetch-depth: 0 + submodules: recursive + - name: Checkout apache/arrow + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + repository: apache/arrow + fetch-depth: 0 + path: arrow + submodules: recursive + - name: Download Libraries + uses: actions/download-artifact@v4 + with: + path: artifacts + - name: Decompress artifacts + run: | + mv artifacts/*/*.tar.gz . + tar -xvzf arrow-shared-libs-linux-x86_64.tar.gz + # tar -xvzf arrow-shared-libs-linux-aarch_64.tar.gz + tar -xvzf arrow-shared-libs-macos-x86_64.tar.gz + tar -xvzf arrow-shared-libs-macos-aarch_64.tar.gz + # tar -xvzf arrow-shared-libs-windows.tar.gz + - name: Test that shared libraries exist + run: | + set -x + + test -f dist/arrow_cdata_jni/x86_64/libarrow_cdata_jni.so + test -f dist/arrow_dataset_jni/x86_64/libarrow_dataset_jni.so + test -f dist/arrow_orc_jni/x86_64/libarrow_orc_jni.so + test -f dist/gandiva_jni/x86_64/libgandiva_jni.so + + # test -f dist/arrow_cdata_jni/aarch_64/libarrow_cdata_jni.so + # test -f dist/arrow_dataset_jni/aarch_64/libarrow_dataset_jni.so + # test -f dist/arrow_orc_jni/aarch_64/libarrow_orc_jni.so + # test -f dist/gandiva_jni/aarch_64/libgandiva_jni.so + + test -f dist/arrow_cdata_jni/x86_64/libarrow_cdata_jni.dylib + test -f dist/arrow_dataset_jni/x86_64/libarrow_dataset_jni.dylib + test -f dist/arrow_orc_jni/x86_64/libarrow_orc_jni.dylib + test -f dist/gandiva_jni/x86_64/libgandiva_jni.dylib + + test -f dist/arrow_cdata_jni/aarch_64/libarrow_cdata_jni.dylib + test -f dist/arrow_dataset_jni/aarch_64/libarrow_dataset_jni.dylib + test -f dist/arrow_orc_jni/aarch_64/libarrow_orc_jni.dylib + test -f dist/gandiva_jni/aarch_64/libgandiva_jni.dylib + + # test -f dist/arrow_cdata_jni/x86_64/arrow_cdata_jni.dll + # test -f dist/arrow_dataset_jni/x86_64/arrow_dataset_jni.dll + # test -f dist/arrow_orc_jni/x86_64/arrow_orc_jni.dll + - name: Build bundled jar + env: + MAVEN_ARGS: >- + --no-transfer-progress + run: | + set -e + # mvn versions:set -DnewVersion={{ arrow.no_rc_snapshot_version }} + # mvn versions:set -DnewVersion={{ arrow.no_rc_snapshot_version }} -f bom + ./ci/scripts/jni_full_build.sh \ + $GITHUB_WORKSPACE \ + $GITHUB_WORKSPACE/arrow \ + $GITHUB_WORKSPACE/dist + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: java-jars + path: ${{ github.workspace }}/arrow-java/java-dist diff --git a/adapter/orc/src/test/java/org/apache/arrow/adapter/orc/OrcReaderTest.java b/adapter/orc/src/test/java/org/apache/arrow/adapter/orc/OrcReaderTest.java index f8eb91a1cc..f48e6bb95e 100644 --- a/adapter/orc/src/test/java/org/apache/arrow/adapter/orc/OrcReaderTest.java +++ b/adapter/orc/src/test/java/org/apache/arrow/adapter/orc/OrcReaderTest.java @@ -38,6 +38,7 @@ import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -53,6 +54,7 @@ public static void beforeClass() { allocator = new RootAllocator(MAX_ALLOCATION); } + @Disabled("ORC is flaky: https://github.com/apache/arrow-java/pull/449") @Test public void testOrcJniReader() throws Exception { TypeDescription schema = TypeDescription.fromString("struct"); diff --git a/ci/docker/vcpkg-jni.dockerfile b/ci/docker/vcpkg-jni.dockerfile new file mode 100644 index 0000000000..55fa35e0d1 --- /dev/null +++ b/ci/docker/vcpkg-jni.dockerfile @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG base +FROM ${base} + +# Install the libraries required by Gandiva to run +# Use enable llvm[enable-rtti] in the vcpkg.json to avoid link problems in Gandiva +RUN vcpkg install \ + --clean-after-build \ + --x-install-root=${VCPKG_ROOT}/installed \ + --x-manifest-root=/arrow/ci/vcpkg \ + --x-feature=dev \ + --x-feature=flight \ + --x-feature=gcs \ + --x-feature=json \ + --x-feature=parquet \ + --x-feature=gandiva \ + --x-feature=s3 + +# Install Java +# We need Java for JNI headers, but we don't invoke Maven in this build. +ARG java=11 +RUN yum install -y java-$java-openjdk-devel && yum clean all + +# For ci/scripts/{cpp,java}_*.sh +ENV ARROW_HOME=/tmp/local \ + ARROW_JAVA_CDATA=ON \ + ARROW_JAVA_JNI=ON \ + ARROW_USE_CCACHE=ON + +LABEL org.opencontainers.image.source https://github.com/apache/arrow-java diff --git a/ci/scripts/java_jni_build.sh b/ci/scripts/jni_build.sh similarity index 95% rename from ci/scripts/java_jni_build.sh rename to ci/scripts/jni_build.sh index 44388e33fe..4462646347 100755 --- a/ci/scripts/java_jni_build.sh +++ b/ci/scripts/jni_build.sh @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -set -eo pipefail +set -exo pipefail arrow_dir=${1} arrow_install_dir=${2} @@ -47,6 +47,7 @@ esac : "${ARROW_JAVA_BUILD_TESTS:=${ARROW_BUILD_TESTS:-OFF}}" : "${CMAKE_BUILD_TYPE:=release}" +read -ra EXTRA_CMAKE_OPTIONS <<<"${JAVA_JNI_CMAKE_ARGS:-}" cmake \ -DARROW_JAVA_JNI_ENABLE_DATASET="${ARROW_DATASET:-OFF}" \ -DARROW_JAVA_JNI_ENABLE_GANDIVA="${ARROW_GANDIVA:-OFF}" \ @@ -58,7 +59,7 @@ cmake \ -DCMAKE_UNITY_BUILD="${CMAKE_UNITY_BUILD:-OFF}" \ -DProtobuf_USE_STATIC_LIBS=ON \ -GNinja \ - "${JAVA_JNI_CMAKE_ARGS:-}" \ + "${EXTRA_CMAKE_OPTIONS[@]}" \ "${arrow_dir}" export CMAKE_BUILD_PARALLEL_LEVEL=${n_jobs} cmake --build . --config "${CMAKE_BUILD_TYPE}" diff --git a/ci/scripts/jni_full_build.sh b/ci/scripts/jni_full_build.sh new file mode 100755 index 0000000000..1a39aeb510 --- /dev/null +++ b/ci/scripts/jni_full_build.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +arrow_java_dir="${1}" +arrow_dir="${2}" +dist_dir="${3}" + +export ARROW_TEST_DATA="${arrow_dir}/testing/data" + +pushd "${arrow_java_dir}" + +# Ensure that there is no old jar +# inside the maven repository +maven_repo=~/.m2/repository/org/apache/arrow +if [ -d "$maven_repo" ]; then + find "$maven_repo" \ + "(" -name "*.jar" -o -name "*.zip" -o -name "*.pom" ")" \ + -exec echo {} ";" \ + -exec rm -rf {} ";" +fi + +# generate dummy GPG key for -Papache-release. +# -Papache-release generates signs (*.asc) of artifacts. +# We don't use these signs in our release process. +( + echo "Key-Type: RSA" + echo "Key-Length: 4096" + echo "Name-Real: Build" + echo "Name-Email: build@example.com" + echo "%no-protection" +) | + gpg --full-generate-key --batch + +# build the entire project +mvn clean \ + install \ + -Papache-release \ + -Parrow-c-data \ + -Parrow-jni \ + -Darrow.cpp.build.dir="$dist_dir" \ + -Darrow.c.jni.dist.dir="$dist_dir" \ + --no-transfer-progress + +# copy all jar, zip and pom files to the distribution folder +find ~/.m2/repository/org/apache/arrow \ + "(" \ + -name "*.jar" -o \ + -name "*.json" -o \ + -name "*.pom" -o \ + -name "*.xml" -o \ + -name "*.zip" \ + ")" \ + -exec echo "{}" ";" \ + -exec cp "{}" "$dist_dir" ";" + +popd diff --git a/ci/scripts/jni_macos_build.sh b/ci/scripts/jni_macos_build.sh new file mode 100755 index 0000000000..eeabfd1334 --- /dev/null +++ b/ci/scripts/jni_macos_build.sh @@ -0,0 +1,146 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script is like java_jni_build.sh, but is meant for release artifacts +# and hardcodes assumptions about the environment it is being run in. + +set -ex + +arrow_java_dir="${1}" +arrow_dir="${2}" +build_dir="${3}" +normalized_arch="$(arch)" +case "${normalized_arch}" in +arm64) + normalized_arch=aarch_64 + ;; +i386) + normalized_arch=x86_64 + ;; +esac +# The directory where the final binaries will be stored when scripts finish +dist_dir="${4}" + +echo "=== Clear output directories and leftovers ===" +# Clear output directories and leftovers +rm -rf "${build_dir}" + +echo "=== Building Arrow C++ libraries ===" +install_dir="${build_dir}/cpp-install" +: "${ARROW_ACERO:=ON}" +export ARROW_ACERO +: "${ARROW_BUILD_TESTS:=ON}" +: "${ARROW_DATASET:=ON}" +export ARROW_DATASET +: "${ARROW_GANDIVA:=ON}" +export ARROW_GANDIVA +: "${ARROW_ORC:=ON}" +export ARROW_ORC +: "${ARROW_PARQUET:=ON}" +: "${ARROW_S3:=ON}" +: "${ARROW_USE_CCACHE:=OFF}" +: "${CMAKE_BUILD_TYPE:=Release}" +: "${CMAKE_UNITY_BUILD:=ON}" + +if [ "${ARROW_USE_CCACHE}" == "ON" ]; then + echo "=== ccache statistics before build ===" + ccache -sv 2>/dev/null || ccache -s +fi + +export ARROW_TEST_DATA="${arrow_dir}/testing/data" +export PARQUET_TEST_DATA="${arrow_dir}/cpp/submodules/parquet-testing/data" +export AWS_EC2_METADATA_DISABLED=TRUE + +mkdir -p "${build_dir}/cpp" +pushd "${build_dir}/cpp" + +cmake \ + -DARROW_ACERO="${ARROW_ACERO}" \ + -DARROW_BUILD_SHARED=OFF \ + -DARROW_BUILD_TESTS="${ARROW_BUILD_TESTS}" \ + -DARROW_CSV="${ARROW_DATASET}" \ + -DARROW_DATASET="${ARROW_DATASET}" \ + -DARROW_SUBSTRAIT="${ARROW_DATASET}" \ + -DARROW_DEPENDENCY_USE_SHARED=OFF \ + -DARROW_GANDIVA="${ARROW_GANDIVA}" \ + -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ + -DARROW_JSON="${ARROW_DATASET}" \ + -DARROW_ORC="${ARROW_ORC}" \ + -DARROW_PARQUET="${ARROW_PARQUET}" \ + -DARROW_S3="${ARROW_S3}" \ + -DARROW_USE_CCACHE="${ARROW_USE_CCACHE}" \ + -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" \ + -DCMAKE_INSTALL_PREFIX="${install_dir}" \ + -DCMAKE_UNITY_BUILD="${CMAKE_UNITY_BUILD}" \ + -DGTest_SOURCE=BUNDLED \ + -DPARQUET_BUILD_EXAMPLES=OFF \ + -DPARQUET_BUILD_EXECUTABLES=OFF \ + -DPARQUET_REQUIRE_ENCRYPTION=OFF \ + -Dre2_SOURCE=BUNDLED \ + -GNinja \ + "${arrow_dir}/cpp" +cmake --build . --target install + +if [ "${ARROW_BUILD_TESTS}" == "ON" ]; then + # MinIO is required + exclude_tests="arrow-s3fs-test" + # unstable + exclude_tests="${exclude_tests}|arrow-acero-asof-join-node-test" + exclude_tests="${exclude_tests}|arrow-acero-hash-join-node-test" + ctest \ + --exclude-regex "${exclude_tests}" \ + --label-regex unittest \ + --output-on-failure \ + --parallel "$(sysctl -n hw.ncpu)" \ + --timeout 300 +fi + +popd + +export JAVA_JNI_CMAKE_ARGS="-DProtobuf_ROOT=${build_dir}/cpp/protobuf_ep-install" +"${arrow_java_dir}/ci/scripts/jni_build.sh" \ + "${arrow_java_dir}" \ + "${install_dir}" \ + "${build_dir}" \ + "${dist_dir}" + +if [ "${ARROW_USE_CCACHE}" == "ON" ]; then + echo "=== ccache statistics after build ===" + ccache -sv 2>/dev/null || ccache -s +fi + +echo "=== Checking shared dependencies for libraries ===" +pushd "${dist_dir}" +archery linking check-dependencies \ + --allow CoreFoundation \ + --allow Security \ + --allow libSystem \ + --allow libarrow_cdata_jni \ + --allow libarrow_dataset_jni \ + --allow libarrow_orc_jni \ + --allow libc++ \ + --allow libcurl \ + --allow libgandiva_jni \ + --allow libncurses \ + --allow libobjc \ + --allow libz \ + "arrow_cdata_jni/${normalized_arch}/libarrow_cdata_jni.dylib" \ + "arrow_dataset_jni/${normalized_arch}/libarrow_dataset_jni.dylib" \ + "arrow_orc_jni/${normalized_arch}/libarrow_orc_jni.dylib" \ + "gandiva_jni/${normalized_arch}/libgandiva_jni.dylib" +popd diff --git a/ci/scripts/jni_manylinux_build.sh b/ci/scripts/jni_manylinux_build.sh new file mode 100755 index 0000000000..2551d67239 --- /dev/null +++ b/ci/scripts/jni_manylinux_build.sh @@ -0,0 +1,178 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script is like java_jni_build.sh, but is meant for release artifacts +# and hardcodes assumptions about the environment it is being run in. + +set -exo pipefail + +arrow_java_dir="${1}" +arrow_dir="${2}" +build_dir="${3}" +normalized_arch="$(arch)" +case "${normalized_arch}" in +aarch64) + normalized_arch=aarch_64 + ;; +esac +# The directory where the final binaries will be stored when scripts finish +dist_dir="${4}" + +echo "=== Install Archery ===" +pip install -e "${arrow_dir}/dev/archery[all]" + +echo "=== Clear output directories and leftovers ===" +# Clear output directories and leftovers +rm -rf "${build_dir}" +rm -rf "${dist_dir}" + +echo "=== Building Arrow C++ libraries ===" +devtoolset_version="$(rpm -qa "devtoolset-*-gcc" --queryformat '%{VERSION}' | grep -o "^[0-9]*")" +devtoolset_include_cpp="/opt/rh/devtoolset-${devtoolset_version}/root/usr/include/c++/${devtoolset_version}" +: "${ARROW_ACERO:=ON}" +export ARROW_ACERO +: "${ARROW_BUILD_TESTS:=OFF}" +: "${ARROW_DATASET:=ON}" +export ARROW_DATASET +: "${ARROW_GANDIVA:=ON}" +export ARROW_GANDIVA +: "${ARROW_GCS:=ON}" +: "${ARROW_JEMALLOC:=OFF}" +: "${ARROW_MIMALLOC:=ON}" +: "${ARROW_RPATH_ORIGIN:=ON}" +: "${ARROW_ORC:=ON}" +export ARROW_ORC +: "${ARROW_PARQUET:=ON}" +: "${ARROW_S3:=ON}" +: "${ARROW_USE_CCACHE:=OFF}" +: "${CMAKE_BUILD_TYPE:=release}" +: "${CMAKE_UNITY_BUILD:=ON}" +: "${VCPKG_ROOT:=/opt/vcpkg}" +: "${VCPKG_FEATURE_FLAGS:=-manifests}" +: "${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-linux-static-${CMAKE_BUILD_TYPE}}}" +: "${GANDIVA_CXX_FLAGS:=-isystem;${devtoolset_include_cpp};-isystem;${devtoolset_include_cpp}/x86_64-redhat-linux;-lpthread}" + +if [ "${ARROW_USE_CCACHE}" == "ON" ]; then + echo "=== ccache statistics before build ===" + ccache -sv 2>/dev/null || ccache -s +fi + +export ARROW_TEST_DATA="${arrow_dir}/testing/data" +export PARQUET_TEST_DATA="${arrow_dir}/cpp/submodules/parquet-testing/data" +export AWS_EC2_METADATA_DISABLED=TRUE + +mkdir -p "${build_dir}/cpp" +pushd "${build_dir}/cpp" + +cmake \ + -DARROW_ACERO="${ARROW_ACERO}" \ + -DARROW_BUILD_SHARED=OFF \ + -DARROW_BUILD_TESTS="${ARROW_BUILD_TESTS}" \ + -DARROW_CSV="${ARROW_DATASET}" \ + -DARROW_DATASET="${ARROW_DATASET}" \ + -DARROW_SUBSTRAIT="${ARROW_DATASET}" \ + -DARROW_DEPENDENCY_SOURCE="VCPKG" \ + -DARROW_DEPENDENCY_USE_SHARED=OFF \ + -DARROW_GANDIVA_PC_CXX_FLAGS="${GANDIVA_CXX_FLAGS}" \ + -DARROW_GANDIVA="${ARROW_GANDIVA}" \ + -DARROW_GCS="${ARROW_GCS}" \ + -DARROW_JEMALLOC="${ARROW_JEMALLOC}" \ + -DARROW_JSON="${ARROW_DATASET}" \ + -DARROW_MIMALLOC="${ARROW_MIMALLOC}" \ + -DARROW_ORC="${ARROW_ORC}" \ + -DARROW_PARQUET="${ARROW_PARQUET}" \ + -DARROW_RPATH_ORIGIN="${ARROW_RPATH_ORIGIN}" \ + -DARROW_S3="${ARROW_S3}" \ + -DARROW_USE_CCACHE="${ARROW_USE_CCACHE}" \ + -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" \ + -DCMAKE_INSTALL_PREFIX="${ARROW_HOME}" \ + -DCMAKE_UNITY_BUILD="${CMAKE_UNITY_BUILD}" \ + -DGTest_SOURCE=BUNDLED \ + -DORC_SOURCE=BUNDLED \ + -DORC_PROTOBUF_EXECUTABLE="${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc" \ + -DPARQUET_BUILD_EXAMPLES=OFF \ + -DPARQUET_BUILD_EXECUTABLES=OFF \ + -DPARQUET_REQUIRE_ENCRYPTION=OFF \ + -DVCPKG_MANIFEST_MODE=OFF \ + -DVCPKG_TARGET_TRIPLET="${VCPKG_TARGET_TRIPLET}" \ + -GNinja \ + "${arrow_dir}/cpp" +ninja install + +if [ "${ARROW_BUILD_TESTS}" = "ON" ]; then + # MinIO is required + exclude_tests="arrow-s3fs-test" + case $(arch) in + aarch64) + # GCS testbench is crashed on aarch64: + # ImportError: ../grpc/_cython/cygrpc.cpython-38-aarch64-linux-gnu.so: + # undefined symbol: vtable for std::__cxx11::basic_ostringstream< + # char, std::char_traits, std::allocator > + exclude_tests="${exclude_tests}|arrow-gcsfs-test" + ;; + esac + # unstable + exclude_tests="${exclude_tests}|arrow-acero-asof-join-node-test" + exclude_tests="${exclude_tests}|arrow-acero-hash-join-node-test" + # external dependency + exclude_tests="${exclude_tests}|arrow-gcsfs-test" + # strptime + exclude_tests="${exclude_tests}|arrow-utility-test" + ctest \ + --exclude-regex "${exclude_tests}" \ + --label-regex unittest \ + --output-on-failure \ + --parallel "$(nproc)" \ + --timeout 300 +fi + +popd + +JAVA_JNI_CMAKE_ARGS="-DCMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" +JAVA_JNI_CMAKE_ARGS="${JAVA_JNI_CMAKE_ARGS} -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET}" +export JAVA_JNI_CMAKE_ARGS +"${arrow_java_dir}/ci/scripts/jni_build.sh" \ + "${arrow_java_dir}" \ + "${ARROW_HOME}" \ + "${build_dir}" \ + "${dist_dir}" + +if [ "${ARROW_USE_CCACHE}" == "ON" ]; then + echo "=== ccache statistics after build ===" + ccache -sv 2>/dev/null || ccache -s +fi + +echo "=== Checking shared dependencies for libraries ===" +pushd "${dist_dir}" +archery linking check-dependencies \ + --allow ld-linux-aarch64 \ + --allow ld-linux-x86-64 \ + --allow libc \ + --allow libdl \ + --allow libgcc_s \ + --allow libm \ + --allow libpthread \ + --allow librt \ + --allow libstdc++ \ + --allow libz \ + --allow linux-vdso \ + arrow_cdata_jni/"${normalized_arch}"/libarrow_cdata_jni.so \ + arrow_dataset_jni/"${normalized_arch}"/libarrow_dataset_jni.so \ + arrow_orc_jni/"${normalized_arch}"/libarrow_orc_jni.so \ + gandiva_jni/"${normalized_arch}"/libgandiva_jni.so +popd diff --git a/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java b/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java index eb73663191..0edc428254 100644 --- a/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java +++ b/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java @@ -95,8 +95,7 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) { // DenseUnion List childFields = new ArrayList<>(); childFields.add( - new Field( - "int-child", new FieldType(false, new ArrowType.Int(32, true), null, null), null)); + new Field("int-child", new FieldType(true, new ArrowType.Int(32, true), null, null), null)); Field structField = new Field( "struct", new FieldType(true, ArrowType.Struct.INSTANCE, null, null), childFields); diff --git a/dataset/src/test/java/org/apache/arrow/dataset/TestDataset.java b/dataset/src/test/java/org/apache/arrow/dataset/TestDataset.java index f3ca04d77b..4b155137ed 100644 --- a/dataset/src/test/java/org/apache/arrow/dataset/TestDataset.java +++ b/dataset/src/test/java/org/apache/arrow/dataset/TestDataset.java @@ -123,8 +123,6 @@ protected void assertParquetFileEquals(String expectedURI, String actualURI) thr VectorSchemaRoot actualVsr = VectorSchemaRoot.create(actualFactory.inspect(), rootAllocator())) { - // fast-fail by comparing metadata - assertEquals(expectedBatches.toString(), actualBatches.toString()); // compare ArrowRecordBatches assertEquals(expectedBatches.size(), actualBatches.size()); VectorLoader expectLoader = new VectorLoader(expectVsr); diff --git a/docker-compose.yml b/docker-compose.yml index ae378865b3..44d58c96a0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -31,6 +31,8 @@ volumes: services: ubuntu: + # Build and test arrow-java on Ubuntu. + # # Usage: # docker compose build ubuntu # docker compose run ubuntu @@ -47,6 +49,10 @@ services: /arrow-java/ci/scripts/java_test.sh /arrow-java /build" conda-jni-cdata: + # Builds and tests just the C Data Interface JNI library and JARs. + # (No dependencies on arrow-cpp.) + # This build isn't meant for distribution. It's for testing only. + # # Usage: # docker compose build conda-jni-cdata # docker compose run conda-jni-cdata @@ -72,6 +78,32 @@ services: ARROW_JAVA_CDATA: "ON" command: /bin/bash -c " - /arrow-java/ci/scripts/java_jni_build.sh /arrow-java /build/jni /build /jni && + /arrow-java/ci/scripts/jni_build.sh /arrow-java /build/jni /build /jni && /arrow-java/ci/scripts/java_build.sh /arrow-java /build /jni && /arrow-java/ci/scripts/java_test.sh /arrow-java /build /jni" + + vcpkg-jni: + # Builds all the JNI libraries, but not the JARs. + # (Requires arrow-cpp.) + # The artifacts from this build are meant to be used for packaging. + # + # Usage: + # docker compose build vcpkg-jni + # docker compose run vcpkg-jni + image: ${REPO}:${ARCH}-vcpkg-jni + build: + context: . + dockerfile: ci/docker/vcpkg-jni.dockerfile + cache_from: + - ${REPO}:${ARCH}-vcpkg-jni + args: + base: ${ARROW_REPO}:${ARCH}-python-${PYTHON}-wheel-manylinux-2014-vcpkg-${VCPKG} + volumes: + - .:/arrow-java:delegated + - ${ARROW_REPO_ROOT}:/arrow:delegated + - ${DOCKER_VOLUME_PREFIX}maven-cache:/root/.m2:delegated + environment: + ARROW_JAVA_CDATA: "ON" + command: + ["git config --global --add safe.directory /arrow-java && \ + /arrow-java/ci/scripts/jni_manylinux_build.sh /arrow-java /arrow /build /arrow-java/dist"]