From bf4937c3c4b09acb4cefe8856d4302a7a5ba218a Mon Sep 17 00:00:00 2001 From: Danny Mccormick Date: Thu, 6 Nov 2025 15:52:31 -0500 Subject: [PATCH 1/3] Split some requirements into extras --- sdks/python/apache_beam/io/tfrecordio.py | 17 +++++++++++++++-- sdks/python/container/common.gradle | 6 +++--- sdks/python/setup.py | 2 +- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/sdks/python/apache_beam/io/tfrecordio.py b/sdks/python/apache_beam/io/tfrecordio.py index c6c59b2c2bed..073cbc1d211b 100644 --- a/sdks/python/apache_beam/io/tfrecordio.py +++ b/sdks/python/apache_beam/io/tfrecordio.py @@ -24,8 +24,6 @@ import struct from functools import partial -import crcmod - from apache_beam import coders from apache_beam.io import filebasedsink from apache_beam.io.filebasedsource import FileBasedSource @@ -35,6 +33,16 @@ from apache_beam.io.iobase import Write from apache_beam.transforms import PTransform +try: + import crcmod +except ImportError: + logging.warning( + 'crcmod package not found. This package is required if ' + 'python-snappy or google-crc32c are not installed. To ensure crcmod is ' + 'installed, install the tfrecord extra: pip install ' + 'apache-beam[tfrecord]') + crcmod = None + __all__ = ['ReadFromTFRecord', 'ReadAllFromTFRecord', 'WriteToTFRecord'] _LOGGER = logging.getLogger(__name__) @@ -67,6 +75,11 @@ def _default_crc32c_fn(value): pass if not _default_crc32c_fn.fn: + if crcmod is None: + raise RuntimeError( + 'Could not find python-snappy, google-crc32c, or crcmod. To allow ' + 'execution to succeed, make sure that one of these packages is ' + 'installed or pip install apache-beam[tfrecord]') _LOGGER.warning( 'Couldn\'t find python-snappy or google-crc32c so the ' 'implementation of _TFRecordUtil._masked_crc32c is not as fast ' diff --git a/sdks/python/container/common.gradle b/sdks/python/container/common.gradle index 8ee31cf4e50d..ad64dbbb660b 100644 --- a/sdks/python/container/common.gradle +++ b/sdks/python/container/common.gradle @@ -42,7 +42,7 @@ def generatePythonRequirements = tasks.register("generatePythonRequirements") { "${files(configurations.sdkSourceTarball.files).singleFile} " + "base_image_requirements.txt " + "container " + - "[gcp,dataframe,test] " + + "[gcp,dataframe,test,tfrecord] " + "${pipExtraOptions}" } // Generate versions for ML dependencies @@ -53,7 +53,7 @@ def generatePythonRequirements = tasks.register("generatePythonRequirements") { "${files(configurations.sdkSourceTarball.files).singleFile} " + "base_image_requirements.txt " + "container/ml " + - "[gcp,dataframe,test,ml_cpu] " + + "[gcp,dataframe,test,ml_cpu,tfrecord] " + "${pipExtraOptions}" } // TODO(https://github.com/apache/beam/issues/36637) @@ -73,7 +73,7 @@ def generatePythonRequirements = tasks.register("generatePythonRequirements") { "${files(configurations.sdkSourceTarball.files).singleFile} " + "gpu_image_requirements.txt " + "container/ml " + - "[gcp,dataframe,test,tensorflow,torch,transformers,vllm] " + + "[gcp,dataframe,test,tensorflow,tfrecord,torch,transformers,vllm] " + "${pipExtraOptions}" } } diff --git a/sdks/python/setup.py b/sdks/python/setup.py index c50050d9241e..3c486eebbbad 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -373,7 +373,6 @@ def get_portability_package_data(): }, ext_modules=extensions, install_requires=[ - 'crcmod>=1.7,<2.0', 'cryptography>=39.0.0,<48.0.0', 'fastavro>=0.23.6,<2', 'fasteners>=0.3,<1.0', @@ -596,6 +595,7 @@ def get_portability_package_data(): , 'dill' ], + 'tfrecord': ['crcmod>=1.7,<2.0'] 'onnx': [ 'onnxruntime==1.13.1', 'torch==1.13.1', From 845e815a4b66994d09d08aa3ea290cd7dd91e7c0 Mon Sep 17 00:00:00 2001 From: Danny Mccormick Date: Thu, 6 Nov 2025 15:58:56 -0500 Subject: [PATCH 2/3] comma --- sdks/python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 3c486eebbbad..6c9a0d41f18b 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -595,7 +595,7 @@ def get_portability_package_data(): , 'dill' ], - 'tfrecord': ['crcmod>=1.7,<2.0'] + 'tfrecord': ['crcmod>=1.7,<2.0'], 'onnx': [ 'onnxruntime==1.13.1', 'torch==1.13.1', From 92db37f177c92bc6c46cedb3173e365afd7d6dee Mon Sep 17 00:00:00 2001 From: Danny Mccormick Date: Thu, 6 Nov 2025 16:25:35 -0500 Subject: [PATCH 3/3] test fixes --- sdks/python/apache_beam/io/tfrecordio_test.py | 7 ++++++- sdks/python/tox.ini | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/io/tfrecordio_test.py b/sdks/python/apache_beam/io/tfrecordio_test.py index 6522ade36d80..e88ed1778633 100644 --- a/sdks/python/apache_beam/io/tfrecordio_test.py +++ b/sdks/python/apache_beam/io/tfrecordio_test.py @@ -33,7 +33,6 @@ import zlib from datetime import datetime -import crcmod import pytz import apache_beam as beam @@ -61,6 +60,11 @@ tf = None # pylint: disable=invalid-name logging.warning('Tensorflow is not installed, so skipping some tests.') +try: + import crcmod +except ImportError: + crcmod = None + # Created by running following code in python: # >>> import tensorflow as tf # >>> import base64 @@ -121,6 +125,7 @@ def test_masked_crc32c(self): 0xe4999b0, _TFRecordUtil._masked_crc32c(b'\x03\x00\x00\x00\x00\x00\x00\x00')) + @unittest.skipIf(crcmod is None, 'crcmod not installed.') def test_masked_crc32c_crcmod(self): crc32c_fn = crcmod.predefined.mkPredefinedCrcFun('crc-32c') self.assertEqual( diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index d47de67df5d2..7d84ca7a2c62 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -33,7 +33,7 @@ pip_pre = True # allow apps that support color to use it. passenv=TERM,CLOUDSDK_CONFIG,DOCKER_*,TESTCONTAINERS_*,TC_*,ALLOYDB_PASSWORD # Set [] options for pip installation of apache-beam tarball. -extras = test,dataframe,yaml +extras = test,dataframe,tfrecord,yaml # Don't warn that these commands aren't installed. allowlist_externals = false