resolve the comments

jialuoo · jialuoo · commit 4688d70bade1 · 2025-07-29T23:51:11.000Z
diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py
@@ -103,6 +103,13 @@ class FunctionAxisOnePreviewWarning(PreviewWarning):
     """Remote Function and Managed UDF with axis=1 preview."""
 
 
+class FunctionPackageVersionWarning(PreviewWarning):
+    """
+    Managed UDF package versions may not precisely match users' local
+    environment or the exact versions specified.
+    """
+
+
 def format_message(message: str, fill: bool = True):
     """Formats a warning message with ANSI color codes for the warning color.
 
diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py
@@ -19,7 +19,6 @@
 import logging
 import os
 import random
-import re
 import shutil
 import string
 import tempfile
@@ -270,43 +269,6 @@ def provision_bq_managed_function(
         )
 
         udf_name = func.__name__
-        if capture_references:
-            # This code path ensures that if the udf body contains any
-            # references to variables and/or imports outside the body, they are
-            # captured as well.
-            import cloudpickle
-
-            pickled = cloudpickle.dumps(func)
-            func_code = textwrap.dedent(
-                f"""
-                import cloudpickle
-                {udf_name} = cloudpickle.loads({pickled})
-            """
-            )
-        else:
-            # This code path ensures that if the udf body is self contained,
-            # i.e. there are no references to variables or imports outside the
-            # body.
-            func_code = textwrap.dedent(inspect.getsource(func))
-            match = re.search(r"^def ", func_code, flags=re.MULTILINE)
-            if match is None:
-                raise ValueError("The UDF is not defined correctly.")
-            func_code = func_code[match.start() :]
-
-        if is_row_processor:
-            udf_code = textwrap.dedent(inspect.getsource(bff_template.get_pd_series))
-            udf_code = udf_code[udf_code.index("def") :]
-            bigframes_handler_code = textwrap.dedent(
-                f"""def bigframes_handler(str_arg):
-                    return {udf_name}({bff_template.get_pd_series.__name__}(str_arg))"""
-            )
-        else:
-            udf_code = ""
-            bigframes_handler_code = textwrap.dedent(
-                f"""def bigframes_handler(*args):
-                    return {udf_name}(*args)"""
-            )
-        udf_code = f"{udf_code}\n{func_code}"
 
         with_connection_clause = (
             (
@@ -316,6 +278,13 @@ def provision_bq_managed_function(
             else ""
         )
 
+        # Generate the complete Python code block for the managed Python UDF,
+        # including the user's function, necessary imports, and the BigQuery
+        # handler wrapper.
+        python_code_block = bff_template.generate_managed_function_code(
+            func, udf_name, is_row_processor, capture_references
+        )
+
         create_function_ddl = (
             textwrap.dedent(
                 f"""
@@ -326,12 +295,11 @@ def provision_bq_managed_function(
                 OPTIONS ({managed_function_options_str})
                 AS r'''
                 __UDF_PLACE_HOLDER__
-                {bigframes_handler_code}
                 '''
             """
             )
             .strip()
-            .replace("__UDF_PLACE_HOLDER__", udf_code)
+            .replace("__UDF_PLACE_HOLDER__", python_code_block)
         )
 
         self._ensure_dataset_exists()
diff --git a/bigframes/functions/_utils.py b/bigframes/functions/_utils.py
@@ -18,6 +18,7 @@
 import sys
 import typing
 from typing import cast, Optional, Set
+import warnings
 
 import cloudpickle
 import google.api_core.exceptions
@@ -26,6 +27,7 @@
 import pandas
 import pyarrow
 
+import bigframes.exceptions as bfe
 import bigframes.formatting_helpers as bf_formatting
 from bigframes.functions import function_typing
 
@@ -81,9 +83,14 @@ def _get_updated_package_requirements(
         # Due to current limitations on the numpy version in Python UDFs, we use
         # `ignore_numpy_version` to optionally omit the version for managed
         # functions only.
-        numpy_package = (
-            "numpy" if ignore_numpy_version else f"numpy=={numpy.__version__}"
-        )
+        if ignore_numpy_version:
+            msg = bfe.format_message(
+                "Numpy version may not precisely match your local environment."
+            )
+            warnings.warn(msg, category=bfe.PreviewWarning)
+            numpy_package = "numpy"
+        else:
+            numpy_package = f"numpy=={numpy.__version__}"
         requirements.append(numpy_package)
 
     if package_requirements:
diff --git a/bigframes/functions/function_template.py b/bigframes/functions/function_template.py
@@ -17,6 +17,7 @@
 import inspect
 import logging
 import os
+import re
 import textwrap
 from typing import Tuple
 
@@ -291,3 +292,55 @@ def generate_cloud_function_main_code(
     logger.debug(f"Wrote {os.path.abspath(main_py)}:\n{open(main_py).read()}")
 
     return handler_func_name
+
+
+def generate_managed_function_code(
+    def_,
+    udf_name: str,
+    is_row_processor: bool,
+    capture_references: bool,
+) -> str:
+    """Generates the Python code block for managed Python UDF."""
+
+    if capture_references:
+        # This code path ensures that if the udf body contains any
+        # references to variables and/or imports outside the body, they are
+        # captured as well.
+        import cloudpickle
+
+        pickled = cloudpickle.dumps(def_)
+        func_code = textwrap.dedent(
+            f"""
+            import cloudpickle
+            {udf_name} = cloudpickle.loads({pickled})
+        """
+        )
+    else:
+        # This code path ensures that if the udf body is self contained,
+        # i.e. there are no references to variables or imports outside the
+        # body.
+        func_code = textwrap.dedent(inspect.getsource(def_))
+        match = re.search(r"^def ", func_code, flags=re.MULTILINE)
+        if match is None:
+            raise ValueError("The UDF is not defined correctly.")
+        func_code = func_code[match.start() :]
+
+    if is_row_processor:
+        udf_code = textwrap.dedent(inspect.getsource(get_pd_series))
+        udf_code = udf_code[udf_code.index("def") :]
+        bigframes_handler_code = textwrap.dedent(
+            f"""def bigframes_handler(str_arg):
+                return {udf_name}({get_pd_series.__name__}(str_arg))"""
+        )
+    else:
+        udf_code = ""
+        bigframes_handler_code = textwrap.dedent(
+            f"""def bigframes_handler(*args):
+                return {udf_name}(*args)"""
+        )
+
+    udf_code_block = textwrap.dedent(
+        f"{udf_code}\n{func_code}\n{bigframes_handler_code}"
+    )
+
+    return udf_code_block
diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py
@@ -659,6 +659,7 @@ def serialize_row(row):
             # BigFrames and pandas. Without it, BigFrames return plain Python
             # types, e.g. 0, while pandas return NumPy types, e.g. np.int64(0),
             # which could lead to mismatches and requires further investigation.
+            # See b/435021126.
             custom = {
                 "name": int(row.name),
                 "index": [idx for idx in row.index],
@@ -719,6 +720,7 @@ def analyze(row):
             # BigFrames and pandas. Without it, BigFrames return plain Python
             # types, e.g. 0, while pandas return NumPy types, e.g. np.int64(0),
             # which could lead to mismatches and requires further investigation.
+            # See b/435021126.
             return str(
                 {
                     "dtype": row.dtype,
@@ -731,12 +733,17 @@ def analyze(row):
                 }
             )
 
-        analyze_mf = session.udf(
-            input_types=bigframes.series.Series,
-            output_type=str,
-            dataset=dataset_id,
-            name=prefixer.create_prefix(),
-        )(analyze)
+        with pytest.warns(
+            bfe.PreviewWarning,
+            match=("Numpy version may not precisely match your local environment."),
+        ):
+
+            analyze_mf = session.udf(
+                input_types=bigframes.series.Series,
+                output_type=str,
+                dataset=dataset_id,
+                name=prefixer.create_prefix(),
+            )(analyze)
 
         assert getattr(analyze_mf, "is_row_processor")
 
@@ -831,6 +838,7 @@ def serialize_row(row):
             # BigFrames and pandas. Without it, BigFrames return plain Python
             # types, e.g. 0, while pandas return NumPy types, e.g. np.int64(0),
             # which could lead to mismatches and requires further investigation.
+            # See b/435021126.
             custom = {
                 "name": int(row.name),
                 "index": [idx for idx in row.index],
@@ -870,3 +878,69 @@ def serialize_row(row):
         cleanup_function_assets(
             serialize_row_mf, session.bqclient, session.cloudfunctionsclient
         )
+
+
+@pytest.mark.skip(reason="Revert after this bug b/435018880 is fixed.")
+def test_managed_function_df_apply_axis_1_na_nan_inf(dataset_id, session):
+    """This test is for special cases of float values, to make sure any (nan,
+    inf, -inf) produced by user code is honored.
+    """
+    bf_df = session.read_gbq(
+        """\
+SELECT "1" AS text, 1 AS num
+UNION ALL
+SELECT "2.5" AS text, 2.5 AS num
+UNION ALL
+SELECT "nan" AS text, IEEE_DIVIDE(0, 0) AS num
+UNION ALL
+SELECT "inf" AS text, IEEE_DIVIDE(1, 0) AS num
+UNION ALL
+SELECT "-inf" AS text, IEEE_DIVIDE(-1, 0) AS num
+UNION ALL
+SELECT "numpy nan" AS text, IEEE_DIVIDE(0, 0) AS num
+UNION ALL
+SELECT "pandas na" AS text, NULL AS num
+                             """
+    )
+
+    pd_df = bf_df.to_pandas()
+
+    try:
+
+        def float_parser(row):
+            import numpy as mynp
+            import pandas as mypd
+
+            if row["text"] == "pandas na":
+                return mypd.NA
+            if row["text"] == "numpy nan":
+                return mynp.nan
+            return float(row["text"])
+
+        float_parser_mf = session.udf(
+            input_types=bigframes.series.Series,
+            output_type=float,
+            dataset=dataset_id,
+            name=prefixer.create_prefix(),
+        )(float_parser)
+
+        assert getattr(float_parser_mf, "is_row_processor")
+
+        pd_result = pd_df.apply(float_parser, axis=1)
+        bf_result = bf_df.apply(float_parser_mf, axis=1).to_pandas()
+
+        # bf_result.dtype is 'Float64' while pd_result.dtype is 'object'
+        # , ignore this mismatch by using check_dtype=False.
+        pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+
+        # Let's also assert that the data is consistent in this round trip
+        # (BQ -> BigFrames -> BQ -> GCF -> BQ -> BigFrames) w.r.t. their
+        # expected values in BQ.
+        bq_result = bf_df["num"].to_pandas()
+        bq_result.name = None
+        pandas.testing.assert_series_equal(bq_result, bf_result)
+    finally:
+        # clean up the gcp assets created for the managed function.
+        cleanup_function_assets(
+            float_parser_mf, session.bqclient, session.cloudfunctionsclient
+        )