Add image normalize error handling

shuoweil · shuoweil · commit b4ef718f272b · 2025-10-14T18:46:19.000Z
diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py
@@ -26,7 +26,6 @@
     float: "FLOAT64",
     str: "STRING",
     bytes: "BYTES",
-    bool: "BOOL",
 }
 
 
@@ -340,11 +339,12 @@ def image_normalize_func(
     beta: float,
     norm_type: str,
     ext: str,
-    verbose: bool,
 ) -> str:
-    try:
-        import json
+    import json
+
+    result_dict = {"status": "", "content": dst_obj_ref_rt}
 
+    try:
         import cv2 as cv  # type: ignore
         import numpy as np
         import requests
@@ -392,18 +392,11 @@ def image_normalize_func(
             },
             timeout=30,
         )
-        if verbose:
-            result_dict = {"status": "", "content": dst_obj_ref_rt}
-            return json.dumps(result_dict)
-        else:
-            return dst_obj_ref_rt
 
     except Exception as e:
-        if verbose:
-            result_dict = {"status": str(e), "content": None}
-            return json.dumps(result_dict)
-        else:
-            return None
+        result_dict["status"] = str(e)
+
+    return json.dumps(result_dict)
 
 
 image_normalize_def = FunctionDef(
@@ -417,7 +410,6 @@ def image_normalize_to_bytes_func(
     beta: float,
     norm_type: str,
     ext: str,
-    verbose: bool,
 ) -> str:
     try:
         import base64
@@ -453,21 +445,15 @@ def image_normalize_to_bytes_func(
         )
         bts = cv.imencode(".jpeg", img_normalized)[1].tobytes()
 
-        if verbose:
-            content_b64 = base64.b64encode(bts).decode("utf-8")
-            result_dict = {"status": "", "content": content_b64}
-            result_json = json.dumps(result_dict)
-            return result_json
-        else:
-            return bts
+        content_b64 = base64.b64encode(bts).decode("utf-8")
+        result_dict = {"status": "", "content": content_b64}
+        result_json = json.dumps(result_dict)
 
     except Exception as e:
-        if verbose:
-            result_dict = {"status": str(e), "content": b""}
-            result_json = json.dumps(result_dict)
-            return result_json
-        else:
-            return b""
+        result_dict = {"status": str(e), "content": b""}
+        result_json = json.dumps(result_dict)
+
+    return result_json
 
 
 image_normalize_to_bytes_def = FunctionDef(
diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py
@@ -637,8 +637,6 @@ def image_normalize(
         if engine is None or engine.casefold() != "opencv":
             raise ValueError("Must specify the engine, supported value is 'opencv'.")
 
-        import base64
-
         import bigframes.bigquery as bbq
         import bigframes.blob._functions as blob_func
         import bigframes.pandas as bpd
@@ -662,25 +660,25 @@ def image_normalize(
             df["beta"] = beta
             df["norm_type"] = norm_type
             df["ext"] = ext  # type: ignore
-            df["verbose"] = verbose
             res = self._df_apply_udf(df, image_normalize_udf)
 
+            normalized_content_b64_series = res._apply_unary_op(
+                ops.JSONValue(json_path="$.content")
+            )
+            normalized_bytes = bbq.sql_scalar(
+                "FROM_BASE64({0})", columns=[normalized_content_b64_series]
+            )
             if verbose:
                 normalized_status_series = res._apply_unary_op(
                     ops.JSONValue(json_path="$.status")
                 )
-                normalized_content_b64_series = res._apply_unary_op(
-                    ops.JSONValue(json_path="$.content")
-                )
-                # TODO this is not allowed, I need to find another way
-                normalized_bytes = base64.b64decode(normalized_content_b64_series)
                 results_df = bpd.DataFrame(
                     {"status": normalized_status_series, "content": normalized_bytes}
                 )
                 results_struct = bbq.struct(results_df).rename("normalized_results")
                 return results_struct
             else:
-                return res
+                return normalized_bytes.rename("normalized_bytes")
 
         if isinstance(dst, str):
             dst = os.path.join(dst, "")
@@ -708,22 +706,31 @@ def image_normalize(
         df["beta"] = beta
         df["norm_type"] = norm_type
         df["ext"] = ext  # type: ignore
-        # df["verbose"] = verbose
 
         res = self._df_apply_udf(df, image_normalize_udf)
         res.cache()  # to execute the udf
 
+        normalized_content_series = res._apply_unary_op(
+            ops.JSONValue(json_path="$.content")
+        )
+        normalized_content_blobs = normalized_content_series.str.to_blob(
+            connection=connection
+        )
+
         if verbose:
             normalized_status_series = res._apply_unary_op(
                 ops.JSONValue(json_path="$.status")
             )
             results_df = bpd.DataFrame(
-                {"status": normalized_status_series, "content": dst}
+                {
+                    "status": normalized_status_series,
+                    "content": normalized_content_blobs,
+                }
             )
             results_struct = bbq.struct(results_df).rename("normalized_results")
             return results_struct
         else:
-            return dst
+            return normalized_content_blobs.rename("normalized_content")
 
     def pdf_extract(
         self,
@@ -781,7 +788,7 @@ def pdf_extract(
 
         extracted_content_series = res._apply_unary_op(
             ops.JSONValue(json_path="$.content")
-        ).rename("extracted_content")
+        )
 
         if verbose:
             status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status"))
@@ -791,7 +798,7 @@ def pdf_extract(
             results_struct = bbq.struct(results_df).rename("extracted_results")
             return results_struct
         else:
-            return extracted_content_series
+            return extracted_content_series.rename("extracted_content")
 
     def pdf_chunk(
         self,
@@ -865,9 +872,8 @@ def pdf_chunk(
 
         res = self._df_apply_udf(df, pdf_chunk_udf)
 
-        chunked_content_series = bbq.json_extract_string_array(res, "$.content").rename(
-            "chunked_content"
-        )
+        chunked_content_series = bbq.json_extract_string_array(res, "$.content")
+
         if verbose:
             status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status"))
             results_df = bpd.DataFrame(
@@ -876,7 +882,7 @@ def pdf_chunk(
             resultes_struct = bbq.struct(results_df).rename("chunked_results")
             return resultes_struct
         else:
-            return chunked_content_series
+            return chunked_content_series.rename("chunked_content")
 
     def audio_transcribe(
         self,
@@ -934,6 +940,7 @@ def audio_transcribe(
             model_params={"generationConfig": {"temperature": 0.0}},
         )
 
+
         transcribed_content_series = transcribed_results.struct.field("result").rename(
             "transcribed_content"
         )
@@ -949,4 +956,4 @@ def audio_transcribe(
             results_struct = bbq.struct(results_df).rename("transcription_results")
             return results_struct
         else:
-            return transcribed_content_series
+            return transcribed_content_series.rename("transcribed_content")
diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py
@@ -228,11 +228,13 @@ def test_blob_image_resize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str
     assert actual.dtype == dtypes.BYTES_DTYPE
 
 
+@pytest.mark.parametrize("verbose", [True, False])
 def test_blob_image_normalize_to_series(
     images_mm_df: bpd.DataFrame,
     bq_connection: str,
     images_output_uris: list[str],
     session: bigframes.Session,
+    verbose: bool,
 ):
     series = bpd.Series(images_output_uris, session=session).str.to_blob(
         connection=bq_connection
@@ -246,30 +248,48 @@ def test_blob_image_normalize_to_series(
         connection=bq_connection,
         engine="opencv",
     )
-    expected_df = pd.DataFrame(
-        {
-            "uri": images_output_uris,
-            "version": [None, None],
-            "authorizer": [bq_connection.casefold(), bq_connection.casefold()],
-            "details": [None, None],
-        }
-    )
-    pd.testing.assert_frame_equal(
-        actual.struct.explode().to_pandas(),
-        expected_df,
-        check_dtype=False,
-        check_index_type=False,
-    )
 
-    # verify the files exist
-    assert not actual.blob.size().isna().any()
+    if verbose:
+
+        assert hasattr(actual, "struct")
+        actual_exploded = actual.struct.explode()
+        assert "status" in actual_exploded.columns
+        assert "content" in actual_exploded.columns
+
+        status_series = actual_exploded["status"]
+        assert status_series.dtype == dtypes.STRING_DTYPE
+
+        content_series = actual_exploded["content"]
+        # Content should be blob objects for GCS destination
+        assert hasattr(content_series, "blob")
+
+    else:
+        expected_df = pd.DataFrame(
+            {
+                "uri": images_output_uris,
+                "version": [None, None],
+                "authorizer": [bq_connection.casefold(), bq_connection.casefold()],
+                "details": [None, None],
+            }
+        )
+        pd.testing.assert_frame_equal(
+            actual.struct.explode().to_pandas(),
+            expected_df,
+            check_dtype=False,
+            check_index_type=False,
+        )
+
+        # verify the files exist
+        assert not actual.blob.size().isna().any()
 
 
+@pytest.mark.parametrize("verbose", [True, False])
 def test_blob_image_normalize_to_folder(
     images_mm_df: bpd.DataFrame,
     bq_connection: str,
     images_output_folder: str,
     images_output_uris: list[str],
+    verbose: bool,
 ):
     actual = images_mm_df["blob_col"].blob.image_normalize(
         alpha=50.0,
@@ -298,10 +318,7 @@ def test_blob_image_normalize_to_folder(
     assert not actual.blob.size().isna().any()
 
 
-@pytest.mark.parametrize(
-    "verbose",
-    [True, False],
-)
+@pytest.mark.parametrize("verbose", [True, False])
 def test_blob_image_normalize_to_bq(
     images_mm_df: bpd.DataFrame, bq_connection: str, verbose: bool
 ):