Skip to content

Commit 055b45e

Browse files
fix: Handle generic types in UnsupportedTypeError
This commit fixes a test failure caused by an `AttributeError` when handling generic types from the `typing` module in the `UnsupportedTypeError` class. The `__init__` method of `UnsupportedTypeError` has been updated to check if a type is a generic from the `typing` module and, if so, convert it to a string directly to get the full type representation (e.g., `list[str]`). This ensures that the error message is generated correctly without raising an `AttributeError`. A new unit test has also been added to `tests/unit/functions/test_function_typing.py` to verify the fix.
1 parent ea7e8bb commit 055b45e

File tree

14 files changed

+263
-892
lines changed

14 files changed

+263
-892
lines changed

bigframes/blob/_functions.py

Lines changed: 97 additions & 188 deletions
Large diffs are not rendered by default.

bigframes/core/compile/sqlglot/expressions/comparison_ops.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,6 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
109109
return sge.LTE(this=left_expr, expression=right_expr)
110110

111111

112-
@register_binary_op(ops.minimum_op)
113-
def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
114-
return sge.Least(this=left.expr, expressions=right.expr)
115-
116-
117112
@register_binary_op(ops.ne_op)
118113
def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
119114
left_expr = _coerce_bool_to_int(left)

bigframes/core/compile/sqlglot/expressions/generic_ops.py

Lines changed: 0 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -159,30 +159,6 @@ def _(*cases_and_outputs: TypedExpr) -> sge.Expression:
159159
)
160160

161161

162-
@register_nary_op(ops.RowKey)
163-
def _(*values: TypedExpr) -> sge.Expression:
164-
# All inputs into hash must be non-null or resulting hash will be null
165-
str_values = [_convert_to_nonnull_string_sqlglot(value) for value in values]
166-
167-
full_row_hash_p1 = sge.func("FARM_FINGERPRINT", sge.Concat(expressions=str_values))
168-
169-
# By modifying value slightly, we get another hash uncorrelated with the first
170-
full_row_hash_p2 = sge.func(
171-
"FARM_FINGERPRINT", sge.Concat(expressions=[*str_values, sge.convert("_")])
172-
)
173-
174-
# Used to disambiguate between identical rows (which will have identical hash)
175-
random_hash_p3 = sge.func("RAND")
176-
177-
return sge.Concat(
178-
expressions=[
179-
sge.Cast(this=full_row_hash_p1, to="STRING"),
180-
sge.Cast(this=full_row_hash_p2, to="STRING"),
181-
sge.Cast(this=random_hash_p3, to="STRING"),
182-
]
183-
)
184-
185-
186162
# Helper functions
187163
def _cast_to_json(expr: TypedExpr, op: ops.AsTypeOp) -> sge.Expression:
188164
from_type = expr.dtype
@@ -242,32 +218,3 @@ def _cast(expr: sge.Expression, to: str, safe: bool):
242218
return sge.TryCast(this=expr, to=to)
243219
else:
244220
return sge.Cast(this=expr, to=to)
245-
246-
247-
def _convert_to_nonnull_string_sqlglot(expr: TypedExpr) -> sge.Expression:
248-
col_type = expr.dtype
249-
sg_expr = expr.expr
250-
251-
if col_type == dtypes.STRING_DTYPE:
252-
result = sg_expr
253-
elif (
254-
dtypes.is_numeric(col_type)
255-
or dtypes.is_time_or_date_like(col_type)
256-
or col_type == dtypes.BYTES_DTYPE
257-
):
258-
result = sge.Cast(this=sg_expr, to="STRING")
259-
elif col_type == dtypes.GEO_DTYPE:
260-
result = sge.func("ST_ASTEXT", sg_expr)
261-
else:
262-
# TO_JSON_STRING works with all data types, but isn't the most efficient
263-
# Needed for JSON, STRUCT and ARRAY datatypes
264-
result = sge.func("TO_JSON_STRING", sg_expr)
265-
266-
# Escape backslashes and use backslash as delineator
267-
escaped = sge.func(
268-
"REPLACE",
269-
sge.func("COALESCE", result, sge.convert("")),
270-
sge.convert("\\"),
271-
sge.convert("\\\\"),
272-
)
273-
return sge.Concat(expressions=[sge.convert("\\"), escaped])

bigframes/core/compile/sqlglot/expressions/numeric_ops.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -377,14 +377,6 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
377377
return result
378378

379379

380-
@register_binary_op(ops.round_op)
381-
def _(expr: TypedExpr, n_digits: TypedExpr) -> sge.Expression:
382-
rounded = sge.Round(this=expr.expr, decimals=n_digits.expr)
383-
if expr.dtype == dtypes.INT_DTYPE:
384-
return sge.Cast(this=rounded, to="INT64")
385-
return rounded
386-
387-
388380
@register_binary_op(ops.sub_op)
389381
def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
390382
if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype):

bigframes/functions/function_typing.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,16 @@ def __init__(self, type_, supported_types):
6565
if isinstance(supported_types, dict):
6666
types_to_format = supported_types.keys()
6767

68-
supported_types_str = ", ".join(sorted([t.__name__ for t in types_to_format]))
68+
supported_types_str = ", ".join(
69+
sorted([getattr(t, "__name__", str(t)) for t in types_to_format])
70+
)
71+
if get_origin(type_) is not None:
72+
type_str = str(type_)
73+
else:
74+
type_str = getattr(type_, "__name__", str(type_))
6975

7076
super().__init__(
71-
f"'{type_.__name__}' must be one of the supported types ({supported_types_str}) "
77+
f"'{type_str}' must be one of the supported types ({supported_types_str}) "
7278
"or a list of one of those types."
7379
)
7480

bigframes/operations/blob.py

Lines changed: 30 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -193,20 +193,6 @@ def _df_apply_udf(
193193

194194
return s
195195

196-
def _apply_udf_or_raise_error(
197-
self, df: bigframes.dataframe.DataFrame, udf, operation_name: str
198-
) -> bigframes.series.Series:
199-
"""Helper to apply UDF with consistent error handling."""
200-
try:
201-
res = self._df_apply_udf(df, udf)
202-
except Exception as e:
203-
raise RuntimeError(f"{operation_name} UDF execution failed: {e}") from e
204-
205-
if res is None:
206-
raise RuntimeError(f"{operation_name} returned None result")
207-
208-
return res
209-
210196
def read_url(self) -> bigframes.series.Series:
211197
"""Retrieve the read URL of the Blob.
212198
@@ -357,10 +343,6 @@ def exif(
357343
358344
Returns:
359345
bigframes.series.Series: JSON series of key-value pairs if verbose=False, or struct with status and content if verbose=True.
360-
361-
Raises:
362-
ValueError: If engine is not 'pillow'.
363-
RuntimeError: If EXIF extraction fails or returns invalid structure.
364346
"""
365347
if engine is None or engine.casefold() != "pillow":
366348
raise ValueError("Must specify the engine, supported value is 'pillow'.")
@@ -382,28 +364,22 @@ def exif(
382364
container_memory=container_memory,
383365
).udf()
384366

385-
res = self._apply_udf_or_raise_error(df, exif_udf, "EXIF extraction")
367+
res = self._df_apply_udf(df, exif_udf)
386368

387369
if verbose:
388-
try:
389-
exif_content_series = bbq.parse_json(
390-
res._apply_unary_op(ops.JSONValue(json_path="$.content"))
391-
).rename("exif_content")
392-
exif_status_series = res._apply_unary_op(
393-
ops.JSONValue(json_path="$.status")
394-
)
395-
except Exception as e:
396-
raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e
370+
exif_content_series = bbq.parse_json(
371+
res._apply_unary_op(ops.JSONValue(json_path="$.content"))
372+
).rename("exif_content")
373+
exif_status_series = res._apply_unary_op(
374+
ops.JSONValue(json_path="$.status")
375+
)
397376
results_df = bpd.DataFrame(
398377
{"status": exif_status_series, "content": exif_content_series}
399378
)
400379
results_struct = bbq.struct(results_df).rename("exif_results")
401380
return results_struct
402381
else:
403-
try:
404-
return bbq.parse_json(res)
405-
except Exception as e:
406-
raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e
382+
return bbq.parse_json(res)
407383

408384
def image_blur(
409385
self,
@@ -435,10 +411,6 @@ def image_blur(
435411
436412
Returns:
437413
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. If verbose=True, returns struct with status and content.
438-
439-
Raises:
440-
ValueError: If engine is not 'opencv' or parameters are invalid.
441-
RuntimeError: If image blur operation fails.
442414
"""
443415
if engine is None or engine.casefold() != "opencv":
444416
raise ValueError("Must specify the engine, supported value is 'opencv'.")
@@ -465,7 +437,7 @@ def image_blur(
465437
df["ksize_x"], df["ksize_y"] = ksize
466438
df["ext"] = ext # type: ignore
467439
df["verbose"] = verbose
468-
res = self._apply_udf_or_raise_error(df, image_blur_udf, "Image blur")
440+
res = self._df_apply_udf(df, image_blur_udf)
469441

470442
if verbose:
471443
blurred_content_b64_series = res._apply_unary_op(
@@ -514,7 +486,7 @@ def image_blur(
514486
df["ext"] = ext # type: ignore
515487
df["verbose"] = verbose
516488

517-
res = self._apply_udf_or_raise_error(df, image_blur_udf, "Image blur")
489+
res = self._df_apply_udf(df, image_blur_udf)
518490
res.cache() # to execute the udf
519491

520492
if verbose:
@@ -568,10 +540,6 @@ def image_resize(
568540
569541
Returns:
570542
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. If verbose=True, returns struct with status and content.
571-
572-
Raises:
573-
ValueError: If engine is not 'opencv' or parameters are invalid.
574-
RuntimeError: If image resize operation fails.
575543
"""
576544
if engine is None or engine.casefold() != "opencv":
577545
raise ValueError("Must specify the engine, supported value is 'opencv'.")
@@ -602,11 +570,11 @@ def image_resize(
602570
container_memory=container_memory,
603571
).udf()
604572

605-
df["dsize_x"], df["dsize_y"] = dsize
573+
df["dsize_x"], df["dsizye_y"] = dsize
606574
df["fx"], df["fy"] = fx, fy
607575
df["ext"] = ext # type: ignore
608576
df["verbose"] = verbose
609-
res = self._apply_udf_or_raise_error(df, image_resize_udf, "Image resize")
577+
res = self._df_apply_udf(df, image_resize_udf)
610578

611579
if verbose:
612580
resized_content_b64_series = res._apply_unary_op(
@@ -652,12 +620,12 @@ def image_resize(
652620
dst_rt = dst.blob.get_runtime_json_str(mode="RW")
653621

654622
df = df.join(dst_rt, how="outer")
655-
df["dsize_x"], df["dsize_y"] = dsize
623+
df["dsize_x"], df["dsizye_y"] = dsize
656624
df["fx"], df["fy"] = fx, fy
657625
df["ext"] = ext # type: ignore
658626
df["verbose"] = verbose
659627

660-
res = self._apply_udf_or_raise_error(df, image_resize_udf, "Image resize")
628+
res = self._df_apply_udf(df, image_resize_udf)
661629
res.cache() # to execute the udf
662630

663631
if verbose:
@@ -711,10 +679,6 @@ def image_normalize(
711679
712680
Returns:
713681
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. If verbose=True, returns struct with status and content.
714-
715-
Raises:
716-
ValueError: If engine is not 'opencv' or parameters are invalid.
717-
RuntimeError: If image normalize operation fails.
718682
"""
719683
if engine is None or engine.casefold() != "opencv":
720684
raise ValueError("Must specify the engine, supported value is 'opencv'.")
@@ -743,9 +707,7 @@ def image_normalize(
743707
df["norm_type"] = norm_type
744708
df["ext"] = ext # type: ignore
745709
df["verbose"] = verbose
746-
res = self._apply_udf_or_raise_error(
747-
df, image_normalize_udf, "Image normalize"
748-
)
710+
res = self._df_apply_udf(df, image_normalize_udf)
749711

750712
if verbose:
751713
normalized_content_b64_series = res._apply_unary_op(
@@ -796,7 +758,7 @@ def image_normalize(
796758
df["ext"] = ext # type: ignore
797759
df["verbose"] = verbose
798760

799-
res = self._apply_udf_or_raise_error(df, image_normalize_udf, "Image normalize")
761+
res = self._df_apply_udf(df, image_normalize_udf)
800762
res.cache() # to execute the udf
801763

802764
if verbose:
@@ -847,10 +809,6 @@ def pdf_extract(
847809
depend on the "verbose" parameter.
848810
Contains the extracted text from the PDF file.
849811
Includes error messages if verbosity is enabled.
850-
851-
Raises:
852-
ValueError: If engine is not 'pypdf'.
853-
RuntimeError: If PDF extraction fails or returns invalid structure.
854812
"""
855813
if engine is None or engine.casefold() != "pypdf":
856814
raise ValueError("Must specify the engine, supported value is 'pypdf'.")
@@ -872,29 +830,18 @@ def pdf_extract(
872830

873831
df = self.get_runtime_json_str(mode="R").to_frame()
874832
df["verbose"] = verbose
875-
876-
res = self._apply_udf_or_raise_error(df, pdf_extract_udf, "PDF extraction")
833+
res = self._df_apply_udf(df, pdf_extract_udf)
877834

878835
if verbose:
879-
# Extract content with error handling
880-
try:
881-
content_series = res._apply_unary_op(
882-
ops.JSONValue(json_path="$.content")
883-
)
884-
except Exception as e:
885-
raise RuntimeError(
886-
f"Failed to extract content field from PDF result: {e}"
887-
) from e
888-
try:
889-
status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status"))
890-
except Exception as e:
891-
raise RuntimeError(
892-
f"Failed to extract status field from PDF result: {e}"
893-
) from e
894-
895-
res_df = bpd.DataFrame({"status": status_series, "content": content_series})
896-
struct_series = bbq.struct(res_df).rename("extracted_results")
897-
return struct_series
836+
extracted_content_series = res._apply_unary_op(
837+
ops.JSONValue(json_path="$.content")
838+
)
839+
status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status"))
840+
results_df = bpd.DataFrame(
841+
{"status": status_series, "content": extracted_content_series}
842+
)
843+
results_struct = bbq.struct(results_df).rename("extracted_results")
844+
return results_struct
898845
else:
899846
return res.rename("extracted_content")
900847

@@ -937,10 +884,6 @@ def pdf_chunk(
937884
depend on the "verbose" parameter.
938885
where each string is a chunk of text extracted from PDF.
939886
Includes error messages if verbosity is enabled.
940-
941-
Raises:
942-
ValueError: If engine is not 'pypdf'.
943-
RuntimeError: If PDF chunking fails or returns invalid structure.
944887
"""
945888
if engine is None or engine.casefold() != "pypdf":
946889
raise ValueError("Must specify the engine, supported value is 'pypdf'.")
@@ -972,25 +915,13 @@ def pdf_chunk(
972915
df["overlap_size"] = overlap_size
973916
df["verbose"] = verbose
974917

975-
res = self._apply_udf_or_raise_error(df, pdf_chunk_udf, "PDF chunking")
976-
977-
try:
978-
content_series = bbq.json_extract_string_array(res, "$.content")
979-
except Exception as e:
980-
raise RuntimeError(
981-
f"Failed to extract content array from PDF chunk result: {e}"
982-
) from e
918+
res = self._df_apply_udf(df, pdf_chunk_udf)
983919

984920
if verbose:
985-
try:
986-
status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status"))
987-
except Exception as e:
988-
raise RuntimeError(
989-
f"Failed to extract status field from PDF chunk result: {e}"
990-
) from e
991-
921+
chunked_content_series = bbq.json_extract_string_array(res, "$.content")
922+
status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status"))
992923
results_df = bpd.DataFrame(
993-
{"status": status_series, "content": content_series}
924+
{"status": status_series, "content": chunked_content_series}
994925
)
995926
resultes_struct = bbq.struct(results_df).rename("chunked_results")
996927
return resultes_struct
@@ -1031,10 +962,6 @@ def audio_transcribe(
1031962
depend on the "verbose" parameter.
1032963
Contains the transcribed text from the audio file.
1033964
Includes error messages if verbosity is enabled.
1034-
1035-
Raises:
1036-
ValueError: If engine is not 'bigquery'.
1037-
RuntimeError: If the transcription result structure is invalid.
1038965
"""
1039966
if engine.casefold() != "bigquery":
1040967
raise ValueError("Must specify the engine, supported value is 'bigquery'.")
@@ -1057,10 +984,6 @@ def audio_transcribe(
1057984
model_params={"generationConfig": {"temperature": 0.0}},
1058985
)
1059986

1060-
# Validate that the result is not None
1061-
if transcribed_results is None:
1062-
raise RuntimeError("Transcription returned None result")
1063-
1064987
transcribed_content_series = transcribed_results.struct.field("result").rename(
1065988
"transcribed_content"
1066989
)

0 commit comments

Comments
 (0)