Skip to content

Commit b754f59

Browse files
committed
add error handling for pdf functions
1 parent 462c3bd commit b754f59

File tree

1 file changed

+56
-58
lines changed

1 file changed

+56
-58
lines changed

bigframes/operations/blob.py

Lines changed: 56 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -790,25 +790,9 @@ def pdf_extract(
790790
) -> bigframes.series.Series:
791791
"""Extracts text from PDF URLs and saves the text as string.
792792
793-
Args:
794-
engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
795-
connection (str or None, default None): BQ connection used for
796-
function internet transactions, and the output blob if "dst"
797-
is str. If None, uses default connection of the session.
798-
max_batching_rows (int, default 1): Max number of rows per batch
799-
send to cloud run to execute the function.
800-
container_cpu (int or float, default 2): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
801-
container_memory (str, default "1Gi"): container memory size. String of the format <number><unit>. Possible values are from 512Mi to 32Gi.
802-
verbose (bool, default "False"): controls the verbosity of the output.
803-
When set to True, both error messages and the extracted content
804-
are displayed. Conversely, when set to False, only the extracted
805-
content is presented, suppressing error messages.
806-
807-
Returns:
808-
bigframes.series.Series: str or struct[str, str],
809-
depend on the "verbose" parameter.
810-
Contains the extracted text from the PDF file.
811-
Includes error messages if verbosity is enabled.
793+
Raises:
794+
ValueError: If engine is not 'pypdf'.
795+
RuntimeError: If PDF extraction fails or returns invalid structure.
812796
"""
813797
if engine is None or engine.casefold() != "pypdf":
814798
raise ValueError("Must specify the engine, supported value is 'pypdf'.")
@@ -830,20 +814,37 @@ def pdf_extract(
830814

831815
df = self.get_runtime_json_str(mode="R").to_frame()
832816
df["verbose"] = verbose
833-
res = self._df_apply_udf(df, pdf_extract_udf)
817+
818+
try:
819+
res = self._df_apply_udf(df, pdf_extract_udf)
820+
except Exception as e:
821+
raise RuntimeError(f"PDF extraction UDF failed: {e}") from e
822+
823+
# Validate result is not None
824+
if res is None:
825+
raise RuntimeError("PDF extraction returned None result")
826+
827+
# Extract content with error handling
828+
try:
829+
content_series = res._apply_unary_op(ops.JSONValue(json_path="$.content"))
830+
except Exception as e:
831+
raise RuntimeError(
832+
f"Failed to extract content field from PDF result: {e}"
833+
) from e
834834

835835
if verbose:
836-
extracted_content_series = res._apply_unary_op(
837-
ops.JSONValue(json_path="$.content")
838-
)
839-
status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status"))
840-
results_df = bpd.DataFrame(
841-
{"status": status_series, "content": extracted_content_series}
842-
)
843-
results_struct = bbq.struct(results_df).rename("extracted_results")
844-
return results_struct
836+
try:
837+
status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status"))
838+
except Exception as e:
839+
raise RuntimeError(
840+
f"Failed to extract status field from PDF result: {e}"
841+
) from e
842+
843+
res_df = bpd.DataFrame({"status": status_series, "content": content_series})
844+
struct_series = bbq.struct(res_df)
845+
return struct_series
845846
else:
846-
return res.rename("extracted_content")
847+
return content_series
847848

848849
def pdf_chunk(
849850
self,
@@ -860,30 +861,9 @@ def pdf_chunk(
860861
"""Extracts and chunks text from PDF URLs and saves the text as
861862
arrays of strings.
862863
863-
Args:
864-
engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
865-
connection (str or None, default None): BQ connection used for
866-
function internet transactions, and the output blob if "dst"
867-
is str. If None, uses default connection of the session.
868-
chunk_size (int, default 2000): the desired size of each text chunk
869-
(number of characters).
870-
overlap_size (int, default 200): the number of overlapping characters
871-
between consective chunks. The helps to ensure context is
872-
perserved across chunk boundaries.
873-
max_batching_rows (int, default 1): Max number of rows per batch
874-
send to cloud run to execute the function.
875-
container_cpu (int or float, default 2): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
876-
container_memory (str, default "1Gi"): container memory size. String of the format <number><unit>. Possible values are from 512Mi to 32Gi.
877-
verbose (bool, default "False"): controls the verbosity of the output.
878-
When set to True, both error messages and the extracted content
879-
are displayed. Conversely, when set to False, only the extracted
880-
content is presented, suppressing error messages.
881-
882-
Returns:
883-
bigframe.series.Series: array[str] or struct[str, array[str]],
884-
depend on the "verbose" parameter.
885-
where each string is a chunk of text extracted from PDF.
886-
Includes error messages if verbosity is enabled.
864+
Raises:
865+
ValueError: If engine is not 'pypdf'.
866+
RuntimeError: If PDF chunking fails or returns invalid structure.
887867
"""
888868
if engine is None or engine.casefold() != "pypdf":
889869
raise ValueError("Must specify the engine, supported value is 'pypdf'.")
@@ -915,13 +895,31 @@ def pdf_chunk(
915895
df["overlap_size"] = overlap_size
916896
df["verbose"] = verbose
917897

918-
res = self._df_apply_udf(df, pdf_chunk_udf)
898+
try:
899+
res = self._df_apply_udf(df, pdf_chunk_udf)
900+
except Exception as e:
901+
raise RuntimeError(f"PDF chunking UDF failed: {e}") from e
902+
903+
if res is None:
904+
raise RuntimeError("PDF chunking returned None result")
905+
906+
try:
907+
content_series = bbq.json_extract_string_array(res, "$.content")
908+
except Exception as e:
909+
raise RuntimeError(
910+
f"Failed to extract content array from PDF chunk result: {e}"
911+
) from e
919912

920913
if verbose:
921-
chunked_content_series = bbq.json_extract_string_array(res, "$.content")
922-
status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status"))
914+
try:
915+
status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status"))
916+
except Exception as e:
917+
raise RuntimeError(
918+
f"Failed to extract status field from PDF chunk result: {e}"
919+
) from e
920+
923921
results_df = bpd.DataFrame(
924-
{"status": status_series, "content": chunked_content_series}
922+
{"status": status_series, "content": content_series}
925923
)
926924
resultes_struct = bbq.struct(results_df).rename("chunked_results")
927925
return resultes_struct

0 commit comments

Comments
 (0)