@@ -790,25 +790,9 @@ def pdf_extract(
790790 ) -> bigframes .series .Series :
791791 """Extracts text from PDF URLs and saves the text as string.
792792
793- Args:
794- engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
795- connection (str or None, default None): BQ connection used for
796- function internet transactions, and the output blob if "dst"
797- is str. If None, uses default connection of the session.
798- max_batching_rows (int, default 1): Max number of rows per batch
799- send to cloud run to execute the function.
800- container_cpu (int or float, default 2): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
801- container_memory (str, default "1Gi"): container memory size. String of the format <number><unit>. Possible values are from 512Mi to 32Gi.
802- verbose (bool, default "False"): controls the verbosity of the output.
803- When set to True, both error messages and the extracted content
804- are displayed. Conversely, when set to False, only the extracted
805- content is presented, suppressing error messages.
806-
807- Returns:
808- bigframes.series.Series: str or struct[str, str],
809- depend on the "verbose" parameter.
810- Contains the extracted text from the PDF file.
811- Includes error messages if verbosity is enabled.
793+ Raises:
794+ ValueError: If engine is not 'pypdf'.
795+ RuntimeError: If PDF extraction fails or returns invalid structure.
812796 """
813797 if engine is None or engine .casefold () != "pypdf" :
814798 raise ValueError ("Must specify the engine, supported value is 'pypdf'." )
@@ -830,20 +814,37 @@ def pdf_extract(
830814
831815 df = self .get_runtime_json_str (mode = "R" ).to_frame ()
832816 df ["verbose" ] = verbose
833- res = self ._df_apply_udf (df , pdf_extract_udf )
817+
818+ try :
819+ res = self ._df_apply_udf (df , pdf_extract_udf )
820+ except Exception as e :
821+ raise RuntimeError (f"PDF extraction UDF failed: { e } " ) from e
822+
823+ # Validate result is not None
824+ if res is None :
825+ raise RuntimeError ("PDF extraction returned None result" )
826+
827+ # Extract content with error handling
828+ try :
829+ content_series = res ._apply_unary_op (ops .JSONValue (json_path = "$.content" ))
830+ except Exception as e :
831+ raise RuntimeError (
832+ f"Failed to extract content field from PDF result: { e } "
833+ ) from e
834834
835835 if verbose :
836- extracted_content_series = res ._apply_unary_op (
837- ops .JSONValue (json_path = "$.content" )
838- )
839- status_series = res ._apply_unary_op (ops .JSONValue (json_path = "$.status" ))
840- results_df = bpd .DataFrame (
841- {"status" : status_series , "content" : extracted_content_series }
842- )
843- results_struct = bbq .struct (results_df ).rename ("extracted_results" )
844- return results_struct
836+ try :
837+ status_series = res ._apply_unary_op (ops .JSONValue (json_path = "$.status" ))
838+ except Exception as e :
839+ raise RuntimeError (
840+ f"Failed to extract status field from PDF result: { e } "
841+ ) from e
842+
843+ res_df = bpd .DataFrame ({"status" : status_series , "content" : content_series })
844+ struct_series = bbq .struct (res_df )
845+ return struct_series
845846 else :
846- return res . rename ( "extracted_content" )
847+ return content_series
847848
848849 def pdf_chunk (
849850 self ,
@@ -860,30 +861,9 @@ def pdf_chunk(
860861 """Extracts and chunks text from PDF URLs and saves the text as
861862 arrays of strings.
862863
863- Args:
864- engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
865- connection (str or None, default None): BQ connection used for
866- function internet transactions, and the output blob if "dst"
867- is str. If None, uses default connection of the session.
868- chunk_size (int, default 2000): the desired size of each text chunk
869- (number of characters).
870- overlap_size (int, default 200): the number of overlapping characters
871- between consective chunks. The helps to ensure context is
872- perserved across chunk boundaries.
873- max_batching_rows (int, default 1): Max number of rows per batch
874- send to cloud run to execute the function.
875- container_cpu (int or float, default 2): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
876- container_memory (str, default "1Gi"): container memory size. String of the format <number><unit>. Possible values are from 512Mi to 32Gi.
877- verbose (bool, default "False"): controls the verbosity of the output.
878- When set to True, both error messages and the extracted content
879- are displayed. Conversely, when set to False, only the extracted
880- content is presented, suppressing error messages.
881-
882- Returns:
883- bigframe.series.Series: array[str] or struct[str, array[str]],
884- depend on the "verbose" parameter.
885- where each string is a chunk of text extracted from PDF.
886- Includes error messages if verbosity is enabled.
864+ Raises:
865+ ValueError: If engine is not 'pypdf'.
866+ RuntimeError: If PDF chunking fails or returns invalid structure.
887867 """
888868 if engine is None or engine .casefold () != "pypdf" :
889869 raise ValueError ("Must specify the engine, supported value is 'pypdf'." )
@@ -915,13 +895,31 @@ def pdf_chunk(
915895 df ["overlap_size" ] = overlap_size
916896 df ["verbose" ] = verbose
917897
918- res = self ._df_apply_udf (df , pdf_chunk_udf )
898+ try :
899+ res = self ._df_apply_udf (df , pdf_chunk_udf )
900+ except Exception as e :
901+ raise RuntimeError (f"PDF chunking UDF failed: { e } " ) from e
902+
903+ if res is None :
904+ raise RuntimeError ("PDF chunking returned None result" )
905+
906+ try :
907+ content_series = bbq .json_extract_string_array (res , "$.content" )
908+ except Exception as e :
909+ raise RuntimeError (
910+ f"Failed to extract content array from PDF chunk result: { e } "
911+ ) from e
919912
920913 if verbose :
921- chunked_content_series = bbq .json_extract_string_array (res , "$.content" )
922- status_series = res ._apply_unary_op (ops .JSONValue (json_path = "$.status" ))
914+ try :
915+ status_series = res ._apply_unary_op (ops .JSONValue (json_path = "$.status" ))
916+ except Exception as e :
917+ raise RuntimeError (
918+ f"Failed to extract status field from PDF chunk result: { e } "
919+ ) from e
920+
923921 results_df = bpd .DataFrame (
924- {"status" : status_series , "content" : chunked_content_series }
922+ {"status" : status_series , "content" : content_series }
925923 )
926924 resultes_struct = bbq .struct (results_df ).rename ("chunked_results" )
927925 return resultes_struct
0 commit comments