From faa516e293913fc5a6003f850bceb43dc7b29296 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 12 Aug 2025 21:01:37 +0000 Subject: [PATCH 1/4] fix: Correct pypdf dependency specifier for remote PDF functions --- bigframes/blob/_functions.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 51c030a23b..2e161e37ad 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -473,7 +473,9 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str: return result_json -pdf_extract_def = FunctionDef(pdf_extract_func, ["pypdf", "requests", "pypdf[crypto]"]) +pdf_extract_def = FunctionDef( + pdf_extract_func, ["pypdf", "requests", "cryptography>=3.4.0"] +) # Extracts text from a PDF url and chunks it simultaneously @@ -527,4 +529,6 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s return result_json -pdf_chunk_def = FunctionDef(pdf_chunk_func, ["pypdf", "requests", "pypdf[crypto]"]) +pdf_chunk_def = FunctionDef( + pdf_chunk_func, ["pypdf", "requests", "cryptography>=3.4.0"] +) From 2f358295792ac718eaf643d06ff08a671dc7d03c Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 12 Aug 2025 21:12:15 +0000 Subject: [PATCH 2/4] specfy a version for pypdf as well --- bigframes/blob/_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 2e161e37ad..aa7cd4443e 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -474,7 +474,7 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str: pdf_extract_def = FunctionDef( - pdf_extract_func, ["pypdf", "requests", "cryptography>=3.4.0"] + pdf_extract_func, ["pypdf>=5.3.1", "requests", "cryptography>=3.4.0"] ) @@ -530,5 +530,5 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s pdf_chunk_def = FunctionDef( - pdf_chunk_func, ["pypdf", "requests", "cryptography>=3.4.0"] + pdf_chunk_func, ["pypdf>=5.3.1", "requests", "cryptography>=3.4.0"] ) From 5116a2df6eb47803ffc29c2447a3593a5fb07ff6 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 13 Aug 2025 17:47:06 +0000 Subject: [PATCH 3/4] testcase change --- bigframes/blob/_functions.py | 4 +- tests/system/large/blob/test_function.py | 127 +++++++++++------------ 2 files changed, 64 insertions(+), 67 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index aa7cd4443e..057db0089d 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -474,7 +474,7 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str: pdf_extract_def = FunctionDef( - pdf_extract_func, ["pypdf>=5.3.1", "requests", "cryptography>=3.4.0"] + pdf_extract_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography>=3.4.0,<6.0.0"] ) @@ -530,5 +530,5 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s pdf_chunk_def = FunctionDef( - pdf_chunk_func, ["pypdf>=5.3.1", "requests", "cryptography>=3.4.0"] + pdf_chunk_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography>=3.4.0,<6.0.0"] ) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index a594b144f5..c8fa63d493 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -302,37 +302,16 @@ def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: @pytest.mark.parametrize( - "verbose, expected", + "verbose", [ - ( - True, - pd.Series( - [ - {"status": "File has not been decrypted", "content": ""}, - { - "status": "", - "content": "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. ", - }, - ] - ), - ), - ( - False, - pd.Series( - [ - "", - "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. ", - ], - name="pdf", - ), - ), + (True), + (False), ], ) def test_blob_pdf_extract( pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str, - expected: pd.Series, ): actual = ( pdf_mm_df["pdf"] @@ -341,49 +320,44 @@ def test_blob_pdf_extract( .to_pandas() ) - pd.testing.assert_series_equal( - actual, - expected, - check_dtype=False, - check_index=False, + # check relative length + expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes." + expected_len = len(expected_text) + + actual_text = "" + if verbose: + # The first entry is for a file that doesn't exist, so we check the second one + successful_results = actual[actual.apply(lambda x: x["status"] == "")] + actual_text = successful_results.apply(lambda x: x["content"]).iloc[0] + else: + actual_text = actual[actual != ""].iloc[0] + actual_len = len(actual_text) + + relative_length_tolerance = 0.25 + min_acceptable_len = expected_len * (1 - relative_length_tolerance) + max_acceptable_len = expected_len * (1 + relative_length_tolerance) + assert min_acceptable_len <= actual_len <= max_acceptable_len, ( + f"Item (verbose={verbose}): Extracted text length {actual_len} is outside the acceptable range " + f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. " + f"Expected reference length was {expected_len}. " ) + # check for major keywords + major_keywords = ["Sample", "PDF", "testing", "dummy", "messages"] + for keyword in major_keywords: + assert ( + keyword.lower() in actual_text.lower() + ), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in extracted text. " + @pytest.mark.parametrize( - "verbose, expected", + "verbose", [ - ( - True, - pd.Series( - [ - {"status": "File has not been decrypted", "content": []}, - { - "status": "", - "content": [ - "Sample PDF This is a testing file. Some ", - "dummy messages are used for testing ", - "purposes. ", - ], - }, - ] - ), - ), - ( - False, - pd.Series( - [ - pd.NA, - "Sample PDF This is a testing file. Some ", - "dummy messages are used for testing ", - "purposes. ", - ], - ), - ), + (True), + (False), ], ) -def test_blob_pdf_chunk( - pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str, expected: pd.Series -): +def test_blob_pdf_chunk(pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str): actual = ( pdf_mm_df["pdf"] .blob.pdf_chunk( @@ -397,13 +371,36 @@ def test_blob_pdf_chunk( .to_pandas() ) - pd.testing.assert_series_equal( - actual, - expected, - check_dtype=False, - check_index=False, + # check relative length + expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes." + expected_len = len(expected_text) + + actual_text = "" + if verbose: + # The first entry is for a file that doesn't exist, so we check the second one + successful_results = actual[actual.apply(lambda x: x["status"] == "")] + actual_text = "".join(successful_results.apply(lambda x: x["content"]).iloc[0]) + else: + # First entry is NA + actual_text = "".join(actual.dropna()) + actual_len = len(actual_text) + + relative_length_tolerance = 0.25 + min_acceptable_len = expected_len * (1 - relative_length_tolerance) + max_acceptable_len = expected_len * (1 + relative_length_tolerance) + assert min_acceptable_len <= actual_len <= max_acceptable_len, ( + f"Item (verbose={verbose}): Extracted text length {actual_len} is outside the acceptable range " + f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. " + f"Expected reference length was {expected_len}. " ) + # check for major keywords + major_keywords = ["Sample", "PDF", "testing", "dummy", "messages"] + for keyword in major_keywords: + assert ( + keyword.lower() in actual_text.lower() + ), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in extracted text. " + @pytest.mark.parametrize( "model_name, verbose", From b4459699842fda873f83a10f4cad335a7f739cd5 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 13 Aug 2025 20:40:50 +0000 Subject: [PATCH 4/4] specify a version for cryptography --- bigframes/blob/_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 057db0089d..8d1ca38e62 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -474,7 +474,7 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str: pdf_extract_def = FunctionDef( - pdf_extract_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography>=3.4.0,<6.0.0"] + pdf_extract_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography==43.0.3"] ) @@ -530,5 +530,5 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s pdf_chunk_def = FunctionDef( - pdf_chunk_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography>=3.4.0,<6.0.0"] + pdf_chunk_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography==43.0.3"] )