diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 51c030a23b..8d1ca38e62 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -473,7 +473,9 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str: return result_json -pdf_extract_def = FunctionDef(pdf_extract_func, ["pypdf", "requests", "pypdf[crypto]"]) +pdf_extract_def = FunctionDef( + pdf_extract_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography==43.0.3"] +) # Extracts text from a PDF url and chunks it simultaneously @@ -527,4 +529,6 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s return result_json -pdf_chunk_def = FunctionDef(pdf_chunk_func, ["pypdf", "requests", "pypdf[crypto]"]) +pdf_chunk_def = FunctionDef( + pdf_chunk_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography==43.0.3"] +) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index a594b144f5..c8fa63d493 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -302,37 +302,16 @@ def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: @pytest.mark.parametrize( - "verbose, expected", + "verbose", [ - ( - True, - pd.Series( - [ - {"status": "File has not been decrypted", "content": ""}, - { - "status": "", - "content": "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. ", - }, - ] - ), - ), - ( - False, - pd.Series( - [ - "", - "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. ", - ], - name="pdf", - ), - ), + (True), + (False), ], ) def test_blob_pdf_extract( pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str, - expected: pd.Series, ): actual = ( pdf_mm_df["pdf"] @@ -341,49 +320,44 @@ def test_blob_pdf_extract( .to_pandas() ) - pd.testing.assert_series_equal( - actual, - expected, - check_dtype=False, - check_index=False, + # check relative length + expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes." + expected_len = len(expected_text) + + actual_text = "" + if verbose: + # The first entry is for a file that doesn't exist, so we check the second one + successful_results = actual[actual.apply(lambda x: x["status"] == "")] + actual_text = successful_results.apply(lambda x: x["content"]).iloc[0] + else: + actual_text = actual[actual != ""].iloc[0] + actual_len = len(actual_text) + + relative_length_tolerance = 0.25 + min_acceptable_len = expected_len * (1 - relative_length_tolerance) + max_acceptable_len = expected_len * (1 + relative_length_tolerance) + assert min_acceptable_len <= actual_len <= max_acceptable_len, ( + f"Item (verbose={verbose}): Extracted text length {actual_len} is outside the acceptable range " + f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. " + f"Expected reference length was {expected_len}. " ) + # check for major keywords + major_keywords = ["Sample", "PDF", "testing", "dummy", "messages"] + for keyword in major_keywords: + assert ( + keyword.lower() in actual_text.lower() + ), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in extracted text. " + @pytest.mark.parametrize( - "verbose, expected", + "verbose", [ - ( - True, - pd.Series( - [ - {"status": "File has not been decrypted", "content": []}, - { - "status": "", - "content": [ - "Sample PDF This is a testing file. Some ", - "dummy messages are used for testing ", - "purposes. ", - ], - }, - ] - ), - ), - ( - False, - pd.Series( - [ - pd.NA, - "Sample PDF This is a testing file. Some ", - "dummy messages are used for testing ", - "purposes. ", - ], - ), - ), + (True), + (False), ], ) -def test_blob_pdf_chunk( - pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str, expected: pd.Series -): +def test_blob_pdf_chunk(pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str): actual = ( pdf_mm_df["pdf"] .blob.pdf_chunk( @@ -397,13 +371,36 @@ def test_blob_pdf_chunk( .to_pandas() ) - pd.testing.assert_series_equal( - actual, - expected, - check_dtype=False, - check_index=False, + # check relative length + expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes." + expected_len = len(expected_text) + + actual_text = "" + if verbose: + # The first entry is for a file that doesn't exist, so we check the second one + successful_results = actual[actual.apply(lambda x: x["status"] == "")] + actual_text = "".join(successful_results.apply(lambda x: x["content"]).iloc[0]) + else: + # First entry is NA + actual_text = "".join(actual.dropna()) + actual_len = len(actual_text) + + relative_length_tolerance = 0.25 + min_acceptable_len = expected_len * (1 - relative_length_tolerance) + max_acceptable_len = expected_len * (1 + relative_length_tolerance) + assert min_acceptable_len <= actual_len <= max_acceptable_len, ( + f"Item (verbose={verbose}): Extracted text length {actual_len} is outside the acceptable range " + f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. " + f"Expected reference length was {expected_len}. " ) + # check for major keywords + major_keywords = ["Sample", "PDF", "testing", "dummy", "messages"] + for keyword in major_keywords: + assert ( + keyword.lower() in actual_text.lower() + ), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in extracted text. " + @pytest.mark.parametrize( "model_name, verbose",