From faa516e293913fc5a6003f850bceb43dc7b29296 Mon Sep 17 00:00:00 2001
From: Shuowei Li <shuowei@google.com>
Date: Tue, 12 Aug 2025 21:01:37 +0000
Subject: [PATCH 1/4] fix: Correct pypdf dependency specifier for remote PDF
 functions

---
 bigframes/blob/_functions.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py
index 51c030a23b..2e161e37ad 100644
--- a/bigframes/blob/_functions.py
+++ b/bigframes/blob/_functions.py
@@ -473,7 +473,9 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str:
     return result_json
 
 
-pdf_extract_def = FunctionDef(pdf_extract_func, ["pypdf", "requests", "pypdf[crypto]"])
+pdf_extract_def = FunctionDef(
+    pdf_extract_func, ["pypdf", "requests", "cryptography>=3.4.0"]
+)
 
 
 # Extracts text from a PDF url and chunks it simultaneously
@@ -527,4 +529,6 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s
     return result_json
 
 
-pdf_chunk_def = FunctionDef(pdf_chunk_func, ["pypdf", "requests", "pypdf[crypto]"])
+pdf_chunk_def = FunctionDef(
+    pdf_chunk_func, ["pypdf", "requests", "cryptography>=3.4.0"]
+)

From 2f358295792ac718eaf643d06ff08a671dc7d03c Mon Sep 17 00:00:00 2001
From: Shuowei Li <shuowei@google.com>
Date: Tue, 12 Aug 2025 21:12:15 +0000
Subject: [PATCH 2/4] specfy a version for pypdf as well

---
 bigframes/blob/_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py
index 2e161e37ad..aa7cd4443e 100644
--- a/bigframes/blob/_functions.py
+++ b/bigframes/blob/_functions.py
@@ -474,7 +474,7 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str:
 
 
 pdf_extract_def = FunctionDef(
-    pdf_extract_func, ["pypdf", "requests", "cryptography>=3.4.0"]
+    pdf_extract_func, ["pypdf>=5.3.1", "requests", "cryptography>=3.4.0"]
 )
 
 
@@ -530,5 +530,5 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s
 
 
 pdf_chunk_def = FunctionDef(
-    pdf_chunk_func, ["pypdf", "requests", "cryptography>=3.4.0"]
+    pdf_chunk_func, ["pypdf>=5.3.1", "requests", "cryptography>=3.4.0"]
 )

From 5116a2df6eb47803ffc29c2447a3593a5fb07ff6 Mon Sep 17 00:00:00 2001
From: Shuowei Li <shuowei@google.com>
Date: Wed, 13 Aug 2025 17:47:06 +0000
Subject: [PATCH 3/4] testcase change

---
 bigframes/blob/_functions.py             |   4 +-
 tests/system/large/blob/test_function.py | 127 +++++++++++------------
 2 files changed, 64 insertions(+), 67 deletions(-)

diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py
index aa7cd4443e..057db0089d 100644
--- a/bigframes/blob/_functions.py
+++ b/bigframes/blob/_functions.py
@@ -474,7 +474,7 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str:
 
 
 pdf_extract_def = FunctionDef(
-    pdf_extract_func, ["pypdf>=5.3.1", "requests", "cryptography>=3.4.0"]
+    pdf_extract_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography>=3.4.0,<6.0.0"]
 )
 
 
@@ -530,5 +530,5 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s
 
 
 pdf_chunk_def = FunctionDef(
-    pdf_chunk_func, ["pypdf>=5.3.1", "requests", "cryptography>=3.4.0"]
+    pdf_chunk_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography>=3.4.0,<6.0.0"]
 )
diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py
index a594b144f5..c8fa63d493 100644
--- a/tests/system/large/blob/test_function.py
+++ b/tests/system/large/blob/test_function.py
@@ -302,37 +302,16 @@ def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection:
 
 
 @pytest.mark.parametrize(
-    "verbose, expected",
+    "verbose",
     [
-        (
-            True,
-            pd.Series(
-                [
-                    {"status": "File has not been decrypted", "content": ""},
-                    {
-                        "status": "",
-                        "content": "Sample  PDF    This  is  a  testing  file.  Some  dummy  messages  are  used  for  testing  purposes.   ",
-                    },
-                ]
-            ),
-        ),
-        (
-            False,
-            pd.Series(
-                [
-                    "",
-                    "Sample  PDF    This  is  a  testing  file.  Some  dummy  messages  are  used  for  testing  purposes.   ",
-                ],
-                name="pdf",
-            ),
-        ),
+        (True),
+        (False),
     ],
 )
 def test_blob_pdf_extract(
     pdf_mm_df: bpd.DataFrame,
     verbose: bool,
     bq_connection: str,
-    expected: pd.Series,
 ):
     actual = (
         pdf_mm_df["pdf"]
@@ -341,49 +320,44 @@ def test_blob_pdf_extract(
         .to_pandas()
     )
 
-    pd.testing.assert_series_equal(
-        actual,
-        expected,
-        check_dtype=False,
-        check_index=False,
+    # check relative length
+    expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes."
+    expected_len = len(expected_text)
+
+    actual_text = ""
+    if verbose:
+        # The first entry is for a file that doesn't exist, so we check the second one
+        successful_results = actual[actual.apply(lambda x: x["status"] == "")]
+        actual_text = successful_results.apply(lambda x: x["content"]).iloc[0]
+    else:
+        actual_text = actual[actual != ""].iloc[0]
+    actual_len = len(actual_text)
+
+    relative_length_tolerance = 0.25
+    min_acceptable_len = expected_len * (1 - relative_length_tolerance)
+    max_acceptable_len = expected_len * (1 + relative_length_tolerance)
+    assert min_acceptable_len <= actual_len <= max_acceptable_len, (
+        f"Item (verbose={verbose}): Extracted text length {actual_len} is outside the acceptable range "
+        f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. "
+        f"Expected reference length was {expected_len}. "
     )
 
+    # check for major keywords
+    major_keywords = ["Sample", "PDF", "testing", "dummy", "messages"]
+    for keyword in major_keywords:
+        assert (
+            keyword.lower() in actual_text.lower()
+        ), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in extracted text. "
+
 
 @pytest.mark.parametrize(
-    "verbose, expected",
+    "verbose",
     [
-        (
-            True,
-            pd.Series(
-                [
-                    {"status": "File has not been decrypted", "content": []},
-                    {
-                        "status": "",
-                        "content": [
-                            "Sample  PDF    This  is  a  testing  file.  Some ",
-                            "dummy  messages  are  used  for  testing ",
-                            "purposes.   ",
-                        ],
-                    },
-                ]
-            ),
-        ),
-        (
-            False,
-            pd.Series(
-                [
-                    pd.NA,
-                    "Sample  PDF    This  is  a  testing  file.  Some ",
-                    "dummy  messages  are  used  for  testing ",
-                    "purposes.   ",
-                ],
-            ),
-        ),
+        (True),
+        (False),
     ],
 )
-def test_blob_pdf_chunk(
-    pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str, expected: pd.Series
-):
+def test_blob_pdf_chunk(pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str):
     actual = (
         pdf_mm_df["pdf"]
         .blob.pdf_chunk(
@@ -397,13 +371,36 @@ def test_blob_pdf_chunk(
         .to_pandas()
     )
 
-    pd.testing.assert_series_equal(
-        actual,
-        expected,
-        check_dtype=False,
-        check_index=False,
+    # check relative length
+    expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes."
+    expected_len = len(expected_text)
+
+    actual_text = ""
+    if verbose:
+        # The first entry is for a file that doesn't exist, so we check the second one
+        successful_results = actual[actual.apply(lambda x: x["status"] == "")]
+        actual_text = "".join(successful_results.apply(lambda x: x["content"]).iloc[0])
+    else:
+        # First entry is NA
+        actual_text = "".join(actual.dropna())
+    actual_len = len(actual_text)
+
+    relative_length_tolerance = 0.25
+    min_acceptable_len = expected_len * (1 - relative_length_tolerance)
+    max_acceptable_len = expected_len * (1 + relative_length_tolerance)
+    assert min_acceptable_len <= actual_len <= max_acceptable_len, (
+        f"Item (verbose={verbose}): Extracted text length {actual_len} is outside the acceptable range "
+        f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. "
+        f"Expected reference length was {expected_len}. "
     )
 
+    # check for major keywords
+    major_keywords = ["Sample", "PDF", "testing", "dummy", "messages"]
+    for keyword in major_keywords:
+        assert (
+            keyword.lower() in actual_text.lower()
+        ), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in extracted text. "
+
 
 @pytest.mark.parametrize(
     "model_name, verbose",

From b4459699842fda873f83a10f4cad335a7f739cd5 Mon Sep 17 00:00:00 2001
From: Shuowei Li <shuowei@google.com>
Date: Wed, 13 Aug 2025 20:40:50 +0000
Subject: [PATCH 4/4] specify a version for cryptography

---
 bigframes/blob/_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py
index 057db0089d..8d1ca38e62 100644
--- a/bigframes/blob/_functions.py
+++ b/bigframes/blob/_functions.py
@@ -474,7 +474,7 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str:
 
 
 pdf_extract_def = FunctionDef(
-    pdf_extract_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography>=3.4.0,<6.0.0"]
+    pdf_extract_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography==43.0.3"]
 )
 
 
@@ -530,5 +530,5 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s
 
 
 pdf_chunk_def = FunctionDef(
-    pdf_chunk_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography>=3.4.0,<6.0.0"]
+    pdf_chunk_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography==43.0.3"]
 )