Merge branch 'main' into b407969673-dataframe-resample

tswast · web-flow · commit 7375fdbe1c09 · 2025-11-04T09:57:21.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,29 @@
 
 [1]: https://pypi.org/project/bigframes/#history
 
+## [2.28.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.27.0...v2.28.0) (2025-11-03)
+
+
+### Features
+
+* Add bigframes.bigquery.st_simplify ([#2210](https://github.com/googleapis/python-bigquery-dataframes/issues/2210)) ([ecee2bc](https://github.com/googleapis/python-bigquery-dataframes/commit/ecee2bc6ada0bc968fc56ed7194dc8c043547e93))
+* Add Series.dt.day_name ([#2218](https://github.com/googleapis/python-bigquery-dataframes/issues/2218)) ([5e006e4](https://github.com/googleapis/python-bigquery-dataframes/commit/5e006e404b65c32e5b1d342ebfcfce59ee592c8c))
+* Polars engine supports std, var ([#2215](https://github.com/googleapis/python-bigquery-dataframes/issues/2215)) ([ef5e83a](https://github.com/googleapis/python-bigquery-dataframes/commit/ef5e83acedf005cbe1e6ad174bec523ac50517d7))
+* Support INFORMATION_SCHEMA views in `read_gbq` ([#1895](https://github.com/googleapis/python-bigquery-dataframes/issues/1895)) ([d97cafc](https://github.com/googleapis/python-bigquery-dataframes/commit/d97cafcb5921fca2351b18011b0e54e2631cc53d))
+* Support some python standard lib callables in apply/combine ([#2187](https://github.com/googleapis/python-bigquery-dataframes/issues/2187)) ([86a2756](https://github.com/googleapis/python-bigquery-dataframes/commit/86a27564b48b854a32b3d11cd2105aa0fa496279))
+
+
+### Bug Fixes
+
+* Correct connection normalization in blob system tests ([#2222](https://github.com/googleapis/python-bigquery-dataframes/issues/2222)) ([a0e1e50](https://github.com/googleapis/python-bigquery-dataframes/commit/a0e1e50e47c758bdceb54d04180ed36b35cf2e35))
+* Improve error handling in blob operations ([#2194](https://github.com/googleapis/python-bigquery-dataframes/issues/2194)) ([d410046](https://github.com/googleapis/python-bigquery-dataframes/commit/d4100466612df0523d01ed01ca1e115dabd6ef45))
+* Resolve AttributeError in TableWidget and improve initialization ([#1937](https://github.com/googleapis/python-bigquery-dataframes/issues/1937)) ([4c4c9b1](https://github.com/googleapis/python-bigquery-dataframes/commit/4c4c9b14657b7cda1940ef39e7d4db20a9ff5308))
+
+
+### Documentation
+
+* Update bq_dataframes_llm_output_schema.ipynb ([#2004](https://github.com/googleapis/python-bigquery-dataframes/issues/2004)) ([316ba9f](https://github.com/googleapis/python-bigquery-dataframes/commit/316ba9f557d792117d5a7845d7567498f78dd513))
+
 ## [2.27.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.26.0...v2.27.0) (2025-10-24)
 
 
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -68,6 +68,7 @@
 import bigframes.operations.aggregations as agg_ops
 from bigframes.session import dry_runs, execution_spec
 from bigframes.session import executor as executors
+from bigframes.session._io import pandas as io_pandas
 
 # Type constraint for wherever column labels are used
 Label = typing.Hashable
@@ -711,12 +712,15 @@ def to_pandas_batches(
         # To reduce the number of edge cases to consider when working with the
         # results of this, always return at least one DataFrame. See:
         # b/428918844.
-        empty_val = pd.DataFrame(
-            {
-                col: pd.Series([], dtype=self.expr.get_column_type(col))
-                for col in itertools.chain(self.value_columns, self.index_columns)
-            }
-        )
+        try:
+            empty_arrow_table = self.expr.schema.to_pyarrow().empty_table()
+        except pa.ArrowNotImplementedError:
+            # Bug with some pyarrow versions(https://github.com/apache/arrow/issues/45262),
+            # empty_table only supports base storage types, not extension types.
+            empty_arrow_table = self.expr.schema.to_pyarrow(
+                use_storage_types=True
+            ).empty_table()
+        empty_val = io_pandas.arrow_to_pandas(empty_arrow_table, self.expr.schema)
         dfs = map(
             lambda a: a[0],
             itertools.zip_longest(
diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py
@@ -45,7 +45,6 @@
 import google.cloud.bigquery.table
 from google.cloud.bigquery_storage_v1 import types as bq_storage_types
 import pandas
-import pyarrow as pa
 
 import bigframes._tools
 import bigframes._tools.strings
@@ -1307,22 +1306,6 @@ def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict:
     return configuration
 
 
-def _has_json_arrow_type(arrow_type: pa.DataType) -> bool:
-    """
-    Searches recursively for JSON array type within a PyArrow DataType.
-    """
-    if arrow_type == bigframes.dtypes.JSON_ARROW_TYPE:
-        return True
-    if pa.types.is_list(arrow_type):
-        return _has_json_arrow_type(arrow_type.value_type)
-    if pa.types.is_struct(arrow_type):
-        for i in range(arrow_type.num_fields):
-            if _has_json_arrow_type(arrow_type.field(i).type):
-                return True
-        return False
-    return False
-
-
 def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype):
     """
     Determines whether a datatype is supported by bq load jobs.
@@ -1339,7 +1322,9 @@ def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype):
     if column_type == bigframes.dtypes.JSON_DTYPE:
         return
 
-    if isinstance(column_type, pandas.ArrowDtype) and _has_json_arrow_type(
+    if isinstance(
+        column_type, pandas.ArrowDtype
+    ) and bigframes.dtypes.contains_db_dtypes_json_arrow_type(
         column_type.pyarrow_dtype
     ):
         raise NotImplementedError(
diff --git a/bigframes/version.py b/bigframes/version.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.27.0"
+__version__ = "2.28.0"
 
 # {x-release-please-start-date}
-__release_date__ = "2025-10-24"
+__release_date__ = "2025-11-03"
 # {x-release-please-end}
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
@@ -70,6 +70,23 @@ def _hash_digest_file(hasher, filepath):
             hasher.update(chunk)
 
 
+@pytest.fixture(scope="session")
+def normalize_connection_id():
+    """Normalizes the connection ID by casefolding only the LOCATION component.
+
+    Connection format: PROJECT.LOCATION.CONNECTION_NAME
+    Only LOCATION is case-insensitive; PROJECT and CONNECTION_NAME must be lowercase.
+    """
+
+    def normalize(connection_id: str) -> str:
+        parts = connection_id.split(".")
+        if len(parts) == 3:
+            return f"{parts[0]}.{parts[1].casefold()}.{parts[2]}"
+        return connection_id  # Return unchanged if invalid format
+
+    return normalize
+
+
 @pytest.fixture(scope="session")
 def tokyo_location() -> str:
     return TOKYO_LOCATION
diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py
@@ -52,6 +52,7 @@ def images_output_uris(images_output_folder: str) -> list[str]:
     ]
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_exif(
     bq_connection: str,
     session: bigframes.Session,
@@ -103,6 +104,7 @@ def test_blob_exif_verbose(
     assert content_series.dtype == dtypes.JSON_DTYPE
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_image_blur_to_series(
     images_mm_df: bpd.DataFrame,
     bq_connection: str,
@@ -136,6 +138,7 @@ def test_blob_image_blur_to_series(
     assert not actual.blob.size().isna().any()
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_image_blur_to_series_verbose(
     images_mm_df: bpd.DataFrame,
     bq_connection: str,
@@ -163,6 +166,7 @@ def test_blob_image_blur_to_series_verbose(
     assert not actual.blob.size().isna().any()
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_image_blur_to_folder(
     images_mm_df: bpd.DataFrame,
     bq_connection: str,
@@ -195,6 +199,7 @@ def test_blob_image_blur_to_folder(
     assert not actual.blob.size().isna().any()
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_image_blur_to_folder_verbose(
     images_mm_df: bpd.DataFrame,
     bq_connection: str,
@@ -254,6 +259,7 @@ def test_blob_image_blur_to_bq_verbose(images_mm_df: bpd.DataFrame, bq_connectio
     assert content_series.dtype == dtypes.BYTES_DTYPE
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_image_resize_to_series(
     images_mm_df: bpd.DataFrame,
     bq_connection: str,
@@ -291,6 +297,7 @@ def test_blob_image_resize_to_series(
     assert not actual.blob.size().isna().any()
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_image_resize_to_series_verbose(
     images_mm_df: bpd.DataFrame,
     bq_connection: str,
@@ -325,6 +332,7 @@ def test_blob_image_resize_to_series_verbose(
     assert not actual.blob.size().isna().any()
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_image_resize_to_folder(
     images_mm_df: bpd.DataFrame,
     bq_connection: str,
@@ -358,6 +366,7 @@ def test_blob_image_resize_to_folder(
     assert not actual.blob.size().isna().any()
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_image_resize_to_folder_verbose(
     images_mm_df: bpd.DataFrame,
     bq_connection: str,
@@ -420,6 +429,7 @@ def test_blob_image_resize_to_bq_verbose(
     assert content_series.dtype == dtypes.BYTES_DTYPE
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_image_normalize_to_series(
     images_mm_df: bpd.DataFrame,
     bq_connection: str,
@@ -492,6 +502,7 @@ def test_blob_image_normalize_to_series_verbose(
     assert hasattr(content_series, "blob")
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_image_normalize_to_folder(
     images_mm_df: bpd.DataFrame,
     bq_connection: str,
@@ -598,6 +609,7 @@ def test_blob_image_normalize_to_bq_verbose(
     assert content_series.dtype == dtypes.BYTES_DTYPE
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_pdf_extract(
     pdf_mm_df: bpd.DataFrame,
     bq_connection: str,
@@ -633,6 +645,7 @@ def test_blob_pdf_extract(
         ), f"Item (verbose=False): Expected keyword '{keyword}' not found in extracted text. "
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_pdf_extract_verbose(
     pdf_mm_df: bpd.DataFrame,
     bq_connection: str,
@@ -670,6 +683,7 @@ def test_blob_pdf_extract_verbose(
         ), f"Item (verbose=True): Expected keyword '{keyword}' not found in extracted text. "
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_pdf_chunk(pdf_mm_df: bpd.DataFrame, bq_connection: str):
     actual = (
         pdf_mm_df["pdf"]
@@ -709,6 +723,7 @@ def test_blob_pdf_chunk(pdf_mm_df: bpd.DataFrame, bq_connection: str):
         ), f"Item (verbose=False): Expected keyword '{keyword}' not found in extracted text. "
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_blob_pdf_chunk_verbose(pdf_mm_df: bpd.DataFrame, bq_connection: str):
     actual = (
         pdf_mm_df["pdf"]
diff --git a/tests/system/small/bigquery/test_ai.py b/tests/system/small/bigquery/test_ai.py
@@ -273,6 +273,7 @@ def test_ai_if(session):
     assert result.dtype == dtypes.BOOL_DTYPE
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_ai_if_multi_model(session):
     df = session.from_glob_path(
         "gs://bigframes-dev-testing/a_multimodel/images/*", name="image"
@@ -293,6 +294,7 @@ def test_ai_classify(session):
     assert result.dtype == dtypes.STRING_DTYPE
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_ai_classify_multi_model(session):
     df = session.from_glob_path(
         "gs://bigframes-dev-testing/a_multimodel/images/*", name="image"
diff --git a/tests/system/small/blob/test_io.py b/tests/system/small/blob/test_io.py
@@ -12,27 +12,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Callable
 from unittest import mock
 
 import IPython.display
 import pandas as pd
+import pytest
 
 import bigframes
 import bigframes.pandas as bpd
 
 
 def test_blob_create_from_uri_str(
-    bq_connection: str, session: bigframes.Session, images_uris
+    bq_connection: str,
+    session: bigframes.Session,
+    images_uris,
+    normalize_connection_id: Callable[[str], str],
 ):
     uri_series = bpd.Series(images_uris, session=session)
     blob_series = uri_series.str.to_blob(connection=bq_connection)
 
     pd_blob_df = blob_series.struct.explode().to_pandas()
+    pd_blob_df["authorizer"] = pd_blob_df["authorizer"].apply(normalize_connection_id)
     expected_pd_df = pd.DataFrame(
         {
             "uri": images_uris,
             "version": [None, None],
-            "authorizer": [bq_connection.casefold(), bq_connection.casefold()],
+            "authorizer": [
+                normalize_connection_id(bq_connection),
+                normalize_connection_id(bq_connection),
+            ],
             "details": [None, None],
         }
     )
@@ -43,7 +52,11 @@ def test_blob_create_from_uri_str(
 
 
 def test_blob_create_from_glob_path(
-    bq_connection: str, session: bigframes.Session, images_gcs_path, images_uris
+    bq_connection: str,
+    session: bigframes.Session,
+    images_gcs_path,
+    images_uris,
+    normalize_connection_id: Callable[[str], str],
 ):
     blob_df = session.from_glob_path(
         images_gcs_path, connection=bq_connection, name="blob_col"
@@ -55,12 +68,16 @@ def test_blob_create_from_glob_path(
         .sort_values("uri")
         .reset_index(drop=True)
     )
+    pd_blob_df["authorizer"] = pd_blob_df["authorizer"].apply(normalize_connection_id)
 
     expected_df = pd.DataFrame(
         {
             "uri": images_uris,
             "version": [None, None],
-            "authorizer": [bq_connection.casefold(), bq_connection.casefold()],
+            "authorizer": [
+                normalize_connection_id(bq_connection),
+                normalize_connection_id(bq_connection),
+            ],
             "details": [None, None],
         }
     )
@@ -71,7 +88,11 @@ def test_blob_create_from_glob_path(
 
 
 def test_blob_create_read_gbq_object_table(
-    bq_connection: str, session: bigframes.Session, images_gcs_path, images_uris
+    bq_connection: str,
+    session: bigframes.Session,
+    images_gcs_path,
+    images_uris,
+    normalize_connection_id: Callable[[str], str],
 ):
     obj_table = session._create_object_table(images_gcs_path, bq_connection)
 
@@ -83,11 +104,15 @@ def test_blob_create_read_gbq_object_table(
         .sort_values("uri")
         .reset_index(drop=True)
     )
+    pd_blob_df["authorizer"] = pd_blob_df["authorizer"].apply(normalize_connection_id)
     expected_df = pd.DataFrame(
         {
             "uri": images_uris,
             "version": [None, None],
-            "authorizer": [bq_connection.casefold(), bq_connection.casefold()],
+            "authorizer": [
+                normalize_connection_id(bq_connection),
+                normalize_connection_id(bq_connection),
+            ],
             "details": [None, None],
         }
     )
@@ -97,6 +122,7 @@ def test_blob_create_read_gbq_object_table(
     )
 
 
+@pytest.mark.skip(reason="b/457416070")
 def test_display_images(monkeypatch, images_mm_df: bpd.DataFrame):
     mock_display = mock.Mock()
     monkeypatch.setattr(IPython.display, "display", mock_display)
diff --git a/tests/system/small/blob/test_properties.py b/tests/system/small/blob/test_properties.py
diff --git a/tests/system/small/ml/test_multimodal_llm.py b/tests/system/small/ml/test_multimodal_llm.py
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py