Correctly display DataFrames with JSON columns in anywidget

shuoweil · shuoweil · commit a86d953f546e · 2025-10-30T21:50:03.000Z
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -43,6 +43,7 @@
 import warnings
 
 import bigframes_vendored.constants as constants
+import db_dtypes
 import google.cloud.bigquery as bigquery
 import numpy
 import pandas as pd
@@ -134,6 +135,21 @@ class MaterializationOptions:
     ordered: bool = True
 
 
+def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType:
+    """Recursively replace JSONArrowType with string type."""
+    if isinstance(pa_type, db_dtypes.JSONArrowType):
+        return pa.string()
+    if isinstance(pa_type, pa.ListType):
+        return pa.list_(_replace_json_arrow_with_string(pa_type.value_type))
+    if isinstance(pa_type, pa.StructType):
+        new_fields = [
+            field.with_type(_replace_json_arrow_with_string(field.type))
+            for field in pa_type
+        ]
+        return pa.struct(new_fields)
+    return pa_type
+
+
 class Block:
     """A immutable 2D data structure."""
 
@@ -715,12 +731,32 @@ def to_pandas_batches(
         # To reduce the number of edge cases to consider when working with the
         # results of this, always return at least one DataFrame. See:
         # b/428918844.
-        empty_val = pd.DataFrame(
-            {
-                col: pd.Series([], dtype=self.expr.get_column_type(col))
-                for col in itertools.chain(self.value_columns, self.index_columns)
-            }
-        )
+        series_map = {}
+        for col in itertools.chain(self.value_columns, self.index_columns):
+            dtype = self.expr.get_column_type(col)
+            if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype):
+                # Due to a limitation in Apache Arrow (#45262), JSON columns are not
+                # natively supported by the to_pandas_batches() method, which is
+                # used by the anywidget backend.
+                # Workaround for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
+                # PyArrow doesn't support creating an empty array with db_dtypes.JSONArrowType,
+                # especially when nested.
+                # Create with string type and then cast.
+
+                # MyPy doesn't automatically narrow the type of 'dtype' here,
+                # so we add an explicit check.
+                if isinstance(dtype, pd.ArrowDtype):
+                    safe_pa_type = _replace_json_arrow_with_string(dtype.pyarrow_dtype)
+                    safe_dtype = pd.ArrowDtype(safe_pa_type)
+                    series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype)
+                else:
+                    # This branch should ideally not be reached if
+                    # contains_db_dtypes_json_dtype is accurate,
+                    # but it's here for MyPy's sake.
+                    series_map[col] = pd.Series([], dtype=dtype)
+            else:
+                series_map[col] = pd.Series([], dtype=dtype)
+        empty_val = pd.DataFrame(series_map)
         dfs = map(
             lambda a: a[0],
             itertools.zip_longest(
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -783,8 +783,6 @@ def __repr__(self) -> str:
 
         opts = bigframes.options.display
         max_results = opts.max_rows
-        # anywdiget mode uses the same display logic as the "deferred" mode
-        # for faster execution
         if opts.repr_mode in ("deferred", "anywidget"):
             return formatter.repr_query_job(self._compute_dry_run())
 
diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py
@@ -52,6 +52,8 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
         result_rows = 0
 
         for batch in self._arrow_batches:
+            # Convert JSON columns to strings before casting
+            batch = self._convert_json_to_string(batch)
             batch = pyarrow_utils.cast_batch(batch, self.schema.to_pyarrow())
             result_rows += batch.num_rows
 
@@ -67,6 +69,38 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
 
             yield batch
 
+    def _convert_json_to_string(
+        self, batch: pyarrow.RecordBatch
+    ) -> pyarrow.RecordBatch:
+        """Convert JSON arrow extension types to string to avoid PyArrow compatibility issues."""
+        import logging
+
+        new_arrays = []
+        new_fields = []
+
+        for i, field in enumerate(batch.schema):
+            array = batch.column(i)
+
+            # Check if this column should be JSON based on our schema
+            schema_item = next(
+                (item for item in self.schema.items if item.column == field.name), None
+            )
+
+            if schema_item and schema_item.dtype == bigframes.dtypes.JSON_DTYPE:
+                logging.info(f"Converting JSON column: {field.name}")
+                # Convert JSONArrowType to string
+                if array.type == bigframes.dtypes.JSON_ARROW_TYPE:
+                    array = array.cast(pyarrow.string())
+                new_fields.append(pyarrow.field(field.name, pyarrow.string()))
+            else:
+                new_fields.append(field)
+
+            new_arrays.append(array)
+
+        return pyarrow.RecordBatch.from_arrays(
+            new_arrays, schema=pyarrow.schema(new_fields)
+        )
+
     def to_arrow_table(self) -> pyarrow.Table:
         # Need to provide schema if no result rows, as arrow can't infer
         # If ther are rows, it is safest to infer schema from batches.
diff --git a/mypy.ini b/mypy.ini
@@ -44,3 +44,6 @@ ignore_missing_imports = True
 
 [mypy-anywidget]
 ignore_missing_imports = True
+
+[mypy-db_dtypes]
+ignore_missing_imports = True
diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb
@@ -35,7 +35,16 @@
    "execution_count": 2,
    "id": "ca22f059",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/_python_version_support.py:266: FutureWarning: You are using a Python version (3.10.15) which Google will stop supporting in new releases of google.api_core once it reaches its end of life (2026-10-04). Please upgrade to the latest Python version, or at least Python 3.11, to continue receiving updates for google.api_core past that date.\n",
+      "  warnings.warn(message, FutureWarning)\n"
+     ]
+    }
+   ],
    "source": [
     "import bigframes.pandas as bpd"
    ]
@@ -144,7 +153,7 @@
       "application/vnd.jupyter.widget-view+json": {
        "model_id": "93dd10072d564a02a0278817d14855a9",
        "version_major": 2,
-       "version_minor": 0
+       "version_minor": 1
       },
       "text/plain": [
        "TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
@@ -207,7 +216,7 @@
       "application/vnd.jupyter.widget-view+json": {
        "model_id": "6e2538d446e344ac8505e4706730243e",
        "version_major": 2,
-       "version_minor": 0
+       "version_minor": 1
       },
       "text/plain": [
        "TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
@@ -306,7 +315,7 @@
       "application/vnd.jupyter.widget-view+json": {
        "model_id": "d6faf367ea5d44ad9d275506d870557a",
        "version_major": 2,
-       "version_minor": 0
+       "version_minor": 1
       },
       "text/plain": [
        "TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
@@ -333,14 +342,6 @@
     "The `AI.GENERATE` function in BigQuery returns results in a JSON column. While BigQuery's JSON type is not natively supported by the underlying Arrow `to_pandas_batches()` method used in anywidget mode ([Apache Arrow issue #45262](https://github.com/apache/arrow/issues/45262)), BigQuery Dataframes automatically converts JSON columns to strings for display. This allows you to view the results of generative AI functions seamlessly."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fdadcad6",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": 10,