Skip to content

Commit a86d953

Browse files
committed
Correctly display DataFrames with JSON columns in anywidget
2 parents 39cf595 + 8c34512 commit a86d953

File tree

5 files changed

+92
-20
lines changed

5 files changed

+92
-20
lines changed

bigframes/core/blocks.py

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import warnings
4444

4545
import bigframes_vendored.constants as constants
46+
import db_dtypes
4647
import google.cloud.bigquery as bigquery
4748
import numpy
4849
import pandas as pd
@@ -134,6 +135,21 @@ class MaterializationOptions:
134135
ordered: bool = True
135136

136137

138+
def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType:
139+
"""Recursively replace JSONArrowType with string type."""
140+
if isinstance(pa_type, db_dtypes.JSONArrowType):
141+
return pa.string()
142+
if isinstance(pa_type, pa.ListType):
143+
return pa.list_(_replace_json_arrow_with_string(pa_type.value_type))
144+
if isinstance(pa_type, pa.StructType):
145+
new_fields = [
146+
field.with_type(_replace_json_arrow_with_string(field.type))
147+
for field in pa_type
148+
]
149+
return pa.struct(new_fields)
150+
return pa_type
151+
152+
137153
class Block:
138154
"""A immutable 2D data structure."""
139155

@@ -715,12 +731,32 @@ def to_pandas_batches(
715731
# To reduce the number of edge cases to consider when working with the
716732
# results of this, always return at least one DataFrame. See:
717733
# b/428918844.
718-
empty_val = pd.DataFrame(
719-
{
720-
col: pd.Series([], dtype=self.expr.get_column_type(col))
721-
for col in itertools.chain(self.value_columns, self.index_columns)
722-
}
723-
)
734+
series_map = {}
735+
for col in itertools.chain(self.value_columns, self.index_columns):
736+
dtype = self.expr.get_column_type(col)
737+
if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype):
738+
# Due to a limitation in Apache Arrow (#45262), JSON columns are not
739+
# natively supported by the to_pandas_batches() method, which is
740+
# used by the anywidget backend.
741+
# Workaround for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
742+
# PyArrow doesn't support creating an empty array with db_dtypes.JSONArrowType,
743+
# especially when nested.
744+
# Create with string type and then cast.
745+
746+
# MyPy doesn't automatically narrow the type of 'dtype' here,
747+
# so we add an explicit check.
748+
if isinstance(dtype, pd.ArrowDtype):
749+
safe_pa_type = _replace_json_arrow_with_string(dtype.pyarrow_dtype)
750+
safe_dtype = pd.ArrowDtype(safe_pa_type)
751+
series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype)
752+
else:
753+
# This branch should ideally not be reached if
754+
# contains_db_dtypes_json_dtype is accurate,
755+
# but it's here for MyPy's sake.
756+
series_map[col] = pd.Series([], dtype=dtype)
757+
else:
758+
series_map[col] = pd.Series([], dtype=dtype)
759+
empty_val = pd.DataFrame(series_map)
724760
dfs = map(
725761
lambda a: a[0],
726762
itertools.zip_longest(

bigframes/dataframe.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -783,8 +783,6 @@ def __repr__(self) -> str:
783783

784784
opts = bigframes.options.display
785785
max_results = opts.max_rows
786-
# anywdiget mode uses the same display logic as the "deferred" mode
787-
# for faster execution
788786
if opts.repr_mode in ("deferred", "anywidget"):
789787
return formatter.repr_query_job(self._compute_dry_run())
790788

bigframes/session/executor.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
5252
result_rows = 0
5353

5454
for batch in self._arrow_batches:
55+
# Convert JSON columns to strings before casting
56+
batch = self._convert_json_to_string(batch)
5557
batch = pyarrow_utils.cast_batch(batch, self.schema.to_pyarrow())
5658
result_rows += batch.num_rows
5759

@@ -67,6 +69,38 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
6769

6870
yield batch
6971

72+
def _convert_json_to_string(
73+
self, batch: pyarrow.RecordBatch
74+
) -> pyarrow.RecordBatch:
75+
"""Convert JSON arrow extension types to string to avoid PyArrow compatibility issues."""
76+
import logging
77+
78+
new_arrays = []
79+
new_fields = []
80+
81+
for i, field in enumerate(batch.schema):
82+
array = batch.column(i)
83+
84+
# Check if this column should be JSON based on our schema
85+
schema_item = next(
86+
(item for item in self.schema.items if item.column == field.name), None
87+
)
88+
89+
if schema_item and schema_item.dtype == bigframes.dtypes.JSON_DTYPE:
90+
logging.info(f"Converting JSON column: {field.name}")
91+
# Convert JSONArrowType to string
92+
if array.type == bigframes.dtypes.JSON_ARROW_TYPE:
93+
array = array.cast(pyarrow.string())
94+
new_fields.append(pyarrow.field(field.name, pyarrow.string()))
95+
else:
96+
new_fields.append(field)
97+
98+
new_arrays.append(array)
99+
100+
return pyarrow.RecordBatch.from_arrays(
101+
new_arrays, schema=pyarrow.schema(new_fields)
102+
)
103+
70104
def to_arrow_table(self) -> pyarrow.Table:
71105
# Need to provide schema if no result rows, as arrow can't infer
72106
# If ther are rows, it is safest to infer schema from batches.

mypy.ini

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,6 @@ ignore_missing_imports = True
4444

4545
[mypy-anywidget]
4646
ignore_missing_imports = True
47+
48+
[mypy-db_dtypes]
49+
ignore_missing_imports = True

notebooks/dataframes/anywidget_mode.ipynb

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,16 @@
3535
"execution_count": 2,
3636
"id": "ca22f059",
3737
"metadata": {},
38-
"outputs": [],
38+
"outputs": [
39+
{
40+
"name": "stderr",
41+
"output_type": "stream",
42+
"text": [
43+
"/usr/local/google/home/shuowei/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/_python_version_support.py:266: FutureWarning: You are using a Python version (3.10.15) which Google will stop supporting in new releases of google.api_core once it reaches its end of life (2026-10-04). Please upgrade to the latest Python version, or at least Python 3.11, to continue receiving updates for google.api_core past that date.\n",
44+
" warnings.warn(message, FutureWarning)\n"
45+
]
46+
}
47+
],
3948
"source": [
4049
"import bigframes.pandas as bpd"
4150
]
@@ -144,7 +153,7 @@
144153
"application/vnd.jupyter.widget-view+json": {
145154
"model_id": "93dd10072d564a02a0278817d14855a9",
146155
"version_major": 2,
147-
"version_minor": 0
156+
"version_minor": 1
148157
},
149158
"text/plain": [
150159
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
@@ -207,7 +216,7 @@
207216
"application/vnd.jupyter.widget-view+json": {
208217
"model_id": "6e2538d446e344ac8505e4706730243e",
209218
"version_major": 2,
210-
"version_minor": 0
219+
"version_minor": 1
211220
},
212221
"text/plain": [
213222
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
@@ -306,7 +315,7 @@
306315
"application/vnd.jupyter.widget-view+json": {
307316
"model_id": "d6faf367ea5d44ad9d275506d870557a",
308317
"version_major": 2,
309-
"version_minor": 0
318+
"version_minor": 1
310319
},
311320
"text/plain": [
312321
"TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
@@ -333,14 +342,6 @@
333342
"The `AI.GENERATE` function in BigQuery returns results in a JSON column. While BigQuery's JSON type is not natively supported by the underlying Arrow `to_pandas_batches()` method used in anywidget mode ([Apache Arrow issue #45262](https://github.com/apache/arrow/issues/45262)), BigQuery Dataframes automatically converts JSON columns to strings for display. This allows you to view the results of generative AI functions seamlessly."
334343
]
335344
},
336-
{
337-
"cell_type": "code",
338-
"execution_count": null,
339-
"id": "fdadcad6",
340-
"metadata": {},
341-
"outputs": [],
342-
"source": []
343-
},
344345
{
345346
"cell_type": "code",
346347
"execution_count": 10,

0 commit comments

Comments
 (0)