|
43 | 43 | import warnings |
44 | 44 |
|
45 | 45 | import bigframes_vendored.constants as constants |
| 46 | +import db_dtypes |
46 | 47 | import google.cloud.bigquery as bigquery |
47 | 48 | import numpy |
48 | 49 | import pandas as pd |
@@ -134,6 +135,21 @@ class MaterializationOptions: |
134 | 135 | ordered: bool = True |
135 | 136 |
|
136 | 137 |
|
| 138 | +def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType: |
| 139 | + """Recursively replace JSONArrowType with string type.""" |
| 140 | + if isinstance(pa_type, db_dtypes.JSONArrowType): |
| 141 | + return pa.string() |
| 142 | + if isinstance(pa_type, pa.ListType): |
| 143 | + return pa.list_(_replace_json_arrow_with_string(pa_type.value_type)) |
| 144 | + if isinstance(pa_type, pa.StructType): |
| 145 | + new_fields = [ |
| 146 | + field.with_type(_replace_json_arrow_with_string(field.type)) |
| 147 | + for field in pa_type |
| 148 | + ] |
| 149 | + return pa.struct(new_fields) |
| 150 | + return pa_type |
| 151 | + |
| 152 | + |
137 | 153 | class Block: |
138 | 154 | """A immutable 2D data structure.""" |
139 | 155 |
|
@@ -715,12 +731,32 @@ def to_pandas_batches( |
715 | 731 | # To reduce the number of edge cases to consider when working with the |
716 | 732 | # results of this, always return at least one DataFrame. See: |
717 | 733 | # b/428918844. |
718 | | - empty_val = pd.DataFrame( |
719 | | - { |
720 | | - col: pd.Series([], dtype=self.expr.get_column_type(col)) |
721 | | - for col in itertools.chain(self.value_columns, self.index_columns) |
722 | | - } |
723 | | - ) |
| 734 | + series_map = {} |
| 735 | + for col in itertools.chain(self.value_columns, self.index_columns): |
| 736 | + dtype = self.expr.get_column_type(col) |
| 737 | + if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype): |
| 738 | + # Due to a limitation in Apache Arrow (#45262), JSON columns are not |
| 739 | + # natively supported by the to_pandas_batches() method, which is |
| 740 | + # used by the anywidget backend. |
| 741 | + # Workaround for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 |
| 742 | + # PyArrow doesn't support creating an empty array with db_dtypes.JSONArrowType, |
| 743 | + # especially when nested. |
| 744 | + # Create with string type and then cast. |
| 745 | + |
| 746 | + # MyPy doesn't automatically narrow the type of 'dtype' here, |
| 747 | + # so we add an explicit check. |
| 748 | + if isinstance(dtype, pd.ArrowDtype): |
| 749 | + safe_pa_type = _replace_json_arrow_with_string(dtype.pyarrow_dtype) |
| 750 | + safe_dtype = pd.ArrowDtype(safe_pa_type) |
| 751 | + series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype) |
| 752 | + else: |
| 753 | + # This branch should ideally not be reached if |
| 754 | + # contains_db_dtypes_json_dtype is accurate, |
| 755 | + # but it's here for MyPy's sake. |
| 756 | + series_map[col] = pd.Series([], dtype=dtype) |
| 757 | + else: |
| 758 | + series_map[col] = pd.Series([], dtype=dtype) |
| 759 | + empty_val = pd.DataFrame(series_map) |
724 | 760 | dfs = map( |
725 | 761 | lambda a: a[0], |
726 | 762 | itertools.zip_longest( |
|
0 commit comments