feat: add json_arrow_type parameter to allow overriding the JSON data type in to_arrow and to_arrow_iterable

tswast · tswast · commit c47db36b4c09 · 2025-03-19T13:57:37.000-05:00
diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
@@ -27,7 +27,7 @@
 import queue
 import threading
 import warnings
-from typing import Any, Union, Optional, Callable, Generator, List
+from typing import Any, Callable, Generator, Iterable, List, Optional, Union
 
 
 from google.cloud.bigquery import _pyarrow_helpers
@@ -162,10 +162,14 @@ def finish(self):
 }
 
 
-def bq_to_arrow_struct_data_type(field):
+def bq_to_arrow_struct_data_type(
+    field: schema.SchemaField,
+    *,
+    json_arrow_type: "pyarrow.DataType",
+) -> "pyarrow.DataType":
     arrow_fields = []
     for subfield in field.fields:
-        arrow_subfield = bq_to_arrow_field(subfield)
+        arrow_subfield = bq_to_arrow_field(subfield, json_arrow_type=json_arrow_type)
         if arrow_subfield:
             arrow_fields.append(arrow_subfield)
         else:
@@ -186,40 +190,73 @@ def bq_to_arrow_range_data_type(field):
     return pyarrow.struct([("start", arrow_element_type), ("end", arrow_element_type)])
 
 
-def bq_to_arrow_data_type(field):
+def bq_to_arrow_data_type(
+    field: schema.SchemaField,
+    *,
+    json_arrow_type: "pyarrow.DataType",
+) -> "pyarrow.DataType":
     """Return the Arrow data type, corresponding to a given BigQuery column.
 
+    Args:
+        field (SchemaField):
+            BigQuery field to convert to Arrow.
+        json_arrow_type (pyarrow.DataType):
+            Arrow type to use for JSON columns. This defaults to
+            ``pyarrow.string()``.
+
     Returns:
         None: if default Arrow type inspection should be used.
     """
+    # TODO(https://github.com/googleapis/python-bigquery-pandas/pull/893):
+    # move to pandas_gbq.schema.bigquery_to_pyarrow module.
     if field.mode is not None and field.mode.upper() == "REPEATED":
         inner_type = bq_to_arrow_data_type(
-            schema.SchemaField(field.name, field.field_type, fields=field.fields)
+            schema.SchemaField(field.name, field.field_type, fields=field.fields),
+            json_arrow_type=json_arrow_type,
         )
         if inner_type:
             return pyarrow.list_(inner_type)
         return None
 
     field_type_upper = field.field_type.upper() if field.field_type else ""
     if field_type_upper in schema._STRUCT_TYPES:
-        return bq_to_arrow_struct_data_type(field)
+        return bq_to_arrow_struct_data_type(field, json_arrow_type=json_arrow_type)
 
     if field_type_upper == "RANGE":
         return bq_to_arrow_range_data_type(field.range_element_type)
 
-    data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars(field_type_upper)
+    data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars(
+        field_type_upper, json_arrow_type=json_arrow_type
+    )
     if data_type_constructor is None:
         return None
     return data_type_constructor()
 
 
-def bq_to_arrow_field(bq_field, array_type=None):
+def bq_to_arrow_field(
+    bq_field: schema.SchemaField,
+    array_type: Optional["pyarrow.DataType"] = None,
+    *,
+    json_arrow_type: "pyarrow.DataType",
+) -> "pyarrow.Field":
     """Return the Arrow field, corresponding to a given BigQuery column.
 
+    Args:
+        bq_field (SchemaField):
+            BigQuery field to convert to Arrow.
+        array_type (Optional[pyarrow.DataType]):
+            The type that the pyarrow.array constructor determined, such as
+            when converting from a local pandas DataFrame to a BigQuery schema.
+        json_arrow_type (pyarrow.DataType):
+            Arrow type to use for JSON columns. This defaults to
+            ``pyarrow.string()``.
+
     Returns:
         None: if the Arrow type cannot be determined.
     """
-    arrow_type = bq_to_arrow_data_type(bq_field)
+    # TODO(https://github.com/googleapis/python-bigquery-pandas/pull/893):
+    # move to pandas_gbq.schema.bigquery_to_pyarrow module.
+    arrow_type = bq_to_arrow_data_type(bq_field, json_arrow_type=json_arrow_type)
     if arrow_type is not None:
         if array_type is not None:
             arrow_type = array_type  # For GEOGRAPHY, at least initially
@@ -243,15 +280,29 @@ def bq_to_arrow_field(bq_field, array_type=None):
     return None
 
 
-def bq_to_arrow_schema(bq_schema):
+def bq_to_arrow_schema(
+    bq_schema: Iterable[schema.SchemaField],
+    *,
+    json_arrow_type: "pyarrow.DataType",
+) -> "pyarrow.Schema":
     """Return the Arrow schema, corresponding to a given BigQuery schema.
 
+    Args:
+        bq_schema (Iterable[SchemaField]):
+            BigQuery schema to convert to Arrow.
+        json_arrow_type (Optional[pyarrow.DataType]):
+            Arrow type to use for JSON columns. This defaults to
+            ``pyarrow.string()``.
+
     Returns:
+        pyarrow.Schema: if all BigQuery types can be converted to Arrow.
         None: if any Arrow type cannot be determined.
     """
+    # TODO(https://github.com/googleapis/python-bigquery-pandas/pull/893):
+    # move to pandas_gbq.schema.bigquery_to_pyarrow module.
     arrow_fields = []
     for bq_field in bq_schema:
-        arrow_field = bq_to_arrow_field(bq_field)
+        arrow_field = bq_to_arrow_field(bq_field, json_arrow_type=json_arrow_type)
         if arrow_field is None:
             # Auto-detect the schema if there is an unknown field type.
             return None
@@ -766,7 +817,7 @@ def _row_iterator_page_to_arrow(page, column_names, arrow_types):
     return pyarrow.RecordBatch.from_arrays(arrays, names=column_names)
 
 
-def download_arrow_row_iterator(pages, bq_schema):
+def download_arrow_row_iterator(pages, bq_schema, json_arrow_type=None):
     """Use HTTP JSON RowIterator to construct an iterable of RecordBatches.
 
     Args:
@@ -777,13 +828,22 @@ def download_arrow_row_iterator(pages, bq_schema):
             Mapping[str, Any] \
         ]]):
             A decription of the fields in result pages.
+        json_arrow_type (Optional[pyarrow.DataType]):
+            Arrow type to use for JSON columns. This defaults to
+            ``pyarrow.string()``.
+
     Yields:
         :class:`pyarrow.RecordBatch`
         The next page of records as a ``pyarrow`` record batch.
     """
     bq_schema = schema._to_schema_fields(bq_schema)
-    column_names = bq_to_arrow_schema(bq_schema) or [field.name for field in bq_schema]
-    arrow_types = [bq_to_arrow_data_type(field) for field in bq_schema]
+    column_names = bq_to_arrow_schema(bq_schema, json_arrow_type=json_arrow_type) or [
+        field.name for field in bq_schema
+    ]
+    arrow_types = [
+        bq_to_arrow_data_type(field, json_arrow_type=json_arrow_type)
+        for field in bq_schema
+    ]
 
     for page in pages:
         yield _row_iterator_page_to_arrow(page, column_names, arrow_types)
diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py
@@ -77,11 +77,6 @@ def pyarrow_timestamp():
         "GEOGRAPHY": pyarrow.string,
         "INT64": pyarrow.int64,
         "INTEGER": pyarrow.int64,
-        # Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
-        # but we'd like this to map as closely to the BQ Storage API as
-        # possible, which uses the string() dtype, as JSON support in Arrow
-        # predates JSON support in BigQuery by several years.
-        "JSON": pyarrow.string,
         "NUMERIC": pyarrow_numeric,
         "STRING": pyarrow.string,
         "TIME": pyarrow_time,
@@ -124,15 +119,22 @@ def pyarrow_timestamp():
     _ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC"
 
 
-def bq_to_arrow_scalars(bq_scalar: str):
+def bq_to_arrow_scalars(bq_scalar: str, *, json_arrow_type: "pyarrow.DataType"):
     """
     DEPRECATED: update pandas_gbq.schema.bigquery_to_pyarrow, instead, which is
     to be added in https://github.com/googleapis/python-bigquery-pandas/pull/893.
 
     Returns:
-        The Arrow scalar type that the input BigQuery scalar type maps to.
-        If it cannot find the BigQuery scalar, return None.
+        A function that returns an Arrow scalar type that the input BigQuery
+        scalar type maps to. If it cannot find the BigQuery scalar, return
+        None.
     """
+    # TODO(tswast): Why is this returning a callable instead of the actual data
+    # type? Seems like we should be able to remove that level of indirection,
+    # especially for these scalar types.
+    if bq_scalar == "JSON":
+        return lambda: json_arrow_type
+
     return _BQ_TO_ARROW_SCALARS.get(bq_scalar)
 
 
diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
@@ -2019,6 +2019,7 @@ def to_arrow_iterable(
         bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
         max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT,  # type: ignore
         max_stream_count: Optional[int] = None,
+        json_arrow_type: Optional["pyarrow.DataType"] = None,
     ) -> Iterator["pyarrow.RecordBatch"]:
         """[Beta] Create an iterable of class:`pyarrow.RecordBatch`, to process the table as a stream.
 
@@ -2058,6 +2059,9 @@ def to_arrow_iterable(
                 especially with very large queries. In that case,
                 setting this parameter value to a value > 0 can help
                 reduce system resource consumption.
+            json_arrow_type (Optional[pyarrow.DataType]):
+                Arrow type to use for JSON columns. This defaults to
+                ``pyarrow.string()``.
 
         Returns:
             pyarrow.RecordBatch:
@@ -2067,6 +2071,13 @@ def to_arrow_iterable(
         """
         self._maybe_warn_max_results(bqstorage_client)
 
+        # Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
+        # but we'd like this to map as closely to the BQ Storage API as
+        # possible, which uses the string() dtype, as JSON support in Arrow
+        # predates JSON support in BigQuery by several years.
+        if json_arrow_type is None:
+            json_arrow_type = pyarrow.string()
+
         bqstorage_download = functools.partial(
             _pandas_helpers.download_arrow_bqstorage,
             self._billing_project,
@@ -2076,9 +2087,13 @@ def to_arrow_iterable(
             selected_fields=self._selected_fields,
             max_queue_size=max_queue_size,
             max_stream_count=max_stream_count,
+            json_arrow_type=json_arrow_type,
         )
         tabledata_list_download = functools.partial(
-            _pandas_helpers.download_arrow_row_iterator, iter(self.pages), self.schema
+            _pandas_helpers.download_arrow_row_iterator,
+            iter(self.pages),
+            self.schema,
+            json_arrow_type=json_arrow_type,
         )
         return self._to_page_iterable(
             bqstorage_download,
@@ -2093,6 +2108,7 @@ def to_arrow(
         progress_bar_type: Optional[str] = None,
         bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
         create_bqstorage_client: bool = True,
+        json_arrow_type: Optional["pyarrow.DataType"] = None,
     ) -> "pyarrow.Table":
         """[Beta] Create a class:`pyarrow.Table` by loading all pages of a
         table or query.
@@ -2134,6 +2150,9 @@ def to_arrow(
                 This argument does nothing if ``bqstorage_client`` is supplied.
 
                 .. versionadded:: 1.24.0
+            json_arrow_type (Optional[pyarrow.DataType]):
+                Arrow type to use for JSON columns. This defaults to
+                ``pyarrow.string()``.
 
         Returns:
             pyarrow.Table
@@ -2152,6 +2171,13 @@ def to_arrow(
 
         self._maybe_warn_max_results(bqstorage_client)
 
+        # Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
+        # but we'd like this to map as closely to the BQ Storage API as
+        # possible, which uses the string() dtype, as JSON support in Arrow
+        # predates JSON support in BigQuery by several years.
+        if json_arrow_type is None:
+            json_arrow_type = pyarrow.string()
+
         if not self._should_use_bqstorage(bqstorage_client, create_bqstorage_client):
             create_bqstorage_client = False
             bqstorage_client = None
@@ -2194,7 +2220,10 @@ def to_arrow(
             # we used the REST API (bqstorage_client is None),
             # which doesn't add arrow extension metadata, so we let
             # `bq_to_arrow_schema` do it.
-            arrow_schema = _pandas_helpers.bq_to_arrow_schema(self._schema)
+            arrow_schema = _pandas_helpers.bq_to_arrow_schema(
+                self._schema,
+                json_arrow_type=json_arrow_type,
+            )
             return pyarrow.Table.from_batches(record_batches, schema=arrow_schema)
 
     def to_dataframe_iterable(
diff --git a/tests/unit/test_table_arrow.py b/tests/unit/test_table_arrow.py