Skip to content

Commit c47db36

Browse files
committed
feat: add json_arrow_type parameter to allow overriding the JSON data type in to_arrow and to_arrow_iterable
1 parent 15ec79e commit c47db36

File tree

4 files changed

+329
-24
lines changed

4 files changed

+329
-24
lines changed

google/cloud/bigquery/_pandas_helpers.py

Lines changed: 74 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import queue
2828
import threading
2929
import warnings
30-
from typing import Any, Union, Optional, Callable, Generator, List
30+
from typing import Any, Callable, Generator, Iterable, List, Optional, Union
3131

3232

3333
from google.cloud.bigquery import _pyarrow_helpers
@@ -162,10 +162,14 @@ def finish(self):
162162
}
163163

164164

165-
def bq_to_arrow_struct_data_type(field):
165+
def bq_to_arrow_struct_data_type(
166+
field: schema.SchemaField,
167+
*,
168+
json_arrow_type: "pyarrow.DataType",
169+
) -> "pyarrow.DataType":
166170
arrow_fields = []
167171
for subfield in field.fields:
168-
arrow_subfield = bq_to_arrow_field(subfield)
172+
arrow_subfield = bq_to_arrow_field(subfield, json_arrow_type=json_arrow_type)
169173
if arrow_subfield:
170174
arrow_fields.append(arrow_subfield)
171175
else:
@@ -186,40 +190,73 @@ def bq_to_arrow_range_data_type(field):
186190
return pyarrow.struct([("start", arrow_element_type), ("end", arrow_element_type)])
187191

188192

189-
def bq_to_arrow_data_type(field):
193+
def bq_to_arrow_data_type(
194+
field: schema.SchemaField,
195+
*,
196+
json_arrow_type: "pyarrow.DataType",
197+
) -> "pyarrow.DataType":
190198
"""Return the Arrow data type, corresponding to a given BigQuery column.
191199
200+
Args:
201+
field (SchemaField):
202+
BigQuery field to convert to Arrow.
203+
json_arrow_type (pyarrow.DataType):
204+
Arrow type to use for JSON columns. This defaults to
205+
``pyarrow.string()``.
206+
192207
Returns:
193208
None: if default Arrow type inspection should be used.
194209
"""
210+
# TODO(https://github.com/googleapis/python-bigquery-pandas/pull/893):
211+
# move to pandas_gbq.schema.bigquery_to_pyarrow module.
195212
if field.mode is not None and field.mode.upper() == "REPEATED":
196213
inner_type = bq_to_arrow_data_type(
197-
schema.SchemaField(field.name, field.field_type, fields=field.fields)
214+
schema.SchemaField(field.name, field.field_type, fields=field.fields),
215+
json_arrow_type=json_arrow_type,
198216
)
199217
if inner_type:
200218
return pyarrow.list_(inner_type)
201219
return None
202220

203221
field_type_upper = field.field_type.upper() if field.field_type else ""
204222
if field_type_upper in schema._STRUCT_TYPES:
205-
return bq_to_arrow_struct_data_type(field)
223+
return bq_to_arrow_struct_data_type(field, json_arrow_type=json_arrow_type)
206224

207225
if field_type_upper == "RANGE":
208226
return bq_to_arrow_range_data_type(field.range_element_type)
209227

210-
data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars(field_type_upper)
228+
data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars(
229+
field_type_upper, json_arrow_type=json_arrow_type
230+
)
211231
if data_type_constructor is None:
212232
return None
213233
return data_type_constructor()
214234

215235

216-
def bq_to_arrow_field(bq_field, array_type=None):
236+
def bq_to_arrow_field(
237+
bq_field: schema.SchemaField,
238+
array_type: Optional["pyarrow.DataType"] = None,
239+
*,
240+
json_arrow_type: "pyarrow.DataType",
241+
) -> "pyarrow.Field":
217242
"""Return the Arrow field, corresponding to a given BigQuery column.
218243
244+
Args:
245+
bq_field (SchemaField):
246+
BigQuery field to convert to Arrow.
247+
array_type (Optional[pyarrow.DataType]):
248+
The type that the pyarrow.array constructor determined, such as
249+
when converting from a local pandas DataFrame to a BigQuery schema.
250+
json_arrow_type (pyarrow.DataType):
251+
Arrow type to use for JSON columns. This defaults to
252+
``pyarrow.string()``.
253+
219254
Returns:
220255
None: if the Arrow type cannot be determined.
221256
"""
222-
arrow_type = bq_to_arrow_data_type(bq_field)
257+
# TODO(https://github.com/googleapis/python-bigquery-pandas/pull/893):
258+
# move to pandas_gbq.schema.bigquery_to_pyarrow module.
259+
arrow_type = bq_to_arrow_data_type(bq_field, json_arrow_type=json_arrow_type)
223260
if arrow_type is not None:
224261
if array_type is not None:
225262
arrow_type = array_type # For GEOGRAPHY, at least initially
@@ -243,15 +280,29 @@ def bq_to_arrow_field(bq_field, array_type=None):
243280
return None
244281

245282

246-
def bq_to_arrow_schema(bq_schema):
283+
def bq_to_arrow_schema(
284+
bq_schema: Iterable[schema.SchemaField],
285+
*,
286+
json_arrow_type: "pyarrow.DataType",
287+
) -> "pyarrow.Schema":
247288
"""Return the Arrow schema, corresponding to a given BigQuery schema.
248289
290+
Args:
291+
bq_schema (Iterable[SchemaField]):
292+
BigQuery schema to convert to Arrow.
293+
json_arrow_type (Optional[pyarrow.DataType]):
294+
Arrow type to use for JSON columns. This defaults to
295+
``pyarrow.string()``.
296+
249297
Returns:
298+
pyarrow.Schema: if all BigQuery types can be converted to Arrow.
250299
None: if any Arrow type cannot be determined.
251300
"""
301+
# TODO(https://github.com/googleapis/python-bigquery-pandas/pull/893):
302+
# move to pandas_gbq.schema.bigquery_to_pyarrow module.
252303
arrow_fields = []
253304
for bq_field in bq_schema:
254-
arrow_field = bq_to_arrow_field(bq_field)
305+
arrow_field = bq_to_arrow_field(bq_field, json_arrow_type=json_arrow_type)
255306
if arrow_field is None:
256307
# Auto-detect the schema if there is an unknown field type.
257308
return None
@@ -766,7 +817,7 @@ def _row_iterator_page_to_arrow(page, column_names, arrow_types):
766817
return pyarrow.RecordBatch.from_arrays(arrays, names=column_names)
767818

768819

769-
def download_arrow_row_iterator(pages, bq_schema):
820+
def download_arrow_row_iterator(pages, bq_schema, json_arrow_type=None):
770821
"""Use HTTP JSON RowIterator to construct an iterable of RecordBatches.
771822
772823
Args:
@@ -777,13 +828,22 @@ def download_arrow_row_iterator(pages, bq_schema):
777828
Mapping[str, Any] \
778829
]]):
779830
A decription of the fields in result pages.
831+
json_arrow_type (Optional[pyarrow.DataType]):
832+
Arrow type to use for JSON columns. This defaults to
833+
``pyarrow.string()``.
834+
780835
Yields:
781836
:class:`pyarrow.RecordBatch`
782837
The next page of records as a ``pyarrow`` record batch.
783838
"""
784839
bq_schema = schema._to_schema_fields(bq_schema)
785-
column_names = bq_to_arrow_schema(bq_schema) or [field.name for field in bq_schema]
786-
arrow_types = [bq_to_arrow_data_type(field) for field in bq_schema]
840+
column_names = bq_to_arrow_schema(bq_schema, json_arrow_type=json_arrow_type) or [
841+
field.name for field in bq_schema
842+
]
843+
arrow_types = [
844+
bq_to_arrow_data_type(field, json_arrow_type=json_arrow_type)
845+
for field in bq_schema
846+
]
787847

788848
for page in pages:
789849
yield _row_iterator_page_to_arrow(page, column_names, arrow_types)

google/cloud/bigquery/_pyarrow_helpers.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,6 @@ def pyarrow_timestamp():
7777
"GEOGRAPHY": pyarrow.string,
7878
"INT64": pyarrow.int64,
7979
"INTEGER": pyarrow.int64,
80-
# Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
81-
# but we'd like this to map as closely to the BQ Storage API as
82-
# possible, which uses the string() dtype, as JSON support in Arrow
83-
# predates JSON support in BigQuery by several years.
84-
"JSON": pyarrow.string,
8580
"NUMERIC": pyarrow_numeric,
8681
"STRING": pyarrow.string,
8782
"TIME": pyarrow_time,
@@ -124,15 +119,22 @@ def pyarrow_timestamp():
124119
_ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC"
125120

126121

127-
def bq_to_arrow_scalars(bq_scalar: str):
122+
def bq_to_arrow_scalars(bq_scalar: str, *, json_arrow_type: "pyarrow.DataType"):
128123
"""
129124
DEPRECATED: update pandas_gbq.schema.bigquery_to_pyarrow, instead, which is
130125
to be added in https://github.com/googleapis/python-bigquery-pandas/pull/893.
131126
132127
Returns:
133-
The Arrow scalar type that the input BigQuery scalar type maps to.
134-
If it cannot find the BigQuery scalar, return None.
128+
A function that returns an Arrow scalar type that the input BigQuery
129+
scalar type maps to. If it cannot find the BigQuery scalar, return
130+
None.
135131
"""
132+
# TODO(tswast): Why is this returning a callable instead of the actual data
133+
# type? Seems like we should be able to remove that level of indirection,
134+
# especially for these scalar types.
135+
if bq_scalar == "JSON":
136+
return lambda: json_arrow_type
137+
136138
return _BQ_TO_ARROW_SCALARS.get(bq_scalar)
137139

138140

google/cloud/bigquery/table.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2019,6 +2019,7 @@ def to_arrow_iterable(
20192019
bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
20202020
max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT, # type: ignore
20212021
max_stream_count: Optional[int] = None,
2022+
json_arrow_type: Optional["pyarrow.DataType"] = None,
20222023
) -> Iterator["pyarrow.RecordBatch"]:
20232024
"""[Beta] Create an iterable of class:`pyarrow.RecordBatch`, to process the table as a stream.
20242025
@@ -2058,6 +2059,9 @@ def to_arrow_iterable(
20582059
especially with very large queries. In that case,
20592060
setting this parameter value to a value > 0 can help
20602061
reduce system resource consumption.
2062+
json_arrow_type (Optional[pyarrow.DataType]):
2063+
Arrow type to use for JSON columns. This defaults to
2064+
``pyarrow.string()``.
20612065
20622066
Returns:
20632067
pyarrow.RecordBatch:
@@ -2067,6 +2071,13 @@ def to_arrow_iterable(
20672071
"""
20682072
self._maybe_warn_max_results(bqstorage_client)
20692073

2074+
# Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
2075+
# but we'd like this to map as closely to the BQ Storage API as
2076+
# possible, which uses the string() dtype, as JSON support in Arrow
2077+
# predates JSON support in BigQuery by several years.
2078+
if json_arrow_type is None:
2079+
json_arrow_type = pyarrow.string()
2080+
20702081
bqstorage_download = functools.partial(
20712082
_pandas_helpers.download_arrow_bqstorage,
20722083
self._billing_project,
@@ -2076,9 +2087,13 @@ def to_arrow_iterable(
20762087
selected_fields=self._selected_fields,
20772088
max_queue_size=max_queue_size,
20782089
max_stream_count=max_stream_count,
2090+
json_arrow_type=json_arrow_type,
20792091
)
20802092
tabledata_list_download = functools.partial(
2081-
_pandas_helpers.download_arrow_row_iterator, iter(self.pages), self.schema
2093+
_pandas_helpers.download_arrow_row_iterator,
2094+
iter(self.pages),
2095+
self.schema,
2096+
json_arrow_type=json_arrow_type,
20822097
)
20832098
return self._to_page_iterable(
20842099
bqstorage_download,
@@ -2093,6 +2108,7 @@ def to_arrow(
20932108
progress_bar_type: Optional[str] = None,
20942109
bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
20952110
create_bqstorage_client: bool = True,
2111+
json_arrow_type: Optional["pyarrow.DataType"] = None,
20962112
) -> "pyarrow.Table":
20972113
"""[Beta] Create a class:`pyarrow.Table` by loading all pages of a
20982114
table or query.
@@ -2134,6 +2150,9 @@ def to_arrow(
21342150
This argument does nothing if ``bqstorage_client`` is supplied.
21352151
21362152
.. versionadded:: 1.24.0
2153+
json_arrow_type (Optional[pyarrow.DataType]):
2154+
Arrow type to use for JSON columns. This defaults to
2155+
``pyarrow.string()``.
21372156
21382157
Returns:
21392158
pyarrow.Table
@@ -2152,6 +2171,13 @@ def to_arrow(
21522171

21532172
self._maybe_warn_max_results(bqstorage_client)
21542173

2174+
# Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
2175+
# but we'd like this to map as closely to the BQ Storage API as
2176+
# possible, which uses the string() dtype, as JSON support in Arrow
2177+
# predates JSON support in BigQuery by several years.
2178+
if json_arrow_type is None:
2179+
json_arrow_type = pyarrow.string()
2180+
21552181
if not self._should_use_bqstorage(bqstorage_client, create_bqstorage_client):
21562182
create_bqstorage_client = False
21572183
bqstorage_client = None
@@ -2194,7 +2220,10 @@ def to_arrow(
21942220
# we used the REST API (bqstorage_client is None),
21952221
# which doesn't add arrow extension metadata, so we let
21962222
# `bq_to_arrow_schema` do it.
2197-
arrow_schema = _pandas_helpers.bq_to_arrow_schema(self._schema)
2223+
arrow_schema = _pandas_helpers.bq_to_arrow_schema(
2224+
self._schema,
2225+
json_arrow_type=json_arrow_type,
2226+
)
21982227
return pyarrow.Table.from_batches(record_batches, schema=arrow_schema)
21992228

22002229
def to_dataframe_iterable(

0 commit comments

Comments
 (0)