2727import queue
2828import threading
2929import warnings
30- from typing import Any , Union , Optional , Callable , Generator , List
30+ from typing import Any , Callable , Generator , Iterable , List , Optional , Union
3131
3232
3333from google .cloud .bigquery import _pyarrow_helpers
@@ -162,10 +162,14 @@ def finish(self):
162162}
163163
164164
165- def bq_to_arrow_struct_data_type (field ):
165+ def bq_to_arrow_struct_data_type (
166+ field : schema .SchemaField ,
167+ * ,
168+ json_arrow_type : "pyarrow.DataType" ,
169+ ) -> "pyarrow.DataType" :
166170 arrow_fields = []
167171 for subfield in field .fields :
168- arrow_subfield = bq_to_arrow_field (subfield )
172+ arrow_subfield = bq_to_arrow_field (subfield , json_arrow_type = json_arrow_type )
169173 if arrow_subfield :
170174 arrow_fields .append (arrow_subfield )
171175 else :
@@ -186,40 +190,73 @@ def bq_to_arrow_range_data_type(field):
186190 return pyarrow .struct ([("start" , arrow_element_type ), ("end" , arrow_element_type )])
187191
188192
189- def bq_to_arrow_data_type (field ):
193+ def bq_to_arrow_data_type (
194+ field : schema .SchemaField ,
195+ * ,
196+ json_arrow_type : "pyarrow.DataType" ,
197+ ) -> "pyarrow.DataType" :
190198 """Return the Arrow data type, corresponding to a given BigQuery column.
191199
200+ Args:
201+ field (SchemaField):
202+ BigQuery field to convert to Arrow.
203+ json_arrow_type (pyarrow.DataType):
204+ Arrow type to use for JSON columns. This defaults to
205+ ``pyarrow.string()``.
206+
192207 Returns:
193208 None: if default Arrow type inspection should be used.
194209 """
210+ # TODO(https://github.com/googleapis/python-bigquery-pandas/pull/893):
211+ # move to pandas_gbq.schema.bigquery_to_pyarrow module.
195212 if field .mode is not None and field .mode .upper () == "REPEATED" :
196213 inner_type = bq_to_arrow_data_type (
197- schema .SchemaField (field .name , field .field_type , fields = field .fields )
214+ schema .SchemaField (field .name , field .field_type , fields = field .fields ),
215+ json_arrow_type = json_arrow_type ,
198216 )
199217 if inner_type :
200218 return pyarrow .list_ (inner_type )
201219 return None
202220
203221 field_type_upper = field .field_type .upper () if field .field_type else ""
204222 if field_type_upper in schema ._STRUCT_TYPES :
205- return bq_to_arrow_struct_data_type (field )
223+ return bq_to_arrow_struct_data_type (field , json_arrow_type = json_arrow_type )
206224
207225 if field_type_upper == "RANGE" :
208226 return bq_to_arrow_range_data_type (field .range_element_type )
209227
210- data_type_constructor = _pyarrow_helpers .bq_to_arrow_scalars (field_type_upper )
228+ data_type_constructor = _pyarrow_helpers .bq_to_arrow_scalars (
229+ field_type_upper , json_arrow_type = json_arrow_type
230+ )
211231 if data_type_constructor is None :
212232 return None
213233 return data_type_constructor ()
214234
215235
216- def bq_to_arrow_field (bq_field , array_type = None ):
236+ def bq_to_arrow_field (
237+ bq_field : schema .SchemaField ,
238+ array_type : Optional ["pyarrow.DataType" ] = None ,
239+ * ,
240+ json_arrow_type : "pyarrow.DataType" ,
241+ ) -> "pyarrow.Field" :
217242 """Return the Arrow field, corresponding to a given BigQuery column.
218243
244+ Args:
245+ bq_field (SchemaField):
246+ BigQuery field to convert to Arrow.
247+ array_type (Optional[pyarrow.DataType]):
248+ The type that the pyarrow.array constructor determined, such as
249+ when converting from a local pandas DataFrame to a BigQuery schema.
250+ json_arrow_type (pyarrow.DataType):
251+ Arrow type to use for JSON columns. This defaults to
252+ ``pyarrow.string()``.
253+
219254 Returns:
220255 None: if the Arrow type cannot be determined.
221256 """
222- arrow_type = bq_to_arrow_data_type (bq_field )
257+ # TODO(https://github.com/googleapis/python-bigquery-pandas/pull/893):
258+ # move to pandas_gbq.schema.bigquery_to_pyarrow module.
259+ arrow_type = bq_to_arrow_data_type (bq_field , json_arrow_type = json_arrow_type )
223260 if arrow_type is not None :
224261 if array_type is not None :
225262 arrow_type = array_type # For GEOGRAPHY, at least initially
@@ -243,15 +280,29 @@ def bq_to_arrow_field(bq_field, array_type=None):
243280 return None
244281
245282
246- def bq_to_arrow_schema (bq_schema ):
283+ def bq_to_arrow_schema (
284+ bq_schema : Iterable [schema .SchemaField ],
285+ * ,
286+ json_arrow_type : "pyarrow.DataType" ,
287+ ) -> "pyarrow.Schema" :
247288 """Return the Arrow schema, corresponding to a given BigQuery schema.
248289
290+ Args:
291+ bq_schema (Iterable[SchemaField]):
292+ BigQuery schema to convert to Arrow.
293+ json_arrow_type (Optional[pyarrow.DataType]):
294+ Arrow type to use for JSON columns. This defaults to
295+ ``pyarrow.string()``.
296+
249297 Returns:
298+ pyarrow.Schema: if all BigQuery types can be converted to Arrow.
250299 None: if any Arrow type cannot be determined.
251300 """
301+ # TODO(https://github.com/googleapis/python-bigquery-pandas/pull/893):
302+ # move to pandas_gbq.schema.bigquery_to_pyarrow module.
252303 arrow_fields = []
253304 for bq_field in bq_schema :
254- arrow_field = bq_to_arrow_field (bq_field )
305+ arrow_field = bq_to_arrow_field (bq_field , json_arrow_type = json_arrow_type )
255306 if arrow_field is None :
256307 # Auto-detect the schema if there is an unknown field type.
257308 return None
@@ -766,7 +817,7 @@ def _row_iterator_page_to_arrow(page, column_names, arrow_types):
766817 return pyarrow .RecordBatch .from_arrays (arrays , names = column_names )
767818
768819
769- def download_arrow_row_iterator (pages , bq_schema ):
820+ def download_arrow_row_iterator (pages , bq_schema , json_arrow_type = None ):
770821 """Use HTTP JSON RowIterator to construct an iterable of RecordBatches.
771822
772823 Args:
@@ -777,13 +828,22 @@ def download_arrow_row_iterator(pages, bq_schema):
777828 Mapping[str, Any] \
778829 ]]):
779830 A decription of the fields in result pages.
831+ json_arrow_type (Optional[pyarrow.DataType]):
832+ Arrow type to use for JSON columns. This defaults to
833+ ``pyarrow.string()``.
834+
780835 Yields:
781836 :class:`pyarrow.RecordBatch`
782837 The next page of records as a ``pyarrow`` record batch.
783838 """
784839 bq_schema = schema ._to_schema_fields (bq_schema )
785- column_names = bq_to_arrow_schema (bq_schema ) or [field .name for field in bq_schema ]
786- arrow_types = [bq_to_arrow_data_type (field ) for field in bq_schema ]
840+ column_names = bq_to_arrow_schema (bq_schema , json_arrow_type = json_arrow_type ) or [
841+ field .name for field in bq_schema
842+ ]
843+ arrow_types = [
844+ bq_to_arrow_data_type (field , json_arrow_type = json_arrow_type )
845+ for field in bq_schema
846+ ]
787847
788848 for page in pages :
789849 yield _row_iterator_page_to_arrow (page , column_names , arrow_types )
0 commit comments