Skip to content

Commit 3f427bb

Browse files
committed
add todos for new approach
1 parent 0a5efe9 commit 3f427bb

File tree

2 files changed

+230
-3
lines changed

2 files changed

+230
-3
lines changed

google/cloud/bigquery/table.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2019,6 +2019,7 @@ def to_arrow_iterable(
20192019
bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
20202020
max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT, # type: ignore
20212021
max_stream_count: Optional[int] = None,
2022+
json_arrow_type: Optional["pyarrow.DataType"] = None,
20222023
) -> Iterator["pyarrow.RecordBatch"]:
20232024
"""[Beta] Create an iterable of class:`pyarrow.RecordBatch`, to process the table as a stream.
20242025
@@ -2058,6 +2059,9 @@ def to_arrow_iterable(
20582059
especially with very large queries. In that case,
20592060
setting this parameter value to a value > 0 can help
20602061
reduce system resource consumption.
2062+
json_arrow_type (Optional[pyarrow.DataType]):
2063+
Arrow type to use for JSON columns. This defaults to
2064+
``pyarrow.string()``.
20612065
20622066
Returns:
20632067
pyarrow.RecordBatch:
@@ -2078,13 +2082,17 @@ def to_arrow_iterable(
20782082
max_stream_count=max_stream_count,
20792083
)
20802084
tabledata_list_download = functools.partial(
2081-
_pandas_helpers.download_arrow_row_iterator, iter(self.pages), self.schema
2085+
_pandas_helpers.download_arrow_row_iterator,
2086+
iter(self.pages),
2087+
self.schema,
20822088
)
2083-
return self._to_page_iterable(
2089+
for table in self._to_page_iterable(
20842090
bqstorage_download,
20852091
tabledata_list_download,
20862092
bqstorage_client=bqstorage_client,
2087-
)
2093+
):
2094+
# TODO: convert json_arrow_type if set.
2095+
yield table
20882096

20892097
# If changing the signature of this method, make sure to apply the same
20902098
# changes to job.QueryJob.to_arrow()
@@ -2093,6 +2101,7 @@ def to_arrow(
20932101
progress_bar_type: Optional[str] = None,
20942102
bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
20952103
create_bqstorage_client: bool = True,
2104+
json_arrow_type: Optional["pyarrow.DataType"] = None,
20962105
) -> "pyarrow.Table":
20972106
"""[Beta] Create a class:`pyarrow.Table` by loading all pages of a
20982107
table or query.
@@ -2134,6 +2143,9 @@ def to_arrow(
21342143
This argument does nothing if ``bqstorage_client`` is supplied.
21352144
21362145
.. versionadded:: 1.24.0
2146+
json_arrow_type (Optional[pyarrow.DataType]):
2147+
Arrow type to use for JSON columns. This defaults to
2148+
``pyarrow.string()``.
21372149
21382150
Returns:
21392151
pyarrow.Table
@@ -2196,6 +2208,7 @@ def to_arrow(
21962208
# `bq_to_arrow_schema` do it.
21972209
arrow_schema = _pandas_helpers.bq_to_arrow_schema(self._schema)
21982210
return pyarrow.Table.from_batches(record_batches, schema=arrow_schema)
2211+
# TODO: convert json columns if json_arrow_type is set.
21992212

22002213
def to_dataframe_iterable(
22012214
self,

tests/unit/test_table_arrow.py

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,220 @@ def test_to_arrow_with_jobs_query_response():
9898
]
9999

100100

101+
def test_to_arrow_with_json_arrow_type():
102+
# JSONArrowType added in 1.4.0.
103+
# https://github.com/googleapis/python-db-dtypes-pandas/pull/312
104+
db_dtypes = pytest.importorskip("db_dtypes", minversion="1.4.0")
105+
106+
resource = {
107+
"kind": "bigquery#queryResponse",
108+
"schema": {
109+
"fields": [
110+
{"name": "json", "type": "JSON", "mode": "NULLABLE"},
111+
{
112+
"name": "reference",
113+
"type": "RECORD",
114+
"mode": "NULLABLE",
115+
"fields": [
116+
{"name": "uri", "type": "STRING", "mode": "NULLABLE"},
117+
{"name": "authorizer", "type": "STRING", "mode": "NULLABLE"},
118+
{"name": "version", "type": "STRING", "mode": "NULLABLE"},
119+
{"name": "details", "type": "JSON", "mode": "NULLABLE"},
120+
],
121+
},
122+
{"name": "repeated_json", "type": "JSON", "mode": "REPEATED"},
123+
]
124+
},
125+
"jobReference": {
126+
"projectId": "test-project",
127+
"jobId": "job_ocd3cb-N62QIslU7R5qKKa2_427J",
128+
"location": "US",
129+
},
130+
"totalRows": "9",
131+
"rows": [
132+
{
133+
"f": [
134+
{"v": '{"key": "value1"}'},
135+
{
136+
"v": {
137+
"f": [
138+
{"v": "uri1"},
139+
{"v": "auth1"},
140+
{"v": "v1"},
141+
{"v": '{"detail_key": "detail_value1"}'},
142+
]
143+
}
144+
},
145+
{"v": [{"v": '{"item1": 1}'}, {"v": '{"item2": 2}'}]},
146+
]
147+
},
148+
{
149+
"f": [
150+
{"v": '{"key": "value2", "num": 2}'},
151+
{
152+
"v": {
153+
"f": [
154+
{"v": "uri2"},
155+
{"v": "auth2"},
156+
{"v": "v2"},
157+
{"v": '{"detail_key": "detail_value2", "num": 2}'},
158+
]
159+
}
160+
},
161+
{
162+
"v": [
163+
{"v": '{"item3": 3}'},
164+
{"v": '{"item4": 4}'},
165+
{"v": '{"item5": 5}'},
166+
]
167+
},
168+
]
169+
},
170+
{
171+
"f": [
172+
{"v": "null"},
173+
{"v": None},
174+
{"v": []},
175+
]
176+
},
177+
{
178+
"f": [
179+
{"v": '{"key": "value4"}'},
180+
{
181+
"v": {
182+
"f": [
183+
{"v": "uri4"},
184+
{"v": "auth4"},
185+
{"v": "v4"},
186+
{"v": None},
187+
]
188+
}
189+
},
190+
{"v": [{"v": '{"item6": 6}'}]},
191+
]
192+
},
193+
{
194+
"f": [
195+
{"v": '{"key": "value5"}'},
196+
{
197+
"v": {
198+
"f": [
199+
{"v": None},
200+
{"v": None},
201+
{"v": None},
202+
{"v": None},
203+
]
204+
}
205+
},
206+
{"v": []},
207+
]
208+
},
209+
{
210+
"f": [
211+
{"v": None},
212+
{"v": None},
213+
{"v": []}, # Note: None is not supported for REPEATED fields.
214+
]
215+
},
216+
{
217+
"f": [
218+
{"v": '{"key": "value7"}'},
219+
{
220+
"v": {
221+
"f": [
222+
{"v": "uri7"},
223+
{"v": "auth7"},
224+
{"v": "v7"},
225+
{"v": '{"detail_key": "detail_value7"}'},
226+
]
227+
}
228+
},
229+
{"v": []},
230+
]
231+
},
232+
{
233+
"f": [
234+
{"v": '{"key": "value8"}'},
235+
{
236+
"v": {
237+
"f": [
238+
{"v": "uri8"},
239+
{"v": "auth8"},
240+
{"v": "v8"},
241+
{"v": '{"detail_key": "detail_value8"}'},
242+
]
243+
}
244+
},
245+
{"v": [{"v": '{"item9": 9}'}, {"v": '{"item10": 10}'}]},
246+
]
247+
},
248+
{
249+
"f": [
250+
{"v": '{"key": "value9"}'},
251+
{
252+
"v": {
253+
"f": [
254+
{"v": "uri9"},
255+
{"v": "auth9"},
256+
{"v": "v9"},
257+
{"v": '{"detail_key": "detail_value9"}'},
258+
]
259+
}
260+
},
261+
{
262+
"v": [
263+
{"v": '{"item11": 11}'},
264+
{"v": '{"item12": 12}'},
265+
{"v": '{"item13": 13}'},
266+
]
267+
},
268+
]
269+
},
270+
],
271+
"totalBytesProcessed": "154775150",
272+
"jobComplete": True,
273+
"cacheHit": False,
274+
"queryId": "job_ocd3cb-N62QIslU7R5qKKa2_427J",
275+
}
276+
277+
rows = google.cloud.bigquery.table.RowIterator(
278+
client=None,
279+
api_request=None,
280+
path=None,
281+
schema=[
282+
bigquery.SchemaField.from_api_repr(field)
283+
for field in resource["schema"]["fields"]
284+
],
285+
first_page_response=resource,
286+
)
287+
records = rows.to_arrow(json_arrow_type=db_dtypes.JSONArrowType())
288+
289+
assert records.column_names == ["name", "number", "json"]
290+
assert records["name"].to_pylist() == [
291+
"Tiarra",
292+
"Timothy",
293+
"Tina",
294+
"Tierra",
295+
"Tia",
296+
"Tiara",
297+
"Tiana",
298+
"Tiffany",
299+
"Tiffani",
300+
]
301+
assert records["number"].to_pylist() == [6, 325, 26, 10, 17, 22, 6, 229, 8]
302+
assert records["json"].to_pylist() == [
303+
"123",
304+
'{"key":"value"}',
305+
"[1,2,3]",
306+
'{"aKey": {"bKey": {"cKey": -123}}}',
307+
None,
308+
'"some-json-string"',
309+
'{"nullKey":null}',
310+
'""',
311+
"[]",
312+
]
313+
314+
101315
def test_to_arrow_with_jobs_query_response_and_max_results():
102316
resource = {
103317
"kind": "bigquery#queryResponse",

0 commit comments

Comments
 (0)