add todos for new approach

tswast · tswast · commit 3f427bb295af · 2025-03-19T14:33:48.000-05:00
diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
@@ -2019,6 +2019,7 @@ def to_arrow_iterable(
         bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
         max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT,  # type: ignore
         max_stream_count: Optional[int] = None,
+        json_arrow_type: Optional["pyarrow.DataType"] = None,
     ) -> Iterator["pyarrow.RecordBatch"]:
         """[Beta] Create an iterable of class:`pyarrow.RecordBatch`, to process the table as a stream.
 
@@ -2058,6 +2059,9 @@ def to_arrow_iterable(
                 especially with very large queries. In that case,
                 setting this parameter value to a value > 0 can help
                 reduce system resource consumption.
+            json_arrow_type (Optional[pyarrow.DataType]):
+                Arrow type to use for JSON columns. This defaults to
+                ``pyarrow.string()``.
 
         Returns:
             pyarrow.RecordBatch:
@@ -2078,13 +2082,17 @@ def to_arrow_iterable(
             max_stream_count=max_stream_count,
         )
         tabledata_list_download = functools.partial(
-            _pandas_helpers.download_arrow_row_iterator, iter(self.pages), self.schema
+            _pandas_helpers.download_arrow_row_iterator,
+            iter(self.pages),
+            self.schema,
         )
-        return self._to_page_iterable(
+        for table in self._to_page_iterable(
             bqstorage_download,
             tabledata_list_download,
             bqstorage_client=bqstorage_client,
-        )
+        ):
+            # TODO: convert json_arrow_type if set.
+            yield table
 
     # If changing the signature of this method, make sure to apply the same
     # changes to job.QueryJob.to_arrow()
@@ -2093,6 +2101,7 @@ def to_arrow(
         progress_bar_type: Optional[str] = None,
         bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
         create_bqstorage_client: bool = True,
+        json_arrow_type: Optional["pyarrow.DataType"] = None,
     ) -> "pyarrow.Table":
         """[Beta] Create a class:`pyarrow.Table` by loading all pages of a
         table or query.
@@ -2134,6 +2143,9 @@ def to_arrow(
                 This argument does nothing if ``bqstorage_client`` is supplied.
 
                 .. versionadded:: 1.24.0
+            json_arrow_type (Optional[pyarrow.DataType]):
+                Arrow type to use for JSON columns. This defaults to
+                ``pyarrow.string()``.
 
         Returns:
             pyarrow.Table
@@ -2196,6 +2208,7 @@ def to_arrow(
             # `bq_to_arrow_schema` do it.
             arrow_schema = _pandas_helpers.bq_to_arrow_schema(self._schema)
             return pyarrow.Table.from_batches(record_batches, schema=arrow_schema)
+            # TODO: convert json columns if json_arrow_type is set.
 
     def to_dataframe_iterable(
         self,
diff --git a/tests/unit/test_table_arrow.py b/tests/unit/test_table_arrow.py
@@ -98,6 +98,220 @@ def test_to_arrow_with_jobs_query_response():
     ]
 
 
+def test_to_arrow_with_json_arrow_type():
+    # JSONArrowType added in 1.4.0.
+    # https://github.com/googleapis/python-db-dtypes-pandas/pull/312
+    db_dtypes = pytest.importorskip("db_dtypes", minversion="1.4.0")
+
+    resource = {
+        "kind": "bigquery#queryResponse",
+        "schema": {
+            "fields": [
+                {"name": "json", "type": "JSON", "mode": "NULLABLE"},
+                {
+                    "name": "reference",
+                    "type": "RECORD",
+                    "mode": "NULLABLE",
+                    "fields": [
+                        {"name": "uri", "type": "STRING", "mode": "NULLABLE"},
+                        {"name": "authorizer", "type": "STRING", "mode": "NULLABLE"},
+                        {"name": "version", "type": "STRING", "mode": "NULLABLE"},
+                        {"name": "details", "type": "JSON", "mode": "NULLABLE"},
+                    ],
+                },
+                {"name": "repeated_json", "type": "JSON", "mode": "REPEATED"},
+            ]
+        },
+        "jobReference": {
+            "projectId": "test-project",
+            "jobId": "job_ocd3cb-N62QIslU7R5qKKa2_427J",
+            "location": "US",
+        },
+        "totalRows": "9",
+        "rows": [
+            {
+                "f": [
+                    {"v": '{"key": "value1"}'},
+                    {
+                        "v": {
+                            "f": [
+                                {"v": "uri1"},
+                                {"v": "auth1"},
+                                {"v": "v1"},
+                                {"v": '{"detail_key": "detail_value1"}'},
+                            ]
+                        }
+                    },
+                    {"v": [{"v": '{"item1": 1}'}, {"v": '{"item2": 2}'}]},
+                ]
+            },
+            {
+                "f": [
+                    {"v": '{"key": "value2", "num": 2}'},
+                    {
+                        "v": {
+                            "f": [
+                                {"v": "uri2"},
+                                {"v": "auth2"},
+                                {"v": "v2"},
+                                {"v": '{"detail_key": "detail_value2", "num": 2}'},
+                            ]
+                        }
+                    },
+                    {
+                        "v": [
+                            {"v": '{"item3": 3}'},
+                            {"v": '{"item4": 4}'},
+                            {"v": '{"item5": 5}'},
+                        ]
+                    },
+                ]
+            },
+            {
+                "f": [
+                    {"v": "null"},
+                    {"v": None},
+                    {"v": []},
+                ]
+            },
+            {
+                "f": [
+                    {"v": '{"key": "value4"}'},
+                    {
+                        "v": {
+                            "f": [
+                                {"v": "uri4"},
+                                {"v": "auth4"},
+                                {"v": "v4"},
+                                {"v": None},
+                            ]
+                        }
+                    },
+                    {"v": [{"v": '{"item6": 6}'}]},
+                ]
+            },
+            {
+                "f": [
+                    {"v": '{"key": "value5"}'},
+                    {
+                        "v": {
+                            "f": [
+                                {"v": None},
+                                {"v": None},
+                                {"v": None},
+                                {"v": None},
+                            ]
+                        }
+                    },
+                    {"v": []},
+                ]
+            },
+            {
+                "f": [
+                    {"v": None},
+                    {"v": None},
+                    {"v": []},  # Note: None is not supported for REPEATED fields.
+                ]
+            },
+            {
+                "f": [
+                    {"v": '{"key": "value7"}'},
+                    {
+                        "v": {
+                            "f": [
+                                {"v": "uri7"},
+                                {"v": "auth7"},
+                                {"v": "v7"},
+                                {"v": '{"detail_key": "detail_value7"}'},
+                            ]
+                        }
+                    },
+                    {"v": []},
+                ]
+            },
+            {
+                "f": [
+                    {"v": '{"key": "value8"}'},
+                    {
+                        "v": {
+                            "f": [
+                                {"v": "uri8"},
+                                {"v": "auth8"},
+                                {"v": "v8"},
+                                {"v": '{"detail_key": "detail_value8"}'},
+                            ]
+                        }
+                    },
+                    {"v": [{"v": '{"item9": 9}'}, {"v": '{"item10": 10}'}]},
+                ]
+            },
+            {
+                "f": [
+                    {"v": '{"key": "value9"}'},
+                    {
+                        "v": {
+                            "f": [
+                                {"v": "uri9"},
+                                {"v": "auth9"},
+                                {"v": "v9"},
+                                {"v": '{"detail_key": "detail_value9"}'},
+                            ]
+                        }
+                    },
+                    {
+                        "v": [
+                            {"v": '{"item11": 11}'},
+                            {"v": '{"item12": 12}'},
+                            {"v": '{"item13": 13}'},
+                        ]
+                    },
+                ]
+            },
+        ],
+        "totalBytesProcessed": "154775150",
+        "jobComplete": True,
+        "cacheHit": False,
+        "queryId": "job_ocd3cb-N62QIslU7R5qKKa2_427J",
+    }
+
+    rows = google.cloud.bigquery.table.RowIterator(
+        client=None,
+        api_request=None,
+        path=None,
+        schema=[
+            bigquery.SchemaField.from_api_repr(field)
+            for field in resource["schema"]["fields"]
+        ],
+        first_page_response=resource,
+    )
+    records = rows.to_arrow(json_arrow_type=db_dtypes.JSONArrowType())
+
+    assert records.column_names == ["name", "number", "json"]
+    assert records["name"].to_pylist() == [
+        "Tiarra",
+        "Timothy",
+        "Tina",
+        "Tierra",
+        "Tia",
+        "Tiara",
+        "Tiana",
+        "Tiffany",
+        "Tiffani",
+    ]
+    assert records["number"].to_pylist() == [6, 325, 26, 10, 17, 22, 6, 229, 8]
+    assert records["json"].to_pylist() == [
+        "123",
+        '{"key":"value"}',
+        "[1,2,3]",
+        '{"aKey": {"bKey": {"cKey": -123}}}',
+        None,
+        '"some-json-string"',
+        '{"nullKey":null}',
+        '""',
+        "[]",
+    ]
+
+
 def test_to_arrow_with_jobs_query_response_and_max_results():
     resource = {
         "kind": "bigquery#queryResponse",