Skip to content

Commit d07ba7e

Browse files
committed
testcase update
1 parent 5955bfe commit d07ba7e

File tree

2 files changed

+37
-68
lines changed

2 files changed

+37
-68
lines changed

bigframes/core/blocks.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -711,12 +711,6 @@ def to_pandas_batches(
711711
# To reduce the number of edge cases to consider when working with the
712712
# results of this, always return at least one DataFrame. See:
713713
# b/428918844.
714-
empty_val = pd.DataFrame(
715-
{
716-
col: pd.Series([], dtype=self.expr.get_column_type(col))
717-
for col in itertools.chain(self.value_columns, self.index_columns)
718-
}
719-
)
720714
series_map = {}
721715
for col in itertools.chain(self.value_columns, self.index_columns):
722716
dtype = self.expr.get_column_type(col)
@@ -733,10 +727,8 @@ def to_pandas_batches(
733727
safe_dtype = pd.ArrowDtype(safe_pa_type)
734728
series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype)
735729
else:
736-
# This branch should ideally not be reached if
737-
# contains_db_dtypes_json_dtype is accurate,
738-
# but it's here for MyPy's sake.
739-
series_map[col] = pd.Series([], dtype=dtype)
730+
# Fallback for other types that might error
731+
series_map[col] = pd.Series([], dtype="object").astype(dtype)
740732
empty_val = pd.DataFrame(series_map)
741733
dfs = map(
742734
lambda a: a[0],

tests/system/small/test_dataframe_io.py

Lines changed: 35 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -376,75 +376,52 @@ def test_to_pandas_batches_w_empty_dataframe(session):
376376
pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes)
377377

378378

379-
def test_to_pandas_batches_w_empty_dataframe_json_in_list(session):
380-
"""Tests to_pandas_batches() with an empty DataFrame containing a list of JSON.
381-
382-
Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
379+
def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json(session):
380+
"""Verifies to_pandas_batches() preserves dtypes for nested JSON."""
381+
# This SQL query only tests the POPULATED case.
382+
sql = """
383+
SELECT
384+
0 AS id,
385+
[JSON '{"a":1}', JSON '{"b":2}'] AS json_array,
386+
STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct
383387
"""
384-
import db_dtypes
388+
df = session.read_gbq(sql, index_col="id")
385389

386-
json_list_dtype = pd.ArrowDtype(pa.list_(db_dtypes.JSONArrowType()))
387-
empty_df_with_json_list = bpd.DataFrame(
388-
{
389-
"idx": pd.Series([], dtype="Int64"),
390-
"json_list_col": pd.Series([], dtype=json_list_dtype),
391-
},
392-
session=session,
393-
).set_index("idx", drop=True)
390+
batches = list(df.to_pandas_batches())
394391

395-
results = list(empty_df_with_json_list.to_pandas_batches())
392+
# Check that we processed the row
393+
assert sum(len(b) for b in batches) == 1
396394

397-
assert len(results) == 1
398-
assert list(results[0].columns) == ["json_list_col"]
399-
assert results[0].dtypes["json_list_col"] == json_list_dtype
400-
assert len(results[0]) == 0
395+
# Check dtypes on the resulting batch
396+
assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype)
397+
assert isinstance(batches[0].dtypes["json_array"].pyarrow_dtype, pa.ListType)
398+
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
399+
assert isinstance(batches[0].dtypes["json_struct"].pyarrow_dtype, pa.StructType)
401400

402401

403-
def test_to_pandas_batches_w_empty_dataframe_json_in_struct(session):
404-
"""Tests to_pandas_batches() with an empty DataFrame containing a struct of JSON.
402+
def test_to_pandas_batches_should_not_error_on_empty_nested_json(session):
403+
"""Verify to_pandas_batches() works with empty nested JSON types.
405404
406-
Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
405+
Regression test for PyArrow limitation with empty JSON arrays.
407406
"""
408-
import db_dtypes
409-
410-
json_struct_dtype = pd.ArrowDtype(
411-
pa.struct([("json_field", db_dtypes.JSONArrowType())])
412-
)
413-
empty_df_with_json_struct = bpd.DataFrame(
414-
{
415-
"idx": pd.Series([], dtype="Int64"),
416-
"json_struct_col": pd.Series([], dtype=json_struct_dtype),
417-
},
418-
session=session,
419-
).set_index("idx", drop=True)
420-
421-
results = list(empty_df_with_json_struct.to_pandas_batches())
422-
423-
assert len(results) == 1
424-
assert list(results[0].columns) == ["json_struct_col"]
425-
assert results[0].dtypes["json_struct_col"] == json_struct_dtype
426-
assert len(results[0]) == 0
427-
428-
429-
def test_to_pandas_batches_w_empty_dataframe_simple_json(session):
430-
"""Tests to_pandas_batches() with an empty DataFrame containing a simple JSON column.
431-
432-
Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
407+
# This SQL query is MINIMAL and tests only the EMPTY regression case.
408+
sql = """
409+
SELECT
410+
1 AS id,
411+
[] AS json_array,
412+
STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct
433413
"""
434-
empty_df_with_json = bpd.DataFrame(
435-
{
436-
"idx": pd.Series([], dtype="Int64"),
437-
"json_col": pd.Series([], dtype=dtypes.JSON_DTYPE),
438-
},
439-
session=session,
440-
).set_index("idx", drop=True)
414+
df = session.read_gbq(sql, index_col="id")
441415

442-
results = list(empty_df_with_json.to_pandas_batches())
416+
# The main point of this test is that this line does not raise an error.
417+
batches = list(df.to_pandas_batches())
443418

444-
assert len(results) == 1
445-
assert list(results[0].columns) == ["json_col"]
446-
assert results[0].dtypes["json_col"] == dtypes.JSON_DTYPE
447-
assert len(results[0]) == 0
419+
# Verify the row was actually processed and not just skipped
420+
assert sum(len(b) for b in batches) == 1
421+
422+
# Verify dtypes are still correct, even with empty data
423+
assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype)
424+
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
448425

449426

450427
@pytest.mark.parametrize("allow_large_results", (True, False))

0 commit comments

Comments
 (0)