From 33109f27c7b164e68928139037679373a3cef9e3 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 1 Jul 2025 22:29:55 +0000 Subject: [PATCH 1/2] feat: support bpd.Series(json_data, dtype="json") --- bigframes/core/indexes/base.py | 2 ++ bigframes/dataframe.py | 2 ++ bigframes/dtypes.py | 12 +++++++----- bigframes/operations/base.py | 2 ++ tests/system/small/test_dataframe.py | 20 ++++++++++++++++++++ tests/system/small/test_index.py | 21 +++++++++++++++++++++ tests/system/small/test_series.py | 12 ++++++++++-- 7 files changed, 64 insertions(+), 7 deletions(-) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index f653b8700b..94f1cf4135 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -86,6 +86,8 @@ def __new__( pd_df = pandas.DataFrame(index=data) block = df.DataFrame(pd_df, session=session)._block else: + if isinstance(dtype, str): + dtype = bigframes.dtypes.bigframes_type(dtype) pd_index = pandas.Index(data=data, dtype=dtype, name=name) pd_df = pandas.DataFrame(index=pd_index) block = df.DataFrame(pd_df, session=session)._block diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1884f0beff..cdd5dc0b44 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -198,6 +198,8 @@ def __init__( else: import bigframes.pandas + if isinstance(dtype, str): + dtype = bigframes.dtypes.bigframes_type(dtype) pd_dataframe = pandas.DataFrame( data=data, index=index, # type:ignore diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 0be31505df..f0c4f329ee 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -637,17 +637,19 @@ def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]: return BIGFRAMES_STRING_TO_BIGFRAMES[ typing.cast(DtypeString, str(dtype_string)) ] + if isinstance(dtype_string, str) and dtype_string.lower() == "json": + return JSON_DTYPE raise TypeError( textwrap.dedent( f""" - Unexpected data type string {dtype_string}. The following + Unexpected data type string `{dtype_string}`. The following dtypes are supppted: 'boolean','Float64','Int64', - 'int64[pyarrow]','string','string[pyarrow]', + 'int64[pyarrow]','string','string[pyarrow]','json', 'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]', 'date32[day][pyarrow]','time64[us][pyarrow]'. - The following pandas.ExtensionDtype are supported: - pandas.BooleanDtype(), pandas.Float64Dtype(), - pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), + The following pandas `ExtensionDtype` are supported: + pd.BooleanDtype(), pd.Float64Dtype(), + pd.Int64Dtype(), pd.StringDtype(storage="pyarrow"), pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")), pd.ArrowDtype(pa.timestamp("us")), pd.ArrowDtype(pa.timestamp("us", tz="UTC")). diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index c316d28321..89c6f13ce3 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -121,6 +121,8 @@ def __init__( bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) else: + if isinstance(dtype, str): + dtype = bigframes.dtypes.bigframes_type(dtype) pd_series = pd.Series( data=data, index=index, # type:ignore diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 91a83dfd73..fb51cc3b12 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -180,6 +180,26 @@ def test_df_construct_from_dict(): ) +@pytest.mark.parametrize( + ("json_type"), + [ + pytest.param(dtypes.JSON_DTYPE), + pytest.param("json"), + ], +) +def test_df_construct_w_json_dtype(json_type): + data = [ + "1", + "false", + '["a", {"b": 1}, null]', + None, + ] + df = dataframe.DataFrame({"json_col": data}, dtype=json_type) + + assert df["json_col"].dtype == dtypes.JSON_DTYPE + assert df["json_col"][1] == "false" + + def test_df_construct_inline_respects_location(reset_default_session_and_location): # Note: This starts a thread-local session. with bpd.option_context("bigquery.location", "europe-west1"): diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index c7e316a9d2..2b2364d3bc 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -18,6 +18,7 @@ import pandas as pd import pytest +from bigframes import dtypes import bigframes.pandas as bpd from bigframes.testing.utils import assert_pandas_index_equal_ignore_index_type @@ -61,6 +62,26 @@ def test_index_construct_from_index(): pd.testing.assert_index_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("json_type"), + [ + pytest.param(dtypes.JSON_DTYPE), + pytest.param("json"), + ], +) +def test_index_construct_w_json_dtype(json_type): + data = [ + "1", + "false", + '["a", {"b": 1}, null]', + None, + ] + index = bpd.Index(data, dtype=json_type) + + assert index.dtype == dtypes.JSON_DTYPE + assert index[1] == "false" + + def test_get_index(scalars_df_index, scalars_pandas_df_index): index = scalars_df_index.index bf_result = index.to_pandas() diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d513b0e780..a69c6b945b 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -326,7 +326,14 @@ def test_series_construct_local_unordered_has_sequential_index(unordered_session pd.testing.assert_index_equal(series.index.to_pandas(), expected) -def test_series_construct_w_dtype_for_json(): +@pytest.mark.parametrize( + ("json_type"), + [ + pytest.param(dtypes.JSON_DTYPE), + pytest.param("json"), + ], +) +def test_series_construct_w_json_dtype(json_type): data = [ "1", '"str"', @@ -335,8 +342,9 @@ def test_series_construct_w_dtype_for_json(): None, '{"a": {"b": [1, 2, 3], "c": true}}', ] - s = bigframes.pandas.Series(data, dtype=dtypes.JSON_DTYPE) + s = bigframes.pandas.Series(data, dtype=json_type) + assert s.dtype == dtypes.JSON_DTYPE assert s[0] == "1" assert s[1] == '"str"' assert s[2] == "false" From 83bacaeb905e5b577dcacd6069a17297ea6f534e Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 2 Jul 2025 22:13:58 +0000 Subject: [PATCH 2/2] undo dtypes._dtype_from_string and apply json only --- bigframes/core/indexes/base.py | 4 ++-- bigframes/dataframe.py | 5 +++-- bigframes/dtypes.py | 12 +++++------- bigframes/operations/base.py | 4 ++-- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 94f1cf4135..ea34a465c1 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -86,8 +86,8 @@ def __new__( pd_df = pandas.DataFrame(index=data) block = df.DataFrame(pd_df, session=session)._block else: - if isinstance(dtype, str): - dtype = bigframes.dtypes.bigframes_type(dtype) + if isinstance(dtype, str) and dtype.lower() == "json": + dtype = bigframes.dtypes.JSON_DTYPE pd_index = pandas.Index(data=data, dtype=dtype, name=name) pd_df = pandas.DataFrame(index=pd_index) block = df.DataFrame(pd_df, session=session)._block diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index cdd5dc0b44..432ded8380 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -196,10 +196,11 @@ def __init__( block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) else: + if isinstance(dtype, str) and dtype.lower() == "json": + dtype = bigframes.dtypes.JSON_DTYPE + import bigframes.pandas - if isinstance(dtype, str): - dtype = bigframes.dtypes.bigframes_type(dtype) pd_dataframe = pandas.DataFrame( data=data, index=index, # type:ignore diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index f0c4f329ee..0be31505df 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -637,19 +637,17 @@ def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]: return BIGFRAMES_STRING_TO_BIGFRAMES[ typing.cast(DtypeString, str(dtype_string)) ] - if isinstance(dtype_string, str) and dtype_string.lower() == "json": - return JSON_DTYPE raise TypeError( textwrap.dedent( f""" - Unexpected data type string `{dtype_string}`. The following + Unexpected data type string {dtype_string}. The following dtypes are supppted: 'boolean','Float64','Int64', - 'int64[pyarrow]','string','string[pyarrow]','json', + 'int64[pyarrow]','string','string[pyarrow]', 'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]', 'date32[day][pyarrow]','time64[us][pyarrow]'. - The following pandas `ExtensionDtype` are supported: - pd.BooleanDtype(), pd.Float64Dtype(), - pd.Int64Dtype(), pd.StringDtype(storage="pyarrow"), + The following pandas.ExtensionDtype are supported: + pandas.BooleanDtype(), pandas.Float64Dtype(), + pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")), pd.ArrowDtype(pa.timestamp("us")), pd.ArrowDtype(pa.timestamp("us", tz="UTC")). diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 89c6f13ce3..f2bbcb3320 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -121,8 +121,8 @@ def __init__( bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) else: - if isinstance(dtype, str): - dtype = bigframes.dtypes.bigframes_type(dtype) + if isinstance(dtype, str) and dtype.lower() == "json": + dtype = bigframes.dtypes.JSON_DTYPE pd_series = pd.Series( data=data, index=index, # type:ignore