Skip to content

Commit a965cc4

Browse files
committed
fix: only show JSON dtype warning when accessing dtypes directly
1 parent 32502f4 commit a965cc4

File tree

8 files changed

+129
-45
lines changed

8 files changed

+129
-45
lines changed

bigframes/core/array_value.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import functools
1919
import typing
2020
from typing import Iterable, List, Mapping, Optional, Sequence, Tuple
21-
import warnings
2221

2322
import google.cloud.bigquery
2423
import pandas
@@ -37,7 +36,6 @@
3736
import bigframes.core.tree_properties
3837
from bigframes.core.window_spec import WindowSpec
3938
import bigframes.dtypes
40-
import bigframes.exceptions as bfe
4139
import bigframes.operations as ops
4240
import bigframes.operations.aggregations as agg_ops
4341

@@ -101,12 +99,6 @@ def from_table(
10199
):
102100
if offsets_col and primary_key:
103101
raise ValueError("must set at most one of 'offests', 'primary_key'")
104-
if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
105-
msg = bfe.format_message(
106-
"JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
107-
"is a preview feature and subject to change."
108-
)
109-
warnings.warn(msg, bfe.PreviewWarning)
110102
# define data source only for needed columns, this makes row-hashing cheaper
111103
table_def = nodes.GbqTable.from_table(table, columns=schema.names)
112104

bigframes/core/indexes/base.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,12 +171,16 @@ def shape(self) -> typing.Tuple[int]:
171171

172172
@property
173173
def dtype(self):
174-
return self._block.index.dtypes[0] if self.nlevels == 1 else np.dtype("O")
174+
dtype = self._block.index.dtypes[0] if self.nlevels == 1 else np.dtype("O")
175+
bigframes.dtypes.warn_on_db_dtypes_json_dtype([dtype])
176+
return dtype
175177

176178
@property
177179
def dtypes(self) -> pandas.Series:
180+
dtypes = self._block.index.dtypes
181+
bigframes.dtypes.warn_on_db_dtypes_json_dtype(dtypes)
178182
return pandas.Series(
179-
data=self._block.index.dtypes,
183+
data=dtypes,
180184
index=typing.cast(typing.Tuple, self._block.index.names),
181185
)
182186

bigframes/dataframe.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,9 @@ def at(self) -> indexers.AtDataFrameIndexer:
321321

322322
@property
323323
def dtypes(self) -> pandas.Series:
324-
return pandas.Series(data=self._block.dtypes, index=self._block.column_labels)
324+
dtypes = self._block.dtypes
325+
bigframes.dtypes.warn_on_db_dtypes_json_dtype(dtypes)
326+
return pandas.Series(data=dtypes, index=self._block.column_labels)
325327

326328
@property
327329
def columns(self) -> pandas.Index:

bigframes/dtypes.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import textwrap
2121
import typing
2222
from typing import Any, Dict, List, Literal, Sequence, Union
23+
import warnings
2324

2425
import bigframes_vendored.constants as constants
2526
import db_dtypes # type: ignore
@@ -30,6 +31,8 @@
3031
import pyarrow as pa
3132
import shapely.geometry # type: ignore
3233

34+
import bigframes.exceptions
35+
3336
# Type hints for Pandas dtypes supported by BigQuery DataFrame
3437
Dtype = Union[
3538
pd.BooleanDtype,
@@ -62,7 +65,8 @@
6265
# No arrow equivalent
6366
GEO_DTYPE = gpd.array.GeometryDtype()
6467
# JSON
65-
# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
68+
# TODO(https://github.com/pandas-dev/pandas/issues/60958): switch to
69+
# pyarrow.json_(pyarrow.string()) when pandas 3+ and pyarrow 18+ is installed.
6670
JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
6771
JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
6872
OBJ_REF_DTYPE = pd.ArrowDtype(
@@ -915,3 +919,39 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype:
915919

916920

917921
TIMEDELTA_DESCRIPTION_TAG = "#microseconds"
922+
923+
924+
def contains_db_dtypes_json_arrow_type(type_):
925+
if isinstance(type_, db_dtypes.JSONArrowType):
926+
return True
927+
928+
if isinstance(type_, pa.ListType):
929+
return contains_db_dtypes_json_arrow_type(type_.value_type)
930+
931+
if isinstance(type_, pa.StructType):
932+
return any(
933+
contains_db_dtypes_json_arrow_type(field.type) for field in type_.fields
934+
)
935+
return False
936+
937+
938+
def contains_db_dtypes_json_dtype(dtype):
939+
if not isinstance(dtype, pd.ArrowDtype):
940+
return False
941+
942+
return contains_db_dtypes_json_arrow_type(dtype.pyarrow_dtype)
943+
944+
945+
def warn_on_db_dtypes_json_dtype(dtypes):
946+
"""Warn that the JSON dtype is changing.
947+
948+
Note: only call this function if the user is explicitly checking the
949+
dtypes.
950+
"""
951+
if any(contains_db_dtypes_json_dtype(dtype) for dtype in dtypes):
952+
msg = bigframes.exceptions.format_message(
953+
"JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_()) "
954+
"instead of using `db_dtypes` in the future when available in pandas "
955+
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow."
956+
)
957+
warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)

bigframes/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@ class FunctionAxisOnePreviewWarning(PreviewWarning):
111111
"""Remote Function and Managed UDF with axis=1 preview."""
112112

113113

114+
class JSONDtypeWarning(PreviewWarning):
115+
"""JSON dtype will be pd.ArrowDtype(pa.json_()) in the future."""
116+
117+
114118
class FunctionConflictTypeHintWarning(UserWarning):
115119
"""Conflicting type hints in a BigFrames function."""
116120

bigframes/series.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,12 @@ def dt(self) -> dt.DatetimeMethods:
113113

114114
@property
115115
def dtype(self):
116+
bigframes.dtypes.warn_on_db_dtypes_json_dtype([self._dtype])
116117
return self._dtype
117118

118119
@property
119120
def dtypes(self):
121+
bigframes.dtypes.warn_on_db_dtypes_json_dtype([self._dtype])
120122
return self._dtype
121123

122124
@property
Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import pandas as pd
2121
import pyarrow as pa # type: ignore
2222
import pytest
23-
import shapely.geometry # type: ignore
2423

2524
import bigframes.core.compile.ibis_types
2625
import bigframes.dtypes
@@ -225,22 +224,6 @@ def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str):
225224
assert result == ibis_dtype
226225

227226

228-
@pytest.mark.parametrize(
229-
["python_type", "expected_dtype"],
230-
[
231-
(bool, bigframes.dtypes.BOOL_DTYPE),
232-
(int, bigframes.dtypes.INT_DTYPE),
233-
(str, bigframes.dtypes.STRING_DTYPE),
234-
(shapely.geometry.Point, bigframes.dtypes.GEO_DTYPE),
235-
(shapely.geometry.Polygon, bigframes.dtypes.GEO_DTYPE),
236-
(shapely.geometry.base.BaseGeometry, bigframes.dtypes.GEO_DTYPE),
237-
],
238-
)
239-
def test_bigframes_type_supports_python_types(python_type, expected_dtype):
240-
got_dtype = bigframes.dtypes.bigframes_type(python_type)
241-
assert got_dtype == expected_dtype
242-
243-
244227
def test_unsupported_dtype_raises_unexpected_datatype():
245228
"""Incompatible dtypes should fail when passed into BigQuery DataFrames"""
246229
with pytest.raises(ValueError, match="Datatype has no ibis type mapping"):
@@ -265,19 +248,3 @@ def test_literal_to_ibis_scalar_converts(literal, ibis_scalar):
265248
assert bigframes.core.compile.ibis_types.literal_to_ibis_scalar(literal).equals(
266249
ibis_scalar
267250
)
268-
269-
270-
@pytest.mark.parametrize(
271-
["scalar", "expected_dtype"],
272-
[
273-
(pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
274-
(pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
275-
(pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE),
276-
# Support NULL scalars.
277-
(pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
278-
(pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
279-
(pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE),
280-
],
281-
)
282-
def test_infer_literal_type_arrow_scalar(scalar, expected_dtype):
283-
assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype

tests/unit/test_dtypes.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import db_dtypes
16+
import pyarrow as pa # type: ignore
17+
import pytest
18+
import shapely.geometry # type: ignore
19+
20+
import bigframes.dtypes
21+
22+
23+
@pytest.mark.parametrize(
24+
["python_type", "expected_dtype"],
25+
[
26+
(bool, bigframes.dtypes.BOOL_DTYPE),
27+
(int, bigframes.dtypes.INT_DTYPE),
28+
(str, bigframes.dtypes.STRING_DTYPE),
29+
(shapely.geometry.Point, bigframes.dtypes.GEO_DTYPE),
30+
(shapely.geometry.Polygon, bigframes.dtypes.GEO_DTYPE),
31+
(shapely.geometry.base.BaseGeometry, bigframes.dtypes.GEO_DTYPE),
32+
],
33+
)
34+
def test_bigframes_type_supports_python_types(python_type, expected_dtype):
35+
got_dtype = bigframes.dtypes.bigframes_type(python_type)
36+
assert got_dtype == expected_dtype
37+
38+
39+
@pytest.mark.parametrize(
40+
["scalar", "expected_dtype"],
41+
[
42+
(pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
43+
(pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
44+
(pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE),
45+
# Support NULL scalars.
46+
(pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
47+
(pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
48+
(pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE),
49+
],
50+
)
51+
def test_infer_literal_type_arrow_scalar(scalar, expected_dtype):
52+
assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype
53+
54+
55+
@pytest.mark.parametrize(
56+
["type_", "expected"],
57+
[
58+
(pa.int64(), False),
59+
(db_dtypes.JSONArrowType(), True),
60+
(pa.struct([("int", pa.int64()), ("str", pa.string())]), False),
61+
(pa.struct([("int", pa.int64()), ("json", db_dtypes.JSONArrowType())]), True),
62+
(pa.list_(pa.int64()), False),
63+
(pa.list_(db_dtypes.JSONArrowType()), True),
64+
(
65+
pa.list_(
66+
pa.struct([("int", pa.int64()), ("json", db_dtypes.JSONArrowType())])
67+
),
68+
True,
69+
),
70+
],
71+
)
72+
def test_contains_db_dtypes_json_arrow_type(type_, expected):
73+
assert bigframes.dtypes.contains_db_dtypes_json_arrow_type(type_) == expected

0 commit comments

Comments
 (0)