diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 0be31505df..a58619dc21 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -659,6 +659,8 @@ def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]: def infer_literal_type(literal) -> typing.Optional[Dtype]: # Maybe also normalize literal to canonical python representation to remove this burden from compilers? + if isinstance(literal, pa.Scalar): + return arrow_dtype_to_bigframes_dtype(literal.type) if pd.api.types.is_list_like(literal): element_types = [infer_literal_type(i) for i in literal] common_type = lcd_type(*element_types) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index caf39bd9e9..bc773d05b2 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -906,15 +906,53 @@ def test_df_to_pandas_batches(scalars_dfs): assert_pandas_df_equal(pd.concat(filtered_batches), pd_result) -def test_assign_new_column(scalars_dfs): +@pytest.mark.parametrize( + ("literal", "expected_dtype"), + ( + pytest.param( + 2, + dtypes.INT_DTYPE, + id="INT64", + ), + # ==================================================================== + # NULL values + # + # These are regression tests for b/428999884. It needs to be possible to + # set a column to NULL with a desired type (not just the pandas default + # of float64). + # ==================================================================== + pytest.param(None, dtypes.FLOAT_DTYPE, id="NULL-None"), + pytest.param( + pa.scalar(None, type=pa.int64()), + dtypes.INT_DTYPE, + id="NULL-pyarrow-TIMESTAMP", + ), + pytest.param( + pa.scalar(None, type=pa.timestamp("us", tz="UTC")), + dtypes.TIMESTAMP_DTYPE, + id="NULL-pyarrow-TIMESTAMP", + ), + pytest.param( + pa.scalar(None, type=pa.timestamp("us")), + dtypes.DATETIME_DTYPE, + id="NULL-pyarrow-DATETIME", + ), + ), +) +def test_assign_new_column_w_literal(scalars_dfs, literal, expected_dtype): scalars_df, scalars_pandas_df = scalars_dfs - kwargs = {"new_col": 2} - df = scalars_df.assign(**kwargs) + df = scalars_df.assign(new_col=literal) bf_result = df.to_pandas() - pd_result = scalars_pandas_df.assign(**kwargs) - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") + new_col_pd = literal + if isinstance(literal, pa.Scalar): + # PyArrow integer scalars aren't yet supported in pandas Int64Dtype. + new_col_pd = literal.as_py() + + # Pandas might not pick the same dtype as BigFrames, but it should at least + # be castable to it. + pd_result = scalars_pandas_df.assign(new_col=new_col_pd) + pd_result["new_col"] = pd_result["new_col"].astype(expected_dtype) assert_pandas_df_equal(bf_result, pd_result) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 4bb1c6589a..a04da64af0 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -606,7 +606,7 @@ def test_read_gbq_wildcard( "query": { "useQueryCache": True, "maximumBytesBilled": "1000000000", - "timeoutMs": 10000, + "timeoutMs": 120_000, } }, pytest.param( diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py index 77392bea2f..cd23614bbf 100644 --- a/tests/unit/core/test_dtypes.py +++ b/tests/unit/core/test_dtypes.py @@ -272,3 +272,19 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal(): ValueError, ): bigframes.core.compile.ibis_types.literal_to_ibis_scalar({"mykey": "myval"}) + + +@pytest.mark.parametrize( + ["scalar", "expected_dtype"], + [ + (pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE), + (pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE), + (pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE), + # Support NULL scalars. + (pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE), + (pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE), + (pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE), + ], +) +def test_infer_literal_type_arrow_scalar(scalar, expected_dtype): + assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype diff --git a/third_party/bigframes_vendored/ibis/common/temporal.py b/third_party/bigframes_vendored/ibis/common/temporal.py index 1b0e4fa985..8d84caf5a1 100644 --- a/third_party/bigframes_vendored/ibis/common/temporal.py +++ b/third_party/bigframes_vendored/ibis/common/temporal.py @@ -260,3 +260,8 @@ def _from_numpy_datetime64(value): raise TypeError("Unable to convert np.datetime64 without pandas") else: return pd.Timestamp(value).to_pydatetime() + + +@normalize_datetime.register("pyarrow.Scalar") +def _from_pyarrow_scalar(value): + return value.as_py() diff --git a/third_party/bigframes_vendored/ibis/expr/datatypes/value.py b/third_party/bigframes_vendored/ibis/expr/datatypes/value.py index e390cea02c..85be0ac749 100644 --- a/third_party/bigframes_vendored/ibis/expr/datatypes/value.py +++ b/third_party/bigframes_vendored/ibis/expr/datatypes/value.py @@ -27,6 +27,7 @@ import bigframes_vendored.ibis.expr.datatypes as dt from bigframes_vendored.ibis.expr.datatypes.cast import highest_precedence from public import public +import pyarrow as pa import toolz @@ -71,6 +72,14 @@ def infer_list(values: Sequence[Any]) -> dt.Array: return dt.Array(highest_precedence(map(infer, values))) +@infer.register("pyarrow.Scalar") +def infer_pyarrow_scalar(value: "pa.Scalar"): + """Infert the type of a PyArrow Scalar value.""" + import bigframes_vendored.ibis.formats.pyarrow + + return bigframes_vendored.ibis.formats.pyarrow.PyArrowType.to_ibis(value.type) + + @infer.register(datetime.time) def infer_time(value: datetime.time) -> dt.Time: return dt.time @@ -253,6 +262,9 @@ def infer_shapely_multipolygon(value) -> dt.MultiPolygon: def normalize(typ, value): """Ensure that the Python type underlying a literal resolves to a single type.""" + if pa is not None and isinstance(value, pa.Scalar): + value = value.as_py() + dtype = dt.dtype(typ) if value is None: if not dtype.nullable: diff --git a/third_party/bigframes_vendored/ibis/formats/pyarrow.py b/third_party/bigframes_vendored/ibis/formats/pyarrow.py index a6861b52e1..491e551ec1 100644 --- a/third_party/bigframes_vendored/ibis/formats/pyarrow.py +++ b/third_party/bigframes_vendored/ibis/formats/pyarrow.py @@ -24,7 +24,6 @@ @functools.cache def _from_pyarrow_types(): import pyarrow as pa - import pyarrow_hotfix # noqa: F401 return { pa.int8(): dt.Int8, @@ -87,7 +86,6 @@ class PyArrowType(TypeMapper): def to_ibis(cls, typ: pa.DataType, nullable=True) -> dt.DataType: """Convert a pyarrow type to an ibis type.""" import pyarrow as pa - import pyarrow_hotfix # noqa: F401 if pa.types.is_null(typ): return dt.null