Skip to content

Commit 03f9abd

Browse files
authored
Merge branch 'main' into main_chelsealin_refactor
2 parents ef6c299 + ac25618 commit 03f9abd

File tree

39 files changed

+1145
-202
lines changed

39 files changed

+1145
-202
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ repos:
2020
hooks:
2121
- id: trailing-whitespace
2222
- id: end-of-file-fixer
23-
exclude: "^tests/unit/core/compile/sqlglot/snapshots"
23+
exclude: "^tests/unit/core/compile/sqlglot/.*snapshots"
2424
- id: check-yaml
2525
- repo: https://github.com/pycqa/isort
2626
rev: 5.12.0

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,20 @@
44

55
[1]: https://pypi.org/project/bigframes/#history
66

7+
## [2.21.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.20.0...v2.21.0) (2025-09-17)
8+
9+
10+
### Features
11+
12+
* Add bigframes.bigquery.to_json ([#2078](https://github.com/googleapis/python-bigquery-dataframes/issues/2078)) ([0fc795a](https://github.com/googleapis/python-bigquery-dataframes/commit/0fc795a9fb56f469b62603462c3f0f56f52bfe04))
13+
* Support average='binary' in precision_score() ([#2080](https://github.com/googleapis/python-bigquery-dataframes/issues/2080)) ([920f381](https://github.com/googleapis/python-bigquery-dataframes/commit/920f381aec7e0a0b986886cdbc333e86335c6d7d))
14+
* Support pandas series in ai.generate_bool ([#2086](https://github.com/googleapis/python-bigquery-dataframes/issues/2086)) ([a3de53f](https://github.com/googleapis/python-bigquery-dataframes/commit/a3de53f68b2a24f4ed85a474dfaff9b59570a2f1))
15+
16+
17+
### Bug Fixes
18+
19+
* Allow bigframes.options.bigquery.credentials to be `None` ([#2092](https://github.com/googleapis/python-bigquery-dataframes/issues/2092)) ([78f4001](https://github.com/googleapis/python-bigquery-dataframes/commit/78f4001e8fcfc77fc82f3893d58e0d04c0f6d3db))
20+
721
## [2.20.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.19.0...v2.20.0) (2025-09-16)
822

923

bigframes/bigquery/_operations/ai.py

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,25 @@
1919
from __future__ import annotations
2020

2121
import json
22-
from typing import Any, List, Literal, Mapping, Tuple
22+
from typing import Any, List, Literal, Mapping, Tuple, Union
2323

24-
from bigframes import clients, dtypes, series
25-
from bigframes.core import log_adapter
24+
import pandas as pd
25+
26+
from bigframes import clients, dtypes, series, session
27+
from bigframes.core import convert, log_adapter
2628
from bigframes.operations import ai_ops
2729

30+
PROMPT_TYPE = Union[
31+
series.Series,
32+
pd.Series,
33+
List[Union[str, series.Series, pd.Series]],
34+
Tuple[Union[str, series.Series, pd.Series], ...],
35+
]
36+
2837

2938
@log_adapter.method_logger(custom_base_name="bigquery_ai")
3039
def generate_bool(
31-
prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series, ...],
40+
prompt: PROMPT_TYPE,
3241
*,
3342
connection_id: str | None = None,
3443
endpoint: str | None = None,
@@ -51,7 +60,7 @@ def generate_bool(
5160
0 {'result': True, 'full_response': '{"candidate...
5261
1 {'result': True, 'full_response': '{"candidate...
5362
2 {'result': False, 'full_response': '{"candidat...
54-
dtype: struct<result: bool, full_response: string, status: string>[pyarrow]
63+
dtype: struct<result: bool, full_response: extension<dbjson<JSONArrowType>>, status: string>[pyarrow]
5564
5665
>>> bbq.ai.generate_bool((df["col_1"], " is a ", df["col_2"])).struct.field("result")
5766
0 True
@@ -60,8 +69,9 @@ def generate_bool(
6069
Name: result, dtype: boolean
6170
6271
Args:
63-
prompt (series.Series | List[str|series.Series] | Tuple[str|series.Series, ...]):
64-
A mixture of Series and string literals that specifies the prompt to send to the model.
72+
prompt (Series | List[str|Series] | Tuple[str|Series, ...]):
73+
A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series
74+
or pandas Series.
6575
connection_id (str, optional):
6676
Specifies the connection to use to communicate with the model. For example, `myproject.us.myconnection`.
6777
If not provided, the connection from the current session will be used.
@@ -84,7 +94,7 @@ def generate_bool(
8494
Returns:
8595
bigframes.series.Series: A new struct Series with the result data. The struct contains these fields:
8696
* "result": a BOOL value containing the model's response to the prompt. The result is None if the request fails or is filtered by responsible AI.
87-
* "full_response": a STRING value containing the JSON response from the projects.locations.endpoints.generateContent call to the model.
97+
* "full_response": a JSON value containing the response from the projects.locations.endpoints.generateContent call to the model.
8898
The generated text is in the text element.
8999
* "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful.
90100
"""
@@ -104,7 +114,7 @@ def generate_bool(
104114

105115

106116
def _separate_context_and_series(
107-
prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series, ...],
117+
prompt: PROMPT_TYPE,
108118
) -> Tuple[List[str | None], List[series.Series]]:
109119
"""
110120
Returns the two values. The first value is the prompt with all series replaced by None. The second value is all the series
@@ -123,18 +133,19 @@ def _separate_context_and_series(
123133
return [None], [prompt]
124134

125135
prompt_context: List[str | None] = []
126-
series_list: List[series.Series] = []
136+
series_list: List[series.Series | pd.Series] = []
127137

138+
session = None
128139
for item in prompt:
129140
if isinstance(item, str):
130141
prompt_context.append(item)
131142

132-
elif isinstance(item, series.Series):
143+
elif isinstance(item, (series.Series, pd.Series)):
133144
prompt_context.append(None)
134145

135-
if item.dtype == dtypes.OBJ_REF_DTYPE:
136-
# Multi-model support
137-
item = item.blob.read_url()
146+
if isinstance(item, series.Series) and session is None:
147+
# Use the first available BF session if there's any.
148+
session = item._session
138149
series_list.append(item)
139150

140151
else:
@@ -143,7 +154,20 @@ def _separate_context_and_series(
143154
if not series_list:
144155
raise ValueError("Please provide at least one Series in the prompt")
145156

146-
return prompt_context, series_list
157+
converted_list = [_convert_series(s, session) for s in series_list]
158+
159+
return prompt_context, converted_list
160+
161+
162+
def _convert_series(
163+
s: series.Series | pd.Series, session: session.Session | None
164+
) -> series.Series:
165+
result = convert.to_bf_series(s, default_index=None, session=session)
166+
167+
if result.dtype == dtypes.OBJ_REF_DTYPE:
168+
# Support multimodel
169+
return result.blob.read_url()
170+
return result
147171

148172

149173
def _resolve_connection_id(series: series.Series, connection_id: str | None):

bigframes/core/array_value.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,14 @@ def relational_join(
480480
type: typing.Literal["inner", "outer", "left", "right", "cross"] = "inner",
481481
propogate_order: Optional[bool] = None,
482482
) -> typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]]:
483+
for lcol, rcol in conditions:
484+
ltype = self.get_column_type(lcol)
485+
rtype = other.get_column_type(rcol)
486+
if not bigframes.dtypes.can_compare(ltype, rtype):
487+
raise TypeError(
488+
f"Cannot join with non-comparable join key types: {ltype}, {rtype}"
489+
)
490+
483491
l_mapping = { # Identity mapping, only rename right side
484492
lcol.name: lcol.name for lcol in self.node.ids
485493
}

bigframes/core/blocks.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,10 @@ def from_local(
252252
pass
253253
return block
254254

255+
@property
256+
def has_index(self) -> bool:
257+
return len(self._index_columns) > 0
258+
255259
@property
256260
def index(self) -> BlockIndexProperties:
257261
"""Row identities for values in the Block."""
@@ -1371,10 +1375,16 @@ def aggregate(
13711375
) -> typing.Tuple[Block, typing.Sequence[str]]:
13721376
"""
13731377
Apply aggregations to the block.
1378+
13741379
Arguments:
13751380
by_column_id: column id of the aggregation key, this is preserved through the transform and used as index.
13761381
aggregations: input_column_id, operation tuples
13771382
dropna: whether null keys should be dropped
1383+
1384+
Returns:
1385+
Tuple[Block, Sequence[str]]:
1386+
The first element is the grouped block. The second is the
1387+
column IDs corresponding to each applied aggregation.
13781388
"""
13791389
if column_labels is None:
13801390
column_labels = pd.Index(range(len(aggregations)))
@@ -1780,15 +1790,19 @@ def pivot(
17801790
else:
17811791
return result_block.with_column_labels(columns_values)
17821792

1783-
def stack(self, how="left", levels: int = 1):
1793+
def stack(
1794+
self, how="left", levels: int = 1, *, override_labels: Optional[pd.Index] = None
1795+
):
17841796
"""Unpivot last column axis level into row axis"""
17851797
if levels == 0:
17861798
return self
17871799

17881800
# These are the values that will be turned into rows
17891801

17901802
col_labels, row_labels = utils.split_index(self.column_labels, levels=levels)
1791-
row_labels = row_labels.drop_duplicates()
1803+
row_labels = (
1804+
row_labels.drop_duplicates() if override_labels is None else override_labels
1805+
)
17921806

17931807
if col_labels is None:
17941808
result_index: pd.Index = pd.Index([None])

bigframes/core/compile/api.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,18 @@
1515

1616
from typing import TYPE_CHECKING
1717

18-
from bigframes.core import rewrite
19-
from bigframes.core.compile.ibis_compiler import ibis_compiler
20-
2118
if TYPE_CHECKING:
2219
import bigframes.core.nodes
2320

2421

2522
def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode):
2623
"""Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema."""
24+
from bigframes.core.compile.ibis_compiler import ibis_compiler
25+
import bigframes.core.rewrite
2726
import bigframes.core.schema
2827

2928
node = ibis_compiler._replace_unsupported_ops(node)
30-
node = rewrite.bake_order(node)
29+
node = bigframes.core.rewrite.bake_order(node)
3130
ir = ibis_compiler.compile_node(node)
3231
items = tuple(
3332
bigframes.core.schema.SchemaItem(name, ir.get_column_type(ibis_id))

bigframes/core/compile/sqlglot/aggregations/unary_compiler.py

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import typing
1818

19+
import pandas as pd
1920
import sqlglot.expressions as sge
2021

2122
from bigframes import dtypes
@@ -46,18 +47,22 @@ def _(
4647
return apply_window_if_present(sge.func("COUNT", column.expr), window)
4748

4849

49-
@UNARY_OP_REGISTRATION.register(agg_ops.SumOp)
50+
@UNARY_OP_REGISTRATION.register(agg_ops.MaxOp)
5051
def _(
51-
op: agg_ops.SumOp,
52+
op: agg_ops.MaxOp,
5253
column: typed_expr.TypedExpr,
5354
window: typing.Optional[window_spec.WindowSpec] = None,
5455
) -> sge.Expression:
55-
expr = column.expr
56-
if column.dtype == dtypes.BOOL_DTYPE:
57-
expr = sge.Cast(this=column.expr, to="INT64")
58-
# Will be null if all inputs are null. Pandas defaults to zero sum though.
59-
expr = apply_window_if_present(sge.func("SUM", expr), window)
60-
return sge.func("IFNULL", expr, ir._literal(0, column.dtype))
56+
return apply_window_if_present(sge.func("MAX", column.expr), window)
57+
58+
59+
@UNARY_OP_REGISTRATION.register(agg_ops.MinOp)
60+
def _(
61+
op: agg_ops.MinOp,
62+
column: typed_expr.TypedExpr,
63+
window: typing.Optional[window_spec.WindowSpec] = None,
64+
) -> sge.Expression:
65+
return apply_window_if_present(sge.func("MIN", column.expr), window)
6166

6267

6368
@UNARY_OP_REGISTRATION.register(agg_ops.SizeUnaryOp)
@@ -67,3 +72,20 @@ def _(
6772
window: typing.Optional[window_spec.WindowSpec] = None,
6873
) -> sge.Expression:
6974
return apply_window_if_present(sge.func("COUNT", sge.convert(1)), window)
75+
76+
77+
@UNARY_OP_REGISTRATION.register(agg_ops.SumOp)
78+
def _(
79+
op: agg_ops.SumOp,
80+
column: typed_expr.TypedExpr,
81+
window: typing.Optional[window_spec.WindowSpec] = None,
82+
) -> sge.Expression:
83+
expr = column.expr
84+
if column.dtype == dtypes.BOOL_DTYPE:
85+
expr = sge.Cast(this=column.expr, to="INT64")
86+
87+
expr = apply_window_if_present(sge.func("SUM", expr), window)
88+
89+
# Will be null if all inputs are null. Pandas defaults to zero sum though.
90+
zero = pd.to_timedelta(0) if column.dtype == dtypes.TIMEDELTA_DTYPE else 0
91+
return sge.func("IFNULL", expr, ir._literal(zero, column.dtype))

bigframes/core/compile/sqlglot/scalar_compiler.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def register_unary_op(
7979
"""
8080
key = typing.cast(str, op_ref.name)
8181

82-
def decorator(impl: typing.Callable[..., TypedExpr]):
82+
def decorator(impl: typing.Callable[..., sge.Expression]):
8383
def normalized_impl(args: typing.Sequence[TypedExpr], op: ops.RowOp):
8484
if pass_op:
8585
return impl(args[0], op)
@@ -108,7 +108,7 @@ def register_binary_op(
108108
"""
109109
key = typing.cast(str, op_ref.name)
110110

111-
def decorator(impl: typing.Callable[..., TypedExpr]):
111+
def decorator(impl: typing.Callable[..., sge.Expression]):
112112
def normalized_impl(args: typing.Sequence[TypedExpr], op: ops.RowOp):
113113
if pass_op:
114114
return impl(args[0], args[1], op)
@@ -132,7 +132,7 @@ def register_ternary_op(
132132
"""
133133
key = typing.cast(str, op_ref.name)
134134

135-
def decorator(impl: typing.Callable[..., TypedExpr]):
135+
def decorator(impl: typing.Callable[..., sge.Expression]):
136136
def normalized_impl(args: typing.Sequence[TypedExpr], op: ops.RowOp):
137137
return impl(args[0], args[1], args[2])
138138

@@ -156,7 +156,7 @@ def register_nary_op(
156156
"""
157157
key = typing.cast(str, op_ref.name)
158158

159-
def decorator(impl: typing.Callable[..., TypedExpr]):
159+
def decorator(impl: typing.Callable[..., sge.Expression]):
160160
def normalized_impl(args: typing.Sequence[TypedExpr], op: ops.RowOp):
161161
if pass_op:
162162
return impl(*args, op=op)

bigframes/core/groupby/dataframe_group_by.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import datetime
1818
import typing
19-
from typing import Literal, Optional, Sequence, Tuple, Union
19+
from typing import Iterable, Literal, Optional, Sequence, Tuple, Union
2020

2121
import bigframes_vendored.constants as constants
2222
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
@@ -29,7 +29,7 @@
2929
from bigframes.core import log_adapter
3030
import bigframes.core.block_transforms as block_ops
3131
import bigframes.core.blocks as blocks
32-
from bigframes.core.groupby import aggs, series_group_by
32+
from bigframes.core.groupby import aggs, group_by, series_group_by
3333
import bigframes.core.ordering as order
3434
import bigframes.core.utils as utils
3535
import bigframes.core.validations as validations
@@ -54,6 +54,7 @@ def __init__(
5454
selected_cols: typing.Optional[typing.Sequence[str]] = None,
5555
dropna: bool = True,
5656
as_index: bool = True,
57+
by_key_is_singular: bool = False,
5758
):
5859
# TODO(tbergeron): Support more group-by expression types
5960
self._block = block
@@ -64,6 +65,9 @@ def __init__(
6465
)
6566
}
6667
self._by_col_ids = by_col_ids
68+
self._by_key_is_singular = by_key_is_singular
69+
if by_key_is_singular:
70+
assert len(by_col_ids) == 1, "singular key should be exactly one group key"
6771

6872
self._dropna = dropna
6973
self._as_index = as_index
@@ -149,6 +153,30 @@ def head(self, n: int = 5) -> df.DataFrame:
149153
)
150154
)
151155

156+
def describe(self, include: None | Literal["all"] = None):
157+
from bigframes.pandas.core.methods import describe
158+
159+
return df.DataFrame(
160+
describe._describe(
161+
self._block,
162+
self._selected_cols,
163+
include,
164+
as_index=self._as_index,
165+
by_col_ids=self._by_col_ids,
166+
dropna=self._dropna,
167+
)
168+
)
169+
170+
def __iter__(self) -> Iterable[Tuple[blocks.Label, df.DataFrame]]:
171+
for group_keys, filtered_block in group_by.block_groupby_iter(
172+
self._block,
173+
by_col_ids=self._by_col_ids,
174+
by_key_is_singular=self._by_key_is_singular,
175+
dropna=self._dropna,
176+
):
177+
filtered_df = df.DataFrame(filtered_block)
178+
yield group_keys, filtered_df
179+
152180
def size(self) -> typing.Union[df.DataFrame, series.Series]:
153181
agg_block, _ = self._block.aggregate_size(
154182
by_column_ids=self._by_col_ids,

0 commit comments

Comments
 (0)