Skip to content

Commit 9aaf580

Browse files
authored
Merge branch 'main' into tswast-st-regionstats
2 parents e000ce0 + 196f6df commit 9aaf580

File tree

27 files changed

+573
-126
lines changed

27 files changed

+573
-126
lines changed

bigframes/core/blocks.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1996,6 +1996,31 @@ def _generate_resample_label(
19961996
Literal["epoch", "start", "start_day", "end", "end_day"],
19971997
] = "start_day",
19981998
) -> Block:
1999+
if not isinstance(rule, str):
2000+
raise NotImplementedError(
2001+
f"Only offset strings are currently supported for rule, but got {repr(rule)}. {constants.FEEDBACK_LINK}"
2002+
)
2003+
2004+
if rule in ("ME", "YE", "QE", "BME", "BA", "BQE", "W"):
2005+
raise NotImplementedError(
2006+
f"Offset strings 'ME', 'YE', 'QE', 'BME', 'BA', 'BQE', 'W' are not currently supported for rule, but got {repr(rule)}. {constants.FEEDBACK_LINK}"
2007+
)
2008+
2009+
if closed == "right":
2010+
raise NotImplementedError(
2011+
f"Only closed='left' is currently supported. {constants.FEEDBACK_LINK}",
2012+
)
2013+
2014+
if label == "right":
2015+
raise NotImplementedError(
2016+
f"Only label='left' is currently supported. {constants.FEEDBACK_LINK}",
2017+
)
2018+
2019+
if origin not in ("epoch", "start", "start_day"):
2020+
raise NotImplementedError(
2021+
f"Only origin='epoch', 'start', 'start_day' are currently supported, but got {repr(origin)}. {constants.FEEDBACK_LINK}"
2022+
)
2023+
19992024
# Validate and resolve the index or column to use for grouping
20002025
if on is None:
20012026
if len(self.index_columns) == 0:

bigframes/core/compile/sqlglot/aggregations/op_registration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,5 +52,5 @@ def arg_checker(*args, **kwargs):
5252
def __getitem__(self, op: str | agg_ops.WindowOp) -> CompilationFunc:
5353
key = op if isinstance(op, type) else type(op)
5454
if str(key) not in self._registered_ops:
55-
raise ValueError(f"{key} is already not registered")
55+
raise ValueError(f"{key} is not registered")
5656
return self._registered_ops[str(key)]

bigframes/core/compile/sqlglot/aggregations/unary_compiler.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,20 @@ def _(
239239
return apply_window_if_present(sge.func("MIN", column.expr), window)
240240

241241

242+
@UNARY_OP_REGISTRATION.register(agg_ops.PopVarOp)
243+
def _(
244+
op: agg_ops.PopVarOp,
245+
column: typed_expr.TypedExpr,
246+
window: typing.Optional[window_spec.WindowSpec] = None,
247+
) -> sge.Expression:
248+
expr = column.expr
249+
if column.dtype == dtypes.BOOL_DTYPE:
250+
expr = sge.Cast(this=expr, to="INT64")
251+
252+
expr = sge.func("VAR_POP", expr)
253+
return apply_window_if_present(expr, window)
254+
255+
242256
@UNARY_OP_REGISTRATION.register(agg_ops.QuantileOp)
243257
def _(
244258
op: agg_ops.QuantileOp,
@@ -278,6 +292,22 @@ def _(
278292
return apply_window_if_present(sge.func("COUNT", sge.convert(1)), window)
279293

280294

295+
@UNARY_OP_REGISTRATION.register(agg_ops.StdOp)
296+
def _(
297+
op: agg_ops.StdOp,
298+
column: typed_expr.TypedExpr,
299+
window: typing.Optional[window_spec.WindowSpec] = None,
300+
) -> sge.Expression:
301+
expr = column.expr
302+
if column.dtype == dtypes.BOOL_DTYPE:
303+
expr = sge.Cast(this=expr, to="INT64")
304+
305+
expr = sge.func("STDDEV", expr)
306+
if op.should_floor_result or column.dtype == dtypes.TIMEDELTA_DTYPE:
307+
expr = sge.Cast(this=sge.func("FLOOR", expr), to="INT64")
308+
return apply_window_if_present(expr, window)
309+
310+
281311
@UNARY_OP_REGISTRATION.register(agg_ops.ShiftOp)
282312
def _(
283313
op: agg_ops.ShiftOp,
@@ -331,3 +361,17 @@ def _(
331361
expression=shifted,
332362
unit=sge.Identifier(this="MICROSECOND"),
333363
)
364+
365+
366+
@UNARY_OP_REGISTRATION.register(agg_ops.VarOp)
367+
def _(
368+
op: agg_ops.VarOp,
369+
column: typed_expr.TypedExpr,
370+
window: typing.Optional[window_spec.WindowSpec] = None,
371+
) -> sge.Expression:
372+
expr = column.expr
373+
if column.dtype == dtypes.BOOL_DTYPE:
374+
expr = sge.Cast(this=expr, to="INT64")
375+
376+
expr = sge.func("VAR_SAMP", expr)
377+
return apply_window_if_present(expr, window)

bigframes/core/compile/sqlglot/expressions/comparison_ops.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
109109
return sge.LTE(this=left_expr, expression=right_expr)
110110

111111

112+
@register_binary_op(ops.maximum_op)
113+
def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
114+
return sge.Greatest(expressions=[left.expr, right.expr])
115+
116+
112117
@register_binary_op(ops.minimum_op)
113118
def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
114119
return sge.Least(this=left.expr, expressions=right.expr)

bigframes/core/compile/sqlglot/expressions/generic_ops.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler
2525

2626
register_unary_op = scalar_compiler.scalar_op_compiler.register_unary_op
27+
register_binary_op = scalar_compiler.scalar_op_compiler.register_binary_op
2728
register_nary_op = scalar_compiler.scalar_op_compiler.register_nary_op
2829
register_ternary_op = scalar_compiler.scalar_op_compiler.register_ternary_op
2930

@@ -159,6 +160,13 @@ def _(*cases_and_outputs: TypedExpr) -> sge.Expression:
159160
)
160161

161162

163+
@register_binary_op(ops.coalesce_op)
164+
def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
165+
if left.expr == right.expr:
166+
return left.expr
167+
return sge.Coalesce(this=left.expr, expressions=[right.expr])
168+
169+
162170
@register_nary_op(ops.RowKey)
163171
def _(*values: TypedExpr) -> sge.Expression:
164172
# All inputs into hash must be non-null or resulting hash will be null

bigframes/dataframe.py

Lines changed: 5 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -4182,10 +4182,12 @@ def _split(
41824182
return [DataFrame(block) for block in blocks]
41834183

41844184
@validations.requires_ordering()
4185-
def _resample(
4185+
def resample(
41864186
self,
41874187
rule: str,
41884188
*,
4189+
closed: Optional[Literal["right", "left"]] = None,
4190+
label: Optional[Literal["right", "left"]] = None,
41894191
on: blocks.Label = None,
41904192
level: Optional[LevelsType] = None,
41914193
origin: Union[
@@ -4195,64 +4197,10 @@ def _resample(
41954197
Literal["epoch", "start", "start_day", "end", "end_day"],
41964198
] = "start_day",
41974199
) -> bigframes.core.groupby.DataFrameGroupBy:
4198-
"""Internal function to support resample. Resample time-series data.
4199-
4200-
**Examples:**
4201-
4202-
>>> import bigframes.pandas as bpd
4203-
>>> data = {
4204-
... "timestamp_col": pd.date_range(
4205-
... start="2021-01-01 13:00:00", periods=30, freq="1s"
4206-
... ),
4207-
... "int64_col": range(30),
4208-
... "int64_too": range(10, 40),
4209-
... }
4210-
4211-
Resample on a DataFrame with index:
4212-
4213-
>>> df = bpd.DataFrame(data).set_index("timestamp_col")
4214-
>>> df._resample(rule="7s").min()
4215-
int64_col int64_too
4216-
2021-01-01 12:59:55 0 10
4217-
2021-01-01 13:00:02 2 12
4218-
2021-01-01 13:00:09 9 19
4219-
2021-01-01 13:00:16 16 26
4220-
2021-01-01 13:00:23 23 33
4221-
<BLANKLINE>
4222-
[5 rows x 2 columns]
4223-
4224-
Resample with column and origin set to 'start':
4225-
4226-
>>> df = bpd.DataFrame(data)
4227-
>>> df._resample(rule="7s", on = "timestamp_col", origin="start").min()
4228-
int64_col int64_too
4229-
2021-01-01 13:00:00 0 10
4230-
2021-01-01 13:00:07 7 17
4231-
2021-01-01 13:00:14 14 24
4232-
2021-01-01 13:00:21 21 31
4233-
2021-01-01 13:00:28 28 38
4234-
<BLANKLINE>
4235-
[5 rows x 2 columns]
4236-
4237-
Args:
4238-
rule (str):
4239-
The offset string representing target conversion.
4240-
on (str, default None):
4241-
For a DataFrame, column to use instead of index for resampling. Column
4242-
must be datetime-like.
4243-
level (str or int, default None):
4244-
For a MultiIndex, level (name or number) to use for resampling.
4245-
level must be datetime-like.
4246-
origin(str, default 'start_day'):
4247-
The timestamp on which to adjust the grouping. Must be one of the following:
4248-
'epoch': origin is 1970-01-01
4249-
'start': origin is the first value of the timeseries
4250-
'start_day': origin is the first day at midnight of the timeseries
4251-
Returns:
4252-
DataFrameGroupBy: DataFrameGroupBy object.
4253-
"""
42544200
block = self._block._generate_resample_label(
42554201
rule=rule,
4202+
closed=closed,
4203+
label=label,
42564204
on=on,
42574205
level=level,
42584206
origin=origin,

bigframes/series.py

Lines changed: 1 addition & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2505,7 +2505,7 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series:
25052505
)
25062506

25072507
@validations.requires_ordering()
2508-
def _resample(
2508+
def resample(
25092509
self,
25102510
rule: str,
25112511
*,
@@ -2519,43 +2519,6 @@ def _resample(
25192519
Literal["epoch", "start", "start_day", "end", "end_day"],
25202520
] = "start_day",
25212521
) -> bigframes.core.groupby.SeriesGroupBy:
2522-
"""Internal function to support resample. Resample time-series data.
2523-
2524-
**Examples:**
2525-
2526-
>>> import bigframes.pandas as bpd
2527-
>>> data = {
2528-
... "timestamp_col": pd.date_range(
2529-
... start="2021-01-01 13:00:00", periods=30, freq="1s"
2530-
... ),
2531-
... "int64_col": range(30),
2532-
... }
2533-
>>> s = bpd.DataFrame(data).set_index("timestamp_col")
2534-
>>> s._resample(rule="7s", origin="epoch").min()
2535-
int64_col
2536-
2021-01-01 12:59:56 0
2537-
2021-01-01 13:00:03 3
2538-
2021-01-01 13:00:10 10
2539-
2021-01-01 13:00:17 17
2540-
2021-01-01 13:00:24 24
2541-
<BLANKLINE>
2542-
[5 rows x 1 columns]
2543-
2544-
2545-
Args:
2546-
rule (str):
2547-
The offset string representing target conversion.
2548-
level (str or int, default None):
2549-
For a MultiIndex, level (name or number) to use for resampling.
2550-
level must be datetime-like.
2551-
origin(str, default 'start_day'):
2552-
The timestamp on which to adjust the grouping. Must be one of the following:
2553-
'epoch': origin is 1970-01-01
2554-
'start': origin is the first value of the timeseries
2555-
'start_day': origin is the first day at midnight of the timeseries
2556-
Returns:
2557-
SeriesGroupBy: SeriesGroupBy object.
2558-
"""
25592522
block = self._block._generate_resample_label(
25602523
rule=rule,
25612524
closed=closed,

tests/system/small/engines/test_aggregation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def test_engines_unary_aggregates(
111111
assert_equivalence_execution(node, REFERENCE_ENGINE, engine)
112112

113113

114-
@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True)
114+
@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True)
115115
@pytest.mark.parametrize(
116116
"op",
117117
[agg_ops.std_op, agg_ops.var_op, agg_ops.PopVarOp()],

tests/system/small/engines/test_generic_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ def test_engines_where_op(scalars_array_value: array_value.ArrayValue, engine):
329329
assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine)
330330

331331

332-
@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True)
332+
@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True)
333333
def test_engines_coalesce_op(scalars_array_value: array_value.ArrayValue, engine):
334334
arr, _ = scalars_array_value.compute_values(
335335
[

tests/system/small/test_dataframe.py

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5915,21 +5915,15 @@ def test_dataframe_explode_xfail(col_names):
59155915
pytest.param("datetime_col", "5M", "epoch"),
59165916
pytest.param("datetime_col", "3Q", "start_day"),
59175917
pytest.param("datetime_col", "3YE", "start"),
5918-
pytest.param(
5919-
"int64_col", "100D", "start", marks=pytest.mark.xfail(raises=TypeError)
5920-
),
5921-
pytest.param(
5922-
"datetime_col", "100D", "end", marks=pytest.mark.xfail(raises=ValueError)
5923-
),
59245918
],
59255919
)
5926-
def test__resample_with_column(
5920+
def test_resample_with_column(
59275921
scalars_df_index, scalars_pandas_df_index, on, rule, origin
59285922
):
59295923
# TODO: supply a reason why this isn't compatible with pandas 1.x
59305924
pytest.importorskip("pandas", minversion="2.0.0")
59315925
bf_result = (
5932-
scalars_df_index._resample(rule=rule, on=on, origin=origin)[
5926+
scalars_df_index.resample(rule=rule, on=on, origin=origin)[
59335927
["int64_col", "int64_too"]
59345928
]
59355929
.max()
@@ -5943,30 +5937,54 @@ def test__resample_with_column(
59435937
)
59445938

59455939

5940+
@pytest.mark.parametrize("index_col", ["timestamp_col", "datetime_col"])
5941+
@pytest.mark.parametrize(
5942+
("index_append", "level"),
5943+
[(True, 1), (False, None), (False, 0)],
5944+
)
59465945
@pytest.mark.parametrize(
5947-
("append", "level", "col", "rule"),
5946+
"rule",
59485947
[
5949-
pytest.param(False, None, "timestamp_col", "100d"),
5950-
pytest.param(True, 1, "timestamp_col", "1200h"),
5951-
pytest.param(False, None, "datetime_col", "100d"),
5948+
# TODO(tswast): support timedeltas and dataoffsets.
5949+
# TODO(tswast): support bins that default to "right".
5950+
"100d",
5951+
"1200h",
59525952
],
59535953
)
5954-
def test__resample_with_index(
5955-
scalars_df_index, scalars_pandas_df_index, append, level, col, rule
5954+
# TODO(tswast): support "right"
5955+
@pytest.mark.parametrize("closed", ["left", None])
5956+
# TODO(tswast): support "right"
5957+
@pytest.mark.parametrize("label", ["left", None])
5958+
@pytest.mark.parametrize(
5959+
"origin",
5960+
["epoch", "start", "start_day"], # TODO(tswast): support end, end_day.
5961+
)
5962+
def test_resample_with_index(
5963+
scalars_df_index,
5964+
scalars_pandas_df_index,
5965+
index_append,
5966+
level,
5967+
index_col,
5968+
rule,
5969+
closed,
5970+
origin,
5971+
label,
59565972
):
59575973
# TODO: supply a reason why this isn't compatible with pandas 1.x
59585974
pytest.importorskip("pandas", minversion="2.0.0")
5959-
scalars_df_index = scalars_df_index.set_index(col, append=append)
5960-
scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)
5975+
scalars_df_index = scalars_df_index.set_index(index_col, append=index_append)
5976+
scalars_pandas_df_index = scalars_pandas_df_index.set_index(
5977+
index_col, append=index_append
5978+
)
59615979
bf_result = (
59625980
scalars_df_index[["int64_col", "int64_too"]]
5963-
._resample(rule=rule, level=level)
5981+
.resample(rule=rule, level=level, closed=closed, origin=origin, label=label)
59645982
.min()
59655983
.to_pandas()
59665984
)
59675985
pd_result = (
59685986
scalars_pandas_df_index[["int64_col", "int64_too"]]
5969-
.resample(rule=rule, level=level)
5987+
.resample(rule=rule, level=level, closed=closed, origin=origin, label=label)
59705988
.min()
59715989
)
59725990
assert_pandas_df_equal(bf_result, pd_result)
@@ -6010,15 +6028,15 @@ def test__resample_with_index(
60106028
),
60116029
],
60126030
)
6013-
def test__resample_start_time(rule, origin, data):
6031+
def test_resample_start_time(rule, origin, data):
60146032
# TODO: supply a reason why this isn't compatible with pandas 1.x
60156033
pytest.importorskip("pandas", minversion="2.0.0")
60166034
col = "timestamp_col"
60176035
scalars_df_index = bpd.DataFrame(data).set_index(col)
60186036
scalars_pandas_df_index = pd.DataFrame(data).set_index(col)
60196037
scalars_pandas_df_index.index.name = None
60206038

6021-
bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas()
6039+
bf_result = scalars_df_index.resample(rule=rule, origin=origin).min().to_pandas()
60226040

60236041
pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min()
60246042

0 commit comments

Comments
 (0)