Skip to content

Commit 1431c24

Browse files
feat: Support some python standard lib callables in apply/combine
1 parent 2c50310 commit 1431c24

File tree

5 files changed

+296
-69
lines changed

5 files changed

+296
-69
lines changed

bigframes/core/compile/polars/operations/numeric_ops.py

Lines changed: 72 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,6 @@
2929
import polars as pl
3030

3131

32-
@polars_compiler.register_op(numeric_ops.CosOp)
33-
def cos_op_impl(
34-
compiler: polars_compiler.PolarsExpressionCompiler,
35-
op: numeric_ops.CosOp, # type: ignore
36-
input: pl.Expr,
37-
) -> pl.Expr:
38-
return input.cos()
39-
40-
4132
@polars_compiler.register_op(numeric_ops.LnOp)
4233
def ln_op_impl(
4334
compiler: polars_compiler.PolarsExpressionCompiler,
@@ -80,6 +71,78 @@ def sin_op_impl(
8071
return input.sin()
8172

8273

74+
@polars_compiler.register_op(numeric_ops.CosOp)
75+
def cos_op_impl(
76+
compiler: polars_compiler.PolarsExpressionCompiler,
77+
op: numeric_ops.CosOp, # type: ignore
78+
input: pl.Expr,
79+
) -> pl.Expr:
80+
return input.cos()
81+
82+
83+
@polars_compiler.register_op(numeric_ops.TanOp)
84+
def tan_op_impl(
85+
compiler: polars_compiler.PolarsExpressionCompiler,
86+
op: numeric_ops.SinOp, # type: ignore
87+
input: pl.Expr,
88+
) -> pl.Expr:
89+
return input.tan()
90+
91+
92+
@polars_compiler.register_op(numeric_ops.SinhOp)
93+
def sinh_op_impl(
94+
compiler: polars_compiler.PolarsExpressionCompiler,
95+
op: numeric_ops.SinOp, # type: ignore
96+
input: pl.Expr,
97+
) -> pl.Expr:
98+
return input.sinh()
99+
100+
101+
@polars_compiler.register_op(numeric_ops.CoshOp)
102+
def cosh_op_impl(
103+
compiler: polars_compiler.PolarsExpressionCompiler,
104+
op: numeric_ops.CosOp, # type: ignore
105+
input: pl.Expr,
106+
) -> pl.Expr:
107+
return input.cosh()
108+
109+
110+
@polars_compiler.register_op(numeric_ops.TanhOp)
111+
def tanh_op_impl(
112+
compiler: polars_compiler.PolarsExpressionCompiler,
113+
op: numeric_ops.SinOp, # type: ignore
114+
input: pl.Expr,
115+
) -> pl.Expr:
116+
return input.tanh()
117+
118+
119+
@polars_compiler.register_op(numeric_ops.ArcsinOp)
120+
def asin_op_impl(
121+
compiler: polars_compiler.PolarsExpressionCompiler,
122+
op: numeric_ops.ArcsinOp, # type: ignore
123+
input: pl.Expr,
124+
) -> pl.Expr:
125+
return input.arcsin()
126+
127+
128+
@polars_compiler.register_op(numeric_ops.ArccosOp)
129+
def acos_op_impl(
130+
compiler: polars_compiler.PolarsExpressionCompiler,
131+
op: numeric_ops.ArccosOp, # type: ignore
132+
input: pl.Expr,
133+
) -> pl.Expr:
134+
return input.arccos()
135+
136+
137+
@polars_compiler.register_op(numeric_ops.ArctanOp)
138+
def atan_op_impl(
139+
compiler: polars_compiler.PolarsExpressionCompiler,
140+
op: numeric_ops.ArctanOp, # type: ignore
141+
input: pl.Expr,
142+
) -> pl.Expr:
143+
return input.arctan()
144+
145+
83146
@polars_compiler.register_op(numeric_ops.SqrtOp)
84147
def sqrt_op_impl(
85148
compiler: polars_compiler.PolarsExpressionCompiler,
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import math
16+
import operator
17+
from typing import Optional
18+
19+
import bigframes.operations
20+
from bigframes.operations import (
21+
aggregations,
22+
array_ops,
23+
bool_ops,
24+
comparison_ops,
25+
numeric_ops,
26+
string_ops,
27+
)
28+
29+
PYTHON_TO_BIGFRAMES = {
30+
## operators
31+
operator.add: numeric_ops.add_op,
32+
operator.sub: numeric_ops.sub_op,
33+
operator.mul: numeric_ops.mul_op,
34+
operator.truediv: numeric_ops.div_op,
35+
operator.mod: numeric_ops.mod_op,
36+
operator.pow: numeric_ops.pow_op,
37+
operator.pos: numeric_ops.pos_op,
38+
operator.neg: numeric_ops.neg_op,
39+
operator.abs: numeric_ops.abs_op,
40+
operator.eq: comparison_ops.eq_null_match_op,
41+
operator.ne: comparison_ops.ne_op,
42+
operator.gt: comparison_ops.gt_op,
43+
operator.lt: comparison_ops.lt_op,
44+
operator.ge: comparison_ops.ge_op,
45+
operator.le: comparison_ops.le_op,
46+
operator.and_: bool_ops.and_op,
47+
operator.or_: bool_ops.or_op,
48+
operator.xor: bool_ops.xor_op,
49+
## math
50+
math.log: numeric_ops.ln_op,
51+
math.log10: numeric_ops.log10_op,
52+
math.sin: numeric_ops.sin_op,
53+
math.cos: numeric_ops.cos_op,
54+
math.tan: numeric_ops.tan_op,
55+
math.sinh: numeric_ops.sinh_op,
56+
math.cosh: numeric_ops.cosh_op,
57+
math.tanh: numeric_ops.tanh_op,
58+
math.asin: numeric_ops.arcsin_op,
59+
math.acos: numeric_ops.arccos_op,
60+
math.atan: numeric_ops.arctan_op,
61+
## str
62+
str.upper: string_ops.upper_op,
63+
str.lower: string_ops.lower_op,
64+
## builtins
65+
len: string_ops.len_op,
66+
abs: numeric_ops.abs_op,
67+
pow: numeric_ops.pow_op,
68+
### builtins -- iterable
69+
all: array_ops.ArrayReduceOp(aggregations.all_op),
70+
any: array_ops.ArrayReduceOp(aggregations.any_op),
71+
sum: array_ops.ArrayReduceOp(aggregations.sum_op),
72+
min: array_ops.ArrayReduceOp(aggregations.min_op),
73+
max: array_ops.ArrayReduceOp(aggregations.max_op),
74+
}
75+
76+
77+
def python_callable_to_op(obj) -> Optional[bigframes.operations.RowOp]:
78+
if obj in PYTHON_TO_BIGFRAMES:
79+
return PYTHON_TO_BIGFRAMES[obj]
80+
return None

bigframes/series.py

Lines changed: 70 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
import bigframes.operations.datetimes as dt
7575
import bigframes.operations.lists as lists
7676
import bigframes.operations.plotting as plotting
77+
import bigframes.operations.python_op_maps as python_ops
7778
import bigframes.operations.strings as strings
7879
import bigframes.operations.structs as structs
7980
import bigframes.session
@@ -2030,88 +2031,97 @@ def apply(
20302031
if by_row not in ["compat", False]:
20312032
raise ValueError("Param by_row must be one of 'compat' or False")
20322033

2033-
if not callable(func):
2034+
if not callable(func) and not isinstance(func, numpy.ufunc):
20342035
raise ValueError(
20352036
"Only a ufunc (a function that applies to the entire Series) or"
20362037
" a BigFrames BigQuery function that only works on single values"
20372038
" are supported."
20382039
)
20392040

2040-
if not isinstance(func, bigframes.functions.BigqueryCallableRoutine):
2041-
# It is neither a remote function nor a managed function.
2042-
# Then it must be a vectorized function that applies to the Series
2043-
# as a whole.
2044-
if by_row:
2045-
raise ValueError(
2046-
"You have passed a function as-is. If your intention is to "
2047-
"apply this function in a vectorized way (i.e. to the "
2048-
"entire Series as a whole, and you are sure that it "
2049-
"performs only the operations that are implemented for a "
2050-
"Series (e.g. a chain of arithmetic/logical operations, "
2051-
"such as `def foo(s): return s % 2 == 1`), please also "
2052-
"specify `by_row=False`. If your function contains "
2053-
"arbitrary code, it can only be applied to every element "
2054-
"in the Series individually, in which case you must "
2055-
"convert it to a BigFrames BigQuery function using "
2056-
"`bigframes.pandas.udf`, "
2057-
"or `bigframes.pandas.remote_function` before passing."
2041+
if isinstance(func, bigframes.functions.BigqueryCallableRoutine):
2042+
# We are working with bigquery function at this point
2043+
if args:
2044+
result_series = self._apply_nary_op(
2045+
ops.NaryRemoteFunctionOp(function_def=func.udf_def), args
2046+
)
2047+
# TODO(jialuo): Investigate why `_apply_nary_op` drops the series
2048+
# `name`. Manually reassigning it here as a temporary fix.
2049+
result_series.name = self.name
2050+
else:
2051+
result_series = self._apply_unary_op(
2052+
ops.RemoteFunctionOp(function_def=func.udf_def, apply_on_null=True)
20582053
)
2054+
result_series = func._post_process_series(result_series)
20592055

2060-
try:
2061-
return func(self)
2062-
except Exception as ex:
2063-
# This could happen if any of the operators in func is not
2064-
# supported on a Series. Let's guide the customer to use a
2065-
# bigquery function instead
2066-
if hasattr(ex, "message"):
2067-
ex.message += f"\n{_bigquery_function_recommendation_message}"
2068-
raise
2069-
2070-
# We are working with bigquery function at this point
2071-
if args:
2072-
result_series = self._apply_nary_op(
2073-
ops.NaryRemoteFunctionOp(function_def=func.udf_def), args
2074-
)
2075-
# TODO(jialuo): Investigate why `_apply_nary_op` drops the series
2076-
# `name`. Manually reassigning it here as a temporary fix.
2077-
result_series.name = self.name
2078-
else:
2079-
result_series = self._apply_unary_op(
2080-
ops.RemoteFunctionOp(function_def=func.udf_def, apply_on_null=True)
2056+
return result_series
2057+
2058+
bf_op = python_ops.python_callable_to_op(func)
2059+
if bf_op and isinstance(bf_op, ops.UnaryOp):
2060+
return self._apply_unary_op(bf_op)
2061+
2062+
# It is neither a remote function nor a managed function.
2063+
# Then it must be a vectorized function that applies to the Series
2064+
# as a whole.
2065+
if by_row:
2066+
raise ValueError(
2067+
"You have passed a function as-is. If your intention is to "
2068+
"apply this function in a vectorized way (i.e. to the "
2069+
"entire Series as a whole, and you are sure that it "
2070+
"performs only the operations that are implemented for a "
2071+
"Series (e.g. a chain of arithmetic/logical operations, "
2072+
"such as `def foo(s): return s % 2 == 1`), please also "
2073+
"specify `by_row=False`. If your function contains "
2074+
"arbitrary code, it can only be applied to every element "
2075+
"in the Series individually, in which case you must "
2076+
"convert it to a BigFrames BigQuery function using "
2077+
"`bigframes.pandas.udf`, "
2078+
"or `bigframes.pandas.remote_function` before passing."
20812079
)
2082-
result_series = func._post_process_series(result_series)
20832080

2084-
return result_series
2081+
try:
2082+
return func(self) # type: ignore
2083+
except Exception as ex:
2084+
# This could happen if any of the operators in func is not
2085+
# supported on a Series. Let's guide the customer to use a
2086+
# bigquery function instead
2087+
if hasattr(ex, "message"):
2088+
ex.message += f"\n{_bigquery_function_recommendation_message}"
2089+
raise
20852090

20862091
def combine(
20872092
self,
20882093
other,
20892094
func,
20902095
) -> Series:
2091-
if not callable(func):
2096+
if not callable(func) and not isinstance(func, numpy.ufunc):
20922097
raise ValueError(
20932098
"Only a ufunc (a function that applies to the entire Series) or"
20942099
" a BigFrames BigQuery function that only works on single values"
20952100
" are supported."
20962101
)
20972102

2098-
if not isinstance(func, bigframes.functions.BigqueryCallableRoutine):
2099-
# Keep this in sync with .apply
2100-
try:
2101-
return func(self, other)
2102-
except Exception as ex:
2103-
# This could happen if any of the operators in func is not
2104-
# supported on a Series. Let's guide the customer to use a
2105-
# bigquery function instead
2106-
if hasattr(ex, "message"):
2107-
ex.message += f"\n{_bigquery_function_recommendation_message}"
2108-
raise
2109-
2110-
result_series = self._apply_binary_op(
2111-
other, ops.BinaryRemoteFunctionOp(function_def=func.udf_def)
2112-
)
2113-
result_series = func._post_process_series(result_series)
2114-
return result_series
2103+
if isinstance(func, bigframes.functions.BigqueryCallableRoutine):
2104+
result_series = self._apply_binary_op(
2105+
other, ops.BinaryRemoteFunctionOp(function_def=func.udf_def)
2106+
)
2107+
result_series = func._post_process_series(result_series)
2108+
return result_series
2109+
2110+
bf_op = python_ops.python_callable_to_op(func)
2111+
if bf_op and isinstance(bf_op, ops.BinaryOp):
2112+
result_series = self._apply_binary_op(other, bf_op)
2113+
return result_series
2114+
2115+
# Keep this in sync with .apply
2116+
try:
2117+
return func(self, other)
2118+
except Exception as ex:
2119+
# This could happen if any of the operators in func is not
2120+
# supported on a Series. Let's guide the customer to use a
2121+
# bigquery function instead
2122+
if hasattr(ex, "message"):
2123+
ex.message += f"\n{_bigquery_function_recommendation_message}"
2124+
raise
21152125

21162126
@validations.requires_index
21172127
def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series:

tests/system/small/operations/test_lists.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,33 @@ def test_len(column_name, dtype, repeated_df, repeated_pandas_df):
106106
check_index_type=False,
107107
check_names=False,
108108
)
109+
110+
111+
@pytest.mark.parametrize(
112+
("column_name", "dtype"),
113+
[
114+
pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))),
115+
pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))),
116+
],
117+
)
118+
@pytest.mark.parametrize(
119+
("func",),
120+
[
121+
pytest.param(len),
122+
pytest.param(all),
123+
pytest.param(any),
124+
pytest.param(min),
125+
pytest.param(max),
126+
pytest.param(sum),
127+
],
128+
)
129+
def test_list_apply_callable(column_name, dtype, repeated_df, repeated_pandas_df, func):
130+
bf_result = repeated_df[column_name].apply(func).to_pandas()
131+
pd_result = repeated_pandas_df[column_name].astype(dtype).apply(func)
132+
pd_result.index = pd_result.index.astype("Int64")
133+
134+
assert_series_equal(
135+
pd_result,
136+
bf_result,
137+
check_dtype=False,
138+
)

0 commit comments

Comments
 (0)