Skip to content

Commit 6909924

Browse files
authored
Merge branch 'main' into sycai_type_parser
2 parents cc2da36 + 8fc051f commit 6909924

File tree

7 files changed

+72
-52
lines changed

7 files changed

+72
-52
lines changed

bigframes/functions/_function_session.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -983,7 +983,17 @@ def _convert_row_processor_sig(
983983
if len(signature.parameters) >= 1:
984984
first_param = next(iter(signature.parameters.values()))
985985
param_type = first_param.annotation
986-
if (param_type == bf_series.Series) or (param_type == pandas.Series):
986+
# Type hints for Series inputs should use pandas.Series because the
987+
# underlying serialization process converts the input to a string
988+
# representation of a pandas Series (not bigframes Series). Using
989+
# bigframes Series will lead to TypeError when creating the function
990+
# remotely. See more from b/445182819.
991+
if param_type == bf_series.Series:
992+
raise bf_formatting.create_exception_with_feedback_link(
993+
TypeError,
994+
"Argument type hint must be Pandas Series, not BigFrames Series.",
995+
)
996+
if param_type == pandas.Series:
987997
msg = bfe.format_message("input_types=Series is in preview.")
988998
warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning)
989999
return signature.replace(

bigframes/functions/function_template.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -363,8 +363,16 @@ def generate_managed_function_code(
363363
return {udf_name}(*args)"""
364364
)
365365

366-
udf_code_block = textwrap.dedent(
367-
f"{udf_code}\n{func_code}\n{bigframes_handler_code}"
368-
)
369-
370-
return udf_code_block
366+
udf_code_block = []
367+
if not capture_references and is_row_processor:
368+
# Enable postponed evaluation of type annotations. This converts all
369+
# type hints to strings at runtime, which is necessary for correctly
370+
# handling the type annotation of pandas.Series after the UDF code is
371+
# serialized for remote execution. See more from b/445182819.
372+
udf_code_block.append("from __future__ import annotations")
373+
374+
udf_code_block.append(udf_code)
375+
udf_code_block.append(func_code)
376+
udf_code_block.append(bigframes_handler_code)
377+
378+
return textwrap.dedent("\n".join(udf_code_block))

bigframes/session/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2064,8 +2064,9 @@ def read_gbq_function(
20642064
note, row processor implies that the function has only one input
20652065
parameter.
20662066
2067+
>>> import pandas as pd
20672068
>>> @bpd.remote_function(cloud_function_service_account="default")
2068-
... def row_sum(s: bpd.Series) -> float:
2069+
... def row_sum(s: pd.Series) -> float:
20692070
... return s['a'] + s['b'] + s['c']
20702071
20712072
>>> row_sum_ref = bpd.read_gbq_function(

tests/system/large/functions/test_managed_function.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -701,8 +701,19 @@ def serialize_row(row):
701701
}
702702
)
703703

704+
with pytest.raises(
705+
TypeError,
706+
match="Argument type hint must be Pandas Series, not BigFrames Series.",
707+
):
708+
serialize_row_mf = session.udf(
709+
input_types=bigframes.series.Series,
710+
output_type=str,
711+
dataset=dataset_id,
712+
name=prefixer.create_prefix(),
713+
)(serialize_row)
714+
704715
serialize_row_mf = session.udf(
705-
input_types=bigframes.series.Series,
716+
input_types=pandas.Series,
706717
output_type=str,
707718
dataset=dataset_id,
708719
name=prefixer.create_prefix(),
@@ -762,7 +773,7 @@ def analyze(row):
762773
):
763774

764775
analyze_mf = session.udf(
765-
input_types=bigframes.series.Series,
776+
input_types=pandas.Series,
766777
output_type=str,
767778
dataset=dataset_id,
768779
name=prefixer.create_prefix(),
@@ -876,7 +887,7 @@ def serialize_row(row):
876887
)
877888

878889
serialize_row_mf = session.udf(
879-
input_types=bigframes.series.Series,
890+
input_types=pandas.Series,
880891
output_type=str,
881892
dataset=dataset_id,
882893
name=prefixer.create_prefix(),
@@ -926,7 +937,7 @@ def test_managed_function_df_apply_axis_1_na_nan_inf(dataset_id, session):
926937

927938
try:
928939

929-
def float_parser(row):
940+
def float_parser(row: pandas.Series):
930941
import numpy as mynp
931942
import pandas as mypd
932943

@@ -937,7 +948,7 @@ def float_parser(row):
937948
return float(row["text"])
938949

939950
float_parser_mf = session.udf(
940-
input_types=bigframes.series.Series,
951+
input_types=pandas.Series,
941952
output_type=float,
942953
dataset=dataset_id,
943954
name=prefixer.create_prefix(),
@@ -1027,7 +1038,7 @@ def test_managed_function_df_apply_axis_1_series_args(session, dataset_id, scala
10271038

10281039
try:
10291040

1030-
def analyze(s, x, y):
1041+
def analyze(s: pandas.Series, x: bool, y: float) -> str:
10311042
value = f"value is {s['int64_col']} and {s['float64_col']}"
10321043
if x:
10331044
return f"{value}, x is True!"
@@ -1036,8 +1047,6 @@ def analyze(s, x, y):
10361047
return f"{value}, x is False, y is non-positive!"
10371048

10381049
analyze_mf = session.udf(
1039-
input_types=[bigframes.series.Series, bool, float],
1040-
output_type=str,
10411050
dataset=dataset_id,
10421051
name=prefixer.create_prefix(),
10431052
)(analyze)
@@ -1151,7 +1160,7 @@ def is_sum_positive_series(s):
11511160
return s["int64_col"] + s["int64_too"] > 0
11521161

11531162
is_sum_positive_series_mf = session.udf(
1154-
input_types=bigframes.series.Series,
1163+
input_types=pandas.Series,
11551164
output_type=bool,
11561165
dataset=dataset_id,
11571166
name=prefixer.create_prefix(),
@@ -1217,12 +1226,10 @@ def func_for_other(x):
12171226
def test_managed_function_df_where_other_issue(session, dataset_id, scalars_df_index):
12181227
try:
12191228

1220-
def the_sum(s):
1229+
def the_sum(s: pandas.Series) -> int:
12211230
return s["int64_col"] + s["int64_too"]
12221231

12231232
the_sum_mf = session.udf(
1224-
input_types=bigframes.series.Series,
1225-
output_type=int,
12261233
dataset=dataset_id,
12271234
name=prefixer.create_prefix(),
12281235
)(the_sum)

tests/system/large/functions/test_remote_function.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1722,7 +1722,7 @@ def serialize_row(row):
17221722
)
17231723

17241724
serialize_row_remote = session.remote_function(
1725-
input_types=bigframes.series.Series,
1725+
input_types=pandas.Series,
17261726
output_type=str,
17271727
reuse=False,
17281728
cloud_function_service_account="default",
@@ -1771,7 +1771,7 @@ def analyze(row):
17711771
)
17721772

17731773
analyze_remote = session.remote_function(
1774-
input_types=bigframes.series.Series,
1774+
input_types=pandas.Series,
17751775
output_type=str,
17761776
reuse=False,
17771777
cloud_function_service_account="default",
@@ -1895,7 +1895,7 @@ def serialize_row(row):
18951895
)
18961896

18971897
serialize_row_remote = session.remote_function(
1898-
input_types=bigframes.series.Series,
1898+
input_types=pandas.Series,
18991899
output_type=str,
19001900
reuse=False,
19011901
cloud_function_service_account="default",
@@ -1944,7 +1944,7 @@ def test_df_apply_axis_1_na_nan_inf(session):
19441944

19451945
try:
19461946

1947-
def float_parser(row):
1947+
def float_parser(row: pandas.Series):
19481948
import numpy as mynp
19491949
import pandas as mypd
19501950

@@ -1955,7 +1955,6 @@ def float_parser(row):
19551955
return float(row["text"])
19561956

19571957
float_parser_remote = session.remote_function(
1958-
input_types=bigframes.series.Series,
19591958
output_type=float,
19601959
reuse=False,
19611960
cloud_function_service_account="default",
@@ -2055,12 +2054,12 @@ def test_df_apply_axis_1_series_args(session, scalars_dfs):
20552054
try:
20562055

20572056
@session.remote_function(
2058-
input_types=[bigframes.series.Series, float, str, bool],
2057+
input_types=[pandas.Series, float, str, bool],
20592058
output_type=list[str],
20602059
reuse=False,
20612060
cloud_function_service_account="default",
20622061
)
2063-
def foo_list(x, y0: float, y1, y2) -> list[str]:
2062+
def foo_list(x: pandas.Series, y0: float, y1, y2) -> list[str]:
20642063
return (
20652064
[str(x["int64_col"]), str(y0), str(y1), str(y2)]
20662065
if y2
@@ -3087,12 +3086,21 @@ def test_remote_function_df_where_mask_series(session, dataset_id, scalars_dfs):
30873086
try:
30883087

30893088
# The return type has to be bool type for callable where condition.
3090-
def is_sum_positive_series(s):
3089+
def is_sum_positive_series(s: pandas.Series) -> bool:
30913090
return s["int64_col"] + s["int64_too"] > 0
30923091

3092+
with pytest.raises(
3093+
TypeError,
3094+
match="Argument type hint must be Pandas Series, not BigFrames Series.",
3095+
):
3096+
session.remote_function(
3097+
input_types=bigframes.series.Series,
3098+
dataset=dataset_id,
3099+
reuse=False,
3100+
cloud_function_service_account="default",
3101+
)(is_sum_positive_series)
3102+
30933103
is_sum_positive_series_mf = session.remote_function(
3094-
input_types=bigframes.series.Series,
3095-
output_type=bool,
30963104
dataset=dataset_id,
30973105
reuse=False,
30983106
cloud_function_service_account="default",

tests/system/small/functions/test_remote_function.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import bigframes_vendored.constants as constants
2121
import google.api_core.exceptions
2222
from google.cloud import bigquery
23+
import pandas
2324
import pandas as pd
2425
import pyarrow
2526
import pytest
@@ -1166,16 +1167,14 @@ def test_df_apply_axis_1(session, scalars_dfs, dataset_id_permanent):
11661167
]
11671168
scalars_df, scalars_pandas_df = scalars_dfs
11681169

1169-
def add_ints(row):
1170+
def add_ints(row: pandas.Series) -> int:
11701171
return row["int64_col"] + row["int64_too"]
11711172

11721173
with pytest.warns(
11731174
bigframes.exceptions.PreviewWarning,
11741175
match="input_types=Series is in preview.",
11751176
):
11761177
add_ints_remote = session.remote_function(
1177-
input_types=bigframes.series.Series,
1178-
output_type=int,
11791178
dataset=dataset_id_permanent,
11801179
name=get_function_name(add_ints, is_row_processor=True),
11811180
cloud_function_service_account="default",
@@ -1223,11 +1222,11 @@ def test_df_apply_axis_1_ordering(session, scalars_dfs, dataset_id_permanent):
12231222
ordering_columns = ["bool_col", "int64_col"]
12241223
scalars_df, scalars_pandas_df = scalars_dfs
12251224

1226-
def add_ints(row):
1225+
def add_ints(row: pandas.Series) -> int:
12271226
return row["int64_col"] + row["int64_too"]
12281227

12291228
add_ints_remote = session.remote_function(
1230-
input_types=bigframes.series.Series,
1229+
input_types=pandas.Series,
12311230
output_type=int,
12321231
dataset=dataset_id_permanent,
12331232
name=get_function_name(add_ints, is_row_processor=True),
@@ -1267,7 +1266,7 @@ def add_numbers(row):
12671266
return row["x"] + row["y"]
12681267

12691268
add_numbers_remote = session.remote_function(
1270-
input_types=bigframes.series.Series,
1269+
input_types=pandas.Series,
12711270
output_type=float,
12721271
dataset=dataset_id_permanent,
12731272
name=get_function_name(add_numbers, is_row_processor=True),
@@ -1321,7 +1320,7 @@ def echo_len(row):
13211320
return len(row)
13221321

13231322
echo_len_remote = session.remote_function(
1324-
input_types=bigframes.series.Series,
1323+
input_types=pandas.Series,
13251324
output_type=float,
13261325
dataset=dataset_id_permanent,
13271326
name=get_function_name(echo_len, is_row_processor=True),

tests/unit/functions/test_remote_function.py

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,25 +17,12 @@
1717
import pandas
1818
import pytest
1919

20+
import bigframes.exceptions
2021
import bigframes.functions.function as bff
21-
import bigframes.series
2222
from bigframes.testing import mocks
2323

2424

25-
@pytest.mark.parametrize(
26-
"series_type",
27-
(
28-
pytest.param(
29-
pandas.Series,
30-
id="pandas.Series",
31-
),
32-
pytest.param(
33-
bigframes.series.Series,
34-
id="bigframes.series.Series",
35-
),
36-
),
37-
)
38-
def test_series_input_types_to_str(series_type):
25+
def test_series_input_types_to_str():
3926
"""Check that is_row_processor=True uses str as the input type to serialize a row."""
4027
session = mocks.create_bigquery_session()
4128
remote_function_decorator = bff.remote_function(
@@ -48,7 +35,7 @@ def test_series_input_types_to_str(series_type):
4835
):
4936

5037
@remote_function_decorator
51-
def axis_1_function(myparam: series_type) -> str: # type: ignore
38+
def axis_1_function(myparam: pandas.Series) -> str: # type: ignore
5239
return "Hello, " + myparam["str_col"] + "!" # type: ignore
5340

5441
# Still works as a normal function.

0 commit comments

Comments
 (0)