From d250e14789b84842ff1acd1ce2ce79124fbf4f1b Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 9 Sep 2025 04:34:10 +0000 Subject: [PATCH 01/11] feat: Add ai_generate_bool to the bigframes.bigquery package --- bigframes/bigquery/__init__.py | 3 + bigframes/bigquery/_operations/ai.py | 177 ++++++++++++++++++ .../ibis_compiler/scalar_op_registry.py | 28 +++ bigframes/operations/__init__.py | 3 + bigframes/operations/ai_ops.py | 47 +++++ tests/system/large/bigquery/__init__.py | 13 ++ tests/system/large/bigquery/test_ai.py | 35 ++++ tests/system/small/bigquery/test_ai.py | 37 ++++ .../sql/compilers/bigquery/__init__.py | 15 ++ .../ibis/expr/operations/ai_ops.py | 30 +++ 10 files changed, 388 insertions(+) create mode 100644 bigframes/bigquery/_operations/ai.py create mode 100644 bigframes/operations/ai_ops.py create mode 100644 tests/system/large/bigquery/__init__.py create mode 100644 tests/system/large/bigquery/test_ai.py create mode 100644 tests/system/small/bigquery/test_ai.py create mode 100644 third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 32412648d6..c82306c764 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -18,6 +18,7 @@ import sys +from bigframes.bigquery._operations.ai import ai_generate_bool from bigframes.bigquery._operations.approx_agg import approx_top_count from bigframes.bigquery._operations.array import ( array_agg, @@ -57,6 +58,8 @@ from bigframes.core import log_adapter _functions = [ + # ai ops + ai_generate_bool, # approximate aggregate ops approx_top_count, # array ops diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py new file mode 100644 index 0000000000..6ba7423680 --- /dev/null +++ b/bigframes/bigquery/_operations/ai.py @@ -0,0 +1,177 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import functools +import json +from typing import Any, List, Literal, Mapping, Sequence, Tuple + +from bigframes import clients, dtypes, series +from bigframes.operations import ai_ops + + +def ai_generate_bool( + prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series], + *, + connection_id: str | None = None, + endpoint: str | None = None, + request_type: Literal["dedicated", "shared", "unspecified"] = "unspecified", + model_params: Mapping[Any, Any] | None = None, +) -> series.Series: + """Returns the AI analysis based on the prompt, which can be any combination of text and unstructured data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ + ... "col_1": ["apple", "bear", "pear"], + ... "col_2": ["fruit", "animal", "animal"] + ... }) + >>> bbq.ai_generate_bool((df["col_1"], " is a ", df["col_2"])) + 0 {'result': True, 'full_response': '{"candidate... + 1 {'result': True, 'full_response': '{"candidate... + 2 {'result': False, 'full_response': '{"candidat... + dtype: struct[pyarrow] + + >>> bbq.ai_generate_bool((df["col_1"], " is a ", df["col_2"])).struct.field("result") + 0 True + 1 True + 2 False + Name: result, dtype: boolean + + >>> model_params = { + ... "generation_config": { + ... "thinking_config": { + ... "thinking_budget": 0 + ... } + ... } + ... } + >>> bbq.ai_generate_bool( + ... (df["col_1"], " is a ", df["col_2"]), + ... endpoint="gemini-2.5-pro", + ... model_params=model_params, + ... ).struct.field("result") + 0 True + 1 True + 2 False + Name: result, dtype: boolean + + Args: + prompt (series.Series | List[str|series.Series] | Tuple[str|series.Series]): + A mixture of Series and string literals that specifies the prompt to send to the model. + connection_id (str, optional): + Specifies the connection to use to communicate with the model. For example, `myproject.us.myconnection`. + If not provided, the connection from the current session will be used. + endpoint (str, optional): + Specifies the Vertex AI endpoint to use for the model. You can specify any generally available + or preview Gemini model. If you specify the model name, BigQuery ML automatically identifies and + uses the full endpoint of the model. If you don't specify an ENDPOINT value, BigQuery ML selects + a recent stable version of Gemini to use. + request_type (Literal["dedicated", "shared", "unspecified"]): + Specifies the type of inference request to send to the Gemini model. The request type determines what + quota the request uses. + * "dedicated": function only uses Provisioned Throughput quota. The AI.GENERATE function returns the error Provisioned throughput is not purchased or is not active if Provisioned Throughput quota isn't available. + * "shared": the function only uses dynamic shared quota (DSQ), even if you have purchased Provisioned Throughput quota. + * "unspecified": + * If you haven't purchased Provisioned Throughput quota, the function uses DSQ quota. + * If you have purchased Provisioned Throughput quota, the function uses the Provisioned Throughput quota first. If requests exceed the Provisioned Throughput quota, the overflow traffic uses DSQ quota. + model_params (Mapping[Any, Any]): + Provides additional parameters to the model. The MODEL_PARAMS value must conform to the generateContent request body format. + + Returns: + bigframes.series.Series: A new struct Series with the result data. The struct contains these fields: + * "result": a BOOL value containing the model's response to the prompt. The result is None if the request fails or is filtered by responsible AI. + * "full_resposne": a STRING value containing the JSON response from the projects.locations.endpoints.generateContent call to the model. The generated text is in the text element. + * "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful. + """ + + if request_type not in ("dedicated", "shared", "unspecified"): + raise ValueError(f"Unsupported request type: {request_type}") + + prompt_context, series_list = _separate_context_and_series(prompt) + + if not series_list: + raise ValueError("Please provide at least one Series in the prompt") + + operator = ai_ops.AIGenerateBool( + prompt_context=tuple(prompt_context), + connection_id=_resolve_connection_id(series_list[0], connection_id), + endpoint=endpoint, + request_type=request_type, + model_params=json.dumps(model_params) if model_params else None, + ) + + return series_list[0]._apply_nary_op(operator, series_list[1:]) + + +@functools.singledispatch +def _separate_context_and_series( + prompt: Any, +) -> Tuple[List[str | None], List[series.Series]]: + """ + Returns the two values. The first value is the prompt with all series replaced by None. The second value is all the series + in the prompt. The original item order is kept. + For example: + Input: ("str1", series1, "str2", "str3", series2) + Output: ["str1", None, "str2", "str3", None], [series1, series2] + """ + raise ValueError(f"Unsupported prompt type: {type(prompt)}") + + +@_separate_context_and_series.register +def _( + prompt: series.Series, +) -> Tuple[List[str | None], List[series.Series]]: + if prompt.dtype == dtypes.OBJ_REF_DTYPE: + # Multi-model support + return [None], [prompt.blob.read_url()] + return [None], [prompt] + + +@_separate_context_and_series.register(list) +@_separate_context_and_series.register(tuple) +def _( + prompt: Sequence[str | series.Series], +) -> Tuple[List[str | None], List[series.Series]]: + + prompt_context: List[str | None] = [] + series_list: List[series.Series] = [] + + for item in prompt: + if isinstance(item, str): + prompt_context.append(item) + + elif isinstance(item, series.Series): + prompt_context.append(None) + + if item.dtype == dtypes.OBJ_REF_DTYPE: + # Multi-model support + item = item.blob.read_url() + series_list.append(item) + + else: + raise ValueError(f"Unsupported type in prompt: {type(item)}") + + return prompt_context, series_list + + +def _resolve_connection_id(series: series.Series, connection_id: str | None): + return clients.get_canonical_bq_connection_id( + connection_id or series._session._bq_connection, + series._session._project, + series._session._location, + ) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 044fc90306..8cfbb9936d 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -17,8 +17,10 @@ import functools import typing +from bigframes_vendored import ibis import bigframes_vendored.ibis.expr.api as ibis_api import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes +import bigframes_vendored.ibis.expr.operations.ai_ops as ai_ops import bigframes_vendored.ibis.expr.operations.generic as ibis_generic import bigframes_vendored.ibis.expr.operations.udf as ibis_udf import bigframes_vendored.ibis.expr.types as ibis_types @@ -1963,6 +1965,32 @@ def struct_op_impl( return ibis_types.struct(data) +@scalar_op_compiler.register_nary_op(ops.AIGenerateBool, pass_op=True) +def ai_generate_bool( + *values: ibis_types.Value, op: ops.AIGenerateBool +) -> ibis_dtypes.StructValue: + + prompt = {} + column_ref_idx = 0 + + for idx, elem in enumerate(op.prompt_context): + if elem is None: + value = values[column_ref_idx] + column_ref_idx += 1 + else: + value = elem + + prompt[f"_field_{idx + 1}"] = value + + return ai_ops.AIGenerateBool( + ibis.struct(prompt), + op.connection_id, + op.endpoint, + op.request_type.upper(), + op.model_params, + ).to_expr() + + @scalar_op_compiler.register_nary_op(ops.RowKey, pass_op=True) def rowkey_op_impl(*values: ibis_types.Value, op: ops.RowKey) -> ibis_types.Value: return bigframes.core.compile.default_ordering.gen_row_key(values) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index e5888ace00..bb9ec4d294 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -14,6 +14,7 @@ from __future__ import annotations +from bigframes.operations.ai_ops import AIGenerateBool from bigframes.operations.array_ops import ( ArrayIndexOp, ArrayReduceOp, @@ -408,6 +409,8 @@ "geo_x_op", "geo_y_op", "GeoStDistanceOp", + # AI ops + "AIGenerateBool", # Numpy ops mapping "NUMPY_TO_BINOP", "NUMPY_TO_OP", diff --git a/bigframes/operations/ai_ops.py b/bigframes/operations/ai_ops.py new file mode 100644 index 0000000000..e9ba2ceeee --- /dev/null +++ b/bigframes/operations/ai_ops.py @@ -0,0 +1,47 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +from typing import ClassVar, Literal, Tuple + +import pandas as pd +import pyarrow as pa + +from bigframes import dtypes +from bigframes.operations import base_ops + + +@dataclasses.dataclass(frozen=True) +class AIGenerateBool(base_ops.NaryOp): + name: ClassVar[str] = "ai_generate_bool" + + # None are the placeholders for column references. + prompt_context: Tuple[str | None] + connection_id: str + endpoint: str | None + request_type: Literal["dedicated", "shared", "unspecified"] + model_params: str | None + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return pd.ArrowDtype( + pa.struct( + ( + pa.field("result", pa.bool_()), + pa.field("full_response", pa.string()), + pa.field("status", pa.string()), + ) + ) + ) diff --git a/tests/system/large/bigquery/__init__.py b/tests/system/large/bigquery/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/large/bigquery/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/large/bigquery/test_ai.py b/tests/system/large/bigquery/test_ai.py new file mode 100644 index 0000000000..447105148e --- /dev/null +++ b/tests/system/large/bigquery/test_ai.py @@ -0,0 +1,35 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pandas.testing + +import bigframes.bigquery as bbq + + +def test_ai_generate_bool_multi_model(session): + df = session.from_glob_path( + "gs://bigframes-dev-testing/a_multimodel/images/*", name="image" + ) + + result = bbq.ai_generate_bool((df["image"], " contains an animal")).struct.field( + "result" + ) + + pandas.testing.assert_series_equal( + result.to_pandas(), + pd.Series([True, True, False, False, False], name="result"), + check_dtype=False, + check_index=False, + ) diff --git a/tests/system/small/bigquery/test_ai.py b/tests/system/small/bigquery/test_ai.py new file mode 100644 index 0000000000..42cbaee7ed --- /dev/null +++ b/tests/system/small/bigquery/test_ai.py @@ -0,0 +1,37 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pandas.testing + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def test_ai_generate_bool(session): + s1 = bpd.Series(["apple", "bear"], session=session) + s2 = bpd.Series(["fruit", "tree"], session=session) + prompt = (s1, " is a ", s2) + model_params = {"generation_config": {"thinking_config": {"thinking_budget": 0}}} + + result = bbq.ai_generate_bool( + prompt, endpoint="gemini-2.5-flash", model_params=model_params + ).struct.field("result") + + pandas.testing.assert_series_equal( + result.to_pandas(), + pd.Series([True, False], name="result"), + check_dtype=False, + check_index=False, + ) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index 9af2a4afe4..6ea11d5215 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -1104,6 +1104,21 @@ def visit_StringAgg(self, op, *, arg, sep, order_by, where): expr = arg return self.agg.string_agg(expr, sep, where=where) + def visit_AIGenerateBool(self, op, **kwargs): + func_name = "AI.GENERATE_BOOL" + + args = [] + for key, val in kwargs.items(): + if val is None: + continue + + if key == "model_params": + val = sge.JSON(this=val) + + args.append(sge.Kwarg(this=sge.Identifier(this=key), expression=val)) + + return sge.func(func_name, *args) + def visit_FirstNonNullValue(self, op, *, arg): return sge.IgnoreNulls(this=sge.FirstValue(this=arg)) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py b/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py new file mode 100644 index 0000000000..5a81738c6b --- /dev/null +++ b/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py @@ -0,0 +1,30 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/9.2.0/ibis/expr/operations/maps.py + +"""Operations for working with maps.""" + +from __future__ import annotations + +from bigframes_vendored.ibis.common.annotations import attribute +import bigframes_vendored.ibis.expr.datatypes as dt +from bigframes_vendored.ibis.expr.operations.core import Value +import bigframes_vendored.ibis.expr.rules as rlz +from public import public + + +@public +class AIGenerateBool(Value): + """Generate Bool based on the prompt""" + + prompt: Value + connection_id: Value[dt.String] + endpoint: Value[dt.String] | None + request_type: Value[dt.String] + model_params: Value[dt.String] | None + + shape = rlz.shape_like("prompt") + + @attribute + def dtype(self) -> dt.Struct: + return dt.Struct.from_tuples( + (("result", dt.bool), ("full_resposne", dt.string), ("status", dt.string)) + ) From 442654d0369fe0d0198117abfaf5379a6fab49c2 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 9 Sep 2025 04:59:17 +0000 Subject: [PATCH 02/11] fix stuffs --- bigframes/bigquery/_operations/ai.py | 10 +++++----- .../ibis_compiler/scalar_op_registry.py | 20 +++++++++---------- bigframes/operations/ai_ops.py | 2 +- .../ibis/expr/operations/ai_ops.py | 6 ++++-- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 6ba7423680..aa701a4d66 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -23,7 +23,7 @@ def ai_generate_bool( - prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series], + prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series, ...], *, connection_id: str | None = None, endpoint: str | None = None, @@ -71,7 +71,7 @@ def ai_generate_bool( Name: result, dtype: boolean Args: - prompt (series.Series | List[str|series.Series] | Tuple[str|series.Series]): + prompt (series.Series | List[str|series.Series] | Tuple[str|series.Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. connection_id (str, optional): Specifies the connection to use to communicate with the model. For example, `myproject.us.myconnection`. @@ -87,15 +87,15 @@ def ai_generate_bool( * "dedicated": function only uses Provisioned Throughput quota. The AI.GENERATE function returns the error Provisioned throughput is not purchased or is not active if Provisioned Throughput quota isn't available. * "shared": the function only uses dynamic shared quota (DSQ), even if you have purchased Provisioned Throughput quota. * "unspecified": - * If you haven't purchased Provisioned Throughput quota, the function uses DSQ quota. - * If you have purchased Provisioned Throughput quota, the function uses the Provisioned Throughput quota first. If requests exceed the Provisioned Throughput quota, the overflow traffic uses DSQ quota. + If you haven't purchased Provisioned Throughput quota, the function uses DSQ quota. + If you have purchased Provisioned Throughput quota, the function uses the Provisioned Throughput quota first. If requests exceed the Provisioned Throughput quota, the overflow traffic uses DSQ quota. model_params (Mapping[Any, Any]): Provides additional parameters to the model. The MODEL_PARAMS value must conform to the generateContent request body format. Returns: bigframes.series.Series: A new struct Series with the result data. The struct contains these fields: * "result": a BOOL value containing the model's response to the prompt. The result is None if the request fails or is filtered by responsible AI. - * "full_resposne": a STRING value containing the JSON response from the projects.locations.endpoints.generateContent call to the model. The generated text is in the text element. + * "full_response": a STRING value containing the JSON response from the projects.locations.endpoints.generateContent call to the model. The generated text is in the text element. * "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful. """ diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 8cfbb9936d..2f7d943829 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1968,26 +1968,24 @@ def struct_op_impl( @scalar_op_compiler.register_nary_op(ops.AIGenerateBool, pass_op=True) def ai_generate_bool( *values: ibis_types.Value, op: ops.AIGenerateBool -) -> ibis_dtypes.StructValue: +) -> ibis_types.StructValue: - prompt = {} + prompt: dict[str, ibis_types.Value | str] = {} column_ref_idx = 0 for idx, elem in enumerate(op.prompt_context): if elem is None: - value = values[column_ref_idx] + prompt[f"_field_{idx + 1}"] = values[column_ref_idx] column_ref_idx += 1 else: - value = elem - - prompt[f"_field_{idx + 1}"] = value + prompt[f"_field_{idx + 1}"] = elem return ai_ops.AIGenerateBool( - ibis.struct(prompt), - op.connection_id, - op.endpoint, - op.request_type.upper(), - op.model_params, + ibis.struct(prompt), # type: ignore + op.connection_id,# type: ignore + op.endpoint,# type: ignore + op.request_type.upper(),# type: ignore + op.model_params,# type: ignore ).to_expr() diff --git a/bigframes/operations/ai_ops.py b/bigframes/operations/ai_ops.py index e9ba2ceeee..fe5eb1406f 100644 --- a/bigframes/operations/ai_ops.py +++ b/bigframes/operations/ai_ops.py @@ -29,7 +29,7 @@ class AIGenerateBool(base_ops.NaryOp): name: ClassVar[str] = "ai_generate_bool" # None are the placeholders for column references. - prompt_context: Tuple[str | None] + prompt_context: Tuple[str | None, ...] connection_id: str endpoint: str | None request_type: Literal["dedicated", "shared", "unspecified"] diff --git a/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py b/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py index 5a81738c6b..1f8306bad6 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py @@ -4,6 +4,8 @@ from __future__ import annotations +from typing import Optional + from bigframes_vendored.ibis.common.annotations import attribute import bigframes_vendored.ibis.expr.datatypes as dt from bigframes_vendored.ibis.expr.operations.core import Value @@ -17,9 +19,9 @@ class AIGenerateBool(Value): prompt: Value connection_id: Value[dt.String] - endpoint: Value[dt.String] | None + endpoint: Optional[Value[dt.String]] request_type: Value[dt.String] - model_params: Value[dt.String] | None + model_params: Optional[Value[dt.String]] shape = rlz.shape_like("prompt") From cf255ec7f1ad243f155a9182acf046b6e4debd88 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 9 Sep 2025 05:05:16 +0000 Subject: [PATCH 03/11] Fix format --- bigframes/bigquery/_operations/ai.py | 13 +++++++------ .../compile/ibis_compiler/scalar_op_registry.py | 10 +++++----- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index aa701a4d66..d0625888f2 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -30,7 +30,8 @@ def ai_generate_bool( request_type: Literal["dedicated", "shared", "unspecified"] = "unspecified", model_params: Mapping[Any, Any] | None = None, ) -> series.Series: - """Returns the AI analysis based on the prompt, which can be any combination of text and unstructured data. + """ + Returns the AI analysis based on the prompt, which can be any combination of text and unstructured data. **Examples:** @@ -77,14 +78,14 @@ def ai_generate_bool( Specifies the connection to use to communicate with the model. For example, `myproject.us.myconnection`. If not provided, the connection from the current session will be used. endpoint (str, optional): - Specifies the Vertex AI endpoint to use for the model. You can specify any generally available - or preview Gemini model. If you specify the model name, BigQuery ML automatically identifies and - uses the full endpoint of the model. If you don't specify an ENDPOINT value, BigQuery ML selects - a recent stable version of Gemini to use. + Specifies the Vertex AI endpoint to use for the model. For example `"gemini-2.5-flash"`. You can specify any + generally available or preview Gemini model. If you specify the model name, BigQuery ML automatically identifies and + uses the full endpoint of the model. If you don't specify an ENDPOINT value, BigQuery ML selects a recent stable + version of Gemini to use. request_type (Literal["dedicated", "shared", "unspecified"]): Specifies the type of inference request to send to the Gemini model. The request type determines what quota the request uses. - * "dedicated": function only uses Provisioned Throughput quota. The AI.GENERATE function returns the error Provisioned throughput is not purchased or is not active if Provisioned Throughput quota isn't available. + * "dedicated": function only uses Provisioned Throughput quota. The function returns the error Provisioned throughput is not purchased or is not active if Provisioned Throughput quota isn't available. * "shared": the function only uses dynamic shared quota (DSQ), even if you have purchased Provisioned Throughput quota. * "unspecified": If you haven't purchased Provisioned Throughput quota, the function uses DSQ quota. diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 2f7d943829..3cc9cc32d5 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1981,11 +1981,11 @@ def ai_generate_bool( prompt[f"_field_{idx + 1}"] = elem return ai_ops.AIGenerateBool( - ibis.struct(prompt), # type: ignore - op.connection_id,# type: ignore - op.endpoint,# type: ignore - op.request_type.upper(),# type: ignore - op.model_params,# type: ignore + ibis.struct(prompt), # type: ignore + op.connection_id, # type: ignore + op.endpoint, # type: ignore + op.request_type.upper(), # type: ignore + op.model_params, # type: ignore ).to_expr() From 85d5f934757cad9f5a372b0b6c96a96f712feb03 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 9 Sep 2025 05:10:51 +0000 Subject: [PATCH 04/11] fix doc format --- bigframes/bigquery/_operations/ai.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index d0625888f2..db581a89cf 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -83,20 +83,21 @@ def ai_generate_bool( uses the full endpoint of the model. If you don't specify an ENDPOINT value, BigQuery ML selects a recent stable version of Gemini to use. request_type (Literal["dedicated", "shared", "unspecified"]): - Specifies the type of inference request to send to the Gemini model. The request type determines what - quota the request uses. - * "dedicated": function only uses Provisioned Throughput quota. The function returns the error Provisioned throughput is not purchased or is not active if Provisioned Throughput quota isn't available. + Specifies the type of inference request to send to the Gemini model. The request type determines what quota the request uses. + * "dedicated": function only uses Provisioned Throughput quota. The function returns the error Provisioned throughput is not + purchased or is not active if Provisioned Throughput quota isn't available. * "shared": the function only uses dynamic shared quota (DSQ), even if you have purchased Provisioned Throughput quota. - * "unspecified": - If you haven't purchased Provisioned Throughput quota, the function uses DSQ quota. - If you have purchased Provisioned Throughput quota, the function uses the Provisioned Throughput quota first. If requests exceed the Provisioned Throughput quota, the overflow traffic uses DSQ quota. + * "unspecified": If you haven't purchased Provisioned Throughput quota, the function uses DSQ quota. + If you have purchased Provisioned Throughput quota, the function uses the Provisioned Throughput quota first. + If requests exceed the Provisioned Throughput quota, the overflow traffic uses DSQ quota. model_params (Mapping[Any, Any]): Provides additional parameters to the model. The MODEL_PARAMS value must conform to the generateContent request body format. Returns: bigframes.series.Series: A new struct Series with the result data. The struct contains these fields: * "result": a BOOL value containing the model's response to the prompt. The result is None if the request fails or is filtered by responsible AI. - * "full_response": a STRING value containing the JSON response from the projects.locations.endpoints.generateContent call to the model. The generated text is in the text element. + * "full_response": a STRING value containing the JSON response from the projects.locations.endpoints.generateContent call to the model. + The generated text is in the text element. * "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful. """ From 1f00e3c461c05222fccff4c22547959c3f7cce67 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 9 Sep 2025 05:13:42 +0000 Subject: [PATCH 05/11] fix format --- bigframes/bigquery/_operations/ai.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index db581a89cf..43c640d17d 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -84,11 +84,11 @@ def ai_generate_bool( version of Gemini to use. request_type (Literal["dedicated", "shared", "unspecified"]): Specifies the type of inference request to send to the Gemini model. The request type determines what quota the request uses. - * "dedicated": function only uses Provisioned Throughput quota. The function returns the error Provisioned throughput is not + * "dedicated": function only uses Provisioned Throughput quota. The function returns the error Provisioned throughput is not purchased or is not active if Provisioned Throughput quota isn't available. * "shared": the function only uses dynamic shared quota (DSQ), even if you have purchased Provisioned Throughput quota. - * "unspecified": If you haven't purchased Provisioned Throughput quota, the function uses DSQ quota. - If you have purchased Provisioned Throughput quota, the function uses the Provisioned Throughput quota first. + * "unspecified": If you haven't purchased Provisioned Throughput quota, the function uses DSQ quota. + If you have purchased Provisioned Throughput quota, the function uses the Provisioned Throughput quota first. If requests exceed the Provisioned Throughput quota, the overflow traffic uses DSQ quota. model_params (Mapping[Any, Any]): Provides additional parameters to the model. The MODEL_PARAMS value must conform to the generateContent request body format. @@ -96,7 +96,7 @@ def ai_generate_bool( Returns: bigframes.series.Series: A new struct Series with the result data. The struct contains these fields: * "result": a BOOL value containing the model's response to the prompt. The result is None if the request fails or is filtered by responsible AI. - * "full_response": a STRING value containing the JSON response from the projects.locations.endpoints.generateContent call to the model. + * "full_response": a STRING value containing the JSON response from the projects.locations.endpoints.generateContent call to the model. The generated text is in the text element. * "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful. """ From b2446abf3961ea17465a31929d9ec399ffc0b870 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 9 Sep 2025 05:52:34 +0000 Subject: [PATCH 06/11] fix code --- bigframes/bigquery/_operations/ai.py | 42 ++++++++++------------------ 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 43c640d17d..4b50d7051b 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -14,9 +14,8 @@ from __future__ import annotations -import functools import json -from typing import Any, List, Literal, Mapping, Sequence, Tuple +from typing import Any, List, Literal, Mapping, Tuple from bigframes import clients, dtypes, series from bigframes.operations import ai_ops @@ -101,13 +100,8 @@ def ai_generate_bool( * "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful. """ - if request_type not in ("dedicated", "shared", "unspecified"): - raise ValueError(f"Unsupported request type: {request_type}") - prompt_context, series_list = _separate_context_and_series(prompt) - - if not series_list: - raise ValueError("Please provide at least one Series in the prompt") + assert len(series_list) > 0 operator = ai_ops.AIGenerateBool( prompt_context=tuple(prompt_context), @@ -120,9 +114,8 @@ def ai_generate_bool( return series_list[0]._apply_nary_op(operator, series_list[1:]) -@functools.singledispatch def _separate_context_and_series( - prompt: Any, + prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series, ...], ) -> Tuple[List[str | None], List[series.Series]]: """ Returns the two values. The first value is the prompt with all series replaced by None. The second value is all the series @@ -131,24 +124,14 @@ def _separate_context_and_series( Input: ("str1", series1, "str2", "str3", series2) Output: ["str1", None, "str2", "str3", None], [series1, series2] """ - raise ValueError(f"Unsupported prompt type: {type(prompt)}") - + if not isinstance(prompt, (list, tuple, series.Series)): + raise ValueError(f"Unsupported prompt type: {type(prompt)}") -@_separate_context_and_series.register -def _( - prompt: series.Series, -) -> Tuple[List[str | None], List[series.Series]]: - if prompt.dtype == dtypes.OBJ_REF_DTYPE: - # Multi-model support - return [None], [prompt.blob.read_url()] - return [None], [prompt] - - -@_separate_context_and_series.register(list) -@_separate_context_and_series.register(tuple) -def _( - prompt: Sequence[str | series.Series], -) -> Tuple[List[str | None], List[series.Series]]: + if isinstance(prompt, series.Series): + if prompt.dtype == dtypes.OBJ_REF_DTYPE: + # Multi-model support + return [None], [prompt.blob.read_url()] + return [None], [prompt] prompt_context: List[str | None] = [] series_list: List[series.Series] = [] @@ -166,7 +149,10 @@ def _( series_list.append(item) else: - raise ValueError(f"Unsupported type in prompt: {type(item)}") + raise TypeError(f"Unsupported type in prompt: {type(item)}") + + if not series_list: + raise ValueError("Please provide at least one Series in the prompt") return prompt_context, series_list From 79b96511be7992597a718bd2ce2c999874700066 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 10 Sep 2025 18:15:11 +0000 Subject: [PATCH 07/11] expose ai module and rename the function --- bigframes/bigquery/__init__.py | 6 ++---- bigframes/bigquery/_operations/ai.py | 4 +++- tests/system/large/bigquery/test_ai.py | 2 +- tests/system/small/bigquery/test_ai.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index c82306c764..c9b81d191b 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -18,7 +18,7 @@ import sys -from bigframes.bigquery._operations.ai import ai_generate_bool +from bigframes.bigquery._operations import ai from bigframes.bigquery._operations.approx_agg import approx_top_count from bigframes.bigquery._operations.array import ( array_agg, @@ -58,8 +58,6 @@ from bigframes.core import log_adapter _functions = [ - # ai ops - ai_generate_bool, # approximate aggregate ops approx_top_count, # array ops @@ -99,7 +97,7 @@ struct, ] -__all__ = [f.__name__ for f in _functions] +__all__ = [f.__name__ for f in _functions] + ["ai"] _module = sys.modules[__name__] for f in _functions: diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 4b50d7051b..00d12dc7e2 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -18,10 +18,12 @@ from typing import Any, List, Literal, Mapping, Tuple from bigframes import clients, dtypes, series +from bigframes.core import log_adapter from bigframes.operations import ai_ops -def ai_generate_bool( +@log_adapter.method_logger(custom_base_name="bigquery_ai") +def generate_bool( prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series, ...], *, connection_id: str | None = None, diff --git a/tests/system/large/bigquery/test_ai.py b/tests/system/large/bigquery/test_ai.py index 447105148e..be0216a526 100644 --- a/tests/system/large/bigquery/test_ai.py +++ b/tests/system/large/bigquery/test_ai.py @@ -23,7 +23,7 @@ def test_ai_generate_bool_multi_model(session): "gs://bigframes-dev-testing/a_multimodel/images/*", name="image" ) - result = bbq.ai_generate_bool((df["image"], " contains an animal")).struct.field( + result = bbq.ai.generate_bool((df["image"], " contains an animal")).struct.field( "result" ) diff --git a/tests/system/small/bigquery/test_ai.py b/tests/system/small/bigquery/test_ai.py index 42cbaee7ed..06c31fad03 100644 --- a/tests/system/small/bigquery/test_ai.py +++ b/tests/system/small/bigquery/test_ai.py @@ -25,7 +25,7 @@ def test_ai_generate_bool(session): prompt = (s1, " is a ", s2) model_params = {"generation_config": {"thinking_config": {"thinking_budget": 0}}} - result = bbq.ai_generate_bool( + result = bbq.ai.generate_bool( prompt, endpoint="gemini-2.5-flash", model_params=model_params ).struct.field("result") From 99555441ebed79f0699512b24c1162ddba4bd72a Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 10 Sep 2025 20:13:05 +0000 Subject: [PATCH 08/11] add ai module to doc --- bigframes/bigquery/_operations/ai.py | 4 ++++ docs/reference/bigframes.bigquery/ai.rst | 7 +++++++ docs/reference/bigframes.bigquery/index.rst | 6 +++++- docs/templates/toc.yml | 2 ++ 4 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 docs/reference/bigframes.bigquery/ai.rst diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 00d12dc7e2..e53960070b 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""This module integrates BigQuery built-in AI functions for use with Series/DataFrame objects, +such as AI.GENERTAL_BOOL: +https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-bool""" + from __future__ import annotations import json diff --git a/docs/reference/bigframes.bigquery/ai.rst b/docs/reference/bigframes.bigquery/ai.rst new file mode 100644 index 0000000000..2134125d6f --- /dev/null +++ b/docs/reference/bigframes.bigquery/ai.rst @@ -0,0 +1,7 @@ +bigframes.bigquery.ai +============================= + +.. automodule:: bigframes.bigquery._operations.ai + :members: + :inherited-members: + :undoc-members: \ No newline at end of file diff --git a/docs/reference/bigframes.bigquery/index.rst b/docs/reference/bigframes.bigquery/index.rst index 03e9bb48a4..f9d34f379d 100644 --- a/docs/reference/bigframes.bigquery/index.rst +++ b/docs/reference/bigframes.bigquery/index.rst @@ -5,5 +5,9 @@ BigQuery Built-in Functions .. automodule:: bigframes.bigquery :members: - :inherited-members: :undoc-members: + +.. toctree:: + :maxdepth: 2 + + ai diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index a27f162a9a..ad96977152 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -218,6 +218,8 @@ - items: - name: BigQuery built-in functions uid: bigframes.bigquery + - name: BigQuery AI Functions + uid: bigframes.bigquery.ai name: bigframes.bigquery - items: - name: GeoSeries From ce5cc3877aa625304556f32c40f3076e33ebb2dc Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 10 Sep 2025 20:48:06 +0000 Subject: [PATCH 09/11] fix test --- tests/system/small/bigquery/test_ai.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/system/small/bigquery/test_ai.py b/tests/system/small/bigquery/test_ai.py index 06c31fad03..616ba970db 100644 --- a/tests/system/small/bigquery/test_ai.py +++ b/tests/system/small/bigquery/test_ai.py @@ -12,8 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys + import pandas as pd import pandas.testing +import pytest import bigframes.bigquery as bbq import bigframes.pandas as bpd @@ -35,3 +38,26 @@ def test_ai_generate_bool(session): check_dtype=False, check_index=False, ) + + +def test_ai_generate_bool_with_model_params(session): + if sys.version_info < (3, 12): + pytest.skip( + "Skip test because SQLGLot cannot compile model params to JSON at this env." + ) + + s1 = bpd.Series(["apple", "bear"], session=session) + s2 = bpd.Series(["fruit", "tree"], session=session) + prompt = (s1, " is a ", s2) + model_params = {"generation_config": {"thinking_config": {"thinking_budget": 0}}} + + result = bbq.ai.generate_bool( + prompt, endpoint="gemini-2.5-flash", model_params=model_params + ).struct.field("result") + + pandas.testing.assert_series_equal( + result.to_pandas(), + pd.Series([True, False], name="result"), + check_dtype=False, + check_index=False, + ) From 61e10ce69734fafe72851ba0ef8282dbb3a18884 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 11 Sep 2025 03:10:49 +0000 Subject: [PATCH 10/11] fix test --- tests/system/small/bigquery/test_ai.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/system/small/bigquery/test_ai.py b/tests/system/small/bigquery/test_ai.py index 616ba970db..01050ade04 100644 --- a/tests/system/small/bigquery/test_ai.py +++ b/tests/system/small/bigquery/test_ai.py @@ -26,11 +26,10 @@ def test_ai_generate_bool(session): s1 = bpd.Series(["apple", "bear"], session=session) s2 = bpd.Series(["fruit", "tree"], session=session) prompt = (s1, " is a ", s2) - model_params = {"generation_config": {"thinking_config": {"thinking_budget": 0}}} - result = bbq.ai.generate_bool( - prompt, endpoint="gemini-2.5-flash", model_params=model_params - ).struct.field("result") + result = bbq.ai.generate_bool(prompt, endpoint="gemini-2.5-flash").struct.field( + "result" + ) pandas.testing.assert_series_equal( result.to_pandas(), From 48563158a81ba745c1d57337baf89c7dce9bb4f3 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 15 Sep 2025 11:35:29 -0700 Subject: [PATCH 11/11] Update bigframes/bigquery/_operations/ai.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/bigquery/_operations/ai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index e53960070b..d7ea29322d 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -13,7 +13,7 @@ # limitations under the License. """This module integrates BigQuery built-in AI functions for use with Series/DataFrame objects, -such as AI.GENERTAL_BOOL: +such as AI.GENERATE_BOOL: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-bool""" from __future__ import annotations