diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 7b74c1eb88..072bd21da1 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -18,6 +18,7 @@ import sys +from bigframes.bigquery._operations import ai from bigframes.bigquery._operations.approx_agg import approx_top_count from bigframes.bigquery._operations.array import ( array_agg, @@ -98,7 +99,7 @@ struct, ] -__all__ = [f.__name__ for f in _functions] +__all__ = [f.__name__ for f in _functions] + ["ai"] _module = sys.modules[__name__] for f in _functions: diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py new file mode 100644 index 0000000000..d7ea29322d --- /dev/null +++ b/bigframes/bigquery/_operations/ai.py @@ -0,0 +1,171 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module integrates BigQuery built-in AI functions for use with Series/DataFrame objects, +such as AI.GENERATE_BOOL: +https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-bool""" + +from __future__ import annotations + +import json +from typing import Any, List, Literal, Mapping, Tuple + +from bigframes import clients, dtypes, series +from bigframes.core import log_adapter +from bigframes.operations import ai_ops + + +@log_adapter.method_logger(custom_base_name="bigquery_ai") +def generate_bool( + prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series, ...], + *, + connection_id: str | None = None, + endpoint: str | None = None, + request_type: Literal["dedicated", "shared", "unspecified"] = "unspecified", + model_params: Mapping[Any, Any] | None = None, +) -> series.Series: + """ + Returns the AI analysis based on the prompt, which can be any combination of text and unstructured data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ + ... "col_1": ["apple", "bear", "pear"], + ... "col_2": ["fruit", "animal", "animal"] + ... }) + >>> bbq.ai_generate_bool((df["col_1"], " is a ", df["col_2"])) + 0 {'result': True, 'full_response': '{"candidate... + 1 {'result': True, 'full_response': '{"candidate... + 2 {'result': False, 'full_response': '{"candidat... + dtype: struct[pyarrow] + + >>> bbq.ai_generate_bool((df["col_1"], " is a ", df["col_2"])).struct.field("result") + 0 True + 1 True + 2 False + Name: result, dtype: boolean + + >>> model_params = { + ... "generation_config": { + ... "thinking_config": { + ... "thinking_budget": 0 + ... } + ... } + ... } + >>> bbq.ai_generate_bool( + ... (df["col_1"], " is a ", df["col_2"]), + ... endpoint="gemini-2.5-pro", + ... model_params=model_params, + ... ).struct.field("result") + 0 True + 1 True + 2 False + Name: result, dtype: boolean + + Args: + prompt (series.Series | List[str|series.Series] | Tuple[str|series.Series, ...]): + A mixture of Series and string literals that specifies the prompt to send to the model. + connection_id (str, optional): + Specifies the connection to use to communicate with the model. For example, `myproject.us.myconnection`. + If not provided, the connection from the current session will be used. + endpoint (str, optional): + Specifies the Vertex AI endpoint to use for the model. For example `"gemini-2.5-flash"`. You can specify any + generally available or preview Gemini model. If you specify the model name, BigQuery ML automatically identifies and + uses the full endpoint of the model. If you don't specify an ENDPOINT value, BigQuery ML selects a recent stable + version of Gemini to use. + request_type (Literal["dedicated", "shared", "unspecified"]): + Specifies the type of inference request to send to the Gemini model. The request type determines what quota the request uses. + * "dedicated": function only uses Provisioned Throughput quota. The function returns the error Provisioned throughput is not + purchased or is not active if Provisioned Throughput quota isn't available. + * "shared": the function only uses dynamic shared quota (DSQ), even if you have purchased Provisioned Throughput quota. + * "unspecified": If you haven't purchased Provisioned Throughput quota, the function uses DSQ quota. + If you have purchased Provisioned Throughput quota, the function uses the Provisioned Throughput quota first. + If requests exceed the Provisioned Throughput quota, the overflow traffic uses DSQ quota. + model_params (Mapping[Any, Any]): + Provides additional parameters to the model. The MODEL_PARAMS value must conform to the generateContent request body format. + + Returns: + bigframes.series.Series: A new struct Series with the result data. The struct contains these fields: + * "result": a BOOL value containing the model's response to the prompt. The result is None if the request fails or is filtered by responsible AI. + * "full_response": a STRING value containing the JSON response from the projects.locations.endpoints.generateContent call to the model. + The generated text is in the text element. + * "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful. + """ + + prompt_context, series_list = _separate_context_and_series(prompt) + assert len(series_list) > 0 + + operator = ai_ops.AIGenerateBool( + prompt_context=tuple(prompt_context), + connection_id=_resolve_connection_id(series_list[0], connection_id), + endpoint=endpoint, + request_type=request_type, + model_params=json.dumps(model_params) if model_params else None, + ) + + return series_list[0]._apply_nary_op(operator, series_list[1:]) + + +def _separate_context_and_series( + prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series, ...], +) -> Tuple[List[str | None], List[series.Series]]: + """ + Returns the two values. The first value is the prompt with all series replaced by None. The second value is all the series + in the prompt. The original item order is kept. + For example: + Input: ("str1", series1, "str2", "str3", series2) + Output: ["str1", None, "str2", "str3", None], [series1, series2] + """ + if not isinstance(prompt, (list, tuple, series.Series)): + raise ValueError(f"Unsupported prompt type: {type(prompt)}") + + if isinstance(prompt, series.Series): + if prompt.dtype == dtypes.OBJ_REF_DTYPE: + # Multi-model support + return [None], [prompt.blob.read_url()] + return [None], [prompt] + + prompt_context: List[str | None] = [] + series_list: List[series.Series] = [] + + for item in prompt: + if isinstance(item, str): + prompt_context.append(item) + + elif isinstance(item, series.Series): + prompt_context.append(None) + + if item.dtype == dtypes.OBJ_REF_DTYPE: + # Multi-model support + item = item.blob.read_url() + series_list.append(item) + + else: + raise TypeError(f"Unsupported type in prompt: {type(item)}") + + if not series_list: + raise ValueError("Please provide at least one Series in the prompt") + + return prompt_context, series_list + + +def _resolve_connection_id(series: series.Series, connection_id: str | None): + return clients.get_canonical_bq_connection_id( + connection_id or series._session._bq_connection, + series._session._project, + series._session._location, + ) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index af98252643..95dd2bc6b6 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -17,8 +17,10 @@ import functools import typing +from bigframes_vendored import ibis import bigframes_vendored.ibis.expr.api as ibis_api import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes +import bigframes_vendored.ibis.expr.operations.ai_ops as ai_ops import bigframes_vendored.ibis.expr.operations.generic as ibis_generic import bigframes_vendored.ibis.expr.operations.udf as ibis_udf import bigframes_vendored.ibis.expr.types as ibis_types @@ -1963,6 +1965,30 @@ def struct_op_impl( return ibis_types.struct(data) +@scalar_op_compiler.register_nary_op(ops.AIGenerateBool, pass_op=True) +def ai_generate_bool( + *values: ibis_types.Value, op: ops.AIGenerateBool +) -> ibis_types.StructValue: + + prompt: dict[str, ibis_types.Value | str] = {} + column_ref_idx = 0 + + for idx, elem in enumerate(op.prompt_context): + if elem is None: + prompt[f"_field_{idx + 1}"] = values[column_ref_idx] + column_ref_idx += 1 + else: + prompt[f"_field_{idx + 1}"] = elem + + return ai_ops.AIGenerateBool( + ibis.struct(prompt), # type: ignore + op.connection_id, # type: ignore + op.endpoint, # type: ignore + op.request_type.upper(), # type: ignore + op.model_params, # type: ignore + ).to_expr() + + @scalar_op_compiler.register_nary_op(ops.RowKey, pass_op=True) def rowkey_op_impl(*values: ibis_types.Value, op: ops.RowKey) -> ibis_types.Value: return bigframes.core.compile.default_ordering.gen_row_key(values) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index e5888ace00..bb9ec4d294 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -14,6 +14,7 @@ from __future__ import annotations +from bigframes.operations.ai_ops import AIGenerateBool from bigframes.operations.array_ops import ( ArrayIndexOp, ArrayReduceOp, @@ -408,6 +409,8 @@ "geo_x_op", "geo_y_op", "GeoStDistanceOp", + # AI ops + "AIGenerateBool", # Numpy ops mapping "NUMPY_TO_BINOP", "NUMPY_TO_OP", diff --git a/bigframes/operations/ai_ops.py b/bigframes/operations/ai_ops.py new file mode 100644 index 0000000000..fe5eb1406f --- /dev/null +++ b/bigframes/operations/ai_ops.py @@ -0,0 +1,47 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +from typing import ClassVar, Literal, Tuple + +import pandas as pd +import pyarrow as pa + +from bigframes import dtypes +from bigframes.operations import base_ops + + +@dataclasses.dataclass(frozen=True) +class AIGenerateBool(base_ops.NaryOp): + name: ClassVar[str] = "ai_generate_bool" + + # None are the placeholders for column references. + prompt_context: Tuple[str | None, ...] + connection_id: str + endpoint: str | None + request_type: Literal["dedicated", "shared", "unspecified"] + model_params: str | None + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return pd.ArrowDtype( + pa.struct( + ( + pa.field("result", pa.bool_()), + pa.field("full_response", pa.string()), + pa.field("status", pa.string()), + ) + ) + ) diff --git a/docs/reference/bigframes.bigquery/ai.rst b/docs/reference/bigframes.bigquery/ai.rst new file mode 100644 index 0000000000..2134125d6f --- /dev/null +++ b/docs/reference/bigframes.bigquery/ai.rst @@ -0,0 +1,7 @@ +bigframes.bigquery.ai +============================= + +.. automodule:: bigframes.bigquery._operations.ai + :members: + :inherited-members: + :undoc-members: \ No newline at end of file diff --git a/docs/reference/bigframes.bigquery/index.rst b/docs/reference/bigframes.bigquery/index.rst index 03e9bb48a4..f9d34f379d 100644 --- a/docs/reference/bigframes.bigquery/index.rst +++ b/docs/reference/bigframes.bigquery/index.rst @@ -5,5 +5,9 @@ BigQuery Built-in Functions .. automodule:: bigframes.bigquery :members: - :inherited-members: :undoc-members: + +.. toctree:: + :maxdepth: 2 + + ai diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index a27f162a9a..ad96977152 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -218,6 +218,8 @@ - items: - name: BigQuery built-in functions uid: bigframes.bigquery + - name: BigQuery AI Functions + uid: bigframes.bigquery.ai name: bigframes.bigquery - items: - name: GeoSeries diff --git a/tests/system/large/bigquery/__init__.py b/tests/system/large/bigquery/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/large/bigquery/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/large/bigquery/test_ai.py b/tests/system/large/bigquery/test_ai.py new file mode 100644 index 0000000000..be0216a526 --- /dev/null +++ b/tests/system/large/bigquery/test_ai.py @@ -0,0 +1,35 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pandas.testing + +import bigframes.bigquery as bbq + + +def test_ai_generate_bool_multi_model(session): + df = session.from_glob_path( + "gs://bigframes-dev-testing/a_multimodel/images/*", name="image" + ) + + result = bbq.ai.generate_bool((df["image"], " contains an animal")).struct.field( + "result" + ) + + pandas.testing.assert_series_equal( + result.to_pandas(), + pd.Series([True, True, False, False, False], name="result"), + check_dtype=False, + check_index=False, + ) diff --git a/tests/system/small/bigquery/test_ai.py b/tests/system/small/bigquery/test_ai.py new file mode 100644 index 0000000000..01050ade04 --- /dev/null +++ b/tests/system/small/bigquery/test_ai.py @@ -0,0 +1,62 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import pandas as pd +import pandas.testing +import pytest + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def test_ai_generate_bool(session): + s1 = bpd.Series(["apple", "bear"], session=session) + s2 = bpd.Series(["fruit", "tree"], session=session) + prompt = (s1, " is a ", s2) + + result = bbq.ai.generate_bool(prompt, endpoint="gemini-2.5-flash").struct.field( + "result" + ) + + pandas.testing.assert_series_equal( + result.to_pandas(), + pd.Series([True, False], name="result"), + check_dtype=False, + check_index=False, + ) + + +def test_ai_generate_bool_with_model_params(session): + if sys.version_info < (3, 12): + pytest.skip( + "Skip test because SQLGLot cannot compile model params to JSON at this env." + ) + + s1 = bpd.Series(["apple", "bear"], session=session) + s2 = bpd.Series(["fruit", "tree"], session=session) + prompt = (s1, " is a ", s2) + model_params = {"generation_config": {"thinking_config": {"thinking_budget": 0}}} + + result = bbq.ai.generate_bool( + prompt, endpoint="gemini-2.5-flash", model_params=model_params + ).struct.field("result") + + pandas.testing.assert_series_equal( + result.to_pandas(), + pd.Series([True, False], name="result"), + check_dtype=False, + check_index=False, + ) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index 9af2a4afe4..6ea11d5215 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -1104,6 +1104,21 @@ def visit_StringAgg(self, op, *, arg, sep, order_by, where): expr = arg return self.agg.string_agg(expr, sep, where=where) + def visit_AIGenerateBool(self, op, **kwargs): + func_name = "AI.GENERATE_BOOL" + + args = [] + for key, val in kwargs.items(): + if val is None: + continue + + if key == "model_params": + val = sge.JSON(this=val) + + args.append(sge.Kwarg(this=sge.Identifier(this=key), expression=val)) + + return sge.func(func_name, *args) + def visit_FirstNonNullValue(self, op, *, arg): return sge.IgnoreNulls(this=sge.FirstValue(this=arg)) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py b/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py new file mode 100644 index 0000000000..1f8306bad6 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py @@ -0,0 +1,32 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/9.2.0/ibis/expr/operations/maps.py + +"""Operations for working with maps.""" + +from __future__ import annotations + +from typing import Optional + +from bigframes_vendored.ibis.common.annotations import attribute +import bigframes_vendored.ibis.expr.datatypes as dt +from bigframes_vendored.ibis.expr.operations.core import Value +import bigframes_vendored.ibis.expr.rules as rlz +from public import public + + +@public +class AIGenerateBool(Value): + """Generate Bool based on the prompt""" + + prompt: Value + connection_id: Value[dt.String] + endpoint: Optional[Value[dt.String]] + request_type: Value[dt.String] + model_params: Optional[Value[dt.String]] + + shape = rlz.shape_like("prompt") + + @attribute + def dtype(self) -> dt.Struct: + return dt.Struct.from_tuples( + (("result", dt.bool), ("full_resposne", dt.string), ("status", dt.string)) + )