googleapis
diff --git a/‎bigframes/bigquery/_operations/ai.py‎
Lines changed: 24 additions & 2 deletions b/‎bigframes/bigquery/_operations/ai.py‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎bigframes/core/compile/ibis_compiler/scalar_op_registry.py‎
Lines changed: 1 addition & 0 deletions b/‎bigframes/core/compile/ibis_compiler/scalar_op_registry.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bigframes/core/compile/sqlglot/expressions/ai_ops.py‎
Lines changed: 13 additions & 5 deletions b/‎bigframes/core/compile/sqlglot/expressions/ai_ops.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎bigframes/core/compile/sqlglot/expressions/generic_ops.py‎
Lines changed: 2 additions & 2 deletions b/‎bigframes/core/compile/sqlglot/expressions/generic_ops.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bigframes/core/compile/sqlglot/sqlglot_ir.py‎
Lines changed: 2 additions & 2 deletions b/‎bigframes/core/compile/sqlglot/sqlglot_ir.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bigframes/core/compile/sqlglot/sqlglot_types.py‎
Lines changed: 52 additions & 57 deletions b/‎bigframes/core/compile/sqlglot/sqlglot_types.py‎
Lines changed: 52 additions & 57 deletions
diff --git a/‎bigframes/core/indexes/base.py‎
Lines changed: 52 additions & 0 deletions b/‎bigframes/core/indexes/base.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎bigframes/core/indexes/multi.py‎
Lines changed: 25 additions & 0 deletions b/‎bigframes/core/indexes/multi.py‎
Lines changed: 25 additions & 0 deletions
@@ -25,7 +25,7 @@
 
 from bigframes import clients, dtypes, series, session
 from bigframes.core import convert, log_adapter
-from bigframes.operations import ai_ops
+from bigframes.operations import ai_ops, output_schemas
 
 PROMPT_TYPE = Union[
     series.Series,
@@ -43,7 +43,7 @@ def generate(
     endpoint: str | None = None,
     request_type: Literal["dedicated", "shared", "unspecified"] = "unspecified",
     model_params: Mapping[Any, Any] | None = None,
-    # TODO(b/446974666) Add output_schema parameter
+    output_schema: Mapping[str, str] | None = None,
 ) -> series.Series:
     """
     Returns the AI analysis based on the prompt, which can be any combination of text and unstructured data.
@@ -64,6 +64,14 @@ def generate(
         1    Ottawa\\n
         Name: result, dtype: string
 
+        You get structured output when the `output_schema` parameter is set:
+
+        >>> animals = bpd.Series(["Rabbit", "Spider"])
+        >>> bbq.ai.generate(animals, output_schema={"number_of_legs": "INT64", "is_herbivore": "BOOL"})
+        0    {'is_herbivore': True, 'number_of_legs': 4, 'f...
+        1    {'is_herbivore': False, 'number_of_legs': 8, '...
+        dtype: struct<is_herbivore: bool, number_of_legs: int64, full_response: extension<dbjson<JSONArrowType>>, status: string>[pyarrow]
+
     Args:
         prompt (Series | List[str|Series] | Tuple[str|Series, ...]):
             A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series
@@ -86,10 +94,14 @@ def generate(
             If requests exceed the Provisioned Throughput quota, the overflow traffic uses DSQ quota.
         model_params (Mapping[Any, Any]):
             Provides additional parameters to the model. The MODEL_PARAMS value must conform to the generateContent request body format.
+        output_schema (Mapping[str, str]):
+            A mapping value that specifies the schema of the output, in the form {field_name: data_type}. Supported data types include
+            `STRING`, `INT64`, `FLOAT64`, `BOOL`, `ARRAY`, and `STRUCT`.
 
     Returns:
         bigframes.series.Series: A new struct Series with the result data. The struct contains these fields:
         * "result": a STRING value containing the model's response to the prompt. The result is None if the request fails or is filtered by responsible AI.
+        If you specify an output schema then result is replaced by your custom schema.
         * "full_response": a JSON value containing the response from the projects.locations.endpoints.generateContent call to the model.
         The generated text is in the text element.
         * "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful.
@@ -98,12 +110,22 @@ def generate(
     prompt_context, series_list = _separate_context_and_series(prompt)
     assert len(series_list) > 0
 
+    if output_schema is None:
+        output_schema_str = None
+    else:
+        output_schema_str = ", ".join(
+            [f"{name} {sql_type}" for name, sql_type in output_schema.items()]
+        )
+        # Validate user input
+        output_schemas.parse_sql_fields(output_schema_str)
+
     operator = ai_ops.AIGenerate(
         prompt_context=tuple(prompt_context),
         connection_id=_resolve_connection_id(series_list[0], connection_id),
         endpoint=endpoint,
         request_type=request_type,
         model_params=json.dumps(model_params) if model_params else None,
+        output_schema=output_schema_str,
     )
 
     return series_list[0]._apply_nary_op(operator, series_list[1:])
 
@@ -1985,6 +1985,7 @@ def ai_generate(
         op.endpoint,  # type: ignore
         op.request_type.upper(),  # type: ignore
         op.model_params,  # type: ignore
+        op.output_schema,  # type: ignore
     ).to_expr()
 
 
 
@@ -15,7 +15,6 @@
 from __future__ import annotations
 
 from dataclasses import asdict
-import typing
 
 import sqlglot.expressions as sge
 
@@ -105,24 +104,24 @@ def _construct_named_args(op: ops.NaryOp) -> list[sge.Kwarg]:
 
     op_args = asdict(op)
 
-    connection_id = typing.cast(str, op_args["connection_id"])
+    connection_id = op_args["connection_id"]
     args.append(
         sge.Kwarg(this="connection_id", expression=sge.Literal.string(connection_id))
     )
 
-    endpoit = typing.cast(str, op_args.get("endpoint", None))
+    endpoit = op_args.get("endpoint", None)
     if endpoit is not None:
         args.append(sge.Kwarg(this="endpoint", expression=sge.Literal.string(endpoit)))
 
-    request_type = typing.cast(str, op_args.get("request_type", None))
+    request_type = op_args.get("request_type", None)
     if request_type is not None:
         args.append(
             sge.Kwarg(
                 this="request_type", expression=sge.Literal.string(request_type.upper())
             )
         )
 
-    model_params = typing.cast(str, op_args.get("model_params", None))
+    model_params = op_args.get("model_params", None)
     if model_params is not None:
         args.append(
             sge.Kwarg(
@@ -133,4 +132,13 @@ def _construct_named_args(op: ops.NaryOp) -> list[sge.Kwarg]:
             )
         )
 
+    output_schema = op_args.get("output_schema", None)
+    if output_schema is not None:
+        args.append(
+            sge.Kwarg(
+                this="output_schema",
+                expression=sge.Literal.string(output_schema),
+            )
+        )
+
     return args
@@ -18,9 +18,9 @@
 
 from bigframes import dtypes
 from bigframes import operations as ops
+from bigframes.core.compile.sqlglot import sqlglot_types
 from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr
 import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler
-from bigframes.core.compile.sqlglot.sqlglot_types import SQLGlotType
 
 register_unary_op = scalar_compiler.scalar_op_compiler.register_unary_op
 
@@ -29,7 +29,7 @@
 def _(expr: TypedExpr, op: ops.AsTypeOp) -> sge.Expression:
     from_type = expr.dtype
     to_type = op.to_type
-    sg_to_type = SQLGlotType.from_bigframes_dtype(to_type)
+    sg_to_type = sqlglot_types.from_bigframes_dtype(to_type)
     sg_expr = expr.expr
 
     if to_type == dtypes.JSON_DTYPE:
 
@@ -79,7 +79,7 @@ def from_pyarrow(
             expressions=[
                 sge.ColumnDef(
                     this=sge.to_identifier(field.column, quoted=True),
-                    kind=sgt.SQLGlotType.from_bigframes_dtype(field.dtype),
+                    kind=sgt.from_bigframes_dtype(field.dtype),
                 )
                 for field in schema.items
             ],
@@ -620,7 +620,7 @@ def _select_to_cte(expr: sge.Select, cte_name: sge.Identifier) -> sge.Select:
 
 
 def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression:
-    sqlglot_type = sgt.SQLGlotType.from_bigframes_dtype(dtype)
+    sqlglot_type = sgt.from_bigframes_dtype(dtype)
     if value is None:
         return _cast(sge.Null(), sqlglot_type)
     elif dtype == dtypes.BYTES_DTYPE:
 
@@ -25,62 +25,57 @@
 import bigframes.dtypes
 
 
-class SQLGlotType:
-    @classmethod
-    def from_bigframes_dtype(
-        cls,
-        bigframes_dtype: typing.Union[
-            bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[typing.Any]
-        ],
-    ) -> str:
-        if bigframes_dtype == bigframes.dtypes.INT_DTYPE:
-            return "INT64"
-        elif bigframes_dtype == bigframes.dtypes.FLOAT_DTYPE:
-            return "FLOAT64"
-        elif bigframes_dtype == bigframes.dtypes.STRING_DTYPE:
-            return "STRING"
-        elif bigframes_dtype == bigframes.dtypes.BOOL_DTYPE:
-            return "BOOLEAN"
-        elif bigframes_dtype == bigframes.dtypes.DATE_DTYPE:
-            return "DATE"
-        elif bigframes_dtype == bigframes.dtypes.TIME_DTYPE:
-            return "TIME"
-        elif bigframes_dtype == bigframes.dtypes.DATETIME_DTYPE:
-            return "DATETIME"
-        elif bigframes_dtype == bigframes.dtypes.TIMESTAMP_DTYPE:
-            return "TIMESTAMP"
-        elif bigframes_dtype == bigframes.dtypes.BYTES_DTYPE:
-            return "BYTES"
-        elif bigframes_dtype == bigframes.dtypes.NUMERIC_DTYPE:
-            return "NUMERIC"
-        elif bigframes_dtype == bigframes.dtypes.BIGNUMERIC_DTYPE:
-            return "BIGNUMERIC"
-        elif bigframes_dtype == bigframes.dtypes.JSON_DTYPE:
-            return "JSON"
-        elif bigframes_dtype == bigframes.dtypes.GEO_DTYPE:
-            return "GEOGRAPHY"
-        elif bigframes_dtype == bigframes.dtypes.TIMEDELTA_DTYPE:
-            return "INT64"
-        elif isinstance(bigframes_dtype, pd.ArrowDtype):
-            if pa.types.is_list(bigframes_dtype.pyarrow_dtype):
-                inner_bigframes_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
-                    bigframes_dtype.pyarrow_dtype.value_type
+def from_bigframes_dtype(
+    bigframes_dtype: typing.Union[
+        bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[typing.Any]
+    ],
+) -> str:
+    if bigframes_dtype == bigframes.dtypes.INT_DTYPE:
+        return "INT64"
+    elif bigframes_dtype == bigframes.dtypes.FLOAT_DTYPE:
+        return "FLOAT64"
+    elif bigframes_dtype == bigframes.dtypes.STRING_DTYPE:
+        return "STRING"
+    elif bigframes_dtype == bigframes.dtypes.BOOL_DTYPE:
+        return "BOOLEAN"
+    elif bigframes_dtype == bigframes.dtypes.DATE_DTYPE:
+        return "DATE"
+    elif bigframes_dtype == bigframes.dtypes.TIME_DTYPE:
+        return "TIME"
+    elif bigframes_dtype == bigframes.dtypes.DATETIME_DTYPE:
+        return "DATETIME"
+    elif bigframes_dtype == bigframes.dtypes.TIMESTAMP_DTYPE:
+        return "TIMESTAMP"
+    elif bigframes_dtype == bigframes.dtypes.BYTES_DTYPE:
+        return "BYTES"
+    elif bigframes_dtype == bigframes.dtypes.NUMERIC_DTYPE:
+        return "NUMERIC"
+    elif bigframes_dtype == bigframes.dtypes.BIGNUMERIC_DTYPE:
+        return "BIGNUMERIC"
+    elif bigframes_dtype == bigframes.dtypes.JSON_DTYPE:
+        return "JSON"
+    elif bigframes_dtype == bigframes.dtypes.GEO_DTYPE:
+        return "GEOGRAPHY"
+    elif bigframes_dtype == bigframes.dtypes.TIMEDELTA_DTYPE:
+        return "INT64"
+    elif isinstance(bigframes_dtype, pd.ArrowDtype):
+        if pa.types.is_list(bigframes_dtype.pyarrow_dtype):
+            inner_bigframes_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
+                bigframes_dtype.pyarrow_dtype.value_type
+            )
+            return f"ARRAY<{from_bigframes_dtype(inner_bigframes_dtype)}>"
+        elif pa.types.is_struct(bigframes_dtype.pyarrow_dtype):
+            struct_type = typing.cast(pa.StructType, bigframes_dtype.pyarrow_dtype)
+            inner_fields: list[str] = []
+            for i in range(struct_type.num_fields):
+                field = struct_type.field(i)
+                key = sg.to_identifier(field.name).sql("bigquery")
+                dtype = from_bigframes_dtype(
+                    bigframes.dtypes.arrow_dtype_to_bigframes_dtype(field.type)
                 )
-                return (
-                    f"ARRAY<{SQLGlotType.from_bigframes_dtype(inner_bigframes_dtype)}>"
-                )
-            elif pa.types.is_struct(bigframes_dtype.pyarrow_dtype):
-                struct_type = typing.cast(pa.StructType, bigframes_dtype.pyarrow_dtype)
-                inner_fields: list[str] = []
-                for i in range(struct_type.num_fields):
-                    field = struct_type.field(i)
-                    key = sg.to_identifier(field.name).sql("bigquery")
-                    dtype = SQLGlotType.from_bigframes_dtype(
-                        bigframes.dtypes.arrow_dtype_to_bigframes_dtype(field.type)
-                    )
-                    inner_fields.append(f"{key} {dtype}")
-                return "STRUCT<{}>".format(", ".join(inner_fields))
+                inner_fields.append(f"{key} {dtype}")
+            return "STRUCT<{}>".format(", ".join(inner_fields))
 
-        raise ValueError(
-            f"Unsupported type for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
-        )
+    raise ValueError(
+        f"Unsupported type for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
+    )
@@ -754,6 +754,58 @@ def item(self):
         # Docstring is in third_party/bigframes_vendored/pandas/core/indexes/base.py
         return self.to_series().peek(2).item()
 
+    def __eq__(self, other) -> Index:  # type: ignore
+        return self._apply_binop(other, ops.eq_op)
+
+    def _apply_binop(self, other, op: ops.BinaryOp) -> Index:
+        # TODO: Handle local objects, or objects not implicitly alignable? Gets ambiguous with partial ordering though
+        if isinstance(other, (bigframes.series.Series, Index)):
+            other = Index(other)
+            if other.nlevels != self.nlevels:
+                raise ValueError("Dimensions do not match")
+
+            lexpr = self._block.expr
+            rexpr = other._block.expr
+            join_result = lexpr.try_row_join(rexpr)
+            if join_result is None:
+                raise ValueError("Cannot align objects")
+
+            expr, (lmap, rmap) = join_result
+
+            expr, res_ids = expr.compute_values(
+                [
+                    op.as_expr(lmap[lid], rmap[rid])
+                    for lid, rid in zip(lexpr.column_ids, rexpr.column_ids)
+                ]
+            )
+            return Index(
+                blocks.Block(
+                    expr.select_columns(res_ids),
+                    index_columns=res_ids,
+                    column_labels=[],
+                    index_labels=[None] * len(res_ids),
+                )
+            )
+        elif (
+            isinstance(other, bigframes.dtypes.LOCAL_SCALAR_TYPES) and self.nlevels == 1
+        ):
+            block, id = self._block.project_expr(
+                op.as_expr(self._block.index_columns[0], ex.const(other))
+            )
+            return Index(block.select_column(id))
+        elif isinstance(other, tuple) and len(other) == self.nlevels:
+            block = self._block.project_exprs(
+                [
+                    op.as_expr(self._block.index_columns[i], ex.const(other[i]))
+                    for i in range(self.nlevels)
+                ],
+                labels=[None] * self.nlevels,
+                drop=True,
+            )
+            return Index(block.set_index(block.value_columns))
+        else:
+            return NotImplemented
+
 
 def _should_create_datetime_index(block: blocks.Block) -> bool:
     if len(block.index.dtypes) != 1:
 
@@ -19,6 +19,8 @@
 import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex
 import pandas
 
+from bigframes.core import blocks
+from bigframes.core import expression as ex
 from bigframes.core.indexes.base import Index
 
 
@@ -46,3 +48,26 @@ def from_arrays(
         pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names)
         # Index.__new__ should detect multiple levels and properly create a multiindex
         return cast(MultiIndex, Index(pd_index))
+
+    def __eq__(self, other) -> Index:  # type: ignore
+        import bigframes.operations as ops
+        import bigframes.operations.aggregations as agg_ops
+
+        eq_result = self._apply_binop(other, ops.eq_op)._block.expr
+
+        as_array = ops.ToArrayOp().as_expr(
+            *(
+                ops.fillna_op.as_expr(col, ex.const(False))
+                for col in eq_result.column_ids
+            )
+        )
+        reduced = ops.ArrayReduceOp(agg_ops.all_op).as_expr(as_array)
+        result_expr, result_ids = eq_result.compute_values([reduced])
+        return Index(
+            blocks.Block(
+                result_expr.select_columns(result_ids),
+                index_columns=result_ids,
+                column_labels=(),
+                index_labels=[None],
+            )
+        )