Merge branch 'main' into migrate-manhatton-dist

jialuoo · web-flow · commit ac5d8698af13 · 2025-11-12T17:36:54.000-08:00
diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py
@@ -172,6 +172,7 @@ def compile_readtable(node: nodes.ReadTableNode, child: ir.SQLGlotIR):
         col_names=[col.source_id for col in node.scan_list.items],
         alias_names=[col.id.sql for col in node.scan_list.items],
         uid_gen=child.uid_gen,
+        system_time=node.source.at_time,
     )
 
 
diff --git a/bigframes/core/compile/sqlglot/expressions/generic_ops.py b/bigframes/core/compile/sqlglot/expressions/generic_ops.py
@@ -134,6 +134,11 @@ def _(
     )
 
 
+@register_binary_op(ops.fillna_op)
+def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
+    return sge.Coalesce(this=left.expr, expressions=[right.expr])
+
+
 @register_nary_op(ops.case_when_op)
 def _(*cases_and_outputs: TypedExpr) -> sge.Expression:
     # Need to upcast BOOL to INT if any output is numeric
diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py
@@ -305,6 +305,18 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
         return result
 
 
+@register_binary_op(ops.euclidean_distance_op)
+def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
+    return sge.Anonymous(
+        this="ML.DISTANCE",
+        expressions=[
+            left.expr,
+            right.expr,
+            sge.Literal.string("EUCLIDEAN"),
+        ],
+    )
+
+
 @register_binary_op(ops.floordiv_op)
 def _(left: TypedExpr, right: TypedExpr) -> sge.Expression:
     left_expr = _coerce_bool_to_int(left)
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import dataclasses
+import datetime
 import functools
 import typing
 
@@ -118,6 +119,7 @@ def from_table(
         col_names: typing.Sequence[str],
         alias_names: typing.Sequence[str],
         uid_gen: guid.SequentialUIDGenerator,
+        system_time: typing.Optional[datetime.datetime] = None,
     ) -> SQLGlotIR:
         """Builds a SQLGlotIR expression from a BigQuery table.
 
@@ -128,6 +130,7 @@ def from_table(
             col_names (typing.Sequence[str]): The names of the columns to select.
             alias_names (typing.Sequence[str]): The aliases for the selected columns.
             uid_gen (guid.SequentialUIDGenerator): A generator for unique identifiers.
+            system_time (typing.Optional[str]): An optional system time for time-travel queries.
         """
         selections = [
             sge.Alias(
@@ -138,10 +141,20 @@ def from_table(
             else sge.to_identifier(col_name, quoted=cls.quoted)
             for col_name, alias_name in zip(col_names, alias_names)
         ]
+        version = (
+            sge.Version(
+                this="TIMESTAMP",
+                expression=sge.Literal(this=system_time.isoformat(), is_string=True),
+                kind="AS OF",
+            )
+            if system_time
+            else None
+        )
         table_expr = sge.Table(
             this=sg.to_identifier(table_id, quoted=cls.quoted),
             db=sg.to_identifier(dataset_id, quoted=cls.quoted),
             catalog=sg.to_identifier(project_id, quoted=cls.quoted),
+            version=version,
         )
         select_expr = sge.Select().select(*selections).from_(table_expr)
         return cls(expr=select_expr, uid_gen=uid_gen)
diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py
@@ -593,6 +593,7 @@ def _agg_func(self, func) -> df.DataFrame:
     def _agg_dict(self, func: typing.Mapping) -> df.DataFrame:
         aggregations: typing.List[agg_expressions.Aggregation] = []
         column_labels = []
+        function_labels = []
 
         want_aggfunc_level = any(utils.is_list_like(aggs) for aggs in func.values())
 
@@ -602,8 +603,10 @@ def _agg_dict(self, func: typing.Mapping) -> df.DataFrame:
                 funcs_for_id if utils.is_list_like(funcs_for_id) else [funcs_for_id]
             )
             for f in func_list:
-                aggregations.append(aggs.agg(col_id, agg_ops.lookup_agg_func(f)[0]))
+                f_op, f_label = agg_ops.lookup_agg_func(f)
+                aggregations.append(aggs.agg(col_id, f_op))
                 column_labels.append(label)
+                function_labels.append(f_label)
         agg_block, _ = self._block.aggregate(
             by_column_ids=self._by_col_ids,
             aggregations=aggregations,
@@ -613,10 +616,7 @@ def _agg_dict(self, func: typing.Mapping) -> df.DataFrame:
             agg_block = agg_block.with_column_labels(
                 utils.combine_indices(
                     pd.Index(column_labels),
-                    pd.Index(
-                        typing.cast(agg_ops.AggregateOp, agg.op).name
-                        for agg in aggregations
-                    ),
+                    pd.Index(function_labels),
                 )
             )
         else:
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
@@ -717,9 +717,15 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
     np.all: all_op,
     np.any: any_op,
     np.unique: nunique_op,
-    # TODO(b/443252872): Solve
-    # list: ArrayAggOp(),
     np.size: size_op,
+    # TODO(b/443252872): Solve
+    list: ArrayAggOp(),
+    len: size_op,
+    sum: sum_op,
+    min: min_op,
+    max: max_op,
+    any: any_op,
+    all: all_op,
 }
 
 
diff --git a/tests/system/small/engines/test_generic_ops.py b/tests/system/small/engines/test_generic_ops.py
@@ -343,7 +343,7 @@ def test_engines_coalesce_op(scalars_array_value: array_value.ArrayValue, engine
     assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine)
 
 
-@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True)
+@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True)
 def test_engines_fillna_op(scalars_array_value: array_value.ArrayValue, engine):
     arr, _ = scalars_array_value.compute_values(
         [
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -6151,6 +6151,28 @@ def test_agg_with_dict_strs(scalars_dfs):
     )
 
 
+def test_df_agg_with_builtins(scalars_dfs):
+    bf_df, pd_df = scalars_dfs
+
+    bf_result = (
+        bf_df[["int64_col", "bool_col"]]
+        .dropna()
+        .groupby(bf_df.int64_too % 2)
+        .agg({"int64_col": [len, sum, min, max, list], "bool_col": [all, any, max]})
+        .to_pandas()
+    )
+    pd_result = (
+        pd_df[["int64_col", "bool_col"]]
+        .dropna()
+        .groupby(pd_df.int64_too % 2)
+        .agg({"int64_col": [len, sum, min, max, list], "bool_col": [all, any, max]})
+    )
+
+    pd.testing.assert_frame_equal(
+        bf_result, pd_result, check_dtype=False, check_index_type=False
+    )
+
+
 def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs):
     bf_df, _ = scalars_dfs
     agg_funcs = {
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
@@ -282,8 +282,6 @@ def test_dataframe_groupby_agg_dict_with_list(
     )
     bf_result_computed = bf_result.to_pandas()
 
-    # some inconsistency between versions, so normalize to bigframes behavior
-    pd_result = pd_result.rename({"amax": "max"}, axis="columns")
     pd.testing.assert_frame_equal(
         pd_result, bf_result_computed, check_dtype=False, check_index_type=False
     )
diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py
@@ -97,7 +97,10 @@ def scalar_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
 def scalar_types_df(compiler_session) -> bpd.DataFrame:
     """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex`
     column as the index."""
-    bf_df = compiler_session.read_gbq_table("bigframes-dev.sqlglot_test.scalar_types")
+    bf_df = compiler_session._loader.read_gbq_table(
+        "bigframes-dev.sqlglot_test.scalar_types",
+        enable_snapshot=False,
+    )
     bf_df = bf_df.set_index("rowindex", drop=False)
     return bf_df
 
@@ -154,8 +157,9 @@ def nested_structs_types_table_schema() -> typing.Sequence[bigquery.SchemaField]
 def nested_structs_types_df(compiler_session_w_nested_structs_types) -> bpd.DataFrame:
     """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex`
     column as the index."""
-    bf_df = compiler_session_w_nested_structs_types.read_gbq_table(
-        "bigframes-dev.sqlglot_test.nested_structs_types"
+    bf_df = compiler_session_w_nested_structs_types._loader.read_gbq_table(
+        "bigframes-dev.sqlglot_test.nested_structs_types",
+        enable_snapshot=False,
     )
     bf_df = bf_df.set_index("id", drop=False)
     return bf_df
@@ -204,8 +208,9 @@ def repeated_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
 def repeated_types_df(compiler_session_w_repeated_types) -> bpd.DataFrame:
     """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex`
     column as the index."""
-    bf_df = compiler_session_w_repeated_types.read_gbq_table(
-        "bigframes-dev.sqlglot_test.repeated_types"
+    bf_df = compiler_session_w_repeated_types._loader.read_gbq_table(
+        "bigframes-dev.sqlglot_test.repeated_types",
+        enable_snapshot=False,
     )
     bf_df = bf_df.set_index("rowindex", drop=False)
     return bf_df
@@ -237,8 +242,9 @@ def json_types_table_schema() -> typing.Sequence[bigquery.SchemaField]:
 def json_types_df(compiler_session_w_json_types) -> bpd.DataFrame:
     """Returns a BigFrames DataFrame containing JSON types and using the `rowindex`
     column as the index."""
-    bf_df = compiler_session_w_json_types.read_gbq_table(
-        "bigframes-dev.sqlglot_test.json_types"
+    bf_df = compiler_session_w_json_types._loader.read_gbq_table(
+        "bigframes-dev.sqlglot_test.json_types",
+        enable_snapshot=False,
     )
     # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns?
     bf_df = bf_df.set_index("rowindex", drop=True)
diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_fillna/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_fillna/out.sql
@@ -0,0 +1,14 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `float64_col`,
+    `int64_col`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    COALESCE(`int64_col`, `float64_col`) AS `bfcol_2`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_2` AS `int64_col`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_euclidean_distance/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_euclidean_distance/out.sql
@@ -0,0 +1,16 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `int_list_col`,
+    `numeric_list_col`
+  FROM `bigframes-dev`.`sqlglot_test`.`repeated_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    ML.DISTANCE(`int_list_col`, `int_list_col`, 'EUCLIDEAN') AS `bfcol_2`,
+    ML.DISTANCE(`numeric_list_col`, `numeric_list_col`, 'EUCLIDEAN') AS `bfcol_3`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_2` AS `int_list_col`,
+  `bfcol_3` AS `numeric_list_col`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py
@@ -239,6 +239,12 @@ def test_clip(scalar_types_df: bpd.DataFrame, snapshot):
     snapshot.assert_match(sql, "out.sql")
 
 
+def test_fillna(scalar_types_df: bpd.DataFrame, snapshot):
+    bf_df = scalar_types_df[["int64_col", "float64_col"]]
+    sql = utils._apply_binary_op(bf_df, ops.fillna_op, "int64_col", "float64_col")
+    snapshot.assert_match(sql, "out.sql")
+
+
 def test_hash(scalar_types_df: bpd.DataFrame, snapshot):
     col_name = "string_col"
     bf_df = scalar_types_df[[col_name]]
diff --git a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py
@@ -315,6 +315,21 @@ def test_div_timedelta(scalar_types_df: bpd.DataFrame, snapshot):
     snapshot.assert_match(bf_df.sql, "out.sql")
 
 
+def test_euclidean_distance(repeated_types_df: bpd.DataFrame, snapshot):
+    col_names = ["int_list_col", "numeric_list_col"]
+    bf_df = repeated_types_df[col_names]
+
+    sql = utils._apply_ops_to_sql(
+        bf_df,
+        [
+            ops.euclidean_distance_op.as_expr("int_list_col", "int_list_col"),
+            ops.euclidean_distance_op.as_expr("numeric_list_col", "numeric_list_col"),
+        ],
+        ["int_list_col", "numeric_list_col"],
+    )
+    snapshot.assert_match(sql, "out.sql")
+
+
 def test_floordiv_numeric(scalar_types_df: bpd.DataFrame, snapshot):
     bf_df = scalar_types_df[["int64_col", "bool_col", "float64_col"]]
 
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_system_time/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_system_time/out.sql
@@ -0,0 +1,36 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `bool_col`,
+    `bytes_col`,
+    `date_col`,
+    `datetime_col`,
+    `duration_col`,
+    `float64_col`,
+    `geography_col`,
+    `int64_col`,
+    `int64_too`,
+    `numeric_col`,
+    `rowindex`,
+    `rowindex_2`,
+    `string_col`,
+    `time_col`,
+    `timestamp_col`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` FOR SYSTEM_TIME AS OF '2025-11-09T03:04:05.678901+00:00'
+)
+SELECT
+  `bool_col`,
+  `bytes_col`,
+  `date_col`,
+  `datetime_col`,
+  `geography_col`,
+  `int64_col`,
+  `int64_too`,
+  `numeric_col`,
+  `float64_col`,
+  `rowindex`,
+  `rowindex_2`,
+  `string_col`,
+  `time_col`,
+  `timestamp_col`,
+  `duration_col`
+FROM `bfcte_0`
diff --git a/tests/unit/core/compile/sqlglot/test_compile_readtable.py b/tests/unit/core/compile/sqlglot/test_compile_readtable.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import datetime
+
+import google.cloud.bigquery as bigquery
 import pytest
 
 import bigframes.pandas as bpd
@@ -47,3 +50,20 @@ def test_compile_readtable_w_limit(scalar_types_df: bpd.DataFrame, snapshot):
     bf_df = scalar_types_df[["int64_col"]]
     bf_df = bf_df.sort_index().head(10)
     snapshot.assert_match(bf_df.sql, "out.sql")
+
+
+def test_compile_readtable_w_system_time(
+    compiler_session, scalar_types_table_schema, snapshot
+):
+    table_ref = bigquery.TableReference(
+        bigquery.DatasetReference("bigframes-dev", "sqlglot_test"),
+        "scalar_types",
+    )
+    table = bigquery.Table(table_ref, tuple(scalar_types_table_schema))
+    table._properties["location"] = compiler_session._location
+    compiler_session._loader._df_snapshot[str(table_ref)] = (
+        datetime.datetime(2025, 11, 9, 3, 4, 5, 678901, tzinfo=datetime.timezone.utc),
+        table,
+    )
+    bf_df = compiler_session.read_gbq_table(str(table_ref))
+    snapshot.assert_match(bf_df.sql, "out.sql")
diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py
@@ -240,7 +240,6 @@ def test_read_gbq_cached_table():
     )
     table._properties["location"] = session._location
     table._properties["numRows"] = "1000000000"
-    table._properties["location"] = session._location
     table._properties["type"] = "TABLE"
     session._loader._df_snapshot[str(table_ref)] = (
         datetime.datetime(1999, 1, 2, 3, 4, 5, 678901, tzinfo=datetime.timezone.utc),

Original file line number	Diff line number	Diff line change
`@@ -172,6 +172,7 @@ def compile_readtable(node: nodes.ReadTableNode, child: ir.SQLGlotIR):`
`172`	`172`	`col_names=[col.source_id for col in node.scan_list.items],`
`173`	`173`	`alias_names=[col.id.sql for col in node.scan_list.items],`
`174`	`174`	`uid_gen=child.uid_gen,`
	`175`	`+ system_time=node.source.at_time,`
`175`	`176`	`)`
`176`	`177`
`177`	`178`
Original file line number	Diff line number	Diff line change
`@@ -343,7 +343,7 @@ def test_engines_coalesce_op(scalars_array_value: array_value.ArrayValue, engine`
`343`	`343`	`assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine)`
`344`	`344`
`345`	`345`
`346`		`-@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True)`
	`346`	`+@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True)`
`347`	`347`	`def test_engines_fillna_op(scalars_array_value: array_value.ArrayValue, engine):`
`348`	`348`	`arr, _ = scalars_array_value.compute_values(`
`349`	`349`	`[`
Original file line number	Diff line number	Diff line change
`@@ -282,8 +282,6 @@ def test_dataframe_groupby_agg_dict_with_list(`
`282`	`282`	`)`
`283`	`283`	`bf_result_computed = bf_result.to_pandas()`
`284`	`284`
`285`		`- # some inconsistency between versions, so normalize to bigframes behavior`
`286`		`- pd_result = pd_result.rename({"amax": "max"}, axis="columns")`
`287`	`285`	`pd.testing.assert_frame_equal(`
`288`	`286`	`pd_result, bf_result_computed, check_dtype=False, check_index_type=False`
`289`	`287`	`)`