Merge branch 'main' into sycai_ai_gen_bool_sqlglot

sycai · web-flow · commit 4ded421fbec6 · 2025-09-18T15:46:11.000-07:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,7 +20,7 @@ repos:
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
-        exclude: "^tests/unit/core/compile/sqlglot/snapshots"
+        exclude: "^tests/unit/core/compile/sqlglot/.*snapshots"
     -   id: check-yaml
 -   repo: https://github.com/pycqa/isort
     rev: 5.12.0
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -252,6 +252,10 @@ def from_local(
                 pass
         return block
 
+    @property
+    def has_index(self) -> bool:
+        return len(self._index_columns) > 0
+
     @property
     def index(self) -> BlockIndexProperties:
         """Row identities for values in the Block."""
diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py
@@ -16,6 +16,7 @@
 
 import typing
 
+import pandas as pd
 import sqlglot.expressions as sge
 
 from bigframes import dtypes
@@ -46,18 +47,22 @@ def _(
     return apply_window_if_present(sge.func("COUNT", column.expr), window)
 
 
-@UNARY_OP_REGISTRATION.register(agg_ops.SumOp)
+@UNARY_OP_REGISTRATION.register(agg_ops.MaxOp)
 def _(
-    op: agg_ops.SumOp,
+    op: agg_ops.MaxOp,
     column: typed_expr.TypedExpr,
     window: typing.Optional[window_spec.WindowSpec] = None,
 ) -> sge.Expression:
-    expr = column.expr
-    if column.dtype == dtypes.BOOL_DTYPE:
-        expr = sge.Cast(this=column.expr, to="INT64")
-    # Will be null if all inputs are null. Pandas defaults to zero sum though.
-    expr = apply_window_if_present(sge.func("SUM", expr), window)
-    return sge.func("IFNULL", expr, ir._literal(0, column.dtype))
+    return apply_window_if_present(sge.func("MAX", column.expr), window)
+
+
+@UNARY_OP_REGISTRATION.register(agg_ops.MinOp)
+def _(
+    op: agg_ops.MinOp,
+    column: typed_expr.TypedExpr,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    return apply_window_if_present(sge.func("MIN", column.expr), window)
 
 
 @UNARY_OP_REGISTRATION.register(agg_ops.SizeUnaryOp)
@@ -67,3 +72,20 @@ def _(
     window: typing.Optional[window_spec.WindowSpec] = None,
 ) -> sge.Expression:
     return apply_window_if_present(sge.func("COUNT", sge.convert(1)), window)
+
+
+@UNARY_OP_REGISTRATION.register(agg_ops.SumOp)
+def _(
+    op: agg_ops.SumOp,
+    column: typed_expr.TypedExpr,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    expr = column.expr
+    if column.dtype == dtypes.BOOL_DTYPE:
+        expr = sge.Cast(this=column.expr, to="INT64")
+
+    expr = apply_window_if_present(sge.func("SUM", expr), window)
+
+    # Will be null if all inputs are null. Pandas defaults to zero sum though.
+    zero = pd.to_timedelta(0) if column.dtype == dtypes.TIMEDELTA_DTYPE else 0
+    return sge.func("IFNULL", expr, ir._literal(zero, column.dtype))
diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py
@@ -79,7 +79,7 @@ def register_unary_op(
         """
         key = typing.cast(str, op_ref.name)
 
-        def decorator(impl: typing.Callable[..., TypedExpr]):
+        def decorator(impl: typing.Callable[..., sge.Expression]):
             def normalized_impl(args: typing.Sequence[TypedExpr], op: ops.RowOp):
                 if pass_op:
                     return impl(args[0], op)
@@ -108,7 +108,7 @@ def register_binary_op(
         """
         key = typing.cast(str, op_ref.name)
 
-        def decorator(impl: typing.Callable[..., TypedExpr]):
+        def decorator(impl: typing.Callable[..., sge.Expression]):
             def normalized_impl(args: typing.Sequence[TypedExpr], op: ops.RowOp):
                 if pass_op:
                     return impl(args[0], args[1], op)
@@ -132,7 +132,7 @@ def register_ternary_op(
         """
         key = typing.cast(str, op_ref.name)
 
-        def decorator(impl: typing.Callable[..., TypedExpr]):
+        def decorator(impl: typing.Callable[..., sge.Expression]):
             def normalized_impl(args: typing.Sequence[TypedExpr], op: ops.RowOp):
                 return impl(args[0], args[1], args[2])
 
@@ -156,7 +156,7 @@ def register_nary_op(
         """
         key = typing.cast(str, op_ref.name)
 
-        def decorator(impl: typing.Callable[..., TypedExpr]):
+        def decorator(impl: typing.Callable[..., sge.Expression]):
             def normalized_impl(args: typing.Sequence[TypedExpr], op: ops.RowOp):
                 if pass_op:
                     return impl(*args, op=op)
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -489,7 +489,6 @@ def memory_usage(self, index: bool = True):
             column_sizes = pandas.concat([index_size, column_sizes])
         return column_sizes
 
-    @validations.requires_index
     def info(
         self,
         verbose: Optional[bool] = None,
@@ -512,12 +511,17 @@ def info(
 
         obuf.write(f"{type(self)}\n")
 
-        index_type = "MultiIndex" if self.index.nlevels > 1 else "Index"
+        if self._block.has_index:
+            index_type = "MultiIndex" if self.index.nlevels > 1 else "Index"
 
-        # These accessses are kind of expensive, maybe should try to skip?
-        first_indice = self.index[0]
-        last_indice = self.index[-1]
-        obuf.write(f"{index_type}: {n_rows} entries, {first_indice} to {last_indice}\n")
+            # These accessses are kind of expensive, maybe should try to skip?
+            first_indice = self.index[0]
+            last_indice = self.index[-1]
+            obuf.write(
+                f"{index_type}: {n_rows} entries, {first_indice} to {last_indice}\n"
+            )
+        else:
+            obuf.write("NullIndex\n")
 
         dtype_strings = self.dtypes.astype("string")
         if show_all_columns:
diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 
+import io
+
 import pandas as pd
 import pytest
 
@@ -44,6 +46,38 @@ def test_null_index_materialize(scalars_df_null_index, scalars_pandas_df_default
     )
 
 
+def test_null_index_info(scalars_df_null_index):
+    expected = (
+        "<class 'bigframes.dataframe.DataFrame'>\n"
+        "NullIndex\n"
+        "Data columns (total 14 columns):\n"
+        "  #  Column         Non-Null Count    Dtype\n"
+        "---  -------------  ----------------  ------------------------------\n"
+        "  0  bool_col       8 non-null        boolean\n"
+        "  1  bytes_col      6 non-null        binary[pyarrow]\n"
+        "  2  date_col       7 non-null        date32[day][pyarrow]\n"
+        "  3  datetime_col   6 non-null        timestamp[us][pyarrow]\n"
+        "  4  geography_col  4 non-null        geometry\n"
+        "  5  int64_col      8 non-null        Int64\n"
+        "  6  int64_too      9 non-null        Int64\n"
+        "  7  numeric_col    6 non-null        decimal128(38, 9)[pyarrow]\n"
+        "  8  float64_col    7 non-null        Float64\n"
+        "  9  rowindex_2     9 non-null        Int64\n"
+        " 10  string_col     8 non-null        string\n"
+        " 11  time_col       6 non-null        time64[us][pyarrow]\n"
+        " 12  timestamp_col  6 non-null        timestamp[us, tz=UTC][pyarrow]\n"
+        " 13  duration_col   7 non-null        duration[us][pyarrow]\n"
+        "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), duration[us][pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n"
+        "memory usage: 1269 bytes\n"
+    )
+
+    bf_result = io.StringIO()
+
+    scalars_df_null_index.drop(columns="rowindex").info(buf=bf_result)
+
+    assert expected == bf_result.getvalue()
+
+
 def test_null_index_series_repr(scalars_df_null_index, scalars_pandas_df_default_index):
     bf_result = scalars_df_null_index["int64_too"].head(5).__repr__()
     pd_result = (
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_count/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_count/out.sql
@@ -0,0 +1,12 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `int64_col` AS `bfcol_0`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    COUNT(`bfcol_0`) AS `bfcol_1`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_1` AS `int64_col`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_max/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_max/out.sql
@@ -0,0 +1,12 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `int64_col` AS `bfcol_0`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    MAX(`bfcol_0`) AS `bfcol_1`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_1` AS `int64_col`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_min/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_min/out.sql
@@ -0,0 +1,12 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `int64_col` AS `bfcol_0`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    MIN(`bfcol_0`) AS `bfcol_1`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_1` AS `int64_col`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_size_unary/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_size_unary/out.sql
@@ -1,12 +1,12 @@
 WITH `bfcte_0` AS (
   SELECT
-    `string_col` AS `bfcol_0`
+    `float64_col` AS `bfcol_0`
   FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
 ), `bfcte_1` AS (
   SELECT
     COUNT(1) AS `bfcol_1`
   FROM `bfcte_0`
 )
 SELECT
-  `bfcol_1` AS `string_col_agg`
+  `bfcol_1` AS `float64_col`
 FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_sum/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_sum/out.sql
@@ -1,12 +1,15 @@
 WITH `bfcte_0` AS (
   SELECT
-    `int64_col` AS `bfcol_0`
+    `bool_col` AS `bfcol_0`,
+    `int64_col` AS `bfcol_1`
   FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
 ), `bfcte_1` AS (
   SELECT
-    COALESCE(SUM(`bfcol_0`), 0) AS `bfcol_1`
+    COALESCE(SUM(`bfcol_1`), 0) AS `bfcol_4`,
+    COALESCE(SUM(CAST(`bfcol_0` AS INT64)), 0) AS `bfcol_5`
   FROM `bfcte_0`
 )
 SELECT
-  `bfcol_1` AS `int64_col_agg`
+  `bfcol_4` AS `int64_col`,
+  `bfcol_5` AS `bool_col`
 FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py
@@ -12,40 +12,67 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import typing
+
 import pytest
 
-from bigframes.core import agg_expressions, array_value, expression, identifiers, nodes
+from bigframes.core import agg_expressions as agg_exprs
+from bigframes.core import array_value, identifiers, nodes
 from bigframes.operations import aggregations as agg_ops
 import bigframes.pandas as bpd
 
 pytest.importorskip("pytest_snapshot")
 
 
-def _apply_unary_op(obj: bpd.DataFrame, op: agg_ops.UnaryWindowOp, arg: str) -> str:
-    agg_node = nodes.AggregateNode(
-        obj._block.expr.node,
-        aggregations=(
-            (
-                agg_expressions.UnaryAggregation(op, expression.deref(arg)),
-                identifiers.ColumnId(arg + "_agg"),
-            ),
-        ),
-    )
+def _apply_unary_agg_ops(
+    obj: bpd.DataFrame,
+    ops_list: typing.Sequence[agg_exprs.UnaryAggregation],
+    new_names: typing.Sequence[str],
+) -> str:
+    aggs = [(op, identifiers.ColumnId(name)) for op, name in zip(ops_list, new_names)]
+
+    agg_node = nodes.AggregateNode(obj._block.expr.node, aggregations=tuple(aggs))
     result = array_value.ArrayValue(agg_node)
 
     sql = result.session._executor.to_sql(result, enable_cache=False)
     return sql
 
 
-def test_size(scalar_types_df: bpd.DataFrame, snapshot):
-    bf_df = scalar_types_df[["string_col"]]
-    sql = _apply_unary_op(bf_df, agg_ops.SizeUnaryOp(), "string_col")
+def test_count(scalar_types_df: bpd.DataFrame, snapshot):
+    col_name = "int64_col"
+    bf_df = scalar_types_df[[col_name]]
+    agg_expr = agg_ops.CountOp().as_expr(col_name)
+    sql = _apply_unary_agg_ops(bf_df, [agg_expr], [col_name])
+
+    snapshot.assert_match(sql, "out.sql")
+
+
+def test_max(scalar_types_df: bpd.DataFrame, snapshot):
+    col_name = "int64_col"
+    bf_df = scalar_types_df[[col_name]]
+    agg_expr = agg_ops.MaxOp().as_expr(col_name)
+    sql = _apply_unary_agg_ops(bf_df, [agg_expr], [col_name])
+
+    snapshot.assert_match(sql, "out.sql")
+
+
+def test_min(scalar_types_df: bpd.DataFrame, snapshot):
+    col_name = "int64_col"
+    bf_df = scalar_types_df[[col_name]]
+    agg_expr = agg_ops.MinOp().as_expr(col_name)
+    sql = _apply_unary_agg_ops(bf_df, [agg_expr], [col_name])
 
     snapshot.assert_match(sql, "out.sql")
 
 
 def test_sum(scalar_types_df: bpd.DataFrame, snapshot):
-    bf_df = scalar_types_df[["int64_col"]]
-    sql = _apply_unary_op(bf_df, agg_ops.SumOp(), "int64_col")
+    bf_df = scalar_types_df[["int64_col", "bool_col"]]
+    agg_ops_map = {
+        "int64_col": agg_ops.SumOp().as_expr("int64_col"),
+        "bool_col": agg_ops.SumOp().as_expr("bool_col"),
+    }
+    sql = _apply_unary_agg_ops(
+        bf_df, list(agg_ops_map.values()), list(agg_ops_map.keys())
+    )
 
     snapshot.assert_match(sql, "out.sql")