Merge branch 'main' into shuowei-anywidget-sort-by-col-name

shuoweil · shuoweil · commit 7d48abdf7e94 · 2025-11-25T23:17:40.000Z
diff --git a/README.rst b/README.rst
@@ -6,15 +6,25 @@ BigQuery DataFrames (BigFrames)
 |GA| |pypi| |versions|
 
 BigQuery DataFrames (also known as BigFrames) provides a Pythonic DataFrame
-and machine learning (ML) API powered by the BigQuery engine.
+and machine learning (ML) API powered by the BigQuery engine. It provides modules
+for many use cases, including:
 
-* `bigframes.pandas` provides a pandas API for analytics. Many workloads can be
+* `bigframes.pandas <https://dataframes.bigquery.dev/reference/api/bigframes.pandas.html>`_
+  is a pandas API for analytics. Many workloads can be
   migrated from pandas to bigframes by just changing a few imports.
-* ``bigframes.ml`` provides a scikit-learn-like API for ML.
+* `bigframes.ml <https://dataframes.bigquery.dev/reference/index.html#ml-apis>`_
+  is a scikit-learn-like API for ML.
+* `bigframes.bigquery.ai <https://dataframes.bigquery.dev/reference/api/bigframes.bigquery.ai.html>`_
+  are a collection of powerful AI methods, powered by Gemini.
 
-BigQuery DataFrames is an open-source package.
+BigQuery DataFrames is an `open-source package <https://github.com/googleapis/python-bigquery-dataframes>`_.
 
-**Version 2.0 introduces breaking changes for improved security and performance. See below for details.**
+.. |GA| image:: https://img.shields.io/badge/support-GA-gold.svg
+   :target: https://github.com/googleapis/google-cloud-python/blob/main/README.rst#general-availability
+.. |pypi| image:: https://img.shields.io/pypi/v/bigframes.svg
+   :target: https://pypi.org/project/bigframes/
+.. |versions| image:: https://img.shields.io/pypi/pyversions/bigframes.svg
+   :target: https://pypi.org/project/bigframes/
 
 Getting started with BigQuery DataFrames
 ----------------------------------------
@@ -38,7 +48,8 @@ To use BigFrames in your local development environment,
 
     import bigframes.pandas as bpd
 
-    bpd.options.bigquery.project = your_gcp_project_id
+    bpd.options.bigquery.project = your_gcp_project_id  # Optional in BQ Studio.
+    bpd.options.bigquery.ordering_mode = "partial"  # Recommended for performance.
     df = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013")
     print(
         df.groupby("name")
@@ -48,49 +59,16 @@ To use BigFrames in your local development environment,
         .to_pandas()
     )
 
-
 Documentation
 -------------
 
 To learn more about BigQuery DataFrames, visit these pages
 
 * `Introduction to BigQuery DataFrames (BigFrames) <https://cloud.google.com/bigquery/docs/bigquery-dataframes-introduction>`_
 * `Sample notebooks <https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks>`_
-* `API reference <https://cloud.google.com/python/docs/reference/bigframes/latest/summary_overview>`_
+* `API reference <https://dataframes.bigquery.dev/>`_
 * `Source code (GitHub) <https://github.com/googleapis/python-bigquery-dataframes>`_
 
-⚠️ Warning: Breaking Changes in BigQuery DataFrames v2.0
---------------------------------------------------------
-
-Version 2.0 introduces breaking changes for improved security and performance. Key default behaviors have changed, including
-
-* **Large Results (>10GB):** The default value for ``allow_large_results`` has changed to ``False``.
-  Methods like ``to_pandas()`` will now fail if the query result's compressed data size exceeds 10GB,
-  unless large results are explicitly permitted.
-* **Remote Function Security:** The library no longer automatically lets the Compute Engine default service
-  account become the identity of the Cloud Run functions. If that is desired, it has to be indicated by passing
-  ``cloud_function_service_account="default"``. And network ingress now defaults to ``"internal-only"``.
-* **@remote_function Argument Passing:** Arguments other than ``input_types``, ``output_type``, and ``dataset``
-  to ``remote_function`` must now be passed using keyword syntax, as positional arguments are no longer supported.
-* **@udf Argument Passing:** Arguments ``dataset`` and ``name`` to ``udf`` are now mandatory.
-* **Endpoint Connections:** Automatic fallback to locational endpoints in certain regions is removed.
-* **LLM Updates (Gemini Integration):** Integrations now default to the ``gemini-2.0-flash-001`` model.
-  PaLM2 support has been removed; please migrate any existing PaLM2 usage to Gemini. **Note:** The current default
-  model will be removed in Version 3.0.
-
-**Important:** If you are not ready to adapt to these changes, please pin your dependency to a version less than 2.0
-(e.g., ``bigframes==1.42.0``) to avoid disruption.
-
-To learn about these changes and how to migrate to version 2.0, see the
-`updated introduction guide <https://cloud.google.com/bigquery/docs/bigquery-dataframes-introduction>`_.
-
-.. |GA| image:: https://img.shields.io/badge/support-GA-gold.svg
-   :target: https://github.com/googleapis/google-cloud-python/blob/main/README.rst#general-availability
-.. |pypi| image:: https://img.shields.io/pypi/v/bigframes.svg
-   :target: https://pypi.org/project/bigframes/
-.. |versions| image:: https://img.shields.io/pypi/pyversions/bigframes.svg
-   :target: https://pypi.org/project/bigframes/
-
 License
 -------
 
diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py
@@ -386,6 +386,17 @@ def _(
     return apply_window_if_present(sge.func("MIN", column.expr), window)
 
 
+@UNARY_OP_REGISTRATION.register(agg_ops.NuniqueOp)
+def _(
+    op: agg_ops.NuniqueOp,
+    column: typed_expr.TypedExpr,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    return apply_window_if_present(
+        sge.func("COUNT", sge.Distinct(expressions=[column.expr])), window
+    )
+
+
 @UNARY_OP_REGISTRATION.register(agg_ops.PopVarOp)
 def _(
     op: agg_ops.PopVarOp,
@@ -400,6 +411,58 @@ def _(
     return apply_window_if_present(expr, window)
 
 
+@UNARY_OP_REGISTRATION.register(agg_ops.ProductOp)
+def _(
+    op: agg_ops.ProductOp,
+    column: typed_expr.TypedExpr,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    # Need to short-circuit as log with zeroes is illegal sql
+    is_zero = sge.EQ(this=column.expr, expression=sge.convert(0))
+
+    # There is no product sql aggregate function, so must implement as a sum of logs, and then
+    # apply power after. Note, log and power base must be equal! This impl uses natural log.
+    logs = (
+        sge.Case()
+        .when(is_zero, sge.convert(0))
+        .else_(sge.func("LN", sge.func("ABS", column.expr)))
+    )
+    logs_sum = apply_window_if_present(sge.func("SUM", logs), window)
+    magnitude = sge.func("EXP", logs_sum)
+
+    # Can't determine sign from logs, so have to determine parity of count of negative inputs
+    is_negative = (
+        sge.Case()
+        .when(
+            sge.LT(this=sge.func("SIGN", column.expr), expression=sge.convert(0)),
+            sge.convert(1),
+        )
+        .else_(sge.convert(0))
+    )
+    negative_count = apply_window_if_present(sge.func("SUM", is_negative), window)
+    negative_count_parity = sge.Mod(
+        this=negative_count, expression=sge.convert(2)
+    )  # 1 if result should be negative, otherwise 0
+
+    any_zeroes = apply_window_if_present(sge.func("LOGICAL_OR", is_zero), window)
+
+    float_result = (
+        sge.Case()
+        .when(any_zeroes, sge.convert(0))
+        .else_(
+            sge.Mul(
+                this=magnitude,
+                expression=sge.If(
+                    this=sge.EQ(this=negative_count_parity, expression=sge.convert(1)),
+                    true=sge.convert(-1),
+                    false=sge.convert(1),
+                ),
+            )
+        )
+    )
+    return float_result
+
+
 @UNARY_OP_REGISTRATION.register(agg_ops.QcutOp)
 def _(
     op: agg_ops.QcutOp,
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_nunique/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_nunique/out.sql
@@ -0,0 +1,12 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `int64_col`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    COUNT(DISTINCT `int64_col`) AS `bfcol_1`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_1` AS `int64_col`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_product/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_product/out.sql
@@ -0,0 +1,16 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `int64_col`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    CASE
+      WHEN LOGICAL_OR(`int64_col` = 0)
+      THEN 0
+      ELSE EXP(SUM(CASE WHEN `int64_col` = 0 THEN 0 ELSE LN(ABS(`int64_col`)) END)) * IF(MOD(SUM(CASE WHEN SIGN(`int64_col`) < 0 THEN 1 ELSE 0 END), 2) = 1, -1, 1)
+    END AS `bfcol_1`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_1` AS `int64_col`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_product/window_partition_out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_product/window_partition_out.sql
@@ -0,0 +1,27 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `int64_col`,
+    `string_col`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    CASE
+      WHEN LOGICAL_OR(`int64_col` = 0) OVER (PARTITION BY `string_col`)
+      THEN 0
+      ELSE EXP(
+        SUM(CASE WHEN `int64_col` = 0 THEN 0 ELSE LN(ABS(`int64_col`)) END) OVER (PARTITION BY `string_col`)
+      ) * IF(
+        MOD(
+          SUM(CASE WHEN SIGN(`int64_col`) < 0 THEN 1 ELSE 0 END) OVER (PARTITION BY `string_col`),
+          2
+        ) = 1,
+        -1,
+        1
+      )
+    END AS `bfcol_2`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_2` AS `agg_int64`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py
@@ -412,6 +412,15 @@ def test_min(scalar_types_df: bpd.DataFrame, snapshot):
     snapshot.assert_match(sql_window_partition, "window_partition_out.sql")
 
 
+def test_nunique(scalar_types_df: bpd.DataFrame, snapshot):
+    col_name = "int64_col"
+    bf_df = scalar_types_df[[col_name]]
+    agg_expr = agg_ops.NuniqueOp().as_expr(col_name)
+    sql = _apply_unary_agg_ops(bf_df, [agg_expr], [col_name])
+
+    snapshot.assert_match(sql, "out.sql")
+
+
 def test_pop_var(scalar_types_df: bpd.DataFrame, snapshot):
     col_names = ["int64_col", "bool_col"]
     bf_df = scalar_types_df[col_names]
@@ -434,6 +443,25 @@ def test_pop_var(scalar_types_df: bpd.DataFrame, snapshot):
     snapshot.assert_match(sql_window, "window_out.sql")
 
 
+def test_product(scalar_types_df: bpd.DataFrame, snapshot):
+    col_name = "int64_col"
+    bf_df = scalar_types_df[[col_name]]
+    agg_expr = agg_ops.ProductOp().as_expr(col_name)
+    sql = _apply_unary_agg_ops(bf_df, [agg_expr], [col_name])
+
+    snapshot.assert_match(sql, "out.sql")
+
+    bf_df_str = scalar_types_df[[col_name, "string_col"]]
+    window_partition = window_spec.WindowSpec(
+        grouping_keys=(expression.deref("string_col"),),
+    )
+    sql_window_partition = _apply_unary_window_op(
+        bf_df_str, agg_expr, window_partition, "agg_int64"
+    )
+
+    snapshot.assert_match(sql_window_partition, "window_partition_out.sql")
+
+
 def test_qcut(scalar_types_df: bpd.DataFrame, snapshot):
     if sys.version_info < (3, 12):
         pytest.skip(