feat: Allow manually set clustering_columns in dataframe.to_gbq (#302)

Genesis929 · web-flow · commit 9c213239a73b · 2024-01-04T15:59:06.000-08:00
* feat: Allow manually set clustering_columns in dataframe.to_gbq

* Update if_exists check.

* Update test.
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2499,25 +2499,17 @@ def to_gbq(
         if_exists: Optional[Literal["fail", "replace", "append"]] = None,
         index: bool = True,
         ordering_id: Optional[str] = None,
+        clustering_columns: Union[pandas.Index, Iterable[typing.Hashable]] = (),
     ) -> str:
         dispositions = {
             "fail": bigquery.WriteDisposition.WRITE_EMPTY,
             "replace": bigquery.WriteDisposition.WRITE_TRUNCATE,
             "append": bigquery.WriteDisposition.WRITE_APPEND,
         }
 
-        if destination_table is None:
-            # TODO(swast): If there have been no modifications to the DataFrame
-            # since the last time it was written (cached), then return that.
-            # For `read_gbq` nodes, return the underlying table clone.
-            destination_table = bigframes.session._io.bigquery.create_temp_table(
-                self._session.bqclient,
-                self._session._anonymous_dataset,
-                # TODO(swast): allow custom expiration times, probably via session configuration.
-                datetime.datetime.now(datetime.timezone.utc)
-                + constants.DEFAULT_EXPIRATION,
-            )
+        temp_table_ref = None
 
+        if destination_table is None:
             if if_exists is not None and if_exists != "replace":
                 raise ValueError(
                     f"Got invalid value {repr(if_exists)} for if_exists. "
@@ -2526,6 +2518,11 @@ def to_gbq(
                 )
             if_exists = "replace"
 
+            temp_table_ref = bigframes.session._io.bigquery.random_table(
+                self._session._anonymous_dataset
+            )
+            destination_table = f"{temp_table_ref.project}.{temp_table_ref.dataset_id}.{temp_table_ref.table_id}"
+
         table_parts = destination_table.split(".")
         default_project = self._block.expr.session.bqclient.project
 
@@ -2553,15 +2550,29 @@ def to_gbq(
         except google.api_core.exceptions.NotFound:
             self._session.bqclient.create_dataset(destination_dataset, exists_ok=True)
 
+        clustering_fields = self._map_clustering_columns(
+            clustering_columns, index=index
+        )
+
         job_config = bigquery.QueryJobConfig(
             write_disposition=dispositions[if_exists],
             destination=bigquery.table.TableReference.from_string(
                 destination_table,
                 default_project=default_project,
             ),
+            clustering_fields=clustering_fields if clustering_fields else None,
         )
 
         self._run_io_query(index=index, ordering_id=ordering_id, job_config=job_config)
+
+        if temp_table_ref:
+            bigframes.session._io.bigquery.set_table_expiration(
+                self._session.bqclient,
+                temp_table_ref,
+                datetime.datetime.now(datetime.timezone.utc)
+                + constants.DEFAULT_EXPIRATION,
+            )
+
         return destination_table
 
     def to_numpy(
@@ -2756,6 +2767,52 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame:
         block = self._block.multi_apply_unary_op(self._block.value_columns, operation)
         return DataFrame(block)
 
+    def _map_clustering_columns(
+        self,
+        clustering_columns: Union[pandas.Index, Iterable[typing.Hashable]],
+        index: bool,
+    ) -> List[str]:
+        """Maps the provided clustering columns to the existing columns in the DataFrame."""
+
+        def map_columns_on_occurrence(columns):
+            mapped_columns = []
+            for col in clustering_columns:
+                if col in columns:
+                    count = columns.count(col)
+                    mapped_columns.extend([col] * count)
+            return mapped_columns
+
+        if not clustering_columns:
+            return []
+
+        if len(list(clustering_columns)) != len(set(clustering_columns)):
+            raise ValueError("Duplicates are not supported in clustering_columns")
+
+        all_possible_columns = (
+            (set(self.columns) | set(self.index.names)) if index else set(self.columns)
+        )
+        missing_columns = set(clustering_columns) - all_possible_columns
+        if missing_columns:
+            raise ValueError(
+                f"Clustering columns not found in DataFrame: {missing_columns}"
+            )
+
+        clustering_columns_for_df = map_columns_on_occurrence(
+            list(self._block.column_labels)
+        )
+        clustering_columns_for_index = (
+            map_columns_on_occurrence(list(self.index.names)) if index else []
+        )
+
+        (
+            clustering_columns_for_df,
+            clustering_columns_for_index,
+        ) = utils.get_standardized_ids(
+            clustering_columns_for_df, clustering_columns_for_index
+        )
+
+        return clustering_columns_for_index + clustering_columns_for_df
+
     def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str:
         """Create query text representing this dataframe for I/O."""
         array_value = self._block.expr
diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py
@@ -150,6 +150,17 @@ def create_temp_table(
     return f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}"
 
 
+def set_table_expiration(
+    bqclient: bigquery.Client,
+    table_ref: bigquery.TableReference,
+    expiration: datetime.datetime,
+) -> None:
+    """Set an expiration time for an existing BigQuery table."""
+    table = bqclient.get_table(table_ref)
+    table.expires = expiration
+    bqclient.update_table(table, ["expires"])
+
+
 # BigQuery REST API returns types in Legacy SQL format
 # https://cloud.google.com/bigquery/docs/data-types but we use Standard SQL
 # names
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
@@ -317,6 +317,55 @@ def test_to_gbq_w_None_column_names(
     )
 
 
+@pytest.mark.parametrize(
+    "clustering_columns",
+    [
+        pytest.param(["int64_col", "geography_col"]),
+        pytest.param(
+            ["float64_col"],
+            marks=pytest.mark.xfail(raises=google.api_core.exceptions.BadRequest),
+        ),
+        pytest.param(
+            ["int64_col", "int64_col"],
+            marks=pytest.mark.xfail(raises=ValueError),
+        ),
+    ],
+)
+def test_to_gbq_w_clustering(
+    scalars_df_default_index,
+    dataset_id,
+    bigquery_client,
+    clustering_columns,
+):
+    """Test the `to_gbq` API for creating clustered tables."""
+    destination_table = (
+        f"{dataset_id}.test_to_gbq_clustering_{'_'.join(clustering_columns)}"
+    )
+
+    scalars_df_default_index.to_gbq(
+        destination_table, clustering_columns=clustering_columns
+    )
+    table = bigquery_client.get_table(destination_table)
+
+    assert list(table.clustering_fields) == clustering_columns
+    assert table.expires is None
+
+
+def test_to_gbq_w_clustering_no_destination(
+    scalars_df_default_index,
+    bigquery_client,
+):
+    """Test the `to_gbq` API for creating clustered tables without destination."""
+    clustering_columns = ["int64_col", "geography_col"]
+    destination_table = scalars_df_default_index.to_gbq(
+        clustering_columns=clustering_columns
+    )
+    table = bigquery_client.get_table(destination_table)
+
+    assert list(table.clustering_fields) == clustering_columns
+    assert table.expires is not None
+
+
 def test_to_gbq_w_invalid_destination_table(scalars_df_index):
     with pytest.raises(ValueError):
         scalars_df_index.to_gbq("table_id")
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -11,9 +11,10 @@
 """
 from __future__ import annotations
 
-from typing import Literal, Mapping, Optional, Sequence, Union
+from typing import Hashable, Iterable, Literal, Mapping, Optional, Sequence, Union
 
 import numpy as np
+import pandas as pd
 
 from bigframes import constants
 from third_party.bigframes_vendored.pandas.core.generic import NDFrame
@@ -307,6 +308,7 @@ def to_gbq(
         if_exists: Optional[Literal["fail", "replace", "append"]] = None,
         index: bool = True,
         ordering_id: Optional[str] = None,
+        clustering_columns: Union[pd.Index, Iterable[Hashable]] = (),
     ) -> str:
         """Write a DataFrame to a BigQuery table.
 
@@ -336,6 +338,16 @@ def to_gbq(
             <BLANKLINE>
             [2 rows x 2 columns]
 
+        Write a DataFrame to a BigQuery table with clustering columns:
+            >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]})
+            >>> clustering_cols = ['col1', 'col3']
+            >>> df.to_gbq(
+            ...             "bigframes-dev.birds.test-clusters",
+            ...             if_exists="replace",
+            ...             clustering_columns=clustering_cols,
+            ...           )
+            'bigframes-dev.birds.test-clusters'
+
         Args:
             destination_table (Optional[str]):
                 Name of table to be written, in the form ``dataset.tablename``
@@ -364,6 +376,15 @@ def to_gbq(
                 If set, write the ordering of the DataFrame as a column in the
                 result table with this name.
 
+            clustering_columns (Union[pd.Index, Iterable[Hashable]], default ()):
+                Specifies the columns for clustering in the BigQuery table. The order
+                of columns in this list is significant for clustering hierarchy. Index
+                columns may be included in clustering if the `index` parameter is set
+                to True, and their names are specified in this.  These index columns,
+                if included, precede DataFrame columns in the clustering order. The
+                clustering order within the Index/DataFrame columns follows the order
+                specified in `clustering_columns`.
+
         Returns:
             str:
                 The fully-qualified ID for the written table, in the form