feat: Add DataFrame/Series.squeeze

TrevorBergeron · TrevorBergeron · commit 44bc5ee1ae59 · 2025-11-12T22:15:37.000Z
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2033,6 +2033,22 @@ def nsmallest(
         column_ids = self._sql_names(columns)
         return DataFrame(block_ops.nsmallest(self._block, n, column_ids, keep=keep))
 
+    def squeeze(self, axis: typing.Optional[typing.Union[int, str]] = None):
+        nrows, ncols = self.shape
+        squeeze_cols = True
+        squeeze_rows = True
+        if axis is not None:
+            axis_n = utils.get_axis_number(axis)
+            squeeze_cols = axis_n == 1
+            squeeze_rows = axis_n == 0
+        if (ncols == 1) and (nrows == 1) and (squeeze_rows and squeeze_cols):
+            return self.to_pandas().iloc[0, 0]
+        elif ncols == 1 and squeeze_cols:
+            return bigframes.series.Series(self._block)
+        elif nrows == 1 and squeeze_rows:
+            return bigframes.series.Series(self._block.transpose(single_row_mode=True))
+        return self
+
     def insert(
         self,
         loc: int,
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -1133,6 +1133,11 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series:
             block_ops.nsmallest(self._block, n, [self._value_column], keep=keep)
         )
 
+    def squeeze(self, axis=None):
+        if len(self) == 1:
+            return self.to_pandas().iloc[0]
+        return self
+
     def isin(self, values) -> "Series":
         if isinstance(values, Series):
             return Series(self._block.isin(values._block))
diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py
@@ -1785,6 +1785,57 @@ def test_dataframe_sort_index_inplace(scalars_dfs):
     pandas.testing.assert_frame_equal(bf_result, pd_result)
 
 
+@pytest.mark.parametrize(
+    ("axis",),
+    ((0,), ("columns",), (None,)),
+)
+def test_dataframe_squeeze_noop(scalars_dfs, axis):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    pd_result = scalars_pandas_df.squeeze(axis=axis)
+    bf_result = scalars_df.squeeze(axis=axis).to_pandas()
+
+    pandas.testing.assert_frame_equal(bf_result, pd_result)
+
+
+@pytest.mark.parametrize(
+    ("axis",),
+    ((1,), (None,)),
+)
+def test_dataframe_squeeze_cols(scalars_dfs, axis):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    pd_result = scalars_pandas_df[["int64_col"]].squeeze(axis)
+    bf_result = scalars_df[["int64_col"]].squeeze(axis).to_pandas()
+
+    pandas.testing.assert_series_equal(bf_result, pd_result)
+
+
+@pytest.mark.parametrize(
+    ("axis",),
+    ((0,), (None,)),
+)
+def test_dataframe_squeeze_rows(scalars_dfs, axis):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    # implicitly transposes, so col types need to be compatible
+    pd_result = scalars_pandas_df[["int64_col", "int64_too"]].head(1).squeeze(axis)
+    bf_result = scalars_df[["int64_col", "int64_too"]].head(1).squeeze(axis).to_pandas()
+
+    pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
+
+
+def test_dataframe_squeeze_both_axes(
+    scalars_dfs,
+):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    pd_result = scalars_pandas_df[["int64_col"]].head(1).squeeze()
+    bf_result = scalars_df[["int64_col"]].head(1).squeeze()
+
+    assert pd_result == bf_result
+
+
 def test_df_abs(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     columns = ["int64_col", "int64_too", "float64_col"]
diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py
@@ -544,6 +544,25 @@ def test_series_equals_different_values(scalars_df_index, scalars_pandas_df_inde
     assert pd_result == bf_result
 
 
+def test_series_squeeze_noop(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    pd_result = scalars_pandas_df["int64_too"].squeeze()
+    bf_result = scalars_df["int64_too"].squeeze().to_pandas()
+
+    assert_series_equal(bf_result, pd_result)
+
+
+def test_series_squeeze_squeezes(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    # implicitly transposes, so col types need to be compatible
+    pd_result = scalars_pandas_df["int64_too"].head(1).squeeze()
+    bf_result = scalars_df["int64_too"].head(1).squeeze()
+
+    assert pd_result == bf_result
+
+
 def test_series_get_with_default_index(scalars_dfs):
     col_name = "float64_col"
     key = 2
diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py
@@ -82,6 +82,108 @@ def __iter__(self) -> Iterator:
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def squeeze(self, axis=None):
+        """
+        Squeeze 1 dimensional axis objects into scalars.
+
+        Series or DataFrames with a single element are squeezed to a scalar.
+        DataFrames with a single column or a single row are squeezed to a
+        Series. Otherwise the object is unchanged.
+
+        This method is most useful when you don't know if your
+        object is a Series or DataFrame, but you do know it has just a single
+        column. In that case you can safely call `squeeze` to ensure you have a
+        Series.
+
+        **Examples:**
+            >>> primes = bpd.Series([2, 3, 5, 7])
+
+            Slicing might produce a Series with a single value:
+
+            >>> even_primes = primes[primes % 2 == 0]
+            >>> even_primes
+            0    2
+            dtype: Int64
+
+            >>> even_primes.squeeze()
+            np.int64(2)
+
+            Squeezing objects with more than one value in every axis does nothing:
+
+            >>> odd_primes = primes[primes % 2 == 1]
+            >>> odd_primes
+            1    3
+            2    5
+            3    7
+            dtype: Int64
+
+            >>> odd_primes.squeeze()
+            1    3
+            2    5
+            3    7
+            dtype: Int64
+
+            Squeezing is even more effective when used with DataFrames.
+
+            >>> df = bpd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
+            >>> df
+               a  b
+            0  1  2
+            1  3  4
+            <BLANKLINE>
+            [2 rows x 2 columns]
+
+            Slicing a single column will produce a DataFrame with the columns
+            having only one value:
+
+            >>> df_a = df[['a']]
+            >>> df_a
+               a
+            0  1
+            1  3
+            <BLANKLINE>
+            [2 rows x 1 columns]
+
+            So the columns can be squeezed down, resulting in a Series:
+
+            >>> df_a.squeeze('columns')
+            0    1
+            1    3
+            Name: a, dtype: Int64
+
+            Slicing a single row from a single column will produce a single
+            scalar DataFrame:
+
+            >>> df_0a = df.loc[[0], ['a']]
+            >>> df_0a
+               a
+            0  1
+            <BLANKLINE>
+            [1 rows x 1 columns]
+
+            Squeezing the rows produces a single scalar Series:
+
+            >>> df_0a.squeeze('rows')
+            a    1
+            Name: 0, dtype: Int64
+
+            Squeezing all axes will project directly into a scalar:
+
+            >>> df_0a.squeeze()
+            np.int64(1)
+
+        Args:
+            axis ({0 or 'index', 1 or 'columns', None}, default None):
+                A specific axis to squeeze. By default, all length-1 axes are
+                squeezed. For `Series` this parameter is unused and defaults to `None`.
+
+        Returns:
+            DataFrame, Series, or scalar:
+                The projection after squeezing `axis` or all the axes.
+
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     # -------------------------------------------------------------------------
     # Unary Methods