diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index c8632ebc8c..d2662da509 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -387,25 +387,39 @@ def reversed(self) -> Block: index_labels=self.index.names, ) - def reset_index(self, drop: bool = True) -> Block: + def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block: """Reset the index of the block, promoting the old index to a value column. Arguments: + level: the label or index level of the index levels to remove. name: this is the column id for the new value id derived from the old index Returns: A new Block because dropping index columns can break references from Index classes that point to this block. """ + if level: + # preserve original order, not user provided order + level_ids: Sequence[str] = [ + id for id in self.index_columns if id in self.index.resolve_level(level) + ] + else: + level_ids = self.index_columns + expr = self._expr - if ( + if set(self.index_columns) > set(level_ids): + new_index_cols = [col for col in self.index_columns if col not in level_ids] + new_index_labels = [self.col_id_to_index_name[id] for id in new_index_cols] + elif ( self.session._default_index_type == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 ): expr, new_index_col_id = expr.promote_offsets() new_index_cols = [new_index_col_id] + new_index_labels = [None] elif self.session._default_index_type == bigframes.enums.DefaultIndexKind.NULL: new_index_cols = [] + new_index_labels = [] else: raise ValueError( f"Unrecognized default index kind: {self.session._default_index_type}" @@ -415,22 +429,23 @@ def reset_index(self, drop: bool = True) -> Block: # Even though the index might be part of the ordering, keep that # ordering expression as reset_index shouldn't change the row # order. - expr = expr.drop_columns(self.index_columns) + expr = expr.drop_columns(level_ids) return Block( expr, index_columns=new_index_cols, + index_labels=new_index_labels, column_labels=self.column_labels, ) else: # Add index names to column index - index_labels = self.index.names column_labels_modified = self.column_labels - for level, label in enumerate(index_labels): + for position, level_id in enumerate(level_ids): + label = self.col_id_to_index_name[level_id] if label is None: - if "index" not in self.column_labels and len(index_labels) <= 1: + if "index" not in self.column_labels and self.index.nlevels <= 1: label = "index" else: - label = f"level_{level}" + label = f"level_{self.index_columns.index(level_id)}" if label in self.column_labels: raise ValueError(f"cannot insert {label}, already exists") @@ -439,11 +454,12 @@ def reset_index(self, drop: bool = True) -> Block: label = tuple(label if i == 0 else "" for i in range(nlevels)) # Create index copy with label inserted # See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html - column_labels_modified = column_labels_modified.insert(level, label) + column_labels_modified = column_labels_modified.insert(position, label) return Block( - expr, + expr.select_columns((*new_index_cols, *level_ids, *self.value_columns)), index_columns=new_index_cols, + index_labels=new_index_labels, column_labels=column_labels_modified, ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f9113e11c0..bcad00830d 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2315,9 +2315,39 @@ def _assign_series_join_on_index( return DataFrame(block.with_index_labels(self._block.index.names)) - def reset_index(self, *, drop: bool = False) -> DataFrame: - block = self._block.reset_index(drop) - return DataFrame(block) + @overload # type: ignore[override] + def reset_index( + self, + level: blocks.LevelsType = ..., + drop: bool = ..., + inplace: Literal[False] = ..., + ) -> DataFrame: + ... + + @overload + def reset_index( + self, + level: blocks.LevelsType = ..., + drop: bool = ..., + inplace: Literal[True] = ..., + ) -> None: + ... + + @overload + def reset_index( + self, level: blocks.LevelsType = None, drop: bool = False, inplace: bool = ... + ) -> Optional[DataFrame]: + ... + + def reset_index( + self, level: blocks.LevelsType = None, drop: bool = False, inplace: bool = False + ) -> Optional[DataFrame]: + block = self._block.reset_index(level, drop) + if inplace: + self._set_block(block) + return None + else: + return DataFrame(block) def set_index( self, diff --git a/bigframes/series.py b/bigframes/series.py index bfc26adc38..321a023e0c 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -406,17 +406,59 @@ def equals( return False return block_ops.equals(self._block, other._block) + @overload # type: ignore[override] + def reset_index( + self, + level: blocks.LevelsType = ..., + *, + name: typing.Optional[str] = ..., + drop: Literal[False] = ..., + inplace: Literal[False] = ..., + ) -> bigframes.dataframe.DataFrame: + ... + + @overload + def reset_index( + self, + level: blocks.LevelsType = ..., + *, + name: typing.Optional[str] = ..., + drop: Literal[True] = ..., + inplace: Literal[False] = ..., + ) -> Series: + ... + + @overload + def reset_index( + self, + level: blocks.LevelsType = ..., + *, + name: typing.Optional[str] = ..., + drop: bool = ..., + inplace: Literal[True] = ..., + ) -> None: + ... + @validations.requires_ordering() def reset_index( self, + level: blocks.LevelsType = None, *, name: typing.Optional[str] = None, drop: bool = False, - ) -> bigframes.dataframe.DataFrame | Series: - block = self._block.reset_index(drop) + inplace: bool = False, + ) -> bigframes.dataframe.DataFrame | Series | None: + block = self._block.reset_index(level, drop) if drop: + if inplace: + self._set_block(block) + return None return Series(block) else: + if inplace: + raise ValueError( + "Series.reset_index cannot combine inplace=True and drop=False" + ) if name: block = block.assign_label(self._value_column, name) return bigframes.dataframe.DataFrame(block) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 50989ae150..3b70dec0e9 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2070,6 +2070,26 @@ def test_reset_index(scalars_df_index, scalars_pandas_df_index, drop): pandas.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("drop",), + ((True,), (False,)), +) +def test_reset_index_inplace(scalars_df_index, scalars_pandas_df_index, drop): + df = scalars_df_index.copy() + df.reset_index(drop=drop, inplace=True) + assert df.index.name is None + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.copy() + pd_result.reset_index(drop=drop, inplace=True) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + def test_reset_index_then_filter( scalars_df_index, scalars_pandas_df_index, diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 13b5b1886f..0c23ea97ae 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -101,20 +101,69 @@ def test_set_multi_index(scalars_df_index, scalars_pandas_df_index): pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_reset_multi_index(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("level", "drop"), + [ + (None, True), + (None, False), + (1, True), + ("bool_col", True), + (["float64_col", "int64_too"], True), + ([2, 0], False), + ], +) +def test_df_reset_multi_index(scalars_df_index, scalars_pandas_df_index, level, drop): bf_result = ( - scalars_df_index.set_index(["bool_col", "int64_too"]).reset_index().to_pandas() + scalars_df_index.set_index(["bool_col", "int64_too", "float64_col"]) + .reset_index(level=level, drop=drop) + .to_pandas() ) pd_result = scalars_pandas_df_index.set_index( - ["bool_col", "int64_too"] - ).reset_index() + ["bool_col", "int64_too", "float64_col"] + ).reset_index(level=level, drop=drop) # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pandas.Int64Dtype()) + if pd_result.index.dtype != bf_result.index.dtype: + pd_result.index = pd_result.index.astype(pandas.Int64Dtype()) pandas.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("level", "drop"), + [ + (None, True), + (None, False), + (1, True), + ("bool_col", True), + (["float64_col", "int64_too"], True), + ([2, 0], False), + ], +) +def test_series_reset_multi_index( + scalars_df_index, scalars_pandas_df_index, level, drop +): + bf_result = ( + scalars_df_index.set_index(["bool_col", "int64_too", "float64_col"])[ + "string_col" + ] + .reset_index(level=level, drop=drop) + .to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index( + ["bool_col", "int64_too", "float64_col"] + )["string_col"].reset_index(level=level, drop=drop) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + if pd_result.index.dtype != bf_result.index.dtype: + pd_result.index = pd_result.index.astype(pandas.Int64Dtype()) + + if drop: + pandas.testing.assert_series_equal(bf_result, pd_result) + else: + pandas.testing.assert_frame_equal(bf_result, pd_result) + + def test_series_multi_index_idxmin(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.set_index(["bool_col", "int64_too"])[ "float64_col" diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index e94250e98f..aa9afa6032 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1339,6 +1339,18 @@ def test_reset_index_drop(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) +def test_series_reset_index_inplace(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.sort_index(ascending=False)["float64_col"] + bf_result.reset_index(drop=True, inplace=True) + pd_result = scalars_pandas_df_index.sort_index(ascending=False)["float64_col"] + pd_result.reset_index(drop=True, inplace=True) + + # BigQuery DataFrames default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + @pytest.mark.parametrize( ("name",), [ diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index 2070b25d66..a6f5c3d1ef 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -1657,13 +1657,11 @@ def test_reset_index_with_unnamed_index( pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_reset_index_with_unnamed_multiindex( - scalars_df_index, - scalars_pandas_df_index, -): +def test_reset_index_with_unnamed_multiindex(session): bf_df = dataframe.DataFrame( ([1, 2, 3], [2, 5, 7]), index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), + session=session, ) pd_df = pd.DataFrame( ([1, 2, 3], [2, 5, 7]), diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index d3b8b51b65..00984935a4 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1601,8 +1601,10 @@ def droplevel(self, level, axis: str | int = 0): def reset_index( self, + level=None, *, drop: bool = False, + inplace: bool = False, ) -> DataFrame | None: """Reset the index. @@ -1696,9 +1698,14 @@ class name speed max Args: + level (int, str, tuple, or list, default None): + Only remove the given levels from the index. Removes all levels by + default. drop (bool, default False): Do not try to insert index into dataframe columns. This resets the index to the default integer index. + inplace (bool, default False): + Whether to modify the DataFrame rather than creating a new one. Returns: bigframes.pandas.DataFrame: DataFrame with the new index. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 0160a7eb50..7b420cf6e3 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -321,9 +321,11 @@ def transpose(self) -> Series: def reset_index( self, + level=None, *, drop: bool = False, name=pd_ext.no_default, + inplace: bool = False, ) -> DataFrame | Series | None: """ Generate a new DataFrame or Series with the index reset. @@ -399,6 +401,9 @@ def reset_index( [4 rows x 3 columns] Args: + level (int, str, tuple, or list, default optional): + For a Series with a MultiIndex, only remove the specified levels + from the index. Removes all levels by default. drop (bool, default False): Just reset the index, without inserting it as a column in the new DataFrame. @@ -406,6 +411,8 @@ def reset_index( The name to use for the column containing the original Series values. Uses ``self.name`` by default. This argument is ignored when `drop` is True. + inplace (bool, default False): + Modify the Series in place (do not create a new object). Returns: bigframes.pandas.Series or bigframes.pandas.DataFrame or None: