From 6333c3b906d86b5bf2072012fa910ea05c766c40 Mon Sep 17 00:00:00 2001 From: U-S-jun Date: Thu, 28 Nov 2024 20:55:36 +0900 Subject: [PATCH 1/3] ENH: Add sort_columns parameter to combine_first --- pandas/core/frame.py | 27 ++++++++++++++++++- .../tests/frame/methods/test_combine_first.py | 8 ++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d1450537dd740..1132c6a355179 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8712,7 +8712,7 @@ def combine( frame_result = self._constructor(result, index=new_index, columns=new_columns) return frame_result.__finalize__(self, method="combine") - def combine_first(self, other: DataFrame) -> DataFrame: + def combine_first(self, other: DataFrame, sort_columns=True) -> DataFrame: """ Update null elements with value in the same location in `other`. @@ -8728,6 +8728,10 @@ def combine_first(self, other: DataFrame) -> DataFrame: ---------- other : DataFrame Provided DataFrame to use to fill null values. + sort_columns : bool, default True + Whether to sort the columns in the result DataFrame. If False, the + order of the columns in `self` is preserved. + Returns ------- @@ -8741,6 +8745,8 @@ def combine_first(self, other: DataFrame) -> DataFrame: Examples -------- + Default behavior with `sort_columns=True` (default): + >>> df1 = pd.DataFrame({"A": [None, 0], "B": [None, 4]}) >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) >>> df1.combine_first(df2) @@ -8748,6 +8754,16 @@ def combine_first(self, other: DataFrame) -> DataFrame: 0 1.0 3.0 1 0.0 4.0 + + Preserving the column order of `self` with `sort_columns=False`: + + >>> df1 = pd.DataFrame({"B": [None, 4], "A": [0, None]}) + >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) + >>> df1.combine_first(df2, sort_columns=False) + B A + 0 3.0 0.0 + 1 4.0 1.0 + Null values still persist if the location of that null value does not exist in `other` @@ -8773,6 +8789,8 @@ def combiner(x: Series, y: Series): return y_values return expressions.where(mask, y_values, x_values) + + all_columns = self.columns.union(other.columns) if len(other) == 0: combined = self.reindex( @@ -8790,6 +8808,13 @@ def combiner(x: Series, y: Series): if dtypes: combined = combined.astype(dtypes) + + combined = combined.reindex(columns=all_columns, fill_value=None) + + if not sort_columns: + combined = combined[self.columns] + + return combined.__finalize__(self, method="combine_first") diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 87b7d5052a345..e60b2fbe524fc 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -560,3 +560,11 @@ def test_combine_first_empty_columns(): result = left.combine_first(right) expected = DataFrame(columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected) + +def test_combine_first_column_order(): + df1 = pd.DataFrame({"B": [1, 2], "A": [3, 4]}) + df2 = pd.DataFrame({"A": [5]}, index=[1]) + + result = df1.combine_first(df2,sort_columns=False) + expected = pd.DataFrame({"B": [1, 2], "A": [3, 4]}) + pd.testing.assert_frame_equal(result, expected) From 0b4ebc7fffe4854f96851776a998f47f3e654c6b Mon Sep 17 00:00:00 2001 From: U-S-jun Date: Thu, 28 Nov 2024 21:46:07 +0900 Subject: [PATCH 2/3] ENH: Add sort_columns parameter to combine_first --- doc/source/whatsnew/v3.0.0.rst | 4 ++++ pandas/core/frame.py | 18 ++++++++++-------- .../tests/frame/methods/test_combine_first.py | 11 ++++++----- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4bd31de185bb4..63062fbd44a63 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -13,6 +13,10 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +- Added a ``sort_columns`` parameter to :meth:`DataFrame.combine_first` to allow + control over whether the result's column order should follow the original + DataFrame's order or be sorted lexicographically. ([#60427](https://github.com/pandas-dev/pandas/issues/60427)) + .. _whatsnew_300.enhancements.enhancement1: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1132c6a355179..5f1df1dc92372 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8712,7 +8712,9 @@ def combine( frame_result = self._constructor(result, index=new_index, columns=new_columns) return frame_result.__finalize__(self, method="combine") - def combine_first(self, other: DataFrame, sort_columns=True) -> DataFrame: + def combine_first( + self, other: DataFrame, *, sort_columns: bool = True + ) -> DataFrame: """ Update null elements with value in the same location in `other`. @@ -8789,7 +8791,7 @@ def combiner(x: Series, y: Series): return y_values return expressions.where(mask, y_values, x_values) - + all_columns = self.columns.union(other.columns) if len(other) == 0: @@ -8808,13 +8810,11 @@ def combiner(x: Series, y: Series): if dtypes: combined = combined.astype(dtypes) - + combined = combined.reindex(columns=all_columns, fill_value=None) if not sort_columns: combined = combined[self.columns] - - return combined.__finalize__(self, method="combine_first") @@ -10543,9 +10543,11 @@ def _append( index = Index( [other.name], - name=self.index.names - if isinstance(self.index, MultiIndex) - else self.index.name, + name=( + self.index.names + if isinstance(self.index, MultiIndex) + else self.index.name + ), ) row_df = other.to_frame().T # infer_objects is needed for diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index e60b2fbe524fc..00f4393abb569 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -561,10 +561,11 @@ def test_combine_first_empty_columns(): expected = DataFrame(columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected) + def test_combine_first_column_order(): - df1 = pd.DataFrame({"B": [1, 2], "A": [3, 4]}) - df2 = pd.DataFrame({"A": [5]}, index=[1]) + df1 = DataFrame({"B": [1, 2], "A": [3, 4]}) + df2 = DataFrame({"A": [5]}, index=[1]) - result = df1.combine_first(df2,sort_columns=False) - expected = pd.DataFrame({"B": [1, 2], "A": [3, 4]}) - pd.testing.assert_frame_equal(result, expected) + result = df1.combine_first(df2, sort_columns=False) + expected = DataFrame({"B": [1, 2], "A": [3, 4]}) + tm.assert_frame_equal(result, expected) From edc2e8d193145ebd2d2f20636bc26bb57dca1787 Mon Sep 17 00:00:00 2001 From: U-S-jun Date: Thu, 28 Nov 2024 23:23:02 +0900 Subject: [PATCH 3/3] ENH: Add sort_columns parameter to combine_first --- pandas/core/frame.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f1df1dc92372..c07567d5d4786 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8721,10 +8721,10 @@ def combine_first( Combine two DataFrame objects by filling null values in one DataFrame with non-null values from other DataFrame. The row and column indexes of the resulting DataFrame will be the union of the two. The resulting - dataframe contains the 'first' dataframe values and overrides the - second one values where both first.loc[index, col] and - second.loc[index, col] are not missing values, upon calling - first.combine_first(second). + DataFrame contains the 'first' DataFrame values and overrides the + second one values where both `first.loc[index, col]` and + `second.loc[index, col]` are not missing values, upon calling + `first.combine_first(second)`. Parameters ---------- @@ -8734,7 +8734,6 @@ def combine_first( Whether to sort the columns in the result DataFrame. If False, the order of the columns in `self` is preserved. - Returns ------- DataFrame @@ -8752,27 +8751,26 @@ def combine_first( >>> df1 = pd.DataFrame({"A": [None, 0], "B": [None, 4]}) >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) >>> df1.combine_first(df2) - A B + A B 0 1.0 3.0 1 0.0 4.0 - Preserving the column order of `self` with `sort_columns=False`: >>> df1 = pd.DataFrame({"B": [None, 4], "A": [0, None]}) >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) >>> df1.combine_first(df2, sort_columns=False) - B A + B A 0 3.0 0.0 1 4.0 1.0 Null values still persist if the location of that null value - does not exist in `other` + does not exist in `other`. >>> df1 = pd.DataFrame({"A": [None, 0], "B": [4, None]}) >>> df2 = pd.DataFrame({"B": [3, 3], "C": [1, 1]}, index=[1, 2]) >>> df1.combine_first(df2) - A B C + A B C 0 NaN 4.0 NaN 1 0.0 3.0 1.0 2 NaN 3.0 1.0