From d3850472ae1650f37a4054beab7b1853ae47ce6c Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sat, 18 Jan 2025 21:05:22 -0800 Subject: [PATCH 01/10] ENH: Support 'left_anti' and 'right_anti' joins in pd.merge --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_typing.py | 4 +- pandas/core/frame.py | 14 +- pandas/core/reshape/merge.py | 69 ++++- pandas/tests/frame/methods/test_join.py | 10 + .../reshape/merge/test_merge_antijoin.py | 262 ++++++++++++++++++ 6 files changed, 355 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/reshape/merge/test_merge_antijoin.py diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1e33971acac1a..43f65421843a5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -33,6 +33,7 @@ Other enhancements - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) +- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`) - :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). diff --git a/pandas/_typing.py b/pandas/_typing.py index b515305fb6903..4365ee85f72e3 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -442,7 +442,9 @@ def closed(self) -> bool: AnyAll = Literal["any", "all"] # merge -MergeHow = Literal["left", "right", "inner", "outer", "cross"] +MergeHow = Literal[ + "left", "right", "inner", "outer", "cross", "left_anti", "right_anti" +] MergeValidate = Literal[ "one_to_one", "1:1", diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ffffaeba4196e..52f7775a49a9c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -315,7 +315,8 @@ ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'}, + default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -328,6 +329,10 @@ join; preserve the order of the left keys. * cross: creates the cartesian product from both frames, preserves the order of the left keys. + * left_anti: use only keys from left frame that are not in right frame, similar + to SQL left anti join; preserve key order. + * right_anti: use only keys from right frame that are not in left frame, similar + to SQL right anti join; preserve key order. on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -10600,7 +10605,8 @@ def join( values given, the `other` DataFrame must have a MultiIndex. Can pass an array as the join key if it is not already contained in the calling DataFrame. Like an Excel VLOOKUP operation. - how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left' + how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'}, + default 'left' How to handle the operation of the two objects. * left: use calling frame's index (or column if on is specified) @@ -10612,6 +10618,10 @@ def join( of the calling's one. * cross: creates the cartesian product from both frames, preserves the order of the left keys. + * left_anti: use set difference of calling frame's index and `other`'s + index. + * right_anti: use set difference of `other`'s index and calling frame's + index. lsuffix : str, default '' Suffix to use from left frame's overlapping columns. rsuffix : str, default '' diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5fddd9f9aca5b..1c8bf7a0e866b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -180,7 +180,8 @@ def merge( First pandas object to merge. right : DataFrame or named Series Second pandas object to merge. - how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' + how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti}, + default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -193,6 +194,10 @@ def merge( join; preserve the order of the left keys. * cross: creates the cartesian product from both frames, preserves the order of the left keys. + * left_anti: use only keys from left frame that are not in right frame, similar + to SQL left anti join; preserve key order. + * right_anti: use only keys from right frame that are not in left frame, similar + to SQL right anti join; preserve key order. on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -969,6 +974,7 @@ def __init__( self.left = self.orig_left = _left self.right = self.orig_right = _right self.how = how + self.anti_join = False self.on = com.maybe_make_list(on) @@ -999,12 +1005,24 @@ def __init__( raise MergeError(msg) # GH 59435: raise when "how" is not a valid Merge type - merge_type = {"left", "right", "inner", "outer", "cross", "asof"} + merge_type = { + "left", + "right", + "inner", + "outer", + "left_anti", + "right_anti", + "cross", + "asof", + } if how not in merge_type: raise ValueError( f"'{how}' is not a valid Merge type: " f"left, right, inner, outer, cross, asof" ) + if self.how in {"left_anti", "right_anti"}: + self.how = self.how.split("_")[0] + self.anti_join = True self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) @@ -1405,6 +1423,11 @@ def _get_join_info( n = len(left_ax) if left_indexer is None else len(left_indexer) join_index = default_index(n) + if self.anti_join: + join_index, left_indexer, right_indexer = self._handle_anti_join( + join_index, left_indexer, right_indexer + ) + return join_index, left_indexer, right_indexer @final @@ -1447,6 +1470,48 @@ def _create_join_index( return index.copy() return index.take(indexer) + @final + def _handle_anti_join( + self, + join_index: Index, + left_indexer: npt.NDArray[np.intp] | None, + right_indexer: npt.NDArray[np.intp] | None, + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + """ + Handle anti join by returning the correct join index and indexers + + Parameters + ---------- + join_index : Index + join index + left_indexer : np.ndarray[np.intp] or None + left indexer + right_indexer : np.ndarray[np.intp] or None + right indexer + + Returns + ------- + Index, np.ndarray[np.intp] or None, np.ndarray[np.intp] or None + """ + # Make sure indexers are not None + if left_indexer is None: + left_indexer = np.arange(len(self.left)) + if right_indexer is None: + right_indexer = np.arange(len(self.right)) + + assert self.how in {"left", "right"} + if self.how == "left": + # Filter to rows where left keys are not in right keys + filt = right_indexer == -1 + else: + # Filter to rows where right keys are not in left keys + filt = left_indexer == -1 + join_index = join_index[filt] + left_indexer = left_indexer[filt] + right_indexer = right_indexer[filt] + + return join_index, left_indexer, right_indexer + @final def _get_merge_keys( self, diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 479ea7d7ba692..7a80487cb15ee 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -277,6 +277,16 @@ def test_join_index(float_frame): tm.assert_index_equal(joined.index, float_frame.index.sort_values()) tm.assert_index_equal(joined.columns, expected_columns) + # left anti + joined = f.join(f2, how="left_anti") + tm.assert_index_equal(joined.index, float_frame.index[:5]) + tm.assert_index_equal(joined.columns, expected_columns) + + # right anti + joined = f.join(f2, how="right_anti") + tm.assert_index_equal(joined.index, float_frame.index[10:][::-1]) + tm.assert_index_equal(joined.columns, expected_columns) + join_msg = "'foo' is not a valid Merge type: left, right, inner, outer, cross, asof" with pytest.raises(ValueError, match=re.escape(join_msg)): f.join(f2, how="foo") diff --git a/pandas/tests/reshape/merge/test_merge_antijoin.py b/pandas/tests/reshape/merge/test_merge_antijoin.py new file mode 100644 index 0000000000000..5376cb415901d --- /dev/null +++ b/pandas/tests/reshape/merge/test_merge_antijoin.py @@ -0,0 +1,262 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, +) +import pandas._testing as tm +from pandas.core.reshape.merge import merge + + +class TestMergeAntiJoin: + def test_merge_antijoin(self): + # GH#42916 + left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) + right = DataFrame({"B": [1, 2, 4]}, index=["a", "b", "d"]) + + result = merge(left, right, how="left_anti", left_index=True, right_index=True) + expected = DataFrame({"A": [3], "B": [np.nan]}, index=["c"]) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_index=True, right_index=True) + expected = DataFrame({"A": [np.nan], "B": [4]}, index=["d"]) + tm.assert_frame_equal(result, expected) + + def test_merge_antijoin_on_different_columns(self): + left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}) + right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["a", "d", "b"]}) + + result = merge(left, right, how="left_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [3.0], + "B": ["c"], + "C": [np.nan], + "D": np.array([np.nan], dtype=object), + }, + index=[2], + ) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [np.nan], + "B": np.array([np.nan], dtype=object), + "C": [2.0], + "D": ["d"], + }, + index=[1], + ) + tm.assert_frame_equal(result, expected) + + def test_merge_antijoin_nonunique_keys(self): + left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "b"]}) + right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["b", "d", "d"]}) + + result = merge(left, right, how="left_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [1.0], + "B": ["a"], + "C": [np.nan], + "D": np.array([np.nan], dtype=object), + }, + index=[0], + ) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [np.nan, np.nan], + "B": np.array([np.nan, np.nan], dtype=object), + "C": [2.0, 4.0], + "D": ["d", "d"], + }, + index=[2, 3], + ) + tm.assert_frame_equal(result, expected) + + def test_merge_antijoin_same_df(self): + left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"], dtype=np.int64) + result = merge(left, left, how="left_anti", left_index=True, right_index=True) + expected = DataFrame([], columns=["A_x", "A_y"], dtype=np.int64) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_merge_antijoin_nans(self): + left = DataFrame({"A": [1.0, 2.0, np.nan], "C": ["a", "b", "c"]}) + right = DataFrame({"A": [3.0, 2.0, np.nan], "D": ["d", "e", "f"]}) + result = merge(left, right, how="left_anti", on="A") + expected = DataFrame( + {"A": [1.0], "C": ["a"], "D": np.array([np.nan], dtype=object)} + ) + tm.assert_frame_equal(result, expected) + + def test_merge_antijoin_on_datetime64tz(self): + # GH11405 + left = DataFrame( + { + "key": pd.date_range("20151010", periods=2, tz="US/Eastern"), + "value": [1.0, 2.0], + } + ) + right = DataFrame( + { + "key": pd.date_range("20151011", periods=3, tz="US/Eastern"), + "value": [1.0, 2.0, 3.0], + } + ) + + expected = DataFrame( + { + "key": pd.date_range("20151010", periods=1, tz="US/Eastern"), + "value_x": [1.0], + "value_y": [np.nan], + }, + index=[0], + ) + result = merge(left, right, on="key", how="left_anti") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + "key": pd.date_range("20151012", periods=2, tz="US/Eastern"), + "value_x": [np.nan, np.nan], + "value_y": [2.0, 3.0], + }, + index=[1, 2], + ) + result = merge(left, right, on="key", how="right_anti") + tm.assert_frame_equal(result, expected) + + def test_merge_antijoin_multiindex(self): + left = DataFrame( + { + "A": [1, 2, 3], + "B": [4, 5, 6], + }, + index=MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "z")], names=["first", "second"] + ), + ) + right = DataFrame( + { + "C": [7, 8, 9], + "D": [10, 11, 12], + }, + index=MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "w")], names=["first", "second"] + ), + ) + + result = merge(left, right, how="left_anti", left_index=True, right_index=True) + expected = DataFrame( + { + "A": [3], + "B": [6], + "C": [np.nan], + "D": [np.nan], + }, + index=MultiIndex.from_tuples([("c", "z")], names=["first", "second"]), + ) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_index=True, right_index=True) + expected = DataFrame( + { + "A": [np.nan], + "B": [np.nan], + "C": [9], + "D": [12], + }, + index=MultiIndex.from_tuples([("c", "w")], names=["first", "second"]), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + [ + "Int64", + pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("timestamp[s][pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_merge_antijoin_extension_dtype(self, dtype): + left = DataFrame( + { + "join_col": [1, 3, 5], + "left_val": [1, 2, 3], + } + ) + right = DataFrame( + { + "join_col": [2, 3, 4], + "right_val": [1, 2, 3], + } + ) + left = left.astype({"join_col": dtype}) + right = right.astype({"join_col": dtype}) + result = merge(left, right, how="left_anti", on="join_col") + expected = DataFrame( + { + "join_col": [1, 5], + "left_val": [1, 3], + "right_val": [np.nan, np.nan], + }, + index=[0, 2], + ) + expected = expected.astype({"join_col": dtype}) + tm.assert_frame_equal(result, expected) + + def test_merge_antijoin_empty_dataframe(self): + left = DataFrame({"A": [], "B": []}) + right = DataFrame({"C": [], "D": []}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="C") + expected = DataFrame({"A": [], "B": [], "C": [], "D": []}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="C") + tm.assert_frame_equal(result, expected) + + def test_merge_antijoin_no_common_elements(self): + left = DataFrame({"A": [1, 2, 3]}) + right = DataFrame({"B": [4, 5, 6]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1, 2, 3], "B": [np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": [4, 5, 6]}) + tm.assert_frame_equal(result, expected) + + def test_merge_antijoin_with_null_values(self): + left = DataFrame({"A": [1.0, 2.0, None, 4.0]}) + right = DataFrame({"B": [2.0, None, 5.0]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1.0, 4.0], "B": [np.nan, np.nan]}, index=[0, 3]) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan], "B": [5.0]}, index=[2]) + tm.assert_frame_equal(result, expected) + + def test_merge_antijoin_with_mixed_dtypes(self): + left = DataFrame({"A": [1, "2", 3.0]}) + right = DataFrame({"B": ["2", 3.0, 4]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1], "B": [np.nan]}, dtype=object) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan], "B": [4]}, dtype=object, index=[2]) + tm.assert_frame_equal(result, expected) From 3290518ba0d4360237eee36699a88e1125de50a9 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sat, 18 Jan 2025 21:38:02 -0800 Subject: [PATCH 02/10] Fix mypy errors --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1c8bf7a0e866b..9faad50e3fe4c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -958,7 +958,7 @@ def __init__( self, left: DataFrame | Series, right: DataFrame | Series, - how: JoinHow | Literal["asof"] = "inner", + how: JoinHow | Literal["left_anti", "right_anti", "asof"] = "inner", on: IndexLabel | AnyArrayLike | None = None, left_on: IndexLabel | AnyArrayLike | None = None, right_on: IndexLabel | AnyArrayLike | None = None, @@ -1021,7 +1021,7 @@ def __init__( f"left, right, inner, outer, cross, asof" ) if self.how in {"left_anti", "right_anti"}: - self.how = self.how.split("_")[0] + self.how = self.how.split("_")[0] # type: ignore[assignment] self.anti_join = True self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) From 1723d702610a81c96855890cd30ac2a268ad6821 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sat, 18 Jan 2025 21:55:14 -0800 Subject: [PATCH 03/10] Fix another mypy error --- pandas/core/reshape/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9faad50e3fe4c..d578ce9faf1fa 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -938,7 +938,7 @@ class _MergeOperation: """ _merge_type = "merge" - how: JoinHow | Literal["asof"] + how: JoinHow | Literal["left_anti", "right_anti", "asof"] on: IndexLabel | None # left_on/right_on may be None when passed, but in validate_specification # get replaced with non-None. From aa83319c20ee91bce063800d2f8cb99b7425730d Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sat, 18 Jan 2025 22:30:05 -0800 Subject: [PATCH 04/10] Restructure a bit --- pandas/core/reshape/merge.py | 54 ++++++++++++++---------- pandas/tests/reshape/merge/test_merge.py | 5 ++- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d578ce9faf1fa..ea0d5836b16ac 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -938,7 +938,7 @@ class _MergeOperation: """ _merge_type = "merge" - how: JoinHow | Literal["left_anti", "right_anti", "asof"] + how: JoinHow | Literal["asof"] on: IndexLabel | None # left_on/right_on may be None when passed, but in validate_specification # get replaced with non-None. @@ -973,8 +973,7 @@ def __init__( _right = _validate_operand(right) self.left = self.orig_left = _left self.right = self.orig_right = _right - self.how = how - self.anti_join = False + self.how, self.anti_join = self._validate_how(how) self.on = com.maybe_make_list(on) @@ -1004,26 +1003,6 @@ def __init__( ) raise MergeError(msg) - # GH 59435: raise when "how" is not a valid Merge type - merge_type = { - "left", - "right", - "inner", - "outer", - "left_anti", - "right_anti", - "cross", - "asof", - } - if how not in merge_type: - raise ValueError( - f"'{how}' is not a valid Merge type: " - f"left, right, inner, outer, cross, asof" - ) - if self.how in {"left_anti", "right_anti"}: - self.how = self.how.split("_")[0] # type: ignore[assignment] - self.anti_join = True - self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) ( @@ -1053,6 +1032,35 @@ def __init__( if validate is not None: self._validate_validate_kwd(validate) + def _validate_how( + self, how: JoinHow | Literal["left_anti", "right_anti", "asof"] + ) -> tuple[JoinHow, bool]: + """ + Validate the 'how' parameter and return the actual join type and whether + this is an anti join. + """ + # GH 59435: raise when "how" is not a valid Merge type + merge_type = { + "left", + "right", + "inner", + "outer", + "left_anti", + "right_anti", + "cross", + "asof", + } + if how not in merge_type: + raise ValueError( + f"'{how}' is not a valid Merge type: " + f"left, right, inner, outer, left_anti, right_anti, cross, asof" + ) + anti_join = False + if how in {"left_anti", "right_anti"}: + how = how.split("_")[0] # type: ignore[assignment] + anti_join = True + return how, anti_join + def _maybe_require_matching_dtypes( self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike] ) -> None: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0abc1afc6ab0..f0f67aebd85ec 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1464,7 +1464,10 @@ def test_merge_how_validation(self): data2 = DataFrame( np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] ) - msg = "'full' is not a valid Merge type: left, right, inner, outer, cross, asof" + msg = ( + "'full' is not a valid Merge type: left, right, inner, outer, " + "left_anti, right_anti, cross, asof" + ) with pytest.raises(ValueError, match=re.escape(msg)): data1.merge(data2, how="full") From 08c83e9987f9faf6209990fae11cf5e3da5406ad Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sat, 18 Jan 2025 22:56:15 -0800 Subject: [PATCH 05/10] Fix mypy typing error --- pandas/core/reshape/merge.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ea0d5836b16ac..7d14bab274185 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1034,7 +1034,7 @@ def __init__( def _validate_how( self, how: JoinHow | Literal["left_anti", "right_anti", "asof"] - ) -> tuple[JoinHow, bool]: + ) -> tuple[JoinHow | Literal["asof"], bool]: """ Validate the 'how' parameter and return the actual join type and whether this is an anti join. @@ -1059,6 +1059,7 @@ def _validate_how( if how in {"left_anti", "right_anti"}: how = how.split("_")[0] # type: ignore[assignment] anti_join = True + how = cast(JoinHow | Literal["asof"], how) return how, anti_join def _maybe_require_matching_dtypes( From b0c435e3020dd2e981451f68f3a0a93d43a5b061 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sat, 18 Jan 2025 23:19:37 -0800 Subject: [PATCH 06/10] Fix test --- pandas/tests/frame/methods/test_join.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 7a80487cb15ee..aaa9485cab580 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -287,7 +287,10 @@ def test_join_index(float_frame): tm.assert_index_equal(joined.index, float_frame.index[10:][::-1]) tm.assert_index_equal(joined.columns, expected_columns) - join_msg = "'foo' is not a valid Merge type: left, right, inner, outer, cross, asof" + join_msg = ( + "'foo' is not a valid Merge type: left, right, inner, outer, " + "left_anti, right_anti, cross, asof" + ) with pytest.raises(ValueError, match=re.escape(join_msg)): f.join(f2, how="foo") From d6101028ff9258317d4d58cd39426069d47bf43b Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sun, 19 Jan 2025 00:19:13 -0800 Subject: [PATCH 07/10] Fix arrow string test --- pandas/core/reshape/merge.py | 1 + pandas/tests/reshape/merge/test_merge_antijoin.py | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 7d14bab274185..13e728d9de93c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1032,6 +1032,7 @@ def __init__( if validate is not None: self._validate_validate_kwd(validate) + @final def _validate_how( self, how: JoinHow | Literal["left_anti", "right_anti", "asof"] ) -> tuple[JoinHow | Literal["asof"], bool]: diff --git a/pandas/tests/reshape/merge/test_merge_antijoin.py b/pandas/tests/reshape/merge/test_merge_antijoin.py index 5376cb415901d..1fac8d56fe199 100644 --- a/pandas/tests/reshape/merge/test_merge_antijoin.py +++ b/pandas/tests/reshape/merge/test_merge_antijoin.py @@ -36,7 +36,7 @@ def test_merge_antijoin_on_different_columns(self): "A": [3.0], "B": ["c"], "C": [np.nan], - "D": np.array([np.nan], dtype=object), + "D": np.array([np.nan], dtype=right.D.dtype), }, index=[2], ) @@ -46,7 +46,7 @@ def test_merge_antijoin_on_different_columns(self): expected = DataFrame( { "A": [np.nan], - "B": np.array([np.nan], dtype=object), + "B": np.array([np.nan], dtype=left.B.dtype), "C": [2.0], "D": ["d"], }, @@ -64,7 +64,7 @@ def test_merge_antijoin_nonunique_keys(self): "A": [1.0], "B": ["a"], "C": [np.nan], - "D": np.array([np.nan], dtype=object), + "D": np.array([np.nan], dtype=right.D.dtype), }, index=[0], ) @@ -74,7 +74,7 @@ def test_merge_antijoin_nonunique_keys(self): expected = DataFrame( { "A": [np.nan, np.nan], - "B": np.array([np.nan, np.nan], dtype=object), + "B": np.array([np.nan, np.nan], dtype=left.B.dtype), "C": [2.0, 4.0], "D": ["d", "d"], }, @@ -93,7 +93,7 @@ def test_merge_antijoin_nans(self): right = DataFrame({"A": [3.0, 2.0, np.nan], "D": ["d", "e", "f"]}) result = merge(left, right, how="left_anti", on="A") expected = DataFrame( - {"A": [1.0], "C": ["a"], "D": np.array([np.nan], dtype=object)} + {"A": [1.0], "C": ["a"], "D": np.array([np.nan], dtype=right.D.dtype)} ) tm.assert_frame_equal(result, expected) From 803b16c9daf5001a7d528c10fac9ea2747225ecb Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sun, 19 Jan 2025 01:20:18 -0800 Subject: [PATCH 08/10] Fix future string test --- .../reshape/merge/test_merge_antijoin.py | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_antijoin.py b/pandas/tests/reshape/merge/test_merge_antijoin.py index 1fac8d56fe199..c300cdf04929c 100644 --- a/pandas/tests/reshape/merge/test_merge_antijoin.py +++ b/pandas/tests/reshape/merge/test_merge_antijoin.py @@ -27,8 +27,12 @@ def test_merge_antijoin(self): tm.assert_frame_equal(result, expected) def test_merge_antijoin_on_different_columns(self): - left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}) - right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["a", "d", "b"]}) + left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}).astype( + {"B": object} + ) + right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["a", "d", "b"]}).astype( + {"D": object} + ) result = merge(left, right, how="left_anti", left_on="B", right_on="D") expected = DataFrame( @@ -36,7 +40,7 @@ def test_merge_antijoin_on_different_columns(self): "A": [3.0], "B": ["c"], "C": [np.nan], - "D": np.array([np.nan], dtype=right.D.dtype), + "D": np.array([np.nan], dtype=object), }, index=[2], ) @@ -46,7 +50,7 @@ def test_merge_antijoin_on_different_columns(self): expected = DataFrame( { "A": [np.nan], - "B": np.array([np.nan], dtype=left.B.dtype), + "B": np.array([np.nan], dtype=object), "C": [2.0], "D": ["d"], }, @@ -55,8 +59,12 @@ def test_merge_antijoin_on_different_columns(self): tm.assert_frame_equal(result, expected) def test_merge_antijoin_nonunique_keys(self): - left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "b"]}) - right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["b", "d", "d"]}) + left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "b"]}).astype( + {"B": object} + ) + right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["b", "d", "d"]}).astype( + {"D": object} + ) result = merge(left, right, how="left_anti", left_on="B", right_on="D") expected = DataFrame( @@ -64,7 +72,7 @@ def test_merge_antijoin_nonunique_keys(self): "A": [1.0], "B": ["a"], "C": [np.nan], - "D": np.array([np.nan], dtype=right.D.dtype), + "D": np.array([np.nan], dtype=object), }, index=[0], ) @@ -74,7 +82,7 @@ def test_merge_antijoin_nonunique_keys(self): expected = DataFrame( { "A": [np.nan, np.nan], - "B": np.array([np.nan, np.nan], dtype=left.B.dtype), + "B": np.array([np.nan, np.nan], dtype=object), "C": [2.0, 4.0], "D": ["d", "d"], }, @@ -89,11 +97,15 @@ def test_merge_antijoin_same_df(self): tm.assert_frame_equal(result, expected, check_index_type=False) def test_merge_antijoin_nans(self): - left = DataFrame({"A": [1.0, 2.0, np.nan], "C": ["a", "b", "c"]}) - right = DataFrame({"A": [3.0, 2.0, np.nan], "D": ["d", "e", "f"]}) + left = DataFrame({"A": [1.0, 2.0, np.nan], "C": ["a", "b", "c"]}).astype( + {"C": object} + ) + right = DataFrame({"A": [3.0, 2.0, np.nan], "D": ["d", "e", "f"]}).astype( + {"D": object} + ) result = merge(left, right, how="left_anti", on="A") expected = DataFrame( - {"A": [1.0], "C": ["a"], "D": np.array([np.nan], dtype=right.D.dtype)} + {"A": [1.0], "C": ["a"], "D": np.array([np.nan], dtype=object)} ) tm.assert_frame_equal(result, expected) From d40b7139ec879b47b834a6fd30397e3fa37009b1 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sun, 19 Jan 2025 13:36:49 -0800 Subject: [PATCH 09/10] Retry fix --- .../reshape/merge/test_merge_antijoin.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_antijoin.py b/pandas/tests/reshape/merge/test_merge_antijoin.py index c300cdf04929c..0fe6e1f4984ca 100644 --- a/pandas/tests/reshape/merge/test_merge_antijoin.py +++ b/pandas/tests/reshape/merge/test_merge_antijoin.py @@ -40,22 +40,22 @@ def test_merge_antijoin_on_different_columns(self): "A": [3.0], "B": ["c"], "C": [np.nan], - "D": np.array([np.nan], dtype=object), + "D": [np.nan], }, index=[2], - ) + ).astype({"B": object, "D": object}) tm.assert_frame_equal(result, expected) result = merge(left, right, how="right_anti", left_on="B", right_on="D") expected = DataFrame( { "A": [np.nan], - "B": np.array([np.nan], dtype=object), + "B": [np.nan], "C": [2.0], "D": ["d"], }, index=[1], - ) + ).astype({"B": object, "D": object}) tm.assert_frame_equal(result, expected) def test_merge_antijoin_nonunique_keys(self): @@ -72,22 +72,22 @@ def test_merge_antijoin_nonunique_keys(self): "A": [1.0], "B": ["a"], "C": [np.nan], - "D": np.array([np.nan], dtype=object), + "D": [np.nan], }, index=[0], - ) + ).astype({"B": object, "D": object}) tm.assert_frame_equal(result, expected) result = merge(left, right, how="right_anti", left_on="B", right_on="D") expected = DataFrame( { "A": [np.nan, np.nan], - "B": np.array([np.nan, np.nan], dtype=object), + "B": [np.nan, np.nan], "C": [2.0, 4.0], "D": ["d", "d"], }, index=[2, 3], - ) + ).astype({"B": object, "D": object}) tm.assert_frame_equal(result, expected) def test_merge_antijoin_same_df(self): @@ -104,8 +104,8 @@ def test_merge_antijoin_nans(self): {"D": object} ) result = merge(left, right, how="left_anti", on="A") - expected = DataFrame( - {"A": [1.0], "C": ["a"], "D": np.array([np.nan], dtype=object)} + expected = DataFrame({"A": [1.0], "C": ["a"], "D": [np.nan]}).astype( + {"C": object, "D": object} ) tm.assert_frame_equal(result, expected) From 3b783fc0fd79f24e5f5d9813642a66e5137e75ae Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Tue, 21 Jan 2025 17:09:39 -0800 Subject: [PATCH 10/10] Address review comment --- .../reshape/merge/test_merge_antijoin.py | 524 +++++++++--------- 1 file changed, 265 insertions(+), 259 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_antijoin.py b/pandas/tests/reshape/merge/test_merge_antijoin.py index 0fe6e1f4984ca..006622c6e5e94 100644 --- a/pandas/tests/reshape/merge/test_merge_antijoin.py +++ b/pandas/tests/reshape/merge/test_merge_antijoin.py @@ -12,263 +12,269 @@ from pandas.core.reshape.merge import merge -class TestMergeAntiJoin: - def test_merge_antijoin(self): - # GH#42916 - left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) - right = DataFrame({"B": [1, 2, 4]}, index=["a", "b", "d"]) - - result = merge(left, right, how="left_anti", left_index=True, right_index=True) - expected = DataFrame({"A": [3], "B": [np.nan]}, index=["c"]) - tm.assert_frame_equal(result, expected) - - result = merge(left, right, how="right_anti", left_index=True, right_index=True) - expected = DataFrame({"A": [np.nan], "B": [4]}, index=["d"]) - tm.assert_frame_equal(result, expected) - - def test_merge_antijoin_on_different_columns(self): - left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}).astype( - {"B": object} - ) - right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["a", "d", "b"]}).astype( - {"D": object} - ) - - result = merge(left, right, how="left_anti", left_on="B", right_on="D") - expected = DataFrame( - { - "A": [3.0], - "B": ["c"], - "C": [np.nan], - "D": [np.nan], - }, - index=[2], - ).astype({"B": object, "D": object}) - tm.assert_frame_equal(result, expected) - - result = merge(left, right, how="right_anti", left_on="B", right_on="D") - expected = DataFrame( - { - "A": [np.nan], - "B": [np.nan], - "C": [2.0], - "D": ["d"], - }, - index=[1], - ).astype({"B": object, "D": object}) - tm.assert_frame_equal(result, expected) - - def test_merge_antijoin_nonunique_keys(self): - left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "b"]}).astype( - {"B": object} - ) - right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["b", "d", "d"]}).astype( - {"D": object} - ) - - result = merge(left, right, how="left_anti", left_on="B", right_on="D") - expected = DataFrame( - { - "A": [1.0], - "B": ["a"], - "C": [np.nan], - "D": [np.nan], - }, - index=[0], - ).astype({"B": object, "D": object}) - tm.assert_frame_equal(result, expected) - - result = merge(left, right, how="right_anti", left_on="B", right_on="D") - expected = DataFrame( - { - "A": [np.nan, np.nan], - "B": [np.nan, np.nan], - "C": [2.0, 4.0], - "D": ["d", "d"], - }, - index=[2, 3], - ).astype({"B": object, "D": object}) - tm.assert_frame_equal(result, expected) - - def test_merge_antijoin_same_df(self): - left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"], dtype=np.int64) - result = merge(left, left, how="left_anti", left_index=True, right_index=True) - expected = DataFrame([], columns=["A_x", "A_y"], dtype=np.int64) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_merge_antijoin_nans(self): - left = DataFrame({"A": [1.0, 2.0, np.nan], "C": ["a", "b", "c"]}).astype( - {"C": object} - ) - right = DataFrame({"A": [3.0, 2.0, np.nan], "D": ["d", "e", "f"]}).astype( - {"D": object} - ) - result = merge(left, right, how="left_anti", on="A") - expected = DataFrame({"A": [1.0], "C": ["a"], "D": [np.nan]}).astype( - {"C": object, "D": object} - ) - tm.assert_frame_equal(result, expected) - - def test_merge_antijoin_on_datetime64tz(self): - # GH11405 - left = DataFrame( - { - "key": pd.date_range("20151010", periods=2, tz="US/Eastern"), - "value": [1.0, 2.0], - } - ) - right = DataFrame( - { - "key": pd.date_range("20151011", periods=3, tz="US/Eastern"), - "value": [1.0, 2.0, 3.0], - } - ) - - expected = DataFrame( - { - "key": pd.date_range("20151010", periods=1, tz="US/Eastern"), - "value_x": [1.0], - "value_y": [np.nan], - }, - index=[0], - ) - result = merge(left, right, on="key", how="left_anti") - tm.assert_frame_equal(result, expected) - - expected = DataFrame( - { - "key": pd.date_range("20151012", periods=2, tz="US/Eastern"), - "value_x": [np.nan, np.nan], - "value_y": [2.0, 3.0], - }, - index=[1, 2], - ) - result = merge(left, right, on="key", how="right_anti") - tm.assert_frame_equal(result, expected) - - def test_merge_antijoin_multiindex(self): - left = DataFrame( - { - "A": [1, 2, 3], - "B": [4, 5, 6], - }, - index=MultiIndex.from_tuples( - [("a", "x"), ("b", "y"), ("c", "z")], names=["first", "second"] - ), - ) - right = DataFrame( - { - "C": [7, 8, 9], - "D": [10, 11, 12], - }, - index=MultiIndex.from_tuples( - [("a", "x"), ("b", "y"), ("c", "w")], names=["first", "second"] - ), - ) - - result = merge(left, right, how="left_anti", left_index=True, right_index=True) - expected = DataFrame( - { - "A": [3], - "B": [6], - "C": [np.nan], - "D": [np.nan], - }, - index=MultiIndex.from_tuples([("c", "z")], names=["first", "second"]), - ) - tm.assert_frame_equal(result, expected) - - result = merge(left, right, how="right_anti", left_index=True, right_index=True) - expected = DataFrame( - { - "A": [np.nan], - "B": [np.nan], - "C": [9], - "D": [12], - }, - index=MultiIndex.from_tuples([("c", "w")], names=["first", "second"]), - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "dtype", - [ - "Int64", - pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), - pytest.param("timestamp[s][pyarrow]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], +def test_merge_antijoin(): + # GH#42916 + left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) + right = DataFrame({"B": [1, 2, 4]}, index=["a", "b", "d"]) + + result = merge(left, right, how="left_anti", left_index=True, right_index=True) + expected = DataFrame({"A": [3], "B": [np.nan]}, index=["c"]) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_index=True, right_index=True) + expected = DataFrame({"A": [np.nan], "B": [4]}, index=["d"]) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_on_different_columns(): + left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}).astype({"B": object}) + right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["a", "d", "b"]}).astype( + {"D": object} + ) + + result = merge(left, right, how="left_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [3.0], + "B": ["c"], + "C": [np.nan], + "D": [np.nan], + }, + index=[2], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [np.nan], + "B": [np.nan], + "C": [2.0], + "D": ["d"], + }, + index=[1], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_nonunique_keys(): + left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "b"]}).astype({"B": object}) + right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["b", "d", "d"]}).astype( + {"D": object} + ) + + result = merge(left, right, how="left_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [1.0], + "B": ["a"], + "C": [np.nan], + "D": [np.nan], + }, + index=[0], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [np.nan, np.nan], + "B": [np.nan, np.nan], + "C": [2.0, 4.0], + "D": ["d", "d"], + }, + index=[2, 3], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_same_df(): + left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"], dtype=np.int64) + result = merge(left, left, how="left_anti", left_index=True, right_index=True) + expected = DataFrame([], columns=["A_x", "A_y"], dtype=np.int64) + tm.assert_frame_equal(result, expected, check_index_type=False) + + +def test_merge_antijoin_nans(): + left = DataFrame({"A": [1.0, 2.0, np.nan], "C": ["a", "b", "c"]}).astype( + {"C": object} + ) + right = DataFrame({"A": [3.0, 2.0, np.nan], "D": ["d", "e", "f"]}).astype( + {"D": object} + ) + result = merge(left, right, how="left_anti", on="A") + expected = DataFrame({"A": [1.0], "C": ["a"], "D": [np.nan]}).astype( + {"C": object, "D": object} + ) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_on_datetime64tz(): + # GH11405 + left = DataFrame( + { + "key": pd.date_range("20151010", periods=2, tz="US/Eastern"), + "value": [1.0, 2.0], + } + ) + right = DataFrame( + { + "key": pd.date_range("20151011", periods=3, tz="US/Eastern"), + "value": [1.0, 2.0, 3.0], + } + ) + + expected = DataFrame( + { + "key": pd.date_range("20151010", periods=1, tz="US/Eastern"), + "value_x": [1.0], + "value_y": [np.nan], + }, + index=[0], + ) + result = merge(left, right, on="key", how="left_anti") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + "key": pd.date_range("20151012", periods=2, tz="US/Eastern"), + "value_x": [np.nan, np.nan], + "value_y": [2.0, 3.0], + }, + index=[1, 2], + ) + result = merge(left, right, on="key", how="right_anti") + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_multiindex(): + left = DataFrame( + { + "A": [1, 2, 3], + "B": [4, 5, 6], + }, + index=MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "z")], names=["first", "second"] + ), + ) + right = DataFrame( + { + "C": [7, 8, 9], + "D": [10, 11, 12], + }, + index=MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "w")], names=["first", "second"] + ), + ) + + result = merge(left, right, how="left_anti", left_index=True, right_index=True) + expected = DataFrame( + { + "A": [3], + "B": [6], + "C": [np.nan], + "D": [np.nan], + }, + index=MultiIndex.from_tuples([("c", "z")], names=["first", "second"]), + ) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_index=True, right_index=True) + expected = DataFrame( + { + "A": [np.nan], + "B": [np.nan], + "C": [9], + "D": [12], + }, + index=MultiIndex.from_tuples([("c", "w")], names=["first", "second"]), ) - def test_merge_antijoin_extension_dtype(self, dtype): - left = DataFrame( - { - "join_col": [1, 3, 5], - "left_val": [1, 2, 3], - } - ) - right = DataFrame( - { - "join_col": [2, 3, 4], - "right_val": [1, 2, 3], - } - ) - left = left.astype({"join_col": dtype}) - right = right.astype({"join_col": dtype}) - result = merge(left, right, how="left_anti", on="join_col") - expected = DataFrame( - { - "join_col": [1, 5], - "left_val": [1, 3], - "right_val": [np.nan, np.nan], - }, - index=[0, 2], - ) - expected = expected.astype({"join_col": dtype}) - tm.assert_frame_equal(result, expected) - - def test_merge_antijoin_empty_dataframe(self): - left = DataFrame({"A": [], "B": []}) - right = DataFrame({"C": [], "D": []}) - - result = merge(left, right, how="left_anti", left_on="A", right_on="C") - expected = DataFrame({"A": [], "B": [], "C": [], "D": []}) - tm.assert_frame_equal(result, expected) - - result = merge(left, right, how="right_anti", left_on="A", right_on="C") - tm.assert_frame_equal(result, expected) - - def test_merge_antijoin_no_common_elements(self): - left = DataFrame({"A": [1, 2, 3]}) - right = DataFrame({"B": [4, 5, 6]}) - - result = merge(left, right, how="left_anti", left_on="A", right_on="B") - expected = DataFrame({"A": [1, 2, 3], "B": [np.nan, np.nan, np.nan]}) - tm.assert_frame_equal(result, expected) - - result = merge(left, right, how="right_anti", left_on="A", right_on="B") - expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": [4, 5, 6]}) - tm.assert_frame_equal(result, expected) - - def test_merge_antijoin_with_null_values(self): - left = DataFrame({"A": [1.0, 2.0, None, 4.0]}) - right = DataFrame({"B": [2.0, None, 5.0]}) - - result = merge(left, right, how="left_anti", left_on="A", right_on="B") - expected = DataFrame({"A": [1.0, 4.0], "B": [np.nan, np.nan]}, index=[0, 3]) - tm.assert_frame_equal(result, expected) - - result = merge(left, right, how="right_anti", left_on="A", right_on="B") - expected = DataFrame({"A": [np.nan], "B": [5.0]}, index=[2]) - tm.assert_frame_equal(result, expected) - - def test_merge_antijoin_with_mixed_dtypes(self): - left = DataFrame({"A": [1, "2", 3.0]}) - right = DataFrame({"B": ["2", 3.0, 4]}) - - result = merge(left, right, how="left_anti", left_on="A", right_on="B") - expected = DataFrame({"A": [1], "B": [np.nan]}, dtype=object) - tm.assert_frame_equal(result, expected) - - result = merge(left, right, how="right_anti", left_on="A", right_on="B") - expected = DataFrame({"A": [np.nan], "B": [4]}, dtype=object, index=[2]) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + "Int64", + pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("timestamp[s][pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +def test_merge_antijoin_extension_dtype(dtype): + left = DataFrame( + { + "join_col": [1, 3, 5], + "left_val": [1, 2, 3], + } + ) + right = DataFrame( + { + "join_col": [2, 3, 4], + "right_val": [1, 2, 3], + } + ) + left = left.astype({"join_col": dtype}) + right = right.astype({"join_col": dtype}) + result = merge(left, right, how="left_anti", on="join_col") + expected = DataFrame( + { + "join_col": [1, 5], + "left_val": [1, 3], + "right_val": [np.nan, np.nan], + }, + index=[0, 2], + ) + expected = expected.astype({"join_col": dtype}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_empty_dataframe(): + left = DataFrame({"A": [], "B": []}) + right = DataFrame({"C": [], "D": []}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="C") + expected = DataFrame({"A": [], "B": [], "C": [], "D": []}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="C") + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_no_common_elements(): + left = DataFrame({"A": [1, 2, 3]}) + right = DataFrame({"B": [4, 5, 6]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1, 2, 3], "B": [np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": [4, 5, 6]}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_with_null_values(): + left = DataFrame({"A": [1.0, 2.0, None, 4.0]}) + right = DataFrame({"B": [2.0, None, 5.0]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1.0, 4.0], "B": [np.nan, np.nan]}, index=[0, 3]) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan], "B": [5.0]}, index=[2]) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_with_mixed_dtypes(): + left = DataFrame({"A": [1, "2", 3.0]}) + right = DataFrame({"B": ["2", 3.0, 4]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1], "B": [np.nan]}, dtype=object) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan], "B": [4]}, dtype=object, index=[2]) + tm.assert_frame_equal(result, expected)