@@ -2635,3 +2635,34 @@ def trigger_interrupt():
26352635
26362636 # Make sure the interrupt thread has finished
26372637 interrupt_thread .join (timeout = 1.0 )
2638+
2639+
2640+ def test_join_deduplicate_select ():
2641+ """Test that select works correctly after a deduplicated join."""
2642+ ctx = SessionContext ()
2643+
2644+ left_df = ctx .from_pydict ({"id" : [1 , 2 , 3 ], "name" : ["Alice" , "Bob" , "Charlie" ]})
2645+ right_df = ctx .from_pydict ({"id" : [2 , 3 , 4 ], "city" : ["New York" , "London" , "Paris" ]})
2646+
2647+ # Join and select the id column to confirm it works
2648+ joined_df = left_df .join (right_df , on = "id" )
2649+ selected_df = joined_df .select (column ("id" ))
2650+ result = selected_df .collect ()[0 ]
2651+
2652+ # Should have only the matching ids (2, 3)
2653+ expected_ids = [2 , 3 ]
2654+ assert result .column (0 ).to_pylist () == expected_ids
2655+
2656+ # Also test selecting multiple columns
2657+ multi_select_df = joined_df .select (column ("id" ), column ("name" ), column ("city" ))
2658+ multi_result = multi_select_df .collect ()[0 ]
2659+
2660+ expected_data = {
2661+ "id" : [2 , 3 ],
2662+ "name" : ["Bob" , "Charlie" ],
2663+ "city" : ["New York" , "London" ]
2664+ }
2665+
2666+ assert multi_result .column (0 ).to_pylist () == expected_data ["id" ]
2667+ assert multi_result .column (1 ).to_pylist () == expected_data ["name" ]
2668+ assert multi_result .column (2 ).to_pylist () == expected_data ["city" ]
0 commit comments