From a34aead5c361cb3babc136af39d51e7826d28181 Mon Sep 17 00:00:00 2001 From: Nate Todd Date: Tue, 20 Jan 2026 16:11:55 -0500 Subject: [PATCH] Add nulls_equal option to DataFrame.join/3 Adds support for Polars' join_nulls parameter, allowing NULL values to match during join operations. Default is false (SQL semantics). Also refactors join options to use a map at the NIF boundary for extensibility, using Rustler's NifMap derive macro. --- lib/explorer/backend/data_frame.ex | 3 +- lib/explorer/data_frame.ex | 19 ++++++++++-- lib/explorer/polars_backend/data_frame.ex | 4 +-- lib/explorer/polars_backend/lazy_frame.ex | 18 +++++------ lib/explorer/polars_backend/native.ex | 2 +- native/explorer/src/lazyframe.rs | 15 +++++++-- test/explorer/data_frame/lazy_test.exs | 37 +++++++++++++++++++++++ test/explorer/data_frame_test.exs | 34 +++++++++++++++++++++ 8 files changed, 113 insertions(+), 19 deletions(-) diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex index 7022b9640..5dd78c83d 100644 --- a/lib/explorer/backend/data_frame.ex +++ b/lib/explorer/backend/data_frame.ex @@ -243,7 +243,8 @@ defmodule Explorer.Backend.DataFrame do [df()], out_df :: df(), on :: list({column_name(), column_name()}), - how :: :left | :inner | :outer | :right | :cross + how :: :left | :inner | :outer | :right | :cross, + nulls_equal :: boolean() ) :: df @callback join_asof( diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index 4a42e9334..bc6641f22 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -5151,6 +5151,8 @@ defmodule Explorer.DataFrame do * `:on` - The column(s) to join on. Defaults to overlapping columns. Does not apply to cross join. * `:how` - One of the join types (as an atom) described above. Defaults to `:inner`. + * `:nulls_equal` - If `true`, `nil` values are considered equal for matching. + Defaults to `false` (standard SQL semantics where `nil != nil`). ## Examples @@ -5217,6 +5219,18 @@ defmodule Explorer.DataFrame do c string ["d", "e", "f", "d", "e", ...] > + Join with nulls equal: + + iex> left = Explorer.DataFrame.new(a: [1, nil], b: ["a", "b"]) + iex> right = Explorer.DataFrame.new(a: [1, nil], c: ["d", "e"]) + iex> Explorer.DataFrame.join(left, right, nulls_equal: true) + #Explorer.DataFrame< + Polars[2 x 3] + a s64 [1, nil] + b string ["a", "b"] + c string ["d", "e"] + > + Inner join with different names: iex> left = Explorer.DataFrame.new(a: [1, 2, 3], b: ["a", "b", "c"]) @@ -5323,7 +5337,8 @@ defmodule Explorer.DataFrame do opts = Keyword.validate!(opts, on: find_overlapping_columns(left_columns, right_columns), - how: :inner + how: :inner, + nulls_equal: false ) unless opts[:how] in @valid_join_types do @@ -5357,7 +5372,7 @@ defmodule Explorer.DataFrame do out_df = out_df_for_join(how, left, right, on) - Shared.apply_dataframe([left, right], :join, [out_df, on, how]) + Shared.apply_dataframe([left, right], :join, [out_df, on, how, opts[:nulls_equal]]) end defp find_overlapping_columns(left_columns, right_columns) do diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex index 984b6a2ce..d690332ea 100644 --- a/lib/explorer/polars_backend/data_frame.ex +++ b/lib/explorer/polars_backend/data_frame.ex @@ -916,11 +916,11 @@ defmodule Explorer.PolarsBackend.DataFrame do # Two or more table verbs @impl true - def join([left, right], out_df, on, how) do + def join([left, right], out_df, on, how, nulls_equal) do left = lazy(left) right = lazy(right) - ldf = LazyFrame.join([left, right], out_df, on, how) + ldf = LazyFrame.join([left, right], out_df, on, how, nulls_equal) LazyFrame.collect(ldf) end diff --git a/lib/explorer/polars_backend/lazy_frame.ex b/lib/explorer/polars_backend/lazy_frame.ex index 8fed945f7..e493829ed 100644 --- a/lib/explorer/polars_backend/lazy_frame.ex +++ b/lib/explorer/polars_backend/lazy_frame.ex @@ -578,7 +578,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do # Two or more tables @impl true - def join([%DF{} = left, %DF{} = right], %DF{} = out_df, on, how) + def join([%DF{} = left, %DF{} = right], %DF{} = out_df, on, how, nulls_equal) when is_list(on) and how in [:left, :inner, :cross, :outer] do how = Atom.to_string(how) @@ -587,16 +587,18 @@ defmodule Explorer.PolarsBackend.LazyFrame do |> Enum.map(fn {left, right} -> {Native.expr_column(left), Native.expr_column(right)} end) |> Enum.unzip() + opts = %{suffix: "_right", nulls_equal: nulls_equal} + Shared.apply_dataframe( left, out_df, :lf_join, - [right.data, left_on, right_on, how, "_right"] + [right.data, left_on, right_on, how, opts] ) end @impl true - def join([%DF{} = left, %DF{} = right], %DF{} = out_df, on, :right) + def join([%DF{} = left, %DF{} = right], %DF{} = out_df, on, :right, nulls_equal) when is_list(on) do # Right join is the opposite of left join. So we swap the "on" keys, and swap the DFs # in the join. @@ -605,17 +607,13 @@ defmodule Explorer.PolarsBackend.LazyFrame do |> Enum.map(fn {left, right} -> {Native.expr_column(right), Native.expr_column(left)} end) |> Enum.unzip() + opts = %{suffix: "_left", nulls_equal: nulls_equal} + Shared.apply_dataframe( right, out_df, :lf_join, - [ - left.data, - left_on, - right_on, - "left", - "_left" - ] + [left.data, left_on, right_on, "left", opts] ) end diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex index 26de3e44d..310f19b76 100644 --- a/lib/explorer/polars_backend/native.ex +++ b/lib/explorer/polars_backend/native.ex @@ -287,7 +287,7 @@ defmodule Explorer.PolarsBackend.Native do def lf_rename_columns(_df, _column_pairs), do: err() def lf_drop_nils(_df, _column_pairs), do: err() def lf_pivot_longer(_df, _id_vars, _value_vars, _names_to, _values_to), do: err() - def lf_join(_df, _other, _left_on, _right_on, _how, _suffix), do: err() + def lf_join(_df, _other, _left_on, _right_on, _how, _opts), do: err() def lf_join_asof( _df, diff --git a/native/explorer/src/lazyframe.rs b/native/explorer/src/lazyframe.rs index 9fb00d842..2a253bbc0 100644 --- a/native/explorer/src/lazyframe.rs +++ b/native/explorer/src/lazyframe.rs @@ -3,6 +3,13 @@ use crate::{ ExplorerError, }; use polars::{lazy::dsl::Selector, prelude::*}; +use rustler::NifMap; + +#[derive(NifMap)] +pub struct ExJoinOptions { + pub suffix: String, + pub nulls_equal: bool, +} // Loads the IO functions for read/writing CSV, NDJSON, Parquet, etc. pub mod io; @@ -320,7 +327,7 @@ pub fn lf_join( left_on: Vec, right_on: Vec, how: &str, - suffix: &str, + opts: ExJoinOptions, ) -> Result { let how = match how { "left" => JoinType::Left, @@ -344,7 +351,8 @@ pub fn lf_join( .join_builder() .with(ldf1) .how(JoinType::Cross) - .suffix(suffix) + .suffix(&opts.suffix) + .join_nulls(opts.nulls_equal) .finish(), _ => ldf .join_builder() @@ -352,7 +360,8 @@ pub fn lf_join( .how(how) .left_on(ex_expr_to_exprs(left_on)) .right_on(ex_expr_to_exprs(right_on)) - .suffix(suffix) + .suffix(&opts.suffix) + .join_nulls(opts.nulls_equal) .finish(), }; diff --git a/test/explorer/data_frame/lazy_test.exs b/test/explorer/data_frame/lazy_test.exs index e9ba8b3aa..c3ead6f59 100644 --- a/test/explorer/data_frame/lazy_test.exs +++ b/test/explorer/data_frame/lazy_test.exs @@ -1429,6 +1429,43 @@ defmodule Explorer.DataFrame.LazyTest do d_left: [5, 6, 6] } end + + test "nulls_equal: false is the default (SQL semantics)" do + left = DF.new([a: [1, nil], b: ["x", "y"]], lazy: true) + right = DF.new([a: [1, nil], c: ["p", "q"]], lazy: true) + + # Default behavior: nil != nil + ldf = DF.join(left, right) + df = DF.collect(ldf) + assert DF.n_rows(df) == 1 + assert DF.to_columns(df, atom_keys: true) == %{a: [1], b: ["x"], c: ["p"]} + end + + test "nulls_equal: true matches nil values in inner join" do + left = DF.new([a: [1, 2, nil], b: ["a", "b", "c"]], lazy: true) + right = DF.new([a: [1, nil, 4], c: ["d", "e", "f"]], lazy: true) + + ldf = DF.join(left, right, nulls_equal: true) + df = DF.collect(ldf) + assert DF.n_rows(df) == 2 + + assert DF.to_columns(df, atom_keys: true) == %{ + a: [1, nil], + b: ["a", "c"], + c: ["d", "e"] + } + end + + test "nulls_equal: true with right join" do + left = DF.new([a: [1, nil], b: ["a", "b"]], lazy: true) + right = DF.new([a: [nil, 2, 3], c: ["d", "e", "f"]], lazy: true) + + ldf = DF.join(left, right, how: :right, nulls_equal: true) + df = DF.collect(ldf) + assert DF.n_rows(df) == 3 + # Right join keeps right table order + assert DF.to_columns(df, atom_keys: true).c == ["d", "e", "f"] + end end describe "join_asof/3" do diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index a0c4393bf..2d55e8b7a 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -2574,6 +2574,40 @@ defmodule Explorer.DataFrameTest do assert_raise ArgumentError, msg, fn -> DF.join(left, right, on: [0]) end end + + test "nulls_equal: false is the default (SQL semantics)" do + left = DF.new(a: [1, nil], b: ["x", "y"]) + right = DF.new(a: [1, nil], c: ["p", "q"]) + + # Default behavior: nil != nil + df = DF.join(left, right) + assert DF.n_rows(df) == 1 + assert DF.to_columns(df, atom_keys: true) == %{a: [1], b: ["x"], c: ["p"]} + end + + test "nulls_equal: true matches nil values in inner join" do + left = DF.new(a: [1, 2, nil], b: ["a", "b", "c"]) + right = DF.new(a: [1, nil, 4], c: ["d", "e", "f"]) + + df = DF.join(left, right, nulls_equal: true) + assert DF.n_rows(df) == 2 + + assert DF.to_columns(df, atom_keys: true) == %{ + a: [1, nil], + b: ["a", "c"], + c: ["d", "e"] + } + end + + test "nulls_equal: true with right join" do + left = DF.new(a: [1, nil], b: ["a", "b"]) + right = DF.new(a: [nil, 2, 3], c: ["d", "e", "f"]) + + df = DF.join(left, right, how: :right, nulls_equal: true) + assert DF.n_rows(df) == 3 + # Right join keeps right table order + assert DF.to_columns(df, atom_keys: true).c == ["d", "e", "f"] + end end describe "table/1" do