diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index 4a42e9334..ca87a7b34 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -4404,16 +4404,16 @@ defmodule Explorer.DataFrame do iex> Explorer.DataFrame.sample(df, 3, seed: 100) #Explorer.DataFrame< Polars[3 x 10] - year s64 [2012, 2013, 2014] - country string ["SYRIAN ARAB REPUBLIC", "EGYPT", "AFGHANISTAN"] - total s64 [12198, 58198, 2675] - solid_fuel s64 [1, 224, 1194] - liquid_fuel s64 [7909, 26501, 1393] - gas_fuel s64 [3265, 24672, 74] - cement s64 [816, 6800, 14] - gas_flaring s64 [208, 0, 0] - per_capita f64 [0.61, 0.66, 0.08] - bunker_fuels s64 [437, 694, 9] + year s64 [2014, 2014, 2014] + country string ["GUYANA", "MALDIVES", "CAYMAN ISLANDS"] + total s64 [548, 364, 148] + solid_fuel s64 [0, 0, 0] + liquid_fuel s64 [548, 364, 148] + gas_fuel s64 [0, 0, 0] + cement s64 [0, 0, 0] + gas_flaring s64 [0, 0, 0] + per_capita f64 [0.72, 1.02, 2.5] + bunker_fuels s64 [8, 204, 9] > Or you can sample a proportion of rows: @@ -4422,16 +4422,16 @@ defmodule Explorer.DataFrame do iex> Explorer.DataFrame.sample(df, 0.03, seed: 100) #Explorer.DataFrame< Polars[32 x 10] - year s64 [2013, 2012, 2014, 2011, 2011, ...] - country string ["BRITISH VIRGIN ISLANDS", "TAJIKISTAN", "AFGHANISTAN", "ICELAND", "SINGAPORE", ...] - total s64 [48, 800, 2675, 513, 12332, ...] - solid_fuel s64 [0, 192, 1194, 94, 7, ...] - liquid_fuel s64 [48, 501, 1393, 400, 7774, ...] - gas_fuel s64 [0, 74, 74, 0, 4551, ...] - cement s64 [0, 34, 14, 19, 0, ...] - gas_flaring s64 [0, 0, 0, 0, 0, ...] - per_capita f64 [1.64, 0.1, 0.08, 1.6, 2.38, ...] - bunker_fuels s64 [0, 28, 9, 168, 41786, ...] + year s64 [2014, 2014, 2014, 2011, 2014, ...] + country string ["HAITI", "MALI", "CENTRAL AFRICAN REPUBLIC", "RUSSIAN FEDERATION", "HONG KONG SPECIAL ADMINSTRATIVE REGION OF CHINA", ...] + total s64 [780, 385, 82, 480885, 12605, ...] + solid_fuel s64 [0, 0, 0, 109218, 8420, ...] + liquid_fuel s64 [739, 385, 82, 108312, 2659, ...] + gas_fuel s64 [0, 0, 0, 247327, 1268, ...] + cement s64 [41, 0, 0, 7643, 258, ...] + gas_flaring s64 [0, 0, 0, 8385, 0, ...] + per_capita f64 [0.07, 0.02, 0.02, 3.36, 1.74, ...] + bunker_fuels s64 [29, 20, 29, 7962, 12769, ...] > ## Grouped examples @@ -4446,10 +4446,10 @@ defmodule Explorer.DataFrame do #Explorer.DataFrame< Polars[6 x 5] Groups: ["species"] - sepal_length f64 [4.8, 5.0, 5.5, 6.5, 7.4, ...] - sepal_width f64 [3.1, 3.6, 2.4, 2.8, 2.8, ...] - petal_length f64 [1.6, 1.4, 3.8, 4.6, 6.1, ...] - petal_width f64 [0.2, 0.2, 1.1, 1.5, 1.9, ...] + sepal_length f64 [5.0, 4.8, 5.0, 5.7, 6.8, ...] + sepal_width f64 [3.5, 3.0, 2.3, 3.0, 3.2, ...] + petal_length f64 [1.6, 1.4, 3.3, 4.2, 5.9, ...] + petal_width f64 [0.6, 0.3, 1.0, 1.2, 2.3, ...] species string ["Iris-setosa", "Iris-setosa", "Iris-versicolor", "Iris-versicolor", "Iris-virginica", ...] > @@ -4462,10 +4462,10 @@ defmodule Explorer.DataFrame do #Explorer.DataFrame< Polars[15 x 5] Groups: ["species"] - sepal_length f64 [5.2, 5.0, 5.2, 5.0, 5.0, ...] - sepal_width f64 [3.4, 3.6, 3.5, 3.0, 3.4, ...] - petal_length f64 [1.4, 1.4, 1.5, 1.6, 1.6, ...] - petal_width f64 [0.2, 0.2, 0.2, 0.2, 0.4, ...] + sepal_length f64 [5.0, 4.4, 5.1, 5.4, 5.0, ...] + sepal_width f64 [3.5, 3.2, 3.4, 3.9, 3.5, ...] + petal_length f64 [1.3, 1.3, 1.5, 1.3, 1.6, ...] + petal_width f64 [0.3, 0.2, 0.2, 0.4, 0.6, ...] species string ["Iris-setosa", "Iris-setosa", "Iris-setosa", "Iris-setosa", "Iris-setosa", ...] > @@ -4507,16 +4507,16 @@ defmodule Explorer.DataFrame do iex> Explorer.DataFrame.shuffle(df, seed: 100) #Explorer.DataFrame< Polars[1094 x 10] - year s64 [2014, 2014, 2014, 2012, 2010, ...] - country string ["ISRAEL", "ARGENTINA", "NETHERLANDS", "YEMEN", "GRENADA", ...] - total s64 [17617, 55638, 45624, 5091, 71, ...] - solid_fuel s64 [6775, 1588, 9070, 129, 0, ...] - liquid_fuel s64 [6013, 25685, 18272, 4173, 71, ...] - gas_fuel s64 [3930, 26368, 18010, 414, 0, ...] - cement s64 [898, 1551, 272, 375, 0, ...] - gas_flaring s64 [0, 446, 0, 0, 0, ...] - per_capita f64 [2.22, 1.29, 2.7, 0.2, 0.68, ...] - bunker_fuels s64 [1011, 2079, 14210, 111, 4, ...] + year s64 [2011, 2011, 2014, 2013, 2010, ...] + country string ["MARTINIQUE", "PARAGUAY", "HONDURAS", "GERMANY", "GERMANY", ...] + total s64 [605, 1451, 2583, 206521, 206943, ...] + solid_fuel s64 [0, 1, 145, 86226, 83574, ...] + liquid_fuel s64 [585, 1362, 2207, 70373, 71983, ...] + gas_fuel s64 [0, 0, 0, 45658, 47408, ...] + cement s64 [20, 88, 231, 4258, 3972, ...] + gas_flaring s64 [0, 0, 0, 5, 6, ...] + per_capita f64 [1.53, 0.23, 0.32, 2.56, 2.57, ...] + bunker_fuels s64 [120, 22, 57, 8982, 9078, ...] > """ diff --git a/lib/explorer/polars_backend/lazy_frame.ex b/lib/explorer/polars_backend/lazy_frame.ex index 8fed945f7..f1715dc08 100644 --- a/lib/explorer/polars_backend/lazy_frame.ex +++ b/lib/explorer/polars_backend/lazy_frame.ex @@ -525,8 +525,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do @impl true def drop_nil(%DF{} = df, columns) do - exprs = for col <- columns, do: Native.expr_column(col) - Shared.apply_dataframe(df, df, :lf_drop_nils, [exprs]) + Shared.apply_dataframe(df, df, :lf_drop_nils, [columns]) end @impl true diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex index 26de3e44d..857ed3d54 100644 --- a/lib/explorer/polars_backend/native.ex +++ b/lib/explorer/polars_backend/native.ex @@ -285,7 +285,7 @@ defmodule Explorer.PolarsBackend.Native do def lf_mutate_with(_df, _exprs), do: err() def lf_summarise_with(_df, _groups, _stable_groups?, _aggs), do: err() def lf_rename_columns(_df, _column_pairs), do: err() - def lf_drop_nils(_df, _column_pairs), do: err() + def lf_drop_nils(_df, _columns), do: err() def lf_pivot_longer(_df, _id_vars, _value_vars, _names_to, _values_to), do: err() def lf_join(_df, _other, _left_on, _right_on, _how, _suffix), do: err() diff --git a/lib/explorer/series.ex b/lib/explorer/series.ex index 70aec204b..3b979d4ed 100644 --- a/lib/explorer/series.ex +++ b/lib/explorer/series.ex @@ -1577,28 +1577,28 @@ defmodule Explorer.Series do iex> Explorer.Series.sample(s, 10, seed: 100) #Explorer.Series< Polars[10] - s64 [57, 9, 54, 62, 50, 77, 35, 88, 1, 69] + s64 [80, 95, 78, 33, 84, 100, 23, 58, 21, 30] > iex> s = 1..100 |> Enum.to_list() |> Explorer.Series.from_list() iex> Explorer.Series.sample(s, 0.05, seed: 100) #Explorer.Series< Polars[5] - s64 [9, 56, 79, 28, 54] + s64 [85, 89, 82, 35, 88] > iex> s = 1..5 |> Enum.to_list() |> Explorer.Series.from_list() iex> Explorer.Series.sample(s, 7, seed: 100, replace: true) #Explorer.Series< Polars[7] - s64 [4, 1, 3, 4, 3, 4, 2] + s64 [5, 5, 5, 2, 5, 2, 2] > iex> s = 1..5 |> Enum.to_list() |> Explorer.Series.from_list() iex> Explorer.Series.sample(s, 1.2, seed: 100, replace: true) #Explorer.Series< Polars[6] - s64 [4, 1, 3, 4, 3, 4] + s64 [5, 5, 5, 2, 5, 2] > iex> s = 0..9 |> Enum.to_list() |> Explorer.Series.from_list() @@ -1612,7 +1612,7 @@ defmodule Explorer.Series do iex> Explorer.Series.sample(s, 1.0, seed: 100, shuffle: true) #Explorer.Series< Polars[10] - s64 [7, 9, 2, 0, 4, 1, 3, 8, 5, 6] + s64 [3, 7, 8, 0, 5, 1, 2, 6, 4, 9] > """ @@ -1655,7 +1655,7 @@ defmodule Explorer.Series do iex> Explorer.Series.shuffle(s, seed: 100) #Explorer.Series< Polars[10] - s64 [8, 10, 3, 1, 5, 2, 4, 9, 6, 7] + s64 [4, 8, 9, 1, 6, 2, 3, 7, 5, 10] > """ @@ -2055,7 +2055,7 @@ defmodule Explorer.Series do iex> Explorer.Series.rank(s, method: :random, seed: 42) #Explorer.Series< Polars[5] - s64 [3, 4, 2, 1, 5] + s64 [3, 5, 1, 2, 4] > """ @doc type: :element_wise @@ -5116,7 +5116,7 @@ defmodule Explorer.Series do iex> Explorer.Series.window_sum(s, 2, weights: [1.0, 2.0]) #Explorer.Series< Polars[10] - f64 [1.0, 5.0, 8.0, 11.0, 14.0, 17.0, 20.0, 23.0, 26.0, 29.0] + f64 [2.0, 5.0, 8.0, 11.0, 14.0, 17.0, 20.0, 23.0, 26.0, 29.0] > """ @doc type: :window @@ -5149,7 +5149,7 @@ defmodule Explorer.Series do iex> Explorer.Series.window_mean(s, 2, weights: [0.25, 0.75]) #Explorer.Series< Polars[10] - f64 [0.25, 1.75, 2.75, 3.75, 4.75, 5.75, 6.75, 7.75, 8.75, 9.75] + f64 [1.0, 1.75, 2.75, 3.75, 4.75, 5.75, 6.75, 7.75, 8.75, 9.75] > iex> s = 1..10 |> Enum.to_list() |> Explorer.Series.from_list() @@ -5229,7 +5229,7 @@ defmodule Explorer.Series do iex> Explorer.Series.window_min(s, 2, weights: [1.0, 2.0]) #Explorer.Series< Polars[10] - f64 [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] + f64 [2.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] > """ @doc type: :window @@ -5262,7 +5262,7 @@ defmodule Explorer.Series do iex> Explorer.Series.window_max(s, 2, weights: [1.0, 2.0]) #Explorer.Series< Polars[10] - f64 [1.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0] + f64 [2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0] > """ @doc type: :window @@ -5295,7 +5295,7 @@ defmodule Explorer.Series do iex> Explorer.Series.window_standard_deviation(s, 2, weights: [0.25, 0.75]) #Explorer.Series< Polars[6] - f64 [0.4330127018922193, 0.4330127018922193, 0.4330127018922193, 0.4330127018922193, 0.4330127018922193, 0.4330127018922193] + f64 [0.0, 0.4330127018922193, 0.4330127018922193, 0.4330127018922193, 0.4330127018922193, 0.4330127018922193] > """ @doc type: :window @@ -5330,21 +5330,39 @@ defmodule Explorer.Series do iex> Explorer.Series.ewm_mean(s) #Explorer.Series< Polars[5] - f64 [1.0, 1.6666666666666667, 2.4285714285714284, 3.2666666666666666, 4.161290322580645] + f64 [1.0, 1.6666666666666665, 2.4285714285714284, 3.2666666666666666, 4.161290322580645] > iex> s = 1..5 |> Enum.to_list() |> Explorer.Series.from_list() iex> Explorer.Series.ewm_mean(s, alpha: 0.1) #Explorer.Series< Polars[5] - f64 [1.0, 1.5263157894736843, 2.070110701107011, 2.6312881651642916, 3.2097140484969833] + f64 [1.0, 1.526315789473684, 2.0701107011070112, 2.631288165164292, 3.209714048496984] > """ @doc type: :window def ewm_mean(series, opts \\ []) do opts = Keyword.validate!(opts, alpha: 0.5, adjust: true, min_periods: 1, ignore_nils: true) - apply_series(series, :ewm_mean, [ + float_series = + case dtype(series) do + :f32 -> + series + + :f64 -> + series + + _ -> + try do + cast(series, :f64) + rescue + _ -> + raise ArgumentError, + "must pass float-compatible series, found dtype #{series.dtype}" + end + end + + apply_series(float_series, :ewm_mean, [ opts[:alpha], opts[:adjust], opts[:min_periods], @@ -5377,14 +5395,14 @@ defmodule Explorer.Series do iex> Explorer.Series.ewm_standard_deviation(s) #Explorer.Series< Polars[5] - f64 [0.0, 0.7071067811865476, 0.9636241116594314, 1.1771636613972951, 1.3452425132127066] + f64 [0.0, 0.7071067811865476, 0.9636241116594315, 1.1771636613972953, 1.3452425132127066] > iex> s = 1..5 |> Enum.to_list() |> Explorer.Series.from_list() iex> Explorer.Series.ewm_standard_deviation(s, alpha: 0.1) #Explorer.Series< Polars[5] - f64 [0.0, 0.7071067811865476, 0.9990770648702808, 1.2879021599718157, 1.5741638698820746] + f64 [0.0, 0.7071067811865476, 0.999077064870281, 1.2879021599718157, 1.5741638698820741] > """ @doc type: :window @@ -5432,14 +5450,14 @@ defmodule Explorer.Series do iex> Explorer.Series.ewm_variance(s) #Explorer.Series< Polars[5] - f64 [0.0, 0.5, 0.9285714285714284, 1.385714285714286, 1.8096774193548393] + f64 [0.0, 0.5, 0.9285714285714286, 1.3857142857142861, 1.8096774193548393] > iex> s = 1..5 |> Enum.to_list() |> Explorer.Series.from_list() iex> Explorer.Series.ewm_variance(s, alpha: 0.1) #Explorer.Series< Polars[5] - f64 [0.0, 0.5, 0.9981549815498153, 1.6586919736600685, 2.4779918892421087] + f64 [0.0, 0.5000000000000001, 0.9981549815498157, 1.658691973660068, 2.477991889242108] > """ @doc type: :window @@ -6932,15 +6950,6 @@ defmodule Explorer.Series do Polars[1] struct[1] [%{"a" => 1}] > - - If the decoded value does not match the given `dtype`, - an error is raised: - - iex> s = Series.from_list(["\\"1\\""]) - iex> Series.json_decode(s, {:s, 64}) - ** (RuntimeError) Polars Error: error deserializing JSON: error deserializing value \"String(\"1\")\" as numeric. \\\n Try increasing `infer_schema_length` or specifying a schema.\n - - It raises an exception if the string is invalid JSON. """ @doc type: :string_wise @spec json_decode(Series.t(), dtype() | dtype_alias()) :: Series.t() diff --git a/native/explorer/Cargo.lock b/native/explorer/Cargo.lock index 334563ea2..cfb3a0151 100644 --- a/native/explorer/Cargo.lock +++ b/native/explorer/Cargo.lock @@ -197,11 +197,22 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bincode" -version = "1.3.3" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" dependencies = [ + "bincode_derive", "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", ] [[package]] @@ -243,9 +254,9 @@ checksum = "26c4925bc979b677330a8c7fe7a8c94af2dbb4a2d37b4a20a80d884400f46baa" [[package]] name = "brotli" -version = "7.0.0" +version = "8.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -254,9 +265,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "4.0.3" +version = "5.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a334ef7c9e23abf0ce748e8cd309037da93e606ad52eb372e4ce327a0dcfbdfd" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -373,9 +384,9 @@ dependencies = [ [[package]] name = "compact_str" -version = "0.8.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b79c4069c6cad78e2e0cdfcbd26275770669fb39fd308a752dc110e83b9af32" +checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a" dependencies = [ "castaway", "cfg-if", @@ -417,6 +428,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.4.2" @@ -573,6 +593,7 @@ dependencies = [ "object_store", "polars", "polars-arrow", + "polars-lazy", "polars-ops", "rand 0.8.5", "rand_pcg", @@ -626,6 +647,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -804,37 +831,37 @@ dependencies = [ [[package]] name = "halfbrown" -version = "0.2.5" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8588661a8607108a5ca69cab034063441a0413a0b041c13618a7dd348021ef6f" +checksum = "0c7ed2f2edad8a14c8186b847909a41fbb9c3eafa44f88bd891114ed5019da09" dependencies = [ - "hashbrown 0.14.5", + "hashbrown 0.16.1", "serde", ] [[package]] name = "hashbrown" -version = "0.14.5" +version = "0.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" dependencies = [ - "ahash", "allocator-api2", - "rayon", - "serde", + "equivalent", + "foldhash 0.1.5", ] [[package]] name = "hashbrown" -version = "0.15.4" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.2.0", "rayon", "serde", + "serde_core", ] [[package]] @@ -1098,13 +1125,14 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.10.0" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" dependencies = [ "equivalent", - "hashbrown 0.15.4", + "hashbrown 0.16.1", "serde", + "serde_core", ] [[package]] @@ -1531,12 +1559,14 @@ dependencies = [ [[package]] name = "polars" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "443824f43bca39b178353d6c09e4b44e115b21f107a5654d5f980d20b432a303" +checksum = "6bc9ea901050c1bb8747ee411bc7fbb390f3b399931e7484719512965132a248" dependencies = [ "getrandom 0.2.16", + "getrandom 0.3.3", "polars-arrow", + "polars-compute", "polars-core", "polars-error", "polars-io", @@ -1552,9 +1582,9 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809c5340e9e6c16eee5a07585161bae99f903f53af7402075efec23ee75fce5b" +checksum = "33d3fe43f8702cf7899ff3d516c2e5f7dc84ee6f6a3007e1a831a0ff87940704" dependencies = [ "atoi_simd", "bitflags", @@ -1565,7 +1595,8 @@ dependencies = [ "either", "ethnum", "getrandom 0.2.16", - "hashbrown 0.15.4", + "getrandom 0.3.3", + "hashbrown 0.16.1", "itoa", "lz4", "num-traits", @@ -1593,25 +1624,24 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b8802ff2cccea01a845ea8267a7600e495747ed109035bb5020c33eb8717ff4" +checksum = "d29cc7497378dee3a002f117e0b4e16b7cbe6c8ed3da16a0229c89294af7c3bf" dependencies = [ "atoi_simd", "bytemuck", "chrono", "either", "fast-float2", - "hashbrown 0.15.4", + "hashbrown 0.16.1", "itoa", "num-traits", "polars-arrow", "polars-error", "polars-utils", - "rand 0.8.5", + "rand 0.9.1", "ryu", "serde", - "skiplist", "strength_reduce", "strum_macros", "version_check", @@ -1619,9 +1649,9 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fc3c99d7000be1be11665e1e260b93dc3b927342b9da3b53d9a1ac264e4343d" +checksum = "48409b7440cb1a4aa84953fe3a4189dfbfb300a3298266a92a37363476641e40" dependencies = [ "bitflags", "boxcar", @@ -1629,18 +1659,18 @@ dependencies = [ "chrono", "chrono-tz", "either", - "hashbrown 0.14.5", - "hashbrown 0.15.4", + "hashbrown 0.16.1", "indexmap", "itoa", "num-traits", "polars-arrow", "polars-compute", + "polars-dtype", "polars-error", "polars-row", "polars-schema", "polars-utils", - "rand 0.8.5", + "rand 0.9.1", "rand_distr", "rayon", "regex", @@ -1652,11 +1682,26 @@ dependencies = [ "xxhash-rust", ] +[[package]] +name = "polars-dtype" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7007e9e8b7b657cbd339b65246af7e87f5756ee9a860119b9424ddffd2aaf133" +dependencies = [ + "boxcar", + "hashbrown 0.16.1", + "polars-arrow", + "polars-error", + "polars-utils", + "serde", + "uuid", +] + [[package]] name = "polars-error" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1397c17712e61a55fdd45c033a69f0451fde2973ff2609c22e363e21d68f11ef" +checksum = "f9a6be22566c89f6405f553bfdb7c8a6cb20ec51b35f3172de9a25fa3e252d85" dependencies = [ "object_store", "parking_lot", @@ -1668,32 +1713,35 @@ dependencies = [ [[package]] name = "polars-expr" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33d3aa6722c9a3e0b721ec2bcdc4affd9e50e4cb606cd81bb94535a9a5a6ade9" +checksum = "6199a50d3e1afd0674fb009e340cbfb0010682b2387187a36328c00f3f2ca87b" dependencies = [ "bitflags", - "hashbrown 0.15.4", + "hashbrown 0.16.1", "num-traits", "polars-arrow", "polars-compute", "polars-core", "polars-io", + "polars-json", "polars-ops", "polars-plan", "polars-row", "polars-time", "polars-utils", - "rand 0.8.5", + "rand 0.9.1", "rayon", "recursive", + "regex", + "version_check", ] [[package]] name = "polars-io" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a632d442a99821250a8fa66f7d488bf5ee98e5f515e65256b12956cb81fc110" +checksum = "be3714acdff87170141880a07f5d9233490d3bd5531c41898f6969d440feee11" dependencies = [ "async-trait", "atoi_simd", @@ -1706,7 +1754,7 @@ dependencies = [ "fs4", "futures", "glob", - "hashbrown 0.15.4", + "hashbrown 0.16.1", "home", "itoa", "memchr", @@ -1715,6 +1763,7 @@ dependencies = [ "object_store", "percent-encoding", "polars-arrow", + "polars-compute", "polars-core", "polars-error", "polars-json", @@ -1731,21 +1780,19 @@ dependencies = [ "simd-json", "simdutf8", "tokio", - "tokio-util", - "url", "zstd", ] [[package]] name = "polars-json" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd891735404ebb9d6ace066cfb4b8f6edb321bc841d354a0d917a3a1f2d1ca5b" +checksum = "3dd2126daebf58da564fc5840cd55eb8eb2479d24dfced0a1aea2178a9b33b12" dependencies = [ "chrono", "chrono-tz", "fallible-streaming-iterator", - "hashbrown 0.15.4", + "hashbrown 0.16.1", "indexmap", "itoa", "num-traits", @@ -1760,9 +1807,9 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4ed0c87bdc8820447a38ae8efdb5a51a5a93e8bd528cffb05d05cf1145e4161" +checksum = "ea136c360d03aafe56e0233495e30044ce43639b8b0360a4a38e840233f048a1" dependencies = [ "bitflags", "chrono", @@ -1788,9 +1835,9 @@ dependencies = [ [[package]] name = "polars-mem-engine" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "675294ddf9174029e48caa4e59b0665ea64bfb784a366b197690895a6ed65c68" +checksum = "0f6e455ceb6e5aee7ed7d5c8944104e66992173e03a9c42f9670226318672249" dependencies = [ "futures", "memmap2", @@ -1811,9 +1858,9 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1eb4db68956f857c52eeda072d87644a7b42eac41d55073af94dfac8441af6cf" +checksum = "7b59c80a019ef0e6f09b4416d2647076a52839305c9eb11919e8298ec667f853" dependencies = [ "argminmax", "base64", @@ -1821,7 +1868,7 @@ dependencies = [ "chrono", "chrono-tz", "either", - "hashbrown 0.15.4", + "hashbrown 0.16.1", "hex", "indexmap", "jsonpath_lib_polars_vendor", @@ -1835,7 +1882,7 @@ dependencies = [ "polars-json", "polars-schema", "polars-utils", - "rand 0.8.5", + "rand 0.9.1", "rand_distr", "rayon", "regex", @@ -1849,9 +1896,9 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c849c10edd9511ccd4ec4130e283ee3a8b3bb48a7d74ac6354c1c20add81065" +checksum = "93c2439d127c59e6bfc9d698419bdb45210068a6f501d44e6096429ad72c2eaa" dependencies = [ "async-stream", "base64", @@ -1860,7 +1907,7 @@ dependencies = [ "ethnum", "flate2", "futures", - "hashbrown 0.15.4", + "hashbrown 0.16.1", "lz4", "num-traits", "polars-arrow", @@ -1887,9 +1934,9 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71fb4412c42bf637c2c02a617381c682ed425d9c8e4bd1fcb85cf352ed2a67c6" +checksum = "65b4619f5c7e9b91f18611c9ed82ebeee4b10052160825c1316ecf4dbd4d97e6" dependencies = [ "bitflags", "bytemuck", @@ -1898,13 +1945,14 @@ dependencies = [ "chrono-tz", "either", "futures", - "hashbrown 0.15.4", + "hashbrown 0.16.1", "memmap2", "num-traits", "percent-encoding", "polars-arrow", "polars-compute", "polars-core", + "polars-error", "polars-io", "polars-json", "polars-ops", @@ -1914,29 +1962,32 @@ dependencies = [ "rayon", "recursive", "regex", + "sha2", + "slotmap", "strum_macros", "version_check", ] [[package]] name = "polars-row" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08fb77ac1d37340d9cfe57cf58000cf3d9cce429e10d25066952c6145c684cc0" +checksum = "a18d232f25b83032e280a279a1f40beb8a6f8fc43907b13dc07b1c56f3b11eea" dependencies = [ "bitflags", "bytemuck", "polars-arrow", "polars-compute", + "polars-dtype", "polars-error", "polars-utils", ] [[package]] name = "polars-schema" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ada7c7e2fbbeffbdd67628cd8a89f02b0a8d21c71d34e297e2463a7c17575203" +checksum = "f73e21d429ae1c23f442b0220ccfe773a9734a44e997b5062a741842909d9441" dependencies = [ "indexmap", "polars-error", @@ -1947,9 +1998,9 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a8e512b1f05ffda9963fe8f6a7c62dcba86be85218bc033ecdad2802cc1b1a0" +checksum = "3e67ac1cbb0c972a57af3be12f19aa9803898863fe95c33cdd39df05f5738a75" dependencies = [ "bitflags", "hex", @@ -1960,7 +2011,7 @@ dependencies = [ "polars-plan", "polars-time", "polars-utils", - "rand 0.8.5", + "rand 0.9.1", "regex", "serde", "sqlparser", @@ -1968,14 +2019,15 @@ dependencies = [ [[package]] name = "polars-stream" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0a02d8050acd9b64ed7e36c5bc96f6d4f46a940220f9c0e34c96b51f830f8c" +checksum = "2ff19612074640a9d65e5928b7223db76ffee63e55b276f1e466d06719eb7362" dependencies = [ "async-channel", "async-trait", "atomic-waker", "bitflags", + "chrono-tz", "crossbeam-channel", "crossbeam-deque", "crossbeam-queue", @@ -1986,6 +2038,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "polars-arrow", + "polars-compute", "polars-core", "polars-error", "polars-expr", @@ -1994,8 +2047,9 @@ dependencies = [ "polars-ops", "polars-parquet", "polars-plan", + "polars-time", "polars-utils", - "rand 0.8.5", + "rand 0.9.1", "rayon", "recursive", "slotmap", @@ -2005,9 +2059,9 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e84a30110880ffede8d93c085fc429ab1b8bf1acf3d6d489143dd34be374c4" +checksum = "ddce7a9f81d5f47d981bcee4a8db004f9596bb51f0f4d9d93667a1a00d88166c" dependencies = [ "atoi_simd", "bytemuck", @@ -2028,31 +2082,34 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.49.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a05e033960552c47fc35afe14d5af5b29696acc97ae5d3c585ebc33c246cc15f" +checksum = "667c1bc2d2313f934d711f6e3b58d8d9f80351d14ea60af936a26b7dfb06e309" dependencies = [ "bincode", "bytemuck", "bytes", "compact_str", + "either", "flate2", - "foldhash", - "hashbrown 0.15.4", + "foldhash 0.2.0", + "hashbrown 0.16.1", "indexmap", "libc", "memmap2", "num-traits", "polars-error", - "rand 0.8.5", + "rand 0.9.1", "raw-cpuid", "rayon", "regex", "rmp-serde", "serde", "serde_json", + "serde_stacker", "slotmap", "stacker", + "uuid", "version_check", ] @@ -2233,12 +2290,12 @@ dependencies = [ [[package]] name = "rand_distr" -version = "0.4.3" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" dependencies = [ "num-traits", - "rand 0.8.5", + "rand 0.9.1", ] [[package]] @@ -2599,18 +2656,28 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.219" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -2630,6 +2697,16 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_stacker" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69c8defe6c780725cce4ec6ad3bd91e321baf6fa4e255df1f31e345d507ef01a" +dependencies = [ + "serde", + "stacker", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -2642,6 +2719,17 @@ dependencies = [ "serde", ] +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "shlex" version = "1.3.0" @@ -2669,12 +2757,11 @@ dependencies = [ [[package]] name = "simd-json" -version = "0.14.3" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa2bcf6c6e164e81bc7a5d49fc6988b3d515d9e8c07457d7b74ffb9324b9cd40" +checksum = "4255126f310d2ba20048db6321c81ab376f6a6735608bf11f0785c41f01f64e3" dependencies = [ "ahash", - "getrandom 0.2.16", "halfbrown", "once_cell", "ref-cast", @@ -2696,15 +2783,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" -[[package]] -name = "skiplist" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eec25f46463fcdc5e02f388c2780b1b58e01be81a8378e62ec60931beccc3f6" -dependencies = [ - "rand 0.8.5", -] - [[package]] name = "slab" version = "0.4.10" @@ -2799,14 +2877,13 @@ checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" [[package]] name = "strum_macros" -version = "0.26.4" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" dependencies = [ "heck", "proc-macro2", "quote", - "rustversion", "syn", ] @@ -3068,6 +3145,12 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "url" version = "2.5.4" @@ -3093,14 +3176,15 @@ checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" dependencies = [ "getrandom 0.3.3", "js-sys", + "serde", "wasm-bindgen", ] [[package]] name = "value-trait" -version = "0.10.1" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9170e001f458781e92711d2ad666110f153e4e50bfd5cbd02db6547625714187" +checksum = "8e80f0c733af0720a501b3905d22e2f97662d8eacfe082a75ed7ffb5ab08cb59" dependencies = [ "float-cmp", "halfbrown", @@ -3114,6 +3198,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + [[package]] name = "walkdir" version = "2.5.0" diff --git a/native/explorer/Cargo.toml b/native/explorer/Cargo.toml index 9d057111f..4a4007f9d 100644 --- a/native/explorer/Cargo.toml +++ b/native/explorer/Cargo.toml @@ -27,7 +27,8 @@ tokio = { version = "1.40", default-features = false, features = [ "rt", ], optional = true } object_store = { version = "0.12", default-features = false, optional = true } -polars-arrow = "0.49" +polars-arrow = "0.52" +polars-lazy = "0.52" # MiMalloc won´t compile on Windows with the GCC compiler. # On Linux with Musl it won´t load correctly. @@ -35,7 +36,7 @@ polars-arrow = "0.49" mimalloc = { version = "*", default-features = false } [dependencies.polars] -version = "0.49" +version = "0.52" default-features = false features = [ "abs", @@ -86,18 +87,13 @@ features = [ ] [dependencies.polars-ops] -version = "0.49" +version = "0.52" features = ["abs", "ewma", "cum_agg", "cov", "index_of"] [features] default = ["ndjson", "cloud", "nif_version_2_15"] -cloud = [ - "object_store", - "tokio", - "aws", - "polars/cloud", -] +cloud = ["object_store", "tokio", "aws", "polars/cloud"] ndjson = ["polars/json"] aws = ["object_store/aws", "polars/async", "polars/aws"] diff --git a/native/explorer/Cross.toml b/native/explorer/Cross.toml deleted file mode 100644 index 2c0e4bb4a..000000000 --- a/native/explorer/Cross.toml +++ /dev/null @@ -1,5 +0,0 @@ -[build.env] -passthrough = [ - "RUSTLER_NIF_VERSION", - "RUSTFLAGS" -] diff --git a/native/explorer/rust-toolchain.toml b/native/explorer/rust-toolchain.toml index 09db14359..c99756637 100644 --- a/native/explorer/rust-toolchain.toml +++ b/native/explorer/rust-toolchain.toml @@ -1,4 +1,6 @@ +# Analogous file in Polars for reference: +# https://github.com/pola-rs/polars/blob/main/rust-toolchain.toml [toolchain] -channel = "nightly-2025-06-23" +channel = "nightly-2025-10-24" components = ["rustfmt", "clippy"] profile = "minimal" diff --git a/native/explorer/src/dataframe.rs b/native/explorer/src/dataframe.rs index 5f217b1b8..8e627cf4e 100644 --- a/native/explorer/src/dataframe.rs +++ b/native/explorer/src/dataframe.rs @@ -1,4 +1,5 @@ use polars::prelude::*; +use polars_lazy::frame::pivot::PivotExpr; use polars_ops::pivot::{pivot_stable, PivotAgg}; use polars_arrow::ffi; @@ -372,9 +373,10 @@ pub fn df_slice( #[rustler::nif(schedule = "DirtyCpu")] pub fn df_to_dummies(df: ExDataFrame, selection: Vec<&str>) -> Result { let drop_first = false; + let drop_nulls = false; let dummies = df .select(selection) - .and_then(|df| df.to_dummies(None, drop_first))?; + .and_then(|df| df.to_dummies(None, drop_first, drop_nulls))?; Ok(ExDataFrame::new(dummies)) } @@ -458,7 +460,7 @@ pub fn df_pivot_wider( Some(temp_id_names), Some(values_column), false, - Some(PivotAgg::First), + Some(PivotAgg(Arc::new(PivotExpr::from_expr(col("").first())))), None, )?; diff --git a/native/explorer/src/dataframe/io.rs b/native/explorer/src/dataframe/io.rs index 2d7564b26..f522bba26 100644 --- a/native/explorer/src/dataframe/io.rs +++ b/native/explorer/src/dataframe/io.rs @@ -446,7 +446,8 @@ pub fn df_load_ipc( fn decode_ipc_compression(compression: &str) -> Result { match compression { "lz4" => Ok(IpcCompression::LZ4), - "zstd" => Ok(IpcCompression::ZSTD), + // ZSTD with a compression level of 3 is the default. + "zstd" => Ok(IpcCompression::default()), other => Err(ExplorerError::Other(format!( "the algorithm {other} is not supported for IPC compression" ))), diff --git a/native/explorer/src/datatypes.rs b/native/explorer/src/datatypes.rs index e09770ab9..93c65f4ff 100644 --- a/native/explorer/src/datatypes.rs +++ b/native/explorer/src/datatypes.rs @@ -655,16 +655,23 @@ impl ExDecimal { .expect("cannot convert exponent (Elixir) to scale (Rust)") } } + + pub fn default_precision() -> usize { + // https://docs.pola.rs/api/python/stable/reference/api/polars.datatypes.Decimal.html + // > If set to None (default), the precision is set to 38 (the maximum + // > supported by Polars). + 38 + } } impl Literal for ExDecimal { fn lit(self) -> Expr { - let coef = self.signed_coef().expect("decimal coefficient overflow"); + let integer = self.signed_coef().expect("decimal coefficient overflow"); + let precision = ExDecimal::default_precision(); let scale = self.scale(); - Expr::Literal(LiteralValue::Scalar(Scalar::new( - DataType::Decimal(Some(scale), Some(scale)), - AnyValue::Decimal(coef, scale), + DataType::Decimal(precision, scale), + AnyValue::Decimal(integer, precision, scale), ))) } } diff --git a/native/explorer/src/datatypes/ex_dtypes.rs b/native/explorer/src/datatypes/ex_dtypes.rs index b882f890b..71da56c09 100644 --- a/native/explorer/src/datatypes/ex_dtypes.rs +++ b/native/explorer/src/datatypes/ex_dtypes.rs @@ -1,7 +1,9 @@ use crate::ExplorerError; -use polars::datatypes::CategoricalOrdering; +use polars::datatypes::CategoricalPhysical; +use polars::datatypes::Categories; use polars::datatypes::DataType; use polars::datatypes::Field; +use polars::datatypes::PlSmallStr; use polars::datatypes::TimeUnit; use rustler::NifTaggedEnum; @@ -99,7 +101,9 @@ impl TryFrom<&DataType> for ExSeriesDtype { Ok(ExSeriesDtype::Struct(struct_fields)) } - DataType::Decimal(precision, scale) => Ok(ExSeriesDtype::Decimal(*precision, *scale)), + DataType::Decimal(precision, scale) => { + Ok(ExSeriesDtype::Decimal(Some(*precision), Some(*scale))) + } _ => Err(ExplorerError::Other(format!( "cannot cast to dtype: {value}" @@ -117,7 +121,12 @@ impl TryFrom<&ExSeriesDtype> for DataType { ExSeriesDtype::Binary => Ok(DataType::Binary), ExSeriesDtype::Boolean => Ok(DataType::Boolean), ExSeriesDtype::Category => { - Ok(DataType::Categorical(None, CategoricalOrdering::default())) + let cats = Categories::new( + PlSmallStr::EMPTY, + PlSmallStr::EMPTY, + CategoricalPhysical::U32, + ); + Ok(DataType::from_categories(cats.clone())) } ExSeriesDtype::Date => Ok(DataType::Date), ExSeriesDtype::F(64) => Ok(DataType::Float64), @@ -159,7 +168,12 @@ impl TryFrom<&ExSeriesDtype> for DataType { .map(|(k, v)| Ok(Field::new(k.into(), v.try_into()?))) .collect::, Self::Error>>()?, )), - ExSeriesDtype::Decimal(precision, scale) => Ok(DataType::Decimal(*precision, *scale)), + ExSeriesDtype::Decimal(Some(precision), Some(scale)) => { + Ok(DataType::Decimal(*precision, *scale)) + } + ExSeriesDtype::Decimal(_, _) => Err(ExplorerError::Other( + "Decimal type must have both precision and scale".to_string(), + )), } } } diff --git a/native/explorer/src/encoding.rs b/native/explorer/src/encoding.rs index d58cb2be7..f8f7271c2 100644 --- a/native/explorer/src/encoding.rs +++ b/native/explorer/src/encoding.rs @@ -1,6 +1,6 @@ use chrono::prelude::*; use polars::prelude::*; -use rustler::{Encoder, Env, NewBinary, OwnedBinary, ResourceArc, Term}; +use rustler::{Encoder, Env, OwnedBinary, ResourceArc, Term}; use std::collections::HashMap; use std::{mem, slice}; @@ -36,6 +36,15 @@ macro_rules! unsafe_iterator_series_to_list { }}; } +macro_rules! encode_chunked_array { + ($chunked_array: expr, $env: ident, $encode_fun: expr) => {{ + $chunked_array.physical().downcast_iter().flat_map(|iter| { + iter.into_iter() + .map(|opt_v| opt_v.copied().map($encode_fun).encode($env)) + }) + }}; +} + macro_rules! unsafe_encode_date { ($v: ident, $date_struct_keys: ident, $calendar_iso_module: ident, $date_module: ident, $env: ident) => {{ let dt = days_to_date($v); @@ -96,15 +105,13 @@ fn date_series_to_list<'b>(s: &Series, env: Env<'b>) -> Result, Explore Ok(unsafe_iterator_series_to_list!( env, - s.date()?.into_iter().map(|option| option - .map(|v| unsafe_encode_date!( - v, - date_struct_keys, - calendar_iso_module, - date_module, - env - )) - .encode(env)) + encode_chunked_array!(s.date()?, env, |date| unsafe_encode_date!( + date, + date_struct_keys, + calendar_iso_module, + date_module, + env + )) )) } @@ -192,18 +199,18 @@ fn naive_datetime_series_to_list<'b>(s: &Series, env: Env<'b>) -> Result(s: &Series, env: Env<'b>) -> Result, Exp Ok(unsafe_iterator_series_to_list!( env, - s.datetime()?.into_iter().map(|opt_timestamp| opt_timestamp - .map(|timestamp| { - unsafe_encode_datetime!( - timestamp, - time_unit, - time_zone, - datetime_struct_keys, - calendar_iso_module, - datetime_module, - env - ) - }) - .encode(env)) + encode_chunked_array!(s.datetime()?, env, |timestamp| unsafe_encode_datetime!( + timestamp, + time_unit, + time_zone, + datetime_struct_keys, + calendar_iso_module, + datetime_module, + env + )) )) } @@ -397,11 +400,13 @@ fn duration_series_to_list<'b>( Ok(unsafe_iterator_series_to_list!( env, - s.duration()?.into_iter().map(|option| option - .map(|v| { - unsafe_encode_duration!(v, time_unit, duration_struct_keys, duration_module, env) - }) - .encode(env)) + encode_chunked_array!(s.duration()?, env, |duration| unsafe_encode_duration!( + duration, + time_unit, + duration_struct_keys, + duration_module, + env + )) )) } @@ -468,9 +473,13 @@ fn decimal_series_to_list<'b>(s: &Series, env: Env<'b>) -> Result, Expl Ok(unsafe_iterator_series_to_list!( env, - decimal_chunked.into_iter().map(|option| option - .map(|v| { unsafe_encode_decimal!(v, scale, struct_keys, module_atom, env) }) - .encode(env)) + encode_chunked_array!(decimal_chunked, env, |decimal| unsafe_encode_decimal!( + decimal, + scale, + struct_keys, + module_atom, + env + )) )) } @@ -547,17 +556,13 @@ fn time_series_to_list<'b>(s: &Series, env: Env<'b>) -> Result, Explore Ok(unsafe_iterator_series_to_list!( env, - s.time()?.into_iter().map(|option| option - .map(|v| { - unsafe_encode_time!( - v, - naive_time_struct_keys, - calendar_iso_module, - time_module, - env - ) - }) - .encode(env)) + encode_chunked_array!(s.time()?, env, |time| unsafe_encode_time!( + time, + naive_time_struct_keys, + calendar_iso_module, + time_module, + env + )) )) } @@ -590,40 +595,9 @@ fn generic_binary_series_to_list<'b>( Ok(unsafe { Term::new(env, list) }) } -fn categorical_series_to_list<'b>( - s: &Series, - env: Env<'b>, - mapping: &Arc, -) -> Result, ExplorerError> { - let env_as_c_arg = env.as_c_arg(); - let nil_as_c_arg = atom::nil().to_term(env).as_c_arg(); - let mut list = unsafe { list::make_list(env_as_c_arg, &[]) }; - - let logical = s.categorical()?.physical(); - let cat_size = mapping.len(); - let mut terms: Vec = vec![nil_as_c_arg; cat_size]; - - for (index, term) in terms.iter_mut().enumerate() { - if let Some(existing_str) = mapping.get_optional(index as u32) { - let mut binary = NewBinary::new(env, existing_str.len()); - binary.copy_from_slice(existing_str.as_bytes()); - - let binary_term: Term = binary.into(); - - *term = binary_term.as_c_arg(); - } - } - - for maybe_index in &logical.reverse() { - let term_ref = match maybe_index { - Some(index) if index < (cat_size as u32) => terms[index as usize], - _ => nil_as_c_arg, - }; - - list = unsafe { list::make_list_cell(env_as_c_arg, term_ref, list) } - } - - Ok(unsafe { Term::new(env, list) }) +// HELP WANTED: Make this more efficient. +fn categorical_series_to_list<'b>(s: &Series, env: Env<'b>) -> Result, ExplorerError> { + generic_string_series_to_list(&s.cast(&DataType::String).unwrap(), env) } // Convert f32 and f64 series taking into account NaN and Infinity floats (they are encoded as atoms). @@ -685,10 +659,10 @@ fn null_series_to_list<'b>(s: &Series, env: Env<'b>) -> Result, Explore } macro_rules! series_to_iovec { - ($resource:ident, $s:ident, $env:ident, $convert_function:ident, $in_type:ty) => {{ + ($resource:ident, $v:expr, $env:ident, $in_type:ty) => {{ Ok(unsafe_iterator_series_to_list!( $env, - $s.$convert_function()?.downcast_iter().map(|array| { + $v.downcast_iter().map(|array| { let slice: &[$in_type] = array.values().as_slice(); let aligned_slice = unsafe { @@ -735,7 +709,7 @@ pub fn resource_term_from_value<'b>( encode_datetime(v, time_unit, time_zone.parse::().unwrap(), env) } AnyValue::Duration(v, time_unit) => encode_duration(v, time_unit, env), - AnyValue::Categorical(idx, mapping, _) => Ok(mapping.get(idx).encode(env)), + AnyValue::Categorical(idx, mapping) => Ok(mapping.cat_to_str(idx).encode(env)), AnyValue::List(series) => list_from_series(ExSeries::new(series), env), AnyValue::Struct(_, _, fields) => v ._iter_struct_av() @@ -748,7 +722,7 @@ pub fn resource_term_from_value<'b>( }) .collect::, ExplorerError>>() .map(|map| map.encode(env)), - AnyValue::Decimal(number, scale) => encode_decimal(number, scale, env), + AnyValue::Decimal(value, _precision, scale) => encode_decimal(value, scale, env), dt => panic!("cannot encode value {dt:?} to term"), } } @@ -799,7 +773,7 @@ pub fn list_from_series(s: ExSeries, env: Env) -> Result { DataType::Binary => generic_binary_series_to_list(&s.resource, &s, env), DataType::String => generic_string_series_to_list(&s, env), - DataType::Categorical(Some(mapping), _) => categorical_series_to_list(&s, env, mapping), + DataType::Categorical(_, _) => categorical_series_to_list(&s, env), DataType::List(_inner_dtype) => s .list()? @@ -833,28 +807,26 @@ pub fn iovec_from_series(s: ExSeries, env: Env) -> Result { } Ok([bin.release(env)].encode(env)) } - DataType::Int8 => series_to_iovec!(resource, s, env, i8, i8), - DataType::Int16 => series_to_iovec!(resource, s, env, i16, i16), - DataType::Int32 => series_to_iovec!(resource, s, env, i32, i32), - DataType::Int64 => series_to_iovec!(resource, s, env, i64, i64), - DataType::UInt8 => series_to_iovec!(resource, s, env, u8, u8), - DataType::UInt16 => series_to_iovec!(resource, s, env, u16, u16), - DataType::UInt32 => series_to_iovec!(resource, s, env, u32, u32), - DataType::UInt64 => series_to_iovec!(resource, s, env, u64, u64), - DataType::Float32 => series_to_iovec!(resource, s, env, f32, f32), - DataType::Float64 => series_to_iovec!(resource, s, env, f64, f64), - DataType::Date => series_to_iovec!(resource, s, env, date, i32), - DataType::Time => series_to_iovec!(resource, s, env, time, i64), + DataType::Int8 => series_to_iovec!(resource, s.i8()?, env, i8), + DataType::Int16 => series_to_iovec!(resource, s.i16()?, env, i16), + DataType::Int32 => series_to_iovec!(resource, s.i32()?, env, i32), + DataType::Int64 => series_to_iovec!(resource, s.i64()?, env, i64), + DataType::UInt8 => series_to_iovec!(resource, s.u8()?, env, u8), + DataType::UInt16 => series_to_iovec!(resource, s.u16()?, env, u16), + DataType::UInt32 => series_to_iovec!(resource, s.u32()?, env, u32), + DataType::UInt64 => series_to_iovec!(resource, s.u64()?, env, u64), + DataType::Float32 => series_to_iovec!(resource, s.f32()?, env, f32), + DataType::Float64 => series_to_iovec!(resource, s.f64()?, env, f64), + DataType::Date => series_to_iovec!(resource, s.date()?.physical(), env, i32), + DataType::Time => series_to_iovec!(resource, s.time()?.physical(), env, i64), DataType::Datetime(_, None) => { - series_to_iovec!(resource, s, env, datetime, i64) + series_to_iovec!(resource, s.datetime()?.physical(), env, i64) } DataType::Duration(_) => { - series_to_iovec!(resource, s, env, duration, i64) + series_to_iovec!(resource, s.duration()?.physical(), env, i64) } - DataType::Categorical(Some(_), _) => { - let cat_series = s.cast(&DataType::UInt32)?; - - series_to_iovec!(resource, cat_series, env, u32, u32) + DataType::Categorical(_, _) => { + series_to_iovec!(resource, s.cast(&DataType::UInt32)?.u32()?, env, u32) } dt => panic!("to_iovec/1 not implemented for {dt:?}"), } diff --git a/native/explorer/src/expressions.rs b/native/explorer/src/expressions.rs index d230c7b14..abf80da20 100644 --- a/native/explorer/src/expressions.rs +++ b/native/explorer/src/expressions.rs @@ -317,8 +317,8 @@ pub fn expr_fill_missing_with_strategy(data: ExExpr, strategy: &str) -> ExExpr { let result_expr = match strategy { "backward" => expr.fill_null_with_strategy(FillNullStrategy::Backward(None)), "forward" => expr.fill_null_with_strategy(FillNullStrategy::Forward(None)), - "min" => expr.clone().fill_null(expr.min()), - "max" => expr.clone().fill_null(expr.max()), + "min" => expr.fill_null_with_strategy(FillNullStrategy::Min), + "max" => expr.fill_null_with_strategy(FillNullStrategy::Max), "mean" => expr.clone().fill_null(expr.mean()), _other => panic!("unknown strategy {strategy:?}"), }; @@ -408,15 +408,17 @@ pub fn expr_pow(left: ExExpr, right: ExExpr) -> ExExpr { #[rustler::nif] pub fn expr_log(left: ExExpr, base: f64) -> ExExpr { let left_expr = left.clone_inner(); + let base_expr = base.lit(); - ExExpr::new(left_expr.log(base)) + ExExpr::new(left_expr.log(base_expr)) } #[rustler::nif] pub fn expr_log_natural(left: ExExpr) -> ExExpr { let left_expr = left.clone_inner(); + let base_expr = std::f64::consts::E.lit(); - ExExpr::new(left_expr.log(std::f64::consts::E)) + ExExpr::new(left_expr.log(base_expr)) } #[rustler::nif] @@ -802,21 +804,13 @@ pub fn expr_sort( pub fn expr_argsort( expr: ExExpr, descending: bool, - maintain_order: bool, - multithreaded: bool, + _maintain_order: bool, + _multithreaded: bool, nulls_last: bool, ) -> ExExpr { let expr = expr.clone_inner(); - let opts = SortOptions { - descending, - maintain_order, - multithreaded, - nulls_last, - limit: None, - }; - - ExExpr::new(expr.arg_sort(opts)) + ExExpr::new(expr.arg_sort(descending, nulls_last)) } #[rustler::nif] @@ -1178,7 +1172,7 @@ pub fn expr_field(expr: ExExpr, name: &str) -> ExExpr { #[rustler::nif] pub fn expr_json_decode(expr: ExExpr, ex_dtype: ExSeriesDtype) -> ExExpr { let dtype = DataType::try_from(&ex_dtype).unwrap(); - let expr = expr.clone_inner().str().json_decode(Some(dtype), None); + let expr = expr.clone_inner().str().json_decode(dtype); ExExpr::new(expr) } diff --git a/native/explorer/src/lazyframe.rs b/native/explorer/src/lazyframe.rs index 9fb00d842..4e48c9f17 100644 --- a/native/explorer/src/lazyframe.rs +++ b/native/explorer/src/lazyframe.rs @@ -42,7 +42,9 @@ pub fn lf_compute(data: ExLazyFrame) -> Result { #[rustler::nif(schedule = "DirtyCpu")] pub fn lf_fetch(data: ExLazyFrame, n_rows: usize) -> Result { - Ok(ExDataFrame::new(data.clone_inner().fetch(n_rows)?)) + Ok(ExDataFrame::new( + data.clone_inner().slice(0, n_rows as u32).collect()?, + )) } #[rustler::nif] @@ -117,13 +119,15 @@ pub fn lf_dtypes(data: ExLazyFrame) -> Result, ExplorerError> #[rustler::nif] pub fn lf_select(data: ExLazyFrame, columns: Vec<&str>) -> Result { - let lf = data.clone_inner().select(&[cols(columns)]); + let lf = data.clone_inner().select([cols(columns).as_expr()]); Ok(ExLazyFrame::new(lf)) } #[rustler::nif] pub fn lf_drop(data: ExLazyFrame, columns: Vec<&str>) -> Result { - let lf = data.clone_inner().select(&[col("*").exclude(columns)]); + let lf = data + .clone_inner() + .select([all().exclude_cols(columns).as_expr()]); Ok(ExLazyFrame::new(lf)) } @@ -142,7 +146,7 @@ pub fn lf_slice( let groups_exprs: Vec = groups.iter().map(col).collect(); lf.group_by_opt_order(groups_exprs, stable_groups) .agg([col("*").slice(offset, length)]) - .explode([col("*").exclude(groups)]) + .explode(all().exclude_cols(groups)) }; Ok(ExLazyFrame::new(result_lf)) @@ -150,13 +154,13 @@ pub fn lf_slice( #[rustler::nif] pub fn lf_explode(data: ExLazyFrame, columns: Vec<&str>) -> Result { - let lf = data.clone_inner().explode(columns); + let lf = data.clone_inner().explode(cols(columns)); Ok(ExLazyFrame::new(lf)) } #[rustler::nif] pub fn lf_unnest(data: ExLazyFrame, columns: Vec<&str>) -> Result { - let lf = data.clone_inner().unnest(columns); + let lf = data.clone_inner().unnest(cols(columns), None); Ok(ExLazyFrame::new(lf)) } @@ -212,8 +216,7 @@ pub fn lf_distinct( columns_to_keep: Option>, ) -> Result { let df = data.clone_inner(); - let subset = subset.iter().map(|x| x.into()).collect::>(); - let new_df = df.unique_stable(Some(subset), UniqueKeepStrategy::First); + let new_df = df.unique_stable(Some(cols(subset)), UniqueKeepStrategy::First); match columns_to_keep { Some(columns) => Ok(ExLazyFrame::new(new_df.select(ex_expr_to_exprs(columns)))), @@ -257,7 +260,9 @@ pub fn lf_summarise_with( stable_groups, ) .agg(aggs) - .select(&[col("*").exclude(["__explorer_literal_for_group__"])]) + .select(&[all() + .exclude_cols(["__explorer_literal_for_group__"]) + .as_expr()]) } else { ldf.group_by_opt_order(groups, stable_groups).agg(aggs) }; @@ -279,10 +284,10 @@ pub fn lf_rename_columns( #[rustler::nif] pub fn lf_drop_nils( data: ExLazyFrame, - subset: Option>, + subset: Option>, ) -> Result { let ldf = data.clone_inner(); - let columns = subset.map(ex_expr_to_exprs); + let columns: Option = subset.map(cols); Ok(ExLazyFrame::new(ldf.drop_nulls(columns))) } @@ -297,8 +302,8 @@ pub fn lf_pivot_longer( ) -> Result { let ldf = data.clone_inner(); let unpivot_opts = polars::lazy::dsl::UnpivotArgsDSL { - index: to_lazy_selectors(id_vars), - on: to_lazy_selectors(value_vars), + index: by_name(id_vars, true), + on: by_name(value_vars, true), variable_name: Some(names_to.into()), value_name: Some(values_to.into()), }; @@ -306,13 +311,6 @@ pub fn lf_pivot_longer( Ok(ExLazyFrame::new(new_df)) } -fn to_lazy_selectors(values: Vec) -> Vec { - values - .into_iter() - .map(Selector::from) - .collect::>() -} - #[rustler::nif] pub fn lf_join( data: ExLazyFrame, @@ -397,7 +395,7 @@ pub fn lf_join_asof( .join_builder() .with(ldf1) .coalesce(JoinCoalesce::CoalesceColumns) - .how(JoinType::AsOf(AsOfOptions { + .how(JoinType::AsOf(Box::new(AsOfOptions { strategy, tolerance: None, // TODO: provide option @@ -408,7 +406,7 @@ pub fn lf_join_asof( allow_eq: true, // TODO: add a check? Note that Polars prints a warning if `check_sortedness=true` when `by` is provided check_sortedness: false, - })) + }))) .left_on(ex_expr_to_exprs(left_on)) .right_on(ex_expr_to_exprs(right_on)) .suffix(suffix) diff --git a/native/explorer/src/lazyframe/io.rs b/native/explorer/src/lazyframe/io.rs index 08607ed49..6c81e5e1c 100644 --- a/native/explorer/src/lazyframe/io.rs +++ b/native/explorer/src/lazyframe/io.rs @@ -21,10 +21,11 @@ pub fn lf_from_parquet( let cols: Vec = if let Some(cols) = columns { cols.iter().map(col).collect() } else { - vec![all()] + vec![all().as_expr()] }; - let lf = LazyFrame::scan_parquet(filename, options)?.select(cols); + let path = PlPath::from_str(filename); + let lf = LazyFrame::scan_parquet(path, options)?.select(cols); Ok(ExLazyFrame::new(lf)) } @@ -45,9 +46,11 @@ pub fn lf_from_parquet_cloud( let cols: Vec = if let Some(cols) = columns { cols.iter().map(col).collect() } else { - vec![all()] + vec![all().as_expr()] }; - let lf = LazyFrame::scan_parquet(ex_entry.to_string(), options)? + + let path = PlPath::from_string(ex_entry.to_string()); + let lf = LazyFrame::scan_parquet(path, options)? .with_comm_subplan_elim(false) .with_new_streaming(true) .select(cols); @@ -79,14 +82,16 @@ pub fn lf_to_parquet( let lf = data.clone_inner(); if streaming { - let options = ParquetWriteOptions { + let parquet_write_options = ParquetWriteOptions { compression, statistics: StatisticsOptions::empty(), row_group_size: None, data_page_size: None, ..Default::default() }; - let target = std::path::PathBuf::from(filename); + + let sink_target = SinkTarget::Path(PlPath::from_str(filename)); + let sink_options = SinkOptions { maintain_order: false, ..Default::default() @@ -94,7 +99,7 @@ pub fn lf_to_parquet( let _ = lf .with_comm_subplan_elim(false) - .sink_parquet(SinkTarget::Path(target.into()), options, None, sink_options)? + .sink_parquet(sink_target, parquet_write_options, None, sink_options)? .collect(); Ok(()) } else { @@ -129,7 +134,8 @@ pub fn lf_to_parquet_cloud( data_page_size: None, ..Default::default() }; - let target = std::path::PathBuf::from(ex_entry.to_string()); + let sink_target = SinkTarget::Path(PlPath::from_string(ex_entry.to_string())); + let sink_options = SinkOptions { maintain_order: false, ..Default::default() @@ -137,12 +143,7 @@ pub fn lf_to_parquet_cloud( let _ = lf .with_comm_subplan_elim(false) - .sink_parquet( - SinkTarget::Path(target.into()), - options, - cloud_options, - sink_options, - )? + .sink_parquet(sink_target, options, cloud_options, sink_options)? .collect(); Ok(()) } @@ -161,7 +162,8 @@ pub fn lf_to_parquet_cloud( #[rustler::nif(schedule = "DirtyIo")] pub fn lf_from_ipc(filename: &str) -> Result { - let lf = LazyFrame::scan_ipc(filename, Default::default())?; + let sink_target = PlPath::from_str(filename); + let lf = LazyFrame::scan_ipc(sink_target, Default::default(), Default::default())?; Ok(ExLazyFrame::new(lf)) } @@ -176,7 +178,8 @@ pub fn lf_to_ipc( // Select the compression algorithm. let compression = match compression { Some("lz4") => Some(IpcCompression::LZ4), - Some("zstd") => Some(IpcCompression::ZSTD), + // ZSTD with a compression level of 3 is the default. + Some("zstd") => Some(IpcCompression::default()), _ => None, }; @@ -187,14 +190,14 @@ pub fn lf_to_ipc( compression, ..Default::default() }; - let target = std::path::PathBuf::from(filename); + let sink_target = SinkTarget::Path(PlPath::from_str(filename)); let sink_options = SinkOptions { maintain_order: false, ..Default::default() }; let _ = lf .with_comm_subplan_elim(false) - .sink_ipc(SinkTarget::Path(target.into()), options, None, sink_options)? + .sink_ipc(sink_target, options, None, sink_options)? .collect(); Ok(()) } else { @@ -220,7 +223,8 @@ pub fn lf_to_ipc_cloud( // Select the compression algorithm. let compression = match compression { Some("lz4") => Some(IpcCompression::LZ4), - Some("zstd") => Some(IpcCompression::ZSTD), + // ZSTD with a compression level of 3 is the default. + Some("zstd") => Some(IpcCompression::default()), _ => None, }; @@ -228,19 +232,14 @@ pub fn lf_to_ipc_cloud( compression, ..Default::default() }; - let target = std::path::PathBuf::from(ex_entry.to_string()); + let sink_target = SinkTarget::Path(PlPath::from_string(ex_entry.to_string())); let sink_options = SinkOptions { maintain_order: false, ..Default::default() }; let _ = lf .with_comm_subplan_elim(false) - .sink_ipc( - SinkTarget::Path(target.into()), - options, - cloud_options, - sink_options, - )? + .sink_ipc(sink_target, options, cloud_options, sink_options)? .collect(); Ok(()) @@ -269,7 +268,9 @@ pub fn lf_from_csv( _ => CsvEncoding::Utf8, }; - let df = LazyCsvReader::new(filename) + let path = PlPath::from_str(filename); + + let df = LazyCsvReader::new(path) .with_infer_schema_length(infer_schema_length) .with_has_header(has_header) .with_try_parse_dates(parse_dates) @@ -311,7 +312,7 @@ pub fn lf_to_csv( serialize_options, ..Default::default() }; - let target = std::path::PathBuf::from(filename); + let sink_target = SinkTarget::Path(PlPath::from_str(filename)); let sink_options = SinkOptions { maintain_order: true, mkdir: true, @@ -320,7 +321,7 @@ pub fn lf_to_csv( let _ = lf .with_comm_subplan_elim(false) - .sink_csv(SinkTarget::Path(target.into()), options, None, sink_options)? + .sink_csv(sink_target, options, None, sink_options)? .collect(); Ok(()) @@ -348,7 +349,10 @@ pub fn lf_from_ndjson( let batch_size = NonZeroUsize::new(batch_size).ok_or(ExplorerError::Other( "\"batch_size\" expected to be non zero.".to_string(), ))?; - let lf = LazyJsonLineReader::new(filename) + + let path = PlPath::from_str(&filename); + + let lf = LazyJsonLineReader::new(path) .with_infer_schema_length(infer_schema_length.and_then(NonZeroUsize::new)) .with_batch_size(Some(batch_size)) .finish()?; diff --git a/native/explorer/src/series.rs b/native/explorer/src/series.rs index cbb69123a..bafacb909 100644 --- a/native/explorer/src/series.rs +++ b/native/explorer/src/series.rs @@ -587,6 +587,7 @@ pub fn s_fill_missing_with_bin( pub fn s_fill_missing_with_date(series: ExSeries, date: ExDate) -> Result { let s = series .date()? + .physical() .fill_null_with_values(date.into())? .cast(&DataType::Date)? .into_series(); @@ -607,6 +608,7 @@ pub fn s_fill_missing_with_datetime( let s = series .datetime()? + .physical() .fill_null_with_values(timestamp)? .cast(series.dtype())? .into_series(); @@ -1098,20 +1100,22 @@ pub fn s_quantile<'a>( let dtype = s.dtype(); let strategy = parse_quantile_interpol_options(strategy); match dtype { - DataType::Date => match s.date()?.quantile(quantile, strategy)? { + DataType::Date => match s.date()?.physical().quantile(quantile, strategy)? { None => Ok(None::.encode(env)), Some(days) => Ok(ExDate::from(days as i32).encode(env)), }, - DataType::Time => match s.time()?.quantile(quantile, strategy)? { + DataType::Time => match s.time()?.physical().quantile(quantile, strategy)? { None => Ok(None::.encode(env)), Some(microseconds) => Ok(ExTime::from(microseconds as i64).encode(env)), }, - DataType::Datetime(unit, None) => match s.datetime()?.quantile(quantile, strategy)? { - None => Ok(None::.encode(env)), - Some(time) => Ok(encode_naive_datetime(time as i64, *unit, env) - .unwrap() - .encode(env)), - }, + DataType::Datetime(unit, None) => { + match s.datetime()?.physical().quantile(quantile, strategy)? { + None => Ok(None::.encode(env)), + Some(time) => Ok(encode_naive_datetime(time as i64, *unit, env) + .unwrap() + .encode(env)), + } + } _ => encoding::resource_term_from_value( &s.resource, s.quantile_reduce(quantile, strategy)? @@ -1126,24 +1130,33 @@ pub fn s_quantile<'a>( #[rustler::nif(schedule = "DirtyCpu")] pub fn s_peak_max(s: ExSeries) -> Result { let ca = match s.dtype() { - DataType::Int8 => peak_max(s.i8()?), - DataType::Int16 => peak_max(s.i16()?), - DataType::Int32 => peak_max(s.i32()?), - DataType::Int64 => peak_max(s.i64()?), - - DataType::UInt8 => peak_max(s.u8()?), - DataType::UInt16 => peak_max(s.u16()?), - DataType::UInt32 => peak_max(s.u32()?), - DataType::UInt64 => peak_max(s.u64()?), - - DataType::Float32 => peak_max(s.f32()?), - DataType::Float64 => peak_max(s.f64()?), - DataType::Decimal(_, _) => peak_max(s.decimal()?), - - DataType::Date => peak_max(s.date()?), - DataType::Time => peak_max(s.time()?), - DataType::Datetime(_unit, None) => peak_max(s.datetime()?), - DataType::Duration(_unit) => peak_max(s.duration()?), + DataType::Int8 => peak_max_with_start_end(s.i8()?, Some(0i8), Some(0i8)), + DataType::Int16 => peak_max_with_start_end(s.i16()?, Some(0i16), Some(0i16)), + DataType::Int32 => peak_max_with_start_end(s.i32()?, Some(0i32), Some(0i32)), + DataType::Int64 => peak_max_with_start_end(s.i64()?, Some(0i64), Some(0i64)), + + DataType::UInt8 => peak_max_with_start_end(s.u8()?, Some(0u8), Some(0u8)), + DataType::UInt16 => peak_max_with_start_end(s.u16()?, Some(0u16), Some(0u16)), + DataType::UInt32 => peak_max_with_start_end(s.u32()?, Some(0u32), Some(0u32)), + DataType::UInt64 => peak_max_with_start_end(s.u64()?, Some(0u64), Some(0u64)), + + DataType::Float32 => peak_max_with_start_end(s.f32()?, Some(0f32), Some(0f32)), + DataType::Float64 => peak_max_with_start_end(s.f64()?, Some(0f64), Some(0f64)), + + DataType::Decimal(_, _) => { + peak_max_with_start_end(s.decimal()?.physical(), Some(0.into()), Some(0.into())) + } + DataType::Date => peak_max_with_start_end(s.date()?.physical(), Some(0), Some(0)), + DataType::Time => { + peak_max_with_start_end(s.time()?.physical(), Some(0.into()), Some(0.into())) + } + DataType::Datetime(_unit, None) => { + peak_max_with_start_end(s.datetime()?.physical(), Some(0.into()), Some(0.into())) + } + DataType::Duration(_unit) => { + peak_max_with_start_end(s.duration()?.physical(), Some(0.into()), Some(0.into())) + } + dt => panic!("peak_max/1 not implemented for {dt:?}"), }; @@ -1153,24 +1166,31 @@ pub fn s_peak_max(s: ExSeries) -> Result { #[rustler::nif(schedule = "DirtyCpu")] pub fn s_peak_min(s: ExSeries) -> Result { let ca = match s.dtype() { - DataType::Int8 => peak_min(s.i8()?), - DataType::Int16 => peak_min(s.i16()?), - DataType::Int32 => peak_min(s.i32()?), - DataType::Int64 => peak_min(s.i64()?), - - DataType::UInt8 => peak_min(s.u8()?), - DataType::UInt16 => peak_min(s.u16()?), - DataType::UInt32 => peak_min(s.u32()?), - DataType::UInt64 => peak_min(s.u64()?), - - DataType::Float32 => peak_min(s.f32()?), - DataType::Float64 => peak_min(s.f64()?), - DataType::Decimal(_, _) => peak_min(s.decimal()?), - - DataType::Date => peak_min(s.date()?), - DataType::Time => peak_min(s.time()?), - DataType::Datetime(_unit, None) => peak_min(s.datetime()?), - DataType::Duration(_unit) => peak_min(s.duration()?), + DataType::Int8 => peak_min_with_start_end(s.i8()?, Some(0i8), Some(0i8)), + DataType::Int16 => peak_min_with_start_end(s.i16()?, Some(0i16), Some(0i16)), + DataType::Int32 => peak_min_with_start_end(s.i32()?, Some(0i32), Some(0i32)), + DataType::Int64 => peak_min_with_start_end(s.i64()?, Some(0i64), Some(0i64)), + + DataType::UInt8 => peak_min_with_start_end(s.u8()?, Some(0u8), Some(0u8)), + DataType::UInt16 => peak_min_with_start_end(s.u16()?, Some(0u16), Some(0u16)), + DataType::UInt32 => peak_min_with_start_end(s.u32()?, Some(0u32), Some(0u32)), + DataType::UInt64 => peak_min_with_start_end(s.u64()?, Some(0u64), Some(0u64)), + + DataType::Float32 => peak_min_with_start_end(s.f32()?, Some(0f32), Some(0f32)), + DataType::Float64 => peak_min_with_start_end(s.f64()?, Some(0f64), Some(0f64)), + + DataType::Decimal(_, _) => { + peak_min_with_start_end(s.decimal()?.physical(), Some(0.into()), Some(0.into())) + } + DataType::Date => peak_min_with_start_end(s.date()?.physical(), Some(0), Some(0)), + DataType::Time => peak_min_with_start_end(s.time()?.physical(), Some(0), Some(0)), + DataType::Datetime(_unit, None) => { + peak_min_with_start_end(s.datetime()?.physical(), Some(0.into()), Some(0.into())) + } + DataType::Duration(_unit) => { + peak_min_with_start_end(s.duration()?.physical(), Some(0.into()), Some(0.into())) + } + dt => panic!("peak_min/1 not implemented for {dt:?}"), }; @@ -1214,52 +1234,42 @@ pub fn cast_str_to_f64(atom: &str) -> f64 { #[rustler::nif(schedule = "DirtyCpu")] pub fn s_categories(s: ExSeries) -> Result { match s.dtype() { - DataType::Categorical(Some(mapping), _) => { - let size = mapping.len() as u32; - let categories: Vec<&str> = (0..size).map(|id| mapping.get(id)).collect(); - let series = Series::new("categories".into(), &categories); - Ok(ExSeries::new(series)) + DataType::Categorical(_, arc_mapping) => { + // https://github.com/narwhals-dev/narwhals/issues/3097 + let mapping = arc_mapping.as_ref(); + let categories = (0..mapping.num_cats_upper_bound()) + .flat_map(|i| mapping.cat_to_str(i as CatSize)) + .collect(); + Ok(ExSeries::new(categories)) } _ => panic!("Cannot get categories from non categorical series"), } } #[rustler::nif(schedule = "DirtyCpu")] -pub fn s_categorise(s: ExSeries, cat: ExSeries) -> Result { - match cat.dtype() { - DataType::Categorical(Some(mapping), _) => { - let categories = mapping.get_categories(); - let m = categories.len(); - - let chunks: ChunkedArray = if s.dtype() == &DataType::String { - let ca = s.str()?; - if m >= 10 { - let hash_map: std::collections::HashMap<&str, u32> = categories - .values_iter() - .enumerate() - .map(|(i, v)| (v, i as u32)) - .collect(); - ca.into_iter() - .map(|opt| opt.and_then(|slice| hash_map.get(slice).copied())) - .collect() - } else { - ca.into_iter() - .map(|opt| opt.and_then(|slice| mapping.find(slice))) - .collect() - } - } else { - s.cast(&DataType::UInt32)?.u32()?.clone() - }; - - let categorical_chunks = unsafe { - CategoricalChunked::from_cats_and_rev_map_unchecked( - chunks, - mapping.clone(), - false, - CategoricalOrdering::default(), - ) - }; - Ok(ExSeries::new(categorical_chunks.into_series())) +pub fn s_categorise( + to_categorise: ExSeries, + categories: ExSeries, +) -> Result { + let cat_dtype = categories.dtype(); + + match cat_dtype { + DataType::Categorical(_, arc_mapping) => { + // https://github.com/narwhals-dev/narwhals/issues/3097 + let mapping = arc_mapping.as_ref(); + let categories: Vec<&str> = (0..mapping.num_cats_upper_bound()) + .flat_map(|i| mapping.cat_to_str(i as CatSize)) + .collect(); + + // Build an analogous `Enum` dtype. As `Enum`s are static, this will ensure that any + // elements in `to_categorise` which don't belong to a category from `categories` get + // mapped to `null` instead of adding new category. + let fcat = FrozenCategories::new(categories)?; + let fmap = fcat.mapping(); + let enum_dtype = DataType::Enum(fcat.clone(), fmap.clone()); + + let categorised = to_categorise.cast(&enum_dtype)?.cast(cat_dtype)?; + Ok(ExSeries::new(categorised)) } _ => panic!("Cannot get categories from non categorical or string series"), } @@ -1926,7 +1936,7 @@ pub fn s_json_decode(s: ExSeries, ex_dtype: ExSeriesDtype) -> Result ExSeries { pub fn s_from_list_decimal( name: &str, val: Term, - precision: Option, - scale: Option, + precision: usize, + scale: usize, ) -> Result { let iterator = val .decode::() @@ -231,35 +232,41 @@ pub fn s_from_list_decimal( let values: Vec = iterator .map(|item| match item.get_type() { - TermType::Integer => { - let s = scale.unwrap_or(0); - item.decode::() - .map(|num| AnyValue::Decimal(num, s)) - .map_err(|err| { - ExplorerError::Other(format!("int number is too big for an i128: {err:?}")) - }) - } + TermType::Integer => item + .decode::() + .map(|num| AnyValue::Decimal(num, precision, scale)) + .map_err(|err| { + ExplorerError::Other(format!("int number is too big for an i128: {err:?}")) + }), TermType::Map => item .decode::() .map_err(|err| { ExplorerError::Other(format!( - "cannot decode a valid decimal from term; check that `coef` fits into an `i128`. error: {err:?}" + "cannot decode a valid decimal from term. error: {err:?}" )) }) - .and_then(|ex_decimal| Ok(AnyValue::Decimal(ex_decimal.signed_coef()?, ex_decimal.scale()))), + .and_then(|ex_decimal| { + let coef = ex_decimal.signed_coef()?; + let integer = decimal_integer(coef, ex_decimal.scale(), scale); + Ok(AnyValue::Decimal(integer, precision, scale)) + }), TermType::Atom => Ok(AnyValue::Null), TermType::Float => item .decode::() .map(|num| match native_float_to_decimal_parts(num) { - Some((integer, scale)) => AnyValue::Decimal(integer, scale), + Some((inferred_integer, inferred_scale)) => { + let integer = decimal_integer(inferred_integer, inferred_scale, scale); + AnyValue::Decimal(integer, precision, scale) + } None => AnyValue::Null, }) .map_err(|err| { ExplorerError::Other(format!("float number is too big f64: {err:?}")) }), + term_type => Err(ExplorerError::Other(format!( "from_list/2 for decimals not implemented for {term_type:?}" ))), @@ -268,27 +275,29 @@ pub fn s_from_list_decimal( let mut series = Series::from_any_values(name.into(), &values, true)?; - match series.dtype() { - DataType::Decimal(result_precision, result_scale) => { - let p: Option = Some(precision.unwrap_or(result_precision.unwrap_or(38))); - let s: Option = Some(scale.unwrap_or(result_scale.unwrap_or(0))); - - if *result_precision != p || *result_scale != s { - series = series.cast(&DataType::Decimal(p, s))?; - } - } + series = match series.dtype() { + DataType::Decimal(_precision, _scale) => series, // An empty list will result in the `Null` dtype. - DataType::Null => { - let p = Some(precision.unwrap_or(38)); - let s = Some(scale.unwrap_or(0)); - series = series.cast(&DataType::Decimal(p, s))?; - } + DataType::Null => series.cast(&DataType::Decimal(precision, scale))?, other_dtype => panic!("expected dtype to be Decimal. found: {other_dtype:?}"), - } + }; Ok(ExSeries::new(series)) } +// Move the decimal point to match the required scale. +fn decimal_integer(integer: i128, integer_scale: usize, required_scale: usize) -> i128 { + match integer_scale.cmp(&required_scale) { + cmp::Ordering::Less => { + integer * 10i128.pow((required_scale as u32) - (integer_scale as u32)) + } + cmp::Ordering::Greater => { + integer / 10i128.pow((integer_scale as u32) - (required_scale as u32)) + } + cmp::Ordering::Equal => integer, + } +} + fn native_float_to_decimal_parts(float: f64) -> Option<(i128, usize)> { let float_str = float.to_string(); @@ -394,9 +403,14 @@ pub fn s_from_list_binary(name: &str, val: Term) -> NifResult { #[rustler::nif(schedule = "DirtyCpu")] pub fn s_from_list_categories(name: &str, val: Term) -> NifResult { let decoded = val.decode::>>()?; + let categories = Categories::new( + PlSmallStr::EMPTY, + PlSmallStr::EMPTY, + CategoricalPhysical::U32, + ); Ok(ExSeries::new( Series::new(name.into(), decoded.as_slice()) - .cast(&DataType::Categorical(None, CategoricalOrdering::default())) + .cast(&DataType::from_categories(categories)) .map_err(|err| { let message = format!( "from_list/2 cannot cast a string series to categories series: {err:?}" diff --git a/native/explorer/src/series/log.rs b/native/explorer/src/series/log.rs index d2c6d1ee0..10f2c84fc 100644 --- a/native/explorer/src/series/log.rs +++ b/native/explorer/src/series/log.rs @@ -30,13 +30,15 @@ pub fn s_log(s: ExSeries, base: Term) -> Result { } }; - let s = s.log(float); + let float_series = Series::new("".into(), [float]); + let s = s.log(&float_series); Ok(ExSeries::new(s)) } #[rustler::nif(schedule = "DirtyCpu")] pub fn s_log_natural(s: ExSeries) -> Result { - Ok(ExSeries::new(s.log(E))) + let e = Series::new("".into(), [E]); + Ok(ExSeries::new(s.log(&e))) } #[rustler::nif(schedule = "DirtyCpu")] diff --git a/test/explorer/data_frame/grouped_test.exs b/test/explorer/data_frame/grouped_test.exs index 9175debe8..118b80513 100644 --- a/test/explorer/data_frame/grouped_test.exs +++ b/test/explorer/data_frame/grouped_test.exs @@ -621,11 +621,11 @@ defmodule Explorer.DataFrame.GroupedTest do assert DF.to_columns(df2, atom_keys: true) == %{ a: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - b: [1.0, 4.0, 6.0, 8.0, 10.0, 6, 14.0, 16.0, 18.0, 20.0], - c: [0.25, 1.75, 2.75, 3.75, 4.75, 1.5, 6.75, 7.75, 8.75, 9.75], + b: [2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0], + c: [1.0, 1.75, 2.75, 3.75, 4.75, 6.0, 6.75, 7.75, 8.75, 9.75], d: [1.5, 1.5, 2.5, 3.5, 4.5, 6.5, 6.5, 7.5, 8.5, 9.5], - e: [1.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, 9.0], - f: [1.0, 5.0, 8.0, 11.0, 14.0, 6.0, 20.0, 23.0, 26.0, 29.0], + e: [2.0, 1.0, 2.0, 3.0, 4.0, 12.0, 6.0, 7.0, 8.0, 9.0], + f: [2.0, 5.0, 8.0, 11.0, 14.0, 12.0, 20.0, 23.0, 26.0, 29.0], g: [ nil, 0.7071067811865476, diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index a0c4393bf..ba60a2eb2 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -1386,11 +1386,11 @@ defmodule Explorer.DataFrameTest do assert DF.to_columns(df1, atom_keys: true) == %{ a: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - b: [1.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0], - c: [0.25, 1.75, 2.75, 3.75, 4.75, 5.75, 6.75, 7.75, 8.75, 9.75], + b: [2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0], + c: [1.0, 1.75, 2.75, 3.75, 4.75, 5.75, 6.75, 7.75, 8.75, 9.75], d: [1.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5], - e: [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0], - f: [1.0, 5.0, 8.0, 11.0, 14.0, 17.0, 20.0, 23.0, 26.0, 29.0], + e: [2.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0], + f: [2.0, 5.0, 8.0, 11.0, 14.0, 17.0, 20.0, 23.0, 26.0, 29.0], g: [ nil, 0.7071067811865476, @@ -1405,7 +1405,7 @@ defmodule Explorer.DataFrameTest do ], h: [ 1.0, - 1.6666666666666667, + 1.6666666666666665, 2.4285714285714284, 3.2666666666666666, 4.161290322580645, @@ -1558,9 +1558,9 @@ defmodule Explorer.DataFrameTest do assert DF.to_columns(df1, atom_keys: true) == %{ a: [1, 2, 3, 4, 5], - b: [5, 3, 2, 1, 4], - c: [1, 3, 2, 4, 5], - d: [4, 2, 5, 0, 0] + b: [4, 3, 5, 1, 2], + c: [1, 2, 4, 5, 3], + d: [4, 5, 2, 0, 0] } assert df1.dtypes == %{ @@ -1753,7 +1753,7 @@ defmodule Explorer.DataFrameTest do df2_e = df2[:e] assert Explorer.Series.to_list(df2_e) == [ - Decimal.new("3.1415"), + Decimal.new("3.1416"), Decimal.new("2.0000"), nil ] @@ -1771,7 +1771,7 @@ defmodule Explorer.DataFrameTest do df3_f = df3[:f] assert Explorer.Series.to_list(df3_f) == [ - Decimal.new("3.1415000"), + Decimal.new("3.1416000"), Decimal.new("2.0000000"), nil ] @@ -1781,7 +1781,7 @@ defmodule Explorer.DataFrameTest do df4_g = df4[:g] assert Explorer.Series.to_list(df4_g) == [ - Decimal.new("3.1415"), + Decimal.new("3.1416"), Decimal.new("2.0000"), nil ] @@ -4209,8 +4209,8 @@ defmodule Explorer.DataFrameTest do df1 = DF.sample(df, 3, seed: 100) assert DF.to_columns(df1, atom_keys: true) == %{ - letters: ["j", "f", "h"], - numbers: [10, 6, 8] + letters: ["h", "j", "i"], + numbers: [8, 10, 9] } end @@ -4219,10 +4219,7 @@ defmodule Explorer.DataFrameTest do df1 = DF.sample(df, 0.2, seed: 100) - assert DF.to_columns(df1, atom_keys: true) == %{ - letters: ["f", "g"], - numbers: [6, 7] - } + assert DF.to_columns(df1, atom_keys: true) == %{letters: ["h", "j"], numbers: [8, 10]} end test "sampling by integer with same size of the dataframe" do @@ -4243,8 +4240,8 @@ defmodule Explorer.DataFrameTest do df1 = DF.sample(df, 10, seed: 100, shuffle: true) assert DF.to_columns(df1, atom_keys: true) == %{ - letters: ["h", "j", "c", "a", "e", "b", "d", "i", "f", "g"], - numbers: [8, 10, 3, 1, 5, 2, 4, 9, 6, 7] + letters: ["d", "h", "i", "a", "f", "b", "c", "g", "e", "j"], + numbers: [4, 8, 9, 1, 6, 2, 3, 7, 5, 10] } end @@ -4266,8 +4263,8 @@ defmodule Explorer.DataFrameTest do df1 = DF.sample(df, 1.0, seed: 100, shuffle: true) assert DF.to_columns(df1, atom_keys: true) == %{ - letters: ["h", "j", "c", "a", "e", "b", "d", "i", "f", "g"], - numbers: [8, 10, 3, 1, 5, 2, 4, 9, 6, 7] + letters: ["d", "h", "i", "a", "f", "b", "c", "g", "e", "j"], + numbers: [4, 8, 9, 1, 6, 2, 3, 7, 5, 10] } end end @@ -4279,8 +4276,8 @@ defmodule Explorer.DataFrameTest do df1 = DF.shuffle(df, seed: 100) assert DF.to_columns(df1, atom_keys: true) == %{ - letters: ["h", "j", "c", "a", "e", "b", "d", "i", "f", "g"], - numbers: [8, 10, 3, 1, 5, 2, 4, 9, 6, 7] + letters: ["d", "h", "i", "a", "f", "b", "c", "g", "e", "j"], + numbers: [4, 8, 9, 1, 6, 2, 3, 7, 5, 10] } end end @@ -4862,7 +4859,8 @@ defmodule Explorer.DataFrameTest do property "should be able to create a DataFrame from valid rows" do check all( - dtypes <- Explorer.Generator.dtypes(), + # TODO: Remove `exclude: :category` after #1011 is resolved. + dtypes <- Explorer.Generator.dtypes(exclude: :category), rows <- Explorer.Generator.rows(dtypes), max_runs: 1_000 ) do @@ -4872,7 +4870,8 @@ defmodule Explorer.DataFrameTest do property "should be able to create a DataFrame from valid columns" do check all( - dtypes <- Explorer.Generator.dtypes(), + # TODO: Remove `exclude: :category` after #1011 is resolved. + dtypes <- Explorer.Generator.dtypes(exclude: :category), cols <- Explorer.Generator.columns(dtypes), max_runs: 1_000 ) do diff --git a/test/explorer/series/duration_test.exs b/test/explorer/series/duration_test.exs index ec2489d9e..25effc008 100644 --- a/test/explorer/series/duration_test.exs +++ b/test/explorer/series/duration_test.exs @@ -10,7 +10,7 @@ defmodule Explorer.Series.DurationTest do @one_hour_us 3600 * 1_000_000 @one_hour_duration_ms %Duration{value: @one_hour_ms, precision: :millisecond} @one_hour_duration_us %Duration{value: @one_hour_us, precision: :microsecond} - @one_day_duration_ms %Duration{value: 24 * @one_hour_ms, precision: :millisecond} + @one_day_duration_us %Duration{value: 24 * @one_hour_us, precision: :microsecond} describe "list" do test "from a list of integers" do @@ -410,24 +410,24 @@ defmodule Explorer.Series.DurationTest do aug_21_s = Series.from_list([@aug_21]) diff_s = Series.subtract(aug_21_s, aug_20_s) - assert diff_s.dtype == {:duration, :millisecond} - assert Series.to_list(diff_s) == [@one_day_duration_ms] + assert diff_s.dtype == {:duration, :microsecond} + assert Series.to_list(diff_s) == [@one_day_duration_us] end test "Date - date" do aug_20_s = Series.from_list([@aug_20]) diff_s = Series.subtract(@aug_21, aug_20_s) - assert diff_s.dtype == {:duration, :millisecond} - assert Series.to_list(diff_s) == [@one_day_duration_ms] + assert diff_s.dtype == {:duration, :microsecond} + assert Series.to_list(diff_s) == [@one_day_duration_us] end test "date - Date" do aug_21_s = Series.from_list([@aug_21]) diff_s = Series.subtract(aug_21_s, @aug_20) - assert diff_s.dtype == {:duration, :millisecond} - assert Series.to_list(diff_s) == [@one_day_duration_ms] + assert diff_s.dtype == {:duration, :microsecond} + assert Series.to_list(diff_s) == [@one_day_duration_us] end test "Date - Date raises ArgumentError" do @@ -446,8 +446,8 @@ defmodule Explorer.Series.DurationTest do assert diff_s.dtype == :date assert Series.to_list(diff_s) == [@aug_20] - # Subtracting a duration at least a day results in the previous date. - one_day_s = Series.from_list([@one_day_duration_ms]) + # Subtracting a duration of at least a day results in the previous date. + one_day_s = Series.from_list([@one_day_duration_us]) diff_s = Series.subtract(aug_21_s, one_day_s) assert diff_s.dtype == :date diff --git a/test/explorer/series_test.exs b/test/explorer/series_test.exs index 97d4b3605..4d9adf194 100644 --- a/test/explorer/series_test.exs +++ b/test/explorer/series_test.exs @@ -508,10 +508,10 @@ defmodule Explorer.SeriesTest do end test "decimal precision boundary" do - # Biggest number representable by an i128. - max_i128 = 2 ** 127 - 1 - biggest = Decimal.new(max_i128) - too_big = Decimal.new(max_i128 + 1) + # Biggest number with 38 digits + biggest = Decimal.new(10 ** 38 - 1) + # Smallest number with 39 digits + too_big = Decimal.new(10 ** 38) # Can make a series using the biggest allowed decimal. s1 = Series.from_list([biggest]) @@ -519,8 +519,7 @@ defmodule Explorer.SeriesTest do # Can't make a series using a number bigger than the biggest allowed decimal. assert_raise RuntimeError, - "Generic Error: cannot decode a valid decimal from term;" <> - " check that `coef` fits into an `i128`. error: throw()", + "Polars Error: decimal precision 38 can't fit values with 39 digits", fn -> Series.from_list([too_big]) end end @@ -3826,7 +3825,7 @@ defmodule Explorer.SeriesTest do result = Series.sample(s, 10, seed: 100) assert Series.size(result) == 10 - assert Series.to_list(result) == [57, 9, 54, 62, 50, 77, 35, 88, 1, 69] + assert Series.to_list(result) == [80, 95, 78, 33, 84, 100, 23, 58, 21, 30] end test "sample taking 5% of elements" do @@ -3835,7 +3834,7 @@ defmodule Explorer.SeriesTest do result = Series.sample(s, 0.05, seed: 100) assert Series.size(result) == 5 - assert Series.to_list(result) == [9, 56, 79, 28, 54] + assert Series.to_list(result) == [85, 89, 82, 35, 88] end test "sample taking more than elements without replace" do @@ -3864,7 +3863,7 @@ defmodule Explorer.SeriesTest do result = Series.sample(s, 15, replace: true, seed: 100) assert Series.size(result) == 15 - assert Series.to_list(result) == [7, 1, 6, 7, 6, 8, 3, 6, 4, 9, 1, 7, 1, 1, 9] + assert Series.to_list(result) == [9, 10, 9, 4, 9, 4, 3, 6, 3, 3, 8, 5, 10, 8, 8] end test "sample taking more than elements with fraction and replace" do @@ -3873,7 +3872,7 @@ defmodule Explorer.SeriesTest do result = Series.sample(s, 1.2, replace: true, seed: 100) assert Series.size(result) == 12 - assert Series.to_list(result) == [7, 1, 6, 7, 6, 8, 3, 6, 4, 9, 1, 7] + assert Series.to_list(result) == [9, 10, 9, 4, 9, 4, 3, 6, 3, 3, 8, 5] end test "sample with the exact amount of elements, but shuffle off" do @@ -3891,7 +3890,7 @@ defmodule Explorer.SeriesTest do result = Series.sample(s, 1.0, seed: 100, shuffle: true) assert Series.size(result) == 10 - assert Series.to_list(result) == [7, 9, 2, 0, 4, 1, 3, 8, 5, 6] + assert Series.to_list(result) == [3, 7, 8, 0, 5, 1, 2, 6, 4, 9] end end @@ -3902,7 +3901,7 @@ defmodule Explorer.SeriesTest do result = Series.shuffle(s, seed: 100) assert Series.size(result) == 10 - assert Series.to_list(result) == [7, 9, 2, 0, 4, 1, 3, 8, 5, 6] + assert Series.to_list(result) == [3, 7, 8, 0, 5, 1, 2, 6, 4, 9] end end @@ -4663,7 +4662,7 @@ defmodule Explorer.SeriesTest do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_mean(s1) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 1.0, 1.6666666666666667, 2.4285714285714284, @@ -4674,14 +4673,14 @@ defmodule Explorer.SeriesTest do 7.031372549019608, 8.017612524461839, 9.009775171065494 - ] + ]) end test "returns calculated ewma with differernt smoothing factor if different alpha is passed" do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_mean(s1, alpha: 0.8) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 1.0, 1.8333333333333335, 2.7741935483870965, @@ -4692,14 +4691,14 @@ defmodule Explorer.SeriesTest do 7.750020480052428, 8.75000460800236, 9.750001024000106 - ] + ]) end test "returns calculated ewma with nils for index less than min period size, if min_periods is set" do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_mean(s1, min_periods: 5) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ nil, nil, nil, @@ -4710,14 +4709,14 @@ defmodule Explorer.SeriesTest do 7.031372549019608, 8.017612524461839, 9.009775171065494 - ] + ]) end test "ignores nil by default and calculates ewma" do s1 = Series.from_list([1, nil, 2, nil, 3, 4, 5, 6, 7, 8]) s2 = Series.ewm_mean(s1, ignore_nils: true) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 1.0, nil, 1.6666666666666667, @@ -4728,7 +4727,7 @@ defmodule Explorer.SeriesTest do 5.095238095238095, 6.05511811023622, 7.031372549019608 - ] + ]) end test "does not ignore nil if set ignore_nils option to false and calculates ewma" do @@ -4737,7 +4736,7 @@ defmodule Explorer.SeriesTest do s1 = Series.from_list([1, nil, 2, nil, 3, 4, 5, 6, 7, 8]) s2 = Series.ewm_mean(s1, ignore_nils: false) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 1.0, nil, 1.8, @@ -4748,14 +4747,14 @@ defmodule Explorer.SeriesTest do 5.1959183673469385, 6.1177644710578845, 7.069101678183613 - ] + ]) end test "returns calculated ewma without adjustment if adjust option is set to false" do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_mean(s1, adjust: false) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 1.0, 1.5, 2.25, @@ -4766,7 +4765,7 @@ defmodule Explorer.SeriesTest do 7.0078125, 8.00390625, 9.001953125 - ] + ]) end end @@ -4775,7 +4774,7 @@ defmodule Explorer.SeriesTest do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_standard_deviation(s1) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 0.0, 0.7071067811865476, 0.9636241116594314, @@ -4786,14 +4785,14 @@ defmodule Explorer.SeriesTest do 1.6224598916602895, 1.6634845490537977, 1.689976601128564 - ] + ]) end test "returns calculated ewm std with different smoothing factor if different alpha is passed" do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_standard_deviation(s1, alpha: 0.8) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 0.0, 0.7071067811865476, 0.8613567692141088, @@ -4804,14 +4803,14 @@ defmodule Explorer.SeriesTest do 0.9679969383076764, 0.9681825776281606, 0.9682301709724406 - ] + ]) end test "returns calculated ewm std with nils for index less than min period size, if min_periods is set" do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_standard_deviation(s1, min_periods: 5) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ nil, nil, nil, @@ -4822,7 +4821,7 @@ defmodule Explorer.SeriesTest do 1.6224598916602895, 1.6634845490537977, 1.689976601128564 - ] + ]) end test "ignores nil by default and calculates ewm std" do @@ -4831,7 +4830,7 @@ defmodule Explorer.SeriesTest do s1 = Series.from_list([1, nil, 2, nil, 3, 4, 5, 6, 7, 8]) s2 = Series.ewm_standard_deviation(s1, ignore_nils: true) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 0.0, nil, 0.7071067811865476, @@ -4842,7 +4841,7 @@ defmodule Explorer.SeriesTest do 1.4709162008918397, 1.5607315639222439, 1.6224598916602895 - ] + ]) end test "does not ignore nil if set ignore_nils option to false and calculates ewm std" do @@ -4851,7 +4850,7 @@ defmodule Explorer.SeriesTest do s1 = Series.from_list([1, nil, 2, nil, 3, 4, 5, 6, 7, 8]) s2 = Series.ewm_standard_deviation(s1, ignore_nils: false) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 0.0, nil, 0.7071067811865476, @@ -4862,14 +4861,14 @@ defmodule Explorer.SeriesTest do 1.3067888637766594, 1.4363395171897309, 1.5336045526865307 - ] + ]) end test "returns calculated ewm std without adjustment if adjust option is set to false" do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_standard_deviation(s1, adjust: false) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 0.0, 0.7071067811865476, 1.0488088481701516, @@ -4880,14 +4879,14 @@ defmodule Explorer.SeriesTest do 1.6805652557493016, 1.7030595977801866, 1.7159083446458816 - ] + ]) end test "returns calculated ewm std with bias if bias option is set to true" do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_standard_deviation(s1, bias: true) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 0.0, 0.4714045207910317, 0.7284313590846835, @@ -4898,7 +4897,7 @@ defmodule Explorer.SeriesTest do 1.3221328870469677, 1.3568998042691014, 1.3791855333404945 - ] + ]) end end @@ -4907,7 +4906,7 @@ defmodule Explorer.SeriesTest do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_variance(s1) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 0.0, 0.5, 0.9285714285714284, @@ -4918,14 +4917,14 @@ defmodule Explorer.SeriesTest do 2.632376100046318, 2.7671808449407167, 2.8560209123620535 - ] + ]) end test "returns calculated ewm var with different smoothing factor if different alpha is passed" do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_variance(s1, alpha: 0.8) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 0.0, 0.5, 0.7419354838709674, @@ -4936,14 +4935,14 @@ defmodule Explorer.SeriesTest do 0.9370180725730355, 0.9373775036227093, 0.9374696639813216 - ] + ]) end test "returns calculated ewm var with nils for index less than min period size, if min_periods is set" do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_variance(s1, min_periods: 5) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ nil, nil, nil, @@ -4954,7 +4953,7 @@ defmodule Explorer.SeriesTest do 2.632376100046318, 2.7671808449407167, 2.8560209123620535 - ] + ]) end test "ignores nil by default and calculates ewm var" do @@ -4963,7 +4962,7 @@ defmodule Explorer.SeriesTest do s1 = Series.from_list([1, nil, 2, nil, 3, 4, 5, 6, 7, 8]) s2 = Series.ewm_variance(s1, ignore_nils: true) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 0.0, nil, 0.5, @@ -4974,7 +4973,7 @@ defmodule Explorer.SeriesTest do 2.163594470046083, 2.435883014623173, 2.632376100046318 - ] + ]) end test "does not ignore nil if set ignore_nils option to false and calculates ewm var" do @@ -4983,7 +4982,7 @@ defmodule Explorer.SeriesTest do s1 = Series.from_list([1, nil, 2, nil, 3, 4, 5, 6, 7, 8]) s2 = Series.ewm_variance(s1, ignore_nils: false) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 0.0, nil, 0.5, @@ -4994,14 +4993,14 @@ defmodule Explorer.SeriesTest do 1.7076971344906926, 2.0630712086408294, 2.3519429240208543 - ] + ]) end test "returns calculated ewm var without adjustment if adjust option is set to false" do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_variance(s1, adjust: false) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 0.0, 0.5, 1.1, @@ -5012,14 +5011,14 @@ defmodule Explorer.SeriesTest do 2.824299578831716, 2.9004119935912107, 2.9443414472253693 - ] + ]) end test "returns calculated ewm var with bias if bias option is set to true" do s1 = 1..10 |> Enum.to_list() |> Series.from_list() s2 = Series.ewm_variance(s1, bias: true) - assert Series.to_list(s2) == [ + assert all_close?(s2, [ 0.0, 0.2222222222222222, 0.5306122448979591, @@ -5030,7 +5029,7 @@ defmodule Explorer.SeriesTest do 1.7480353710111498, 1.8411770788255257, 1.9021527353757046 - ] + ]) end end @@ -5437,7 +5436,7 @@ defmodule Explorer.SeriesTest do test "rank of a series of floats (method: random)" do s = Series.from_list([3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1]) r = Series.rank(s, method: :random, seed: 4242) - assert Series.to_list(r) === [8, 2, 5, 4, 9, 10, 7, 6, 1, 3] + assert Series.to_list(r) === [8, 2, 5, 3, 9, 10, 7, 6, 1, 4] end test "rank of a float series with a nan" do @@ -6905,10 +6904,24 @@ defmodule Explorer.SeriesTest do end end - defp all_close?(a, b, tol \\ 1.0e-8) do - Series.subtract(a, b) - |> Series.abs() - |> Series.less_equal(tol) - |> Series.all?() + defp all_close?(a, b, tol \\ 1.0e-8) + + defp all_close?(%Series{} = a, %Series{} = b, tol) do + if Series.size(a) != Series.size(b) do + false + else + # Needs to be a value that doesn't appear in any series. + dummy_value = max(Series.max(a), Series.max(b)) + 1 + a = Series.fill_missing(a, dummy_value) + b = Series.fill_missing(b, dummy_value) + + Series.subtract(a, b) + |> Series.abs() + |> Series.less_equal(tol) + |> Series.all?() + end end + + defp all_close?(a, b, tol) when is_list(a), do: all_close?(Series.from_list(a), b, tol) + defp all_close?(a, b, tol) when is_list(b), do: all_close?(a, Series.from_list(b), tol) end