From d1709e805bc02e79a6ffeff0b0ef21169b6bf038 Mon Sep 17 00:00:00 2001
From: Matthieu Gomez <gomez.matthieu@gmail.com>
Date: Wed, 28 Jan 2026 19:11:49 -0500
Subject: [PATCH 1/8] Refactor nthreads parameter handling in fit.jl

Removed default value for nthreads and adjusted related logic.
---
 src/fit.jl | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/fit.jl b/src/fit.jl
index b4d6dad..6205070 100644
--- a/src/fit.jl
+++ b/src/fit.jl
@@ -53,7 +53,7 @@ function reg(df,
     weights::Union{Symbol, Nothing} = nothing,
     save::Union{Bool, Symbol} = :none,
     method::Symbol = :cpu,
-    nthreads::Integer = method == :cpu ? Threads.nthreads() : 256,
+    nthreads::Union{Integer, Nothing} = nothing,
     double_precision::Bool = method == :cpu,
     tol::Real = 1e-6,
     maxiter::Integer = 10000,
@@ -72,7 +72,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
     @nospecialize(weights::Union{Symbol, Nothing} = nothing),
     @nospecialize(save::Union{Bool, Symbol} = :none),
     @nospecialize(method::Symbol = :cpu),
-    @nospecialize(nthreads::Integer = method == :cpu ? Threads.nthreads() : 256),
+    @nospecialize(nthreads::Union{Integer, Nothing} = nothing,
     @nospecialize(double_precision::Bool = true),
     @nospecialize(tol::Real = 1e-6),
     @nospecialize(maxiter::Integer = 10000),
@@ -105,11 +105,6 @@ function StatsAPI.fit(::Type{FixedEffectModel},
     end
     save_residuals = (save == :residuals) | (save == :all)
 
-    if method == :cpu && nthreads > Threads.nthreads()
-        @warn "Keyword argument nthreads = $(nthreads) is ignored (Julia was started with only $(Threads.nthreads()) threads)."
-        nthreads = Threads.nthreads()
-    end
-
     ##############################################################################
     ##
     ## Parse formula
@@ -166,7 +161,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
 
     n_singletons = 0
     if drop_singletons
-        n_singletons = drop_singletons!(esample, fes, nthreads)
+        n_singletons = drop_singletons!(esample, fes)
     end
 
     nobs = sum(esample)

From e51c296267f0f2e7aeb8f1334e5ce9fd86fb859a Mon Sep 17 00:00:00 2001
From: Matthieu Gomez <gomez.matthieu@gmail.com>
Date: Wed, 28 Jan 2026 19:13:07 -0500
Subject: [PATCH 2/8] Remove nthreads parameter from drop_singletons! function

---
 src/utils/fixedeffects.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/fixedeffects.jl b/src/utils/fixedeffects.jl
index 0e2a77e..ae2a8a4 100644
--- a/src/utils/fixedeffects.jl
+++ b/src/utils/fixedeffects.jl
@@ -4,7 +4,7 @@
 ##
 ##############################################################################
 
-function drop_singletons!(esample, fes::Vector{<:FixedEffect}, nthreads)
+function drop_singletons!(esample, fes::Vector{<:FixedEffect})
     nsingletons = 0
     ncleanpasses = 0
     caches = [Vector{UInt8}(undef, fes[i].n) for i in eachindex(fes)]

From 1506494b0a9ace684eb808521fbaff3bb8130f47 Mon Sep 17 00:00:00 2001
From: Matthieu Gomez <gomez.matthieu@gmail.com>
Date: Wed, 28 Jan 2026 19:18:59 -0500
Subject: [PATCH 3/8] Update default double_precision based on method

Change default value of double_precision based on method.
---
 src/fit.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/fit.jl b/src/fit.jl
index 6205070..e5dd428 100644
--- a/src/fit.jl
+++ b/src/fit.jl
@@ -11,7 +11,6 @@ Estimate a linear model with high dimensional categorical variables / instrument
 * `weights::Union{Nothing, Symbol}` A symbol to refer to a columns for weights
 * `save::Symbol`: Should residuals and eventual estimated fixed effects saved in a dataframe? Default to `:none` Use `save = :residuals` to only save residuals, `save = :fe` to only save fixed effects, `save = :all` for both. Once saved, they can then be accessed using `residuals(m)` or `fe(m)` where `m` is the object returned by the estimation. The returned DataFrame is automatically aligned with the original DataFrame.
 * `method::Symbol`: A symbol for the method. Default is :cpu. Alternatively,  use :CUDA or :Metal  (in this case, you need to import the respective package before importing FixedEffectModels)
-* `nthreads::Integer` Number of threads to use in the estimation. If `method = :cpu`, defaults to `Threads.nthreads()`. Otherwise, defaults to 256.
 * `double_precision::Bool`: Should the demeaning operation use Float64 rather than Float32? Default to true if `method =:cpu' and false if `method = :CUDA` or `method = :Metal`.
 * `tol::Real` Tolerance. Default to 1e-6.
 * `maxiter::Integer = 10000`: Maximum number of iterations
@@ -73,7 +72,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
     @nospecialize(save::Union{Bool, Symbol} = :none),
     @nospecialize(method::Symbol = :cpu),
     @nospecialize(nthreads::Union{Integer, Nothing} = nothing,
-    @nospecialize(double_precision::Bool = true),
+    @nospecialize(double_precision::Bool = method == :cpu),
     @nospecialize(tol::Real = 1e-6),
     @nospecialize(maxiter::Integer = 10000),
     @nospecialize(drop_singletons::Bool = true),

From bfd473489a823e37d65b0c5874f0ae220b24442f Mon Sep 17 00:00:00 2001
From: Matthieu Gomez <gomez.matthieu@gmail.com>
Date: Thu, 29 Jan 2026 21:15:55 -0500
Subject: [PATCH 4/8] Bump version to 1.13.0 and update FixedEffects
 compatibility

Updated version and dependency compatibility for FixedEffects.
---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 067a8c7..4f92e50 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "FixedEffectModels"
 uuid = "9d5cd8c9-2029-5cab-9928-427838db53e3"
-version = "1.12.2"
+version = "1.13.0"
 
 [deps]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
@@ -19,7 +19,7 @@ Vcov = "ec2bfdc2-55df-4fc9-b9ae-4958c2cf2486"
 
 [compat]
 DataFrames = "0.21, 0.22, 1"
-FixedEffects = "2.3"
+FixedEffects = "2.6"
 PrecompileTools = "1"
 Reexport = "0.1, 0.2, 1"
 Statistics = "1"

From b5fdc1fc3d11ea3f3a8e8b0b70a70b7c696faa35 Mon Sep 17 00:00:00 2001
From: Matthieu Gomez <gomez.matthieu@gmail.com>
Date: Thu, 29 Jan 2026 21:20:29 -0500
Subject: [PATCH 5/8] Remove deprecated nthreads argument and update solver

Updated fit.jl to remove deprecated nthreads argument and adjust AbstractFixedEffectSolver instantiation.
---
 src/fit.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/fit.jl b/src/fit.jl
index e5dd428..ea18e21 100644
--- a/src/fit.jl
+++ b/src/fit.jl
@@ -93,7 +93,9 @@ function StatsAPI.fit(::Type{FixedEffectModel},
         info("method = :gpu is deprecated. Use method = :CUDA or method = :Metal")
         method = :CUDA
     end
-
+    if nthreads !== nothing    
+        info("The keyword argument nthreads is deprecated. Multiple threads are now used by default.")
+    end
     if save == true
         save = :all
     elseif save == false
@@ -234,7 +236,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
         sumsquares_pre = [sum(abs2, x) for x in cols]
 
         # partial out fixed effects
-        feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(subfes, weights, Val{method}, nthreads)
+        feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(subfes, weights, Val{method})
 
         # partial out fixed effects
         _, iterations, convergeds = solve_residuals!(cols, feM; maxiter = maxiter, tol = tol, progress_bar = progress_bar)

From 7710989f805764d00e473344c1f8344ff9e715e9 Mon Sep 17 00:00:00 2001
From: Matthieu Gomez <gomez.matthieu@gmail.com>
Date: Thu, 29 Jan 2026 21:24:19 -0500
Subject: [PATCH 6/8] Update multi-threading explanation in README

Clarified the usage of multi-threading in FixedEffectModels.
---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 835cf8f..6808562 100755
--- a/README.md
+++ b/README.md
@@ -119,8 +119,7 @@ Finally, you can use [RegressionTables.jl](https://github.com/jmboehm/Regression
 
 ## Performances
 
-### MultiThreads
-`FixedEffectModels` is multi-threaded. Use the option `nthreads` to select the number of threads to use in the estimation (defaults to `Threads.nthreads()`).
+`FixedEffectModels` is multi-threaded by default. Launch Julia with multiple threads to benefit from the speedup. You can verify how many threads are available in your session with `Threads.nthreads()`.
 
 ### GPUs
 The package has an experimental support for GPUs. This can make the package 2x-5x faster for complicated problems.

From fbc9e309c0e08770dcf3b594ee329c0586391147 Mon Sep 17 00:00:00 2001
From: Matthieu Gomez <gomez.matthieu@gmail.com>
Date: Thu, 29 Jan 2026 21:35:40 -0500
Subject: [PATCH 7/8] Refactor fit function to remove nthreads parameter

Removed 'nthreads' from the fit function call parameters.
---
 src/fit.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fit.jl b/src/fit.jl
index ea18e21..8df0e16 100644
--- a/src/fit.jl
+++ b/src/fit.jl
@@ -60,7 +60,7 @@ function reg(df,
     progress_bar::Bool = true,
     subset::Union{Nothing, AbstractVector} = nothing, 
     first_stage::Bool = true)
-    StatsAPI.fit(FixedEffectModel, formula, df, vcov; contrasts = contrasts, weights = weights, save = save, method = method, nthreads = nthreads, double_precision = double_precision, tol = tol, maxiter = maxiter, drop_singletons = drop_singletons, progress_bar = progress_bar, subset = subset, first_stage = first_stage)
+    StatsAPI.fit(FixedEffectModel, formula, df, vcov; contrasts = contrasts, weights = weights, save = save, method = method, double_precision = double_precision, tol = tol, maxiter = maxiter, drop_singletons = drop_singletons, progress_bar = progress_bar, subset = subset, first_stage = first_stage)
 end
     
 function StatsAPI.fit(::Type{FixedEffectModel},     
@@ -71,7 +71,7 @@ function StatsAPI.fit(::Type{FixedEffectModel},
     @nospecialize(weights::Union{Symbol, Nothing} = nothing),
     @nospecialize(save::Union{Bool, Symbol} = :none),
     @nospecialize(method::Symbol = :cpu),
-    @nospecialize(nthreads::Union{Integer, Nothing} = nothing,
+    @nospecialize(nthreads::Union{Integer, Nothing} = nothing),
     @nospecialize(double_precision::Bool = method == :cpu),
     @nospecialize(tol::Real = 1e-6),
     @nospecialize(maxiter::Integer = 10000),

From 0fbc200e31bff4243a5522333a4fcd3a9af122e4 Mon Sep 17 00:00:00 2001
From: Matthieu Gomez <gomez.matthieu@gmail.com>
Date: Thu, 29 Jan 2026 22:04:01 -0500
Subject: [PATCH 8/8] update benchmarks

---
 benchmark/benchmark.jl       | 28 +++++++++++++---------------
 benchmark/benchmark.md       |  2 +-
 benchmark/benchmark_Metal.jl | 20 ++++++++++++++++++++
 benchmark/result.jl          |  2 +-
 4 files changed, 35 insertions(+), 17 deletions(-)
 create mode 100755 benchmark/benchmark_Metal.jl

diff --git a/benchmark/benchmark.jl b/benchmark/benchmark.jl
index b852052..f517866 100755
--- a/benchmark/benchmark.jl
+++ b/benchmark/benchmark.jl
@@ -1,6 +1,6 @@
 using DataFrames, Random, CategoricalArrays
 @time using  FixedEffectModels
-#  13s precompiling
+#  0.418712 seconds (742.49 k allocations: 45.500 MiB, 4.07% gc time, 1.07% compilation time)
 # Very simple setup
 N = 10000000
 K = 100
@@ -12,21 +12,21 @@ y= 3 .* x1 .+ 5 .* x2 .+ cos.(id1) .+ cos.(id2).^2 .+ randn(N)
 df = DataFrame(id1 = id1, id2 = id2, x1 = x1, x2 = x2, y = y)
 # first time
 @time reg(df, @formula(y ~ x1 + x2))
-# 1.823s
+#   1.810739 seconds (14.05 M allocations: 1.583 GiB, 3.76% gc time, 84.28% compilation time: 91% of which was recompilation)
 @time reg(df, @formula(y ~ x1 + x2))
-# 0.353469 seconds (441 allocations: 691.439 MiB, 3.65% gc time)
+#   0.288344 seconds (712 allocations: 920.405 MiB, 18.71% gc time)
 @time reg(df, @formula(y ~ x1 + x2),  Vcov.cluster(:id2))
-# 0.763999 seconds (2.96 M allocations: 967.418 MiB, 2.29% gc time, 54.39% compilation time: 5% of which was recompilation)
+#   0.528590 seconds (1.33 M allocations: 1.039 GiB, 8.76% gc time, 55.33% compilation time)
 @time reg(df, @formula(y ~ x1 + x2),  Vcov.cluster(:id2))
-# 0.401544 seconds (622 allocations: 768.943 MiB, 3.52% gc time)
+#   0.268619 seconds (879 allocations: 997.916 MiB, 15.08% gc time)
 @time reg(df, @formula(y ~ x1 + x2 + fe(id1)))
-# 0.893835 seconds (1.03 k allocations: 929.130 MiB, 54.19% gc time)
+# 0.824536 seconds (3.09 M allocations: 1.426 GiB, 5.37% gc time, 61.39% compilation time: 3% of which was recompilation)
 @time reg(df, @formula(y ~ x1 + x2 + fe(id1)))
-#   0.474160 seconds (1.13 k allocations: 933.340 MiB, 1.74% gc time)
+#    0.356793 seconds (1.41 k allocations: 1.276 GiB, 19.31% gc time)
 @time reg(df, @formula(y ~ x1 + x2 + fe(id1)), Vcov.cluster(:id1))
-#   0.598816 seconds (261.08 k allocations: 1.007 GiB, 8.29% gc time, 9.21% compilation time)
+#   0.435500 seconds (495.96 k allocations: 1.381 GiB, 15.97% gc time, 10.72% compilation time)
 @time reg(df, @formula(y ~ x1 + x2 + fe(id1) + fe(id2)))
-# 1.584573 seconds (489.64 k allocations: 1.094 GiB, 2.10% gc time, 8.53% compilation time)
+#  1.367264 seconds (1.91 M allocations: 1.592 GiB, 5.74% gc time, 23.85% compilation time: 20% of which was recompilation)
 
 # More complicated setup
 N = 800000 # number of observations
@@ -39,9 +39,7 @@ x2 =  cos.(id1) +  sin.(id2) + randn(N)
 y= 3 .* x1 .+ 5 .* x2 .+ cos.(id1) .+ cos.(id2).^2 .+ randn(N)
 df = DataFrame(id1 = id1, id2 = id2, x1 = x1, x2 = x2, y = y)
 @time reg(df, @formula(y ~ x1 + x2 + fe(id1) + fe(id2)))
-# 2.504294 seconds (75.83 k allocations: 95.525 MiB, 0.23% gc time)
-# for some reason in 1.10 I now get worse time (iter 200)
-#  4.709078 seconds (108.98 k allocations: 101.417 MiB)
+#   1.546023 seconds (19.89 k allocations: 119.673 MiB, 1.70% gc time)
 
 
 
@@ -57,8 +55,8 @@ X1 = rand(n)
 ln_y = 3 .* X1 .+ rand(n) 
 df = DataFrame(X1 = X1, ln_y = ln_y, id1 = id1, id2 = id2, id3 = id3)
 @time reg(df, @formula(ln_y ~ X1 + fe(id1)), Vcov.cluster(:id1))
-#  0.543996 seconds (873 allocations: 815.677 MiB, 34.15% gc time)
+#   0.311420 seconds (1.35 k allocations: 1.052 GiB, 22.30% gc time)
 @time reg(df, @formula(ln_y ~ X1 + fe(id1) + fe(id2)), Vcov.cluster(:id1))
-#  1.301908 seconds (3.03 k allocations: 968.729 MiB, 25.84% gc time)
+#   0.808992 seconds (3.52 k allocations: 1.272 GiB, 8.68% gc time)
 @time reg(df, @formula(ln_y ~ X1 + fe(id1) + fe(id2) + fe(id3)), Vcov.cluster(:id1))
-# 1.658832 seconds (4.17 k allocations: 1.095 GiB, 29.78% gc time)
+# 0.950808 seconds (4.75 k allocations: 1.496 GiB, 7.48% gc time)
diff --git a/benchmark/benchmark.md b/benchmark/benchmark.md
index 3217393..3b9fb49 100755
--- a/benchmark/benchmark.md
+++ b/benchmark/benchmark.md
@@ -6,7 +6,7 @@ Code to reproduce this graph:
   FixedEffectModels.jl v1.9.0 (Julia 1.9)
   ```julia
   using DataFrames, CategoricalArrays, FixedEffectModels
-  N = 10000000
+  N = 10_000_000
   K = 100
   id1 = rand(1:(N/K), N)
   id2 = rand(1:K, N)
diff --git a/benchmark/benchmark_Metal.jl b/benchmark/benchmark_Metal.jl
new file mode 100755
index 0000000..89dffad
--- /dev/null
+++ b/benchmark/benchmark_Metal.jl
@@ -0,0 +1,20 @@
+using DataFrames, Random, Metal, FixedEffectModels
+Random.seed!(1234)
+# More complicated setup
+N = 8_000_000 # number of observations
+M = 400_000 # number of workers
+O = 50_000 # number of firms
+id1 = rand(1:M, N)
+id2 = [rand(max(1, div(x, 8)-10):min(O, div(x, 8)+10)) for x in id1]
+x1 = 5 * cos.(id1) + 5 * sin.(id2) + randn(N)
+x2 =  cos.(id1) +  sin.(id2) + randn(N)
+x3 =  cos.(id1) +  sin.(id2) + randn(N)
+y= 3 .* x1 .+ 5 .* x2 .+ cos.(id1) .+ cos.(id2).^2 .+ randn(N)
+df = DataFrame(id1 = id1, id2 = id2, x1 = x1, x2 = x2, x3 = x3, y = y)
+@time reg(df, @formula(y ~ x1 + x2 + x3 + fe(id1) + fe(id2)), maxiter = 200, double_precision = false)
+#  30.002852 seconds (66.42 M allocations: 4.917 GiB, 0.83% gc time, 16.25% compilation time: 6% of which was recompilation)
+@time reg(df, @formula(y ~ x1 + x2 + x3 + fe(id1) + fe(id2)), method = :Metal, maxiter = 200)
+#  16.634684 seconds (3.50 M allocations: 1.407 GiB, 0.69% gc time, 1.49% compilation time: <1% of which was recompilation)
+
+
+
diff --git a/benchmark/result.jl b/benchmark/result.jl
index 07ad850..bebb8a4 100644
--- a/benchmark/result.jl
+++ b/benchmark/result.jl
@@ -1 +1 @@
-using DataFrames, CSV, Gadflydf = CSV.read("/Users/matthieugomez/Dropbox/Github/FixedEffectModels.jl/benchmark/benchmark.csv", DataFrame)df."fixest (R)" = df."fixest (R)" ./ df."FixedEffectModels.jl (Julia)"df."lfe (R)" = df."lfe (R)" ./ df."FixedEffectModels.jl (Julia)"df."reghdfe (Stata)" = df."reghdfe (Stata)" ./ df."FixedEffectModels.jl (Julia)"df."FixedEffectModels.jl (Julia)" = df."FixedEffectModels.jl (Julia)" ./ df."FixedEffectModels.jl (Julia)"mdf = stack(df, Not([:Command, :Order]))mdf = rename(mdf, :variable => :Language)p = plot(mdf, x = "Command", y = "value", color = "Language", Guide.ylabel("Time (Ratio to Julia)"), Guide.xlabel("Command"), Scale.y_log10)draw(PNG("/Users/matthieugomez/Dropbox/Github/FixedEffectModels.jl/benchmark/fixedeffectmodels_benchmark.png", 8inch, 5inch, dpi=300), p)
\ No newline at end of file
+using DataFrames, CSV, StatsPlotsdf = CSV.read("/Users/matthieugomez/Dropbox/Github/FixedEffectModels.jl/benchmark/benchmark2.csv", DataFrame)df."fixest (R)" = df."fixest (R)" ./ df."FixedEffectModels.jl (Julia)"df."lfe (R)" = df."lfe (R)" ./ df."FixedEffectModels.jl (Julia)"df."reg / reghdfe (Stata)" = df."reg / reghdfe (Stata)" ./ df."FixedEffectModels.jl (Julia)"df."FixedEffectModels.jl (Julia)" = df."FixedEffectModels.jl (Julia)" ./ df."FixedEffectModels.jl (Julia)"mdf = stack(df, Not([:Command, :Order]))mdf = rename(mdf, :variable => :Language)p = @df mdf plot(    :Command, :value,    group = :Language,    yaxis = :log10,    xlabel = "Command",    ylabel = "Time (Ratio to Julia)",    legend = :top,    seriestype = :scatter,   # or :line / :path depending on what you want    palette = :tol_light,    dpi = 200,    size=(8 * 100 * 2 /3, 5 * 100 * 2 /3))savefig("/Users/matthieugomez/Dropbox/Github/FixedEffectModels.jl/benchmark/fixedeffectmodels_benchmark.png")
\ No newline at end of file