From a047c64d548861eb159111ff4d0536b58837af82 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Mon, 5 Jan 2026 21:45:00 +0100
Subject: [PATCH 1/6] add join statistics

---
 NEWS.md               |  2 ++
 R/bmerge.R            | 45 ++++++++++++++++++++++++++++++++++++++++++-
 R/data.table.R        |  2 +-
 R/merge.R             |  9 ++++++---
 inst/tests/tests.Rraw | 19 ++++++++++++++++++
 man/merge.Rd          |  2 +-
 6 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 23e8d5c873..83dac301dc 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -16,6 +16,8 @@
 
 1. `nafill()`, `setnafill()` extended to work on logical vectors (part of [#3992](https://github.com/Rdatatable/data.table/issues/3992)). Thanks @jangorecki for the request and @MichaelChirico for the PR.
 
+2. Joins (`y[x, on=]` or `merge(x, y, ...)`) now display join statistics with `verbose=TRUE`, showing row counts, matched rows, and join columns used, [#4677](https://github.com/Rdatatable/data.table/issues/4677). Thanks @thorek1 and @grantmcdermott for the suggestion and @ben-schwen for the implementation.
+
 ### Notes
 
 1. {data.table} now depends on R 3.5.0 (2018).
diff --git a/R/bmerge.R b/R/bmerge.R
index 3c903ae354..730b38bfce 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -25,7 +25,7 @@ coerce_col = function(dt, col, from_type, to_type, from_name, to_name, from_deta
   set(dt, j=col, value=cast_with_attrs(dt[[col]], cast_fun))
 }
 
-bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose)
+bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose, notjoin=FALSE)
 {
   if (roll != 0.0 && length(icols)) {
     last_x_idx = tail(xcols, 1L)
@@ -224,6 +224,49 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
   if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()}
   # TO DO: xo could be moved inside Cbmerge
 
+  # join statistics
+  if (verbose) {
+    nrow_x = nrow(x)
+    nrow_i = nrow(i)
+    inner_join = is.null(nomatch) || identical(nomatch, 0L)
+    idx = if (inner_join) ans$starts != 0L else !is.na(ans$starts)
+    matched_i = sum(idx)
+
+    if (notjoin) {
+      # Anti-join: count rows in x that were NOT matched
+      result_rows = if (matched_i > 0L) nrow_x - length(unique(ans$starts[idx])) else nrow_x
+    } else if (inner_join) {
+      # Inner join: sum lengths for matched rows only
+      result_rows = if (matched_i > 0L) sum(ans$lens[idx]) else 0L
+    } else {
+      # Left join: sum all lengths (includes NAs for unmatched)
+      result_rows = sum(ans$lens)
+    }
+
+    join_desc = character(0L)
+    for (a in seq_along(icols)) {
+      xname = names(x)[xcols[a]]
+      iname = names(i)[icols[a]]
+      op_symbol = if (length(ops)) c("==", "<=", "<", ">=", ">")[ops[a]] else "=="
+      join_desc = c(join_desc, sprintf("%s %s %s", xname, op_symbol, iname))
+    }
+
+    join_str = paste(join_desc, collapse=", ")
+    num_width = max(vapply_1i(list(nrow_x, nrow_i, matched_i, result_rows, join_str), nchar))
+    label_width = 16L
+    line_width = label_width + num_width
+    separator = strrep("-", line_width - 2L)
+
+    catf("Join summary:\n")
+    catf("  rows in x:    %*d\n", num_width, nrow_x)
+    catf("  rows in i:    %*d\n", num_width, nrow_i)
+    catf("  matched rows: %*d\n", num_width, matched_i)
+    catf("  join columns: %s\n", join_str)
+    catf("  %s\n", separator)
+    catf("  result rows:  %*d\n", num_width, result_rows)
+    flush.console()
+  }
+
   ans$xo = xo  # for further use by [.data.table
   ans
 }
diff --git a/R/data.table.R b/R/data.table.R
index 27c985e44c..e2ac9b30bb 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -518,7 +518,7 @@ replace_dot_alias = function(e) {
         setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted
       }
       i = .shallow(i, retain.key = TRUE)
-      ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose)
+      ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose, notjoin=notjoin)
       if (mult == "error") mult = "all" ## error should have been raised inside bmerge() call above already, if it wasn't continue as mult="all"
       xo = ans$xo ## to make it available for further use.
       # temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this
diff --git a/R/merge.R b/R/merge.R
index 2484cd9a0f..274d388779 100644
--- a/R/merge.R
+++ b/R/merge.R
@@ -1,5 +1,5 @@
 merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all,
-               all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) {
+               all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, verbose=getOption("datatable.verbose", FALSE), ...) {
   if (!sort %in% c(TRUE, FALSE))
     stopf("Argument 'sort' should be logical TRUE/FALSE")
   if (!no.dups %in% c(TRUE, FALSE))
@@ -93,12 +93,15 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
     x = x[xind]
     y = y[yind]
   }
-  dt = y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian]   # includes JIS columns (with a i. prefix if conflict with x names)
+  dt = y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian, verbose=verbose]   # includes JIS columns (with a i. prefix if conflict with x names)
 
   if (all.y && nrow(y)) {  # If y does not have any rows, no need to proceed
     # Perhaps not very commonly used, so not a huge deal that the join is redone here.
     missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian]
-    if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE)
+    if (length(missingyidx)) {
+      if (verbose) catf("Adding %d unmatched rows from y\n", length(missingyidx))
+      dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE)
+    }
   }
   # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i.
   newend = setdiff(nm_y, by.y)
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index fcf78e9f36..7b31e28406 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -21978,3 +21978,22 @@ local({
   test(2357.1, fread(f), DT)
   test(2357.2, fread(paste0("file://", f)), DT)
 })
+
+# join statistics #4677
+x = data.table(A = 1:5, B = 6:10)
+y = data.table(A = c(1L, 1L, 4L), C = LETTERS[c(1L, 2L, 4L)])
+test(2358.1, nrow(x[y, on = "A", verbose=TRUE]), 3L, output="result rows: \\s+3\n")
+test(2358.2, nrow(y[x, on = "A", verbose=TRUE]), 6L, output="result rows: \\s+6\n")
+test(2358.3, nrow(y[x, on = "A", nomatch=NULL, verbose=TRUE]), 3L, output="result rows: \\s+3\n")
+test(2358.4, nrow(x[!y, on = "A", verbose=TRUE]), 3L, output="result rows: \\s+3\n")
+test(2358.5, nrow(y[y, on = "A", allow.cartesian=TRUE, verbose=TRUE]), 5L, output="result rows: \\s+5\n")
+test(2358.6, nrow(merge(x, y, by="A", verbose=TRUE)), 3L, output="result rows: \\s+3\n")
+x = data.table(id = c("A", "A", "A", "B", "B"), date = as.IDate(c("2010-01-01", "2012-01-01", "2014-01-01", "2010-01-01", "2012-01-01")))
+y = data.table(id = c("A", "B"), date = as.IDate(c("2013-01-01", "2013-01-01")))
+test(2358.7, nrow(x[y, on = .(id, date <= date), verbose=TRUE]), 4L, output="join columns: id == id, date <= date.*result rows: \\s+4\n")
+x = data.table(A = integer(0))
+y = data.table(A = 1:3)
+test(2358.8, nrow(x[y, on="A", verbose=TRUE]), 3L, output="result rows: \\s+3\n")
+x = data.table(A = 1:3)
+y = data.table(A = 4:6)
+test(2358.9, nrow(x[y, on="A", nomatch=NULL, verbose=TRUE]), 0L, output="matched rows: \\s+0\n")
diff --git a/man/merge.Rd b/man/merge.Rd
index da5bb1efad..7ed07cf13a 100644
--- a/man/merge.Rd
+++ b/man/merge.Rd
@@ -21,7 +21,7 @@ Use the \code{by}, \code{by.x} and \code{by.y} arguments explicitly to override
 \method{merge}{data.table}(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE,
 all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE,
 allow.cartesian=getOption("datatable.allow.cartesian"),  # default FALSE
-incomparables = NULL, \dots)
+incomparables = NULL, verbose=getOption("datatable.verbose", FALSE), \dots)
 }
 
 \arguments{

From 6a4edb64a0d5a3acc9fb35724fdef4bdf937b755 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Thu, 8 Jan 2026 16:25:17 +0100
Subject: [PATCH 2/6] fix verbose tests

---
 R/bmerge.R            | 11 ++---------
 R/merge.R             |  5 ++---
 inst/tests/tests.Rraw |  2 +-
 3 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/R/bmerge.R b/R/bmerge.R
index 730b38bfce..cd72809008 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -243,15 +243,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
       result_rows = sum(ans$lens)
     }
 
-    join_desc = character(0L)
-    for (a in seq_along(icols)) {
-      xname = names(x)[xcols[a]]
-      iname = names(i)[icols[a]]
-      op_symbol = if (length(ops)) c("==", "<=", "<", ">=", ">")[ops[a]] else "=="
-      join_desc = c(join_desc, sprintf("%s %s %s", xname, op_symbol, iname))
-    }
-
-    join_str = paste(join_desc, collapse=", ")
+    op_symbols = if (length(ops)) c("==", "<=", "<", ">=", ">")[ops] else strrep("==", length(icols))
+    join_str = toString(sprintf("%s %s %s", names(x)[xcols], op_symbols, names(i)[icols]))
     num_width = max(vapply_1i(list(nrow_x, nrow_i, matched_i, result_rows, join_str), nchar))
     label_width = 16L
     line_width = label_width + num_width
diff --git a/R/merge.R b/R/merge.R
index 274d388779..e723f95803 100644
--- a/R/merge.R
+++ b/R/merge.R
@@ -1,5 +1,5 @@
 merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all,
-               all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, verbose=getOption("datatable.verbose", FALSE), ...) {
+               all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) {
   if (!sort %in% c(TRUE, FALSE))
     stopf("Argument 'sort' should be logical TRUE/FALSE")
   if (!no.dups %in% c(TRUE, FALSE))
@@ -93,13 +93,12 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
     x = x[xind]
     y = y[yind]
   }
-  dt = y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian, verbose=verbose]   # includes JIS columns (with a i. prefix if conflict with x names)
+  dt = y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian]   # includes JIS columns (with a i. prefix if conflict with x names)
 
   if (all.y && nrow(y)) {  # If y does not have any rows, no need to proceed
     # Perhaps not very commonly used, so not a huge deal that the join is redone here.
     missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian]
     if (length(missingyidx)) {
-      if (verbose) catf("Adding %d unmatched rows from y\n", length(missingyidx))
       dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE)
     }
   }
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 7b31e28406..605e6083a4 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -21987,7 +21987,7 @@ test(2358.2, nrow(y[x, on = "A", verbose=TRUE]), 6L, output="result rows: \\s+6\
 test(2358.3, nrow(y[x, on = "A", nomatch=NULL, verbose=TRUE]), 3L, output="result rows: \\s+3\n")
 test(2358.4, nrow(x[!y, on = "A", verbose=TRUE]), 3L, output="result rows: \\s+3\n")
 test(2358.5, nrow(y[y, on = "A", allow.cartesian=TRUE, verbose=TRUE]), 5L, output="result rows: \\s+5\n")
-test(2358.6, nrow(merge(x, y, by="A", verbose=TRUE)), 3L, output="result rows: \\s+3\n")
+test(2358.6, options=c(datatable.verbose=TRUE), nrow(merge(x, y, by="A")), 3L, output="result rows: \\s+3\n")
 x = data.table(id = c("A", "A", "A", "B", "B"), date = as.IDate(c("2010-01-01", "2012-01-01", "2014-01-01", "2010-01-01", "2012-01-01")))
 y = data.table(id = c("A", "B"), date = as.IDate(c("2013-01-01", "2013-01-01")))
 test(2358.7, nrow(x[y, on = .(id, date <= date), verbose=TRUE]), 4L, output="join columns: id == id, date <= date.*result rows: \\s+4\n")

From 193e7870d51ce64e584620d0dd48ea7f22389ca8 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Thu, 8 Jan 2026 16:26:09 +0100
Subject: [PATCH 3/6] update NEWS

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 83dac301dc..4032e03a72 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -16,7 +16,7 @@
 
 1. `nafill()`, `setnafill()` extended to work on logical vectors (part of [#3992](https://github.com/Rdatatable/data.table/issues/3992)). Thanks @jangorecki for the request and @MichaelChirico for the PR.
 
-2. Joins (`y[x, on=]` or `merge(x, y, ...)`) now display join statistics with `verbose=TRUE`, showing row counts, matched rows, and join columns used, [#4677](https://github.com/Rdatatable/data.table/issues/4677). Thanks @thorek1 and @grantmcdermott for the suggestion and @ben-schwen for the implementation.
+2. Joins (`y[x, on=]` or `merge(x, y, ...)`) now display join statistics with `options(datatable.verbose=TRUE)`, showing row counts, matched rows, and join columns used, [#4677](https://github.com/Rdatatable/data.table/issues/4677). Thanks @thorek1 and @grantmcdermott for the suggestion and @ben-schwen for the implementation.
 
 ### Notes
 

From 853dbcc37703eaac149636dbefc5008c4fddbc51 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Thu, 8 Jan 2026 16:29:31 +0100
Subject: [PATCH 4/6] improve comments

---
 R/bmerge.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/R/bmerge.R b/R/bmerge.R
index cd72809008..00113adc32 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -246,9 +246,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
     op_symbols = if (length(ops)) c("==", "<=", "<", ">=", ">")[ops] else strrep("==", length(icols))
     join_str = toString(sprintf("%s %s %s", names(x)[xcols], op_symbols, names(i)[icols]))
     num_width = max(vapply_1i(list(nrow_x, nrow_i, matched_i, result_rows, join_str), nchar))
-    label_width = 16L
-    line_width = label_width + num_width
-    separator = strrep("-", line_width - 2L)
+    # nchar("rows in x:    ") == 14L
+    separator = strrep("-", 14L + num_width)
 
     catf("Join summary:\n")
     catf("  rows in x:    %*d\n", num_width, nrow_x)

From 6ae25908224eb0a55773df8e74965f38740e63f8 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Thu, 8 Jan 2026 16:30:17 +0100
Subject: [PATCH 5/6] restore change

---
 R/merge.R | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/R/merge.R b/R/merge.R
index e723f95803..2484cd9a0f 100644
--- a/R/merge.R
+++ b/R/merge.R
@@ -98,9 +98,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
   if (all.y && nrow(y)) {  # If y does not have any rows, no need to proceed
     # Perhaps not very commonly used, so not a huge deal that the join is redone here.
     missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian]
-    if (length(missingyidx)) {
-      dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE)
-    }
+    if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE)
   }
   # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i.
   newend = setdiff(nm_y, by.y)

From 5d4c57d46ed5ef5336230387aa3e5642c1011eb9 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Thu, 8 Jan 2026 16:31:04 +0100
Subject: [PATCH 6/6] fix man

---
 man/merge.Rd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/merge.Rd b/man/merge.Rd
index 7ed07cf13a..da5bb1efad 100644
--- a/man/merge.Rd
+++ b/man/merge.Rd
@@ -21,7 +21,7 @@ Use the \code{by}, \code{by.x} and \code{by.y} arguments explicitly to override
 \method{merge}{data.table}(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE,
 all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE,
 allow.cartesian=getOption("datatable.allow.cartesian"),  # default FALSE
-incomparables = NULL, verbose=getOption("datatable.verbose", FALSE), \dots)
+incomparables = NULL, \dots)
 }
 
 \arguments{