Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

1. `nafill()`, `setnafill()` extended to work on logical vectors (part of [#3992](https://github.com/Rdatatable/data.table/issues/3992)). Thanks @jangorecki for the request and @MichaelChirico for the PR.

2. Joins (`y[x, on=]` or `merge(x, y, ...)`) now display join statistics with `options(datatable.verbose=TRUE)`, showing row counts, matched rows, and join columns used, [#4677](https://github.com/Rdatatable/data.table/issues/4677). Thanks @thorek1 and @grantmcdermott for the suggestion and @ben-schwen for the implementation.

### Notes

1. {data.table} now depends on R 3.5.0 (2018).
Expand Down
37 changes: 36 additions & 1 deletion R/bmerge.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ coerce_col = function(dt, col, from_type, to_type, from_name, to_name, from_deta
set(dt, j=col, value=cast_with_attrs(dt[[col]], cast_fun))
}

bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose)
bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose, notjoin=FALSE)
{
if (roll != 0.0 && length(icols)) {
last_x_idx = tail(xcols, 1L)
Expand Down Expand Up @@ -224,6 +224,41 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()}
# TO DO: xo could be moved inside Cbmerge

# join statistics
if (verbose) {
nrow_x = nrow(x)
nrow_i = nrow(i)
inner_join = is.null(nomatch) || identical(nomatch, 0L)
idx = if (inner_join) ans$starts != 0L else !is.na(ans$starts)
matched_i = sum(idx)

if (notjoin) {
# Anti-join: count rows in x that were NOT matched
result_rows = if (matched_i > 0L) nrow_x - length(unique(ans$starts[idx])) else nrow_x
} else if (inner_join) {
# Inner join: sum lengths for matched rows only
result_rows = if (matched_i > 0L) sum(ans$lens[idx]) else 0L
} else {
# Left join: sum all lengths (includes NAs for unmatched)
result_rows = sum(ans$lens)
}

op_symbols = if (length(ops)) c("==", "<=", "<", ">=", ">")[ops] else strrep("==", length(icols))
join_str = toString(sprintf("%s %s %s", names(x)[xcols], op_symbols, names(i)[icols]))
num_width = max(vapply_1i(list(nrow_x, nrow_i, matched_i, result_rows, join_str), nchar))
# nchar("rows in x: ") == 14L
separator = strrep("-", 14L + num_width)

catf("Join summary:\n")
catf(" rows in x: %*d\n", num_width, nrow_x)
catf(" rows in i: %*d\n", num_width, nrow_i)
catf(" matched rows: %*d\n", num_width, matched_i)
catf(" join columns: %s\n", join_str)
catf(" %s\n", separator)
catf(" result rows: %*d\n", num_width, result_rows)
flush.console()
}

ans$xo = xo # for further use by [.data.table
ans
}
2 changes: 1 addition & 1 deletion R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ replace_dot_alias = function(e) {
setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted
}
i = .shallow(i, retain.key = TRUE)
ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose)
ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose, notjoin=notjoin)
if (mult == "error") mult = "all" ## error should have been raised inside bmerge() call above already, if it wasn't continue as mult="all"
xo = ans$xo ## to make it available for further use.
# temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this
Expand Down
19 changes: 19 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -21978,3 +21978,22 @@ local({
test(2357.1, fread(f), DT)
test(2357.2, fread(paste0("file://", f)), DT)
})

# join statistics #4677
x = data.table(A = 1:5, B = 6:10)
y = data.table(A = c(1L, 1L, 4L), C = LETTERS[c(1L, 2L, 4L)])
test(2358.1, nrow(x[y, on = "A", verbose=TRUE]), 3L, output="result rows: \\s+3\n")
test(2358.2, nrow(y[x, on = "A", verbose=TRUE]), 6L, output="result rows: \\s+6\n")
test(2358.3, nrow(y[x, on = "A", nomatch=NULL, verbose=TRUE]), 3L, output="result rows: \\s+3\n")
test(2358.4, nrow(x[!y, on = "A", verbose=TRUE]), 3L, output="result rows: \\s+3\n")
test(2358.5, nrow(y[y, on = "A", allow.cartesian=TRUE, verbose=TRUE]), 5L, output="result rows: \\s+5\n")
test(2358.6, options=c(datatable.verbose=TRUE), nrow(merge(x, y, by="A")), 3L, output="result rows: \\s+3\n")
x = data.table(id = c("A", "A", "A", "B", "B"), date = as.IDate(c("2010-01-01", "2012-01-01", "2014-01-01", "2010-01-01", "2012-01-01")))
y = data.table(id = c("A", "B"), date = as.IDate(c("2013-01-01", "2013-01-01")))
test(2358.7, nrow(x[y, on = .(id, date <= date), verbose=TRUE]), 4L, output="join columns: id == id, date <= date.*result rows: \\s+4\n")
x = data.table(A = integer(0))
y = data.table(A = 1:3)
test(2358.8, nrow(x[y, on="A", verbose=TRUE]), 3L, output="result rows: \\s+3\n")
x = data.table(A = 1:3)
y = data.table(A = 4:6)
test(2358.9, nrow(x[y, on="A", nomatch=NULL, verbose=TRUE]), 0L, output="matched rows: \\s+0\n")