diff --git a/NEWS.md b/NEWS.md index 2930f2311..256c7450a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,6 +34,8 @@ 3. `fread("file://...")` works for file URIs with spaces, [#7550](https://github.com/Rdatatable/data.table/issues/7550). Thanks @aitap for the report and @MichaelChirico for the PR. +4. `sum()` by group is correct with missing entries and GForce activated ([#7571](https://github.com/Rdatatable/data.table/issues/7571)). Thanks to @rweberc for the report and @manmita for the fix. The issue was caused by a faulty early `break` that spilled between groups, and resulted in silently incorrect results! + ## data.table [v1.18.0](https://github.com/Rdatatable/data.table/milestone/37?closed=1) 23 December 2025 ### BREAKING CHANGE diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fcf78e9f3..aba5720a6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21978,3 +21978,55 @@ local({ test(2357.1, fread(f), DT) test(2357.2, fread(paste0("file://", f)), DT) }) + +#7571 issue for na.rm on int64 +if (test_bit64) local({ + # integer64 + GForce grouped sum with na.rm = FALSE + # Example 1 from issue: ids 1:8, 9, 9; three leading NAs then 4:10 + dt_short = data.table( + id = c(1:8, 9, 9), + value = c(rep(NA_integer64_, 3L), as.integer64(4:10)) + ) + test(2358.1, options=c(datatable.optimize=2L), + dt_short[, sum(value, na.rm = FALSE), by = id]$V1, + as.integer64(c(NA, NA, NA, 4:8, 19)) + ) + + # Example 2 from issue: ids in pairs, same values; checks multi-row groups + dt_short2 = data.table( + id = rep(1:5, each = 2L), + value = c(rep(NA_integer64_, 3L), as.integer64(4:10)) + ) + test(2358.2, options=c(datatable.optimize=2L), + dt_short2[, sum(value, na.rm = FALSE), by = id]$V1, + as.integer64(c(NA, NA, 11, 15, 19)) + ) + + # Test mean for integer64 with NA + dt_mean = data.table( + id = c(1,1,2,2,3,3), + value = as.integer64(c(NA, NA, NA, 20000000, 5, 3)) + ) + test(2358.3, options=c(datatable.optimize=2L), + dt_mean[, mean(value, na.rm=FALSE), by = id]$V1, + c(NA, NA, 4) + ) + + # GForce sum vs base::sum for integer64 + DT = data.table(id = sample(letters, 1000, TRUE), value = as.integer64(sample(c(1:100, NA), 1000, TRUE))) + gforce = DT[, .(gforce_sum = sum(value)), by=id] + base = DT[, .(true_sum = base::sum(value)), by=id] + merged = merge(gforce, base, by="id", all=TRUE) + test(2358.4, options=c(datatable.optimize=2L), + merged$gforce_sum, merged$true_sum + ) + + # GForce mean vs base::mean for integer64 + DTm = data.table(id = sample(letters, 1000, TRUE), value = as.integer64(sample(c(1:100, NA), 1000, TRUE))) + gforce_m = DTm[, .(gforce_mean = mean(value)), by=id] + base_m = DTm[, .(true_mean = base::mean(value)), by=id] + merged_m = merge(gforce_m, base_m, by="id", all=TRUE) + test(2358.5, options=c(datatable.optimize=2L), + merged$gforce_mean, merged$true_mean + ) +}) diff --git a/src/gsumm.c b/src/gsumm.c index 5970f5919..ed0bc1b56 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -502,13 +502,13 @@ SEXP gsum(SEXP x, SEXP narmArg) const int64_t *my_gx = gx + b*batchSize + pos; const uint16_t *my_low = low + b*batchSize + pos; for (int i=0; i