Skip to content
Merged
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@

3. `fread("file://...")` works for file URIs with spaces, [#7550](https://github.com/Rdatatable/data.table/issues/7550). Thanks @aitap for the report and @MichaelChirico for the PR.

4. `sum(<int64 column>)` by group is correct with missing entries and GForce activated ([#7571](https://github.com/Rdatatable/data.table/issues/7571)). Thanks to @rweberc for the report and @manmita for the fix. The issue was caused by a faulty early `break` that spilled between groups, and resulted in silently incorrect results!

## data.table [v1.18.0](https://github.com/Rdatatable/data.table/milestone/37?closed=1) 23 December 2025

### BREAKING CHANGE
Expand Down
52 changes: 52 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -21978,3 +21978,55 @@ local({
test(2357.1, fread(f), DT)
test(2357.2, fread(paste0("file://", f)), DT)
})

#7571 issue for na.rm on int64
if (test_bit64) local({
# integer64 + GForce grouped sum with na.rm = FALSE
# Example 1 from issue: ids 1:8, 9, 9; three leading NAs then 4:10
dt_short = data.table(
id = c(1:8, 9, 9),
value = c(rep(NA_integer64_, 3L), as.integer64(4:10))
)
test(2358.1, options=c(datatable.optimize=2L),
dt_short[, sum(value, na.rm = FALSE), by = id]$V1,
as.integer64(c(NA, NA, NA, 4:8, 19))
)

# Example 2 from issue: ids in pairs, same values; checks multi-row groups
dt_short2 = data.table(
id = rep(1:5, each = 2L),
value = c(rep(NA_integer64_, 3L), as.integer64(4:10))
)
test(2358.2, options=c(datatable.optimize=2L),
dt_short2[, sum(value, na.rm = FALSE), by = id]$V1,
as.integer64(c(NA, NA, 11, 15, 19))
)

# Test mean for integer64 with NA
dt_mean = data.table(
id = c(1,1,2,2,3,3),
value = as.integer64(c(NA, NA, NA, 20000000, 5, 3))
)
test(2358.3, options=c(datatable.optimize=2L),
dt_mean[, mean(value, na.rm=FALSE), by = id]$V1,
c(NA, NA, 4)
)

# GForce sum vs base::sum for integer64
DT = data.table(id = sample(letters, 1000, TRUE), value = as.integer64(sample(c(1:100, NA), 1000, TRUE)))
gforce = DT[, .(gforce_sum = sum(value)), by=id]
base = DT[, .(true_sum = base::sum(value)), by=id]
merged = merge(gforce, base, by="id", all=TRUE)
test(2358.4, options=c(datatable.optimize=2L),
merged$gforce_sum, merged$true_sum
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can make this just

test(2358.4, options=c(datatable.optimize=2L),
     merged$gforce_sum, merged$true_sum)

)

# GForce mean vs base::mean for integer64
DTm = data.table(id = sample(letters, 1000, TRUE), value = as.integer64(sample(c(1:100, NA), 1000, TRUE)))
gforce_m = DTm[, .(gforce_mean = mean(value)), by=id]
base_m = DTm[, .(true_mean = base::mean(value)), by=id]
merged_m = merge(gforce_m, base_m, by="id", all=TRUE)
test(2358.5, options=c(datatable.optimize=2L),
merged$gforce_mean, merged$true_mean
)
})
12 changes: 6 additions & 6 deletions src/gsumm.c
Original file line number Diff line number Diff line change
Expand Up @@ -502,13 +502,13 @@ SEXP gsum(SEXP x, SEXP narmArg)
const int64_t *my_gx = gx + b*batchSize + pos;
const uint16_t *my_low = low + b*batchSize + pos;
for (int i=0; i<howMany; i++) {
const int64_t elem = my_gx[i];
if (elem!=INT64_MIN) {
_ans[my_low[i]] += elem;
} else {
_ans[my_low[i]] = INT64_MIN;
break;
if (_ans[my_low[i]] == INT64_MIN) continue;
const int64_t b = my_gx[i];
if (b == INT64_MIN) {
if (!narm) _ans[my_low[i]] = INT64_MIN;
continue;
}
_ans[my_low[i]] += b;
}
}
}
Expand Down
Loading