diff --git a/DESCRIPTION b/DESCRIPTION index bba370c417..3860e0c480 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -106,5 +106,6 @@ Authors@R: c( person(given="@badasahog", role="ctb", comment="GitHub user"), person("Vinit", "Thakur", role="ctb"), person("Mukul", "Kumar", role="ctb"), - person("Ildikó", "Czeller", role="ctb") + person("Ildikó", "Czeller", role="ctb"), + person("Manmita", "Das", role="ctb") ) diff --git a/NEWS.md b/NEWS.md index 23e8d5c873..1ecd21a510 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,10 @@ 1. `nafill()`, `setnafill()` extended to work on logical vectors (part of [#3992](https://github.com/Rdatatable/data.table/issues/3992)). Thanks @jangorecki for the request and @MichaelChirico for the PR. +2. `[,showProgress=]` and `options(datatable.showProgress)` now accept an integer to control the progress bar update interval in seconds, allowing finer control over progress reporting frequency; `TRUE` uses the default 3-second interval, [#6514](https://github.com/Rdatatable/data.table/issues/6514). Thanks @ethanbsmith for the report and @ben-schwen for the PR. + +3. `tables()` can now optionally report `data.table` objects stored one level deep inside list objects when `list_search=TRUE`, with `list_len_threshold` to avoid scanning extremely long lists, [#2606](https://github.com/Rdatatable/data.table/issues/2606). + ### Notes 1. {data.table} now depends on R 3.5.0 (2018). @@ -32,6 +36,8 @@ 3. `fread("file://...")` works for file URIs with spaces, [#7550](https://github.com/Rdatatable/data.table/issues/7550). Thanks @aitap for the report and @MichaelChirico for the PR. +4. `sum()` by group is correct with missing entries and GForce activated ([#7571](https://github.com/Rdatatable/data.table/issues/7571)). Thanks to @rweberc for the report and @manmita for the fix. The issue was caused by a faulty early `break` that spilled between groups, and resulted in silently incorrect results! + ## data.table [v1.18.0](https://github.com/Rdatatable/data.table/milestone/37?closed=1) 23 December 2025 ### BREAKING CHANGE diff --git a/R/data.table.R b/R/data.table.R index 27c985e44c..1ce6b62cf5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -244,7 +244,7 @@ replace_dot_alias = function(e) { if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") if (!with && missing(j)) stopf("j must be provided when with=FALSE") - if (!missing(by) && !isTRUEorFALSE(showProgress)) stopf("%s must be TRUE or FALSE", "showProgress") + if (!missing(by) && !(isTRUEorFALSE(showProgress) || (is.numeric(showProgress) && length(showProgress)==1L && showProgress >= 0))) stopf("showProgress must be TRUE, FALSE, or a single non-negative number") # nocov irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. notjoin = FALSE rightcols = leftcols = integer() @@ -1972,7 +1972,7 @@ replace_dot_alias = function(e) { } ans = c(g, ans) } else { - ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose, showProgress) + ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose, as.integer(showProgress)) } # unlock any locked data.table components of the answer, #4159 # MAX_DEPTH prevents possible infinite recursion from truly recursive object, #4173 diff --git a/R/tables.R b/R/tables.R index 960c74343f..73adaddb8b 100644 --- a/R/tables.R +++ b/R/tables.R @@ -19,7 +19,8 @@ type_size = function(DT) { } tables = function(mb=type_size, order.col="NAME", width=80L, - env=parent.frame(), silent=FALSE, index=FALSE) + env=parent.frame(), silent=FALSE, index=FALSE, + list_search=FALSE, list_len_threshold=100000L) { # Prints name, size and colnames of all data.tables in the calling environment by default mb_name = as.character(substitute(mb)) @@ -27,20 +28,89 @@ tables = function(mb=type_size, order.col="NAME", width=80L, names = ls(envir=env, all.names=TRUE) # include "hidden" objects (starting with .) obj = mget(names, envir=env) # doesn't copy; mget is ok with ... unlike get, #5197 w = which(vapply_1b(obj, is.data.table)) - if (!length(w)) { + + list_info = NULL + if (list_search) { + is_list = vapply_1b(obj, is.list) + dt_mask = logical(length(obj)) + if (length(w)) dt_mask[w] = TRUE + is_df = vapply_1b(obj, is.data.frame) + list_candidates = which(is_list & !dt_mask & !is_df) + + if (length(list_candidates)) { + n_extra = 0L + list_locations = vector("list", length(list_candidates)) + for (j in seq_along(list_candidates)) { + obj_idx = list_candidates[j] + L = obj[[obj_idx]] + if (!is.list(L)) next + lenL = length(L) + if (is.finite(list_len_threshold) && lenL > list_len_threshold) next + elem_is_dt = vapply_1b(L, is.data.table) + idx = which(elem_is_dt) + if (length(idx)) { + list_locations[[j]] = idx + n_extra = n_extra + length(idx) + } + } + + if (n_extra) { + list_info = data.table(NAME=character(n_extra), NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list()) + extra_row = 0L + for (j in seq_along(list_candidates)) { + obj_idx = list_candidates[j] + idx = list_locations[[j]] + if (is.null(idx) || !length(idx)) next + L = obj[[obj_idx]] + obj_name = names[obj_idx] + elem_names = names(L) + for (k in idx) { + extra_row = extra_row + 1L + DT = L[[k]] + if (!is.null(elem_names) && length(elem_names) >= k && nzchar(elem_names[k])) { + nm = paste0(obj_name, "$", elem_names[k]) + } else { + nm = paste0(obj_name, "[[", k, "]]") + } + set(list_info, extra_row, "NAME", nm) + set(list_info, extra_row, "NROW", nrow(DT)) + set(list_info, extra_row, "NCOL", ncol(DT)) + if (is.function(mb)) set(list_info, extra_row, "MB", as.integer(mb(DT)/1048576L)) + if (!is.null(tt<-names(DT))) set(list_info, extra_row, "COLS", tt) + if (!is.null(tt<-key(DT))) set(list_info, extra_row, "KEY", tt) + if (index && !is.null(tt<-indices(DT))) set(list_info, extra_row, "INDICES", tt) + } + } + } + } + } + + info = NULL + if (length(w)) { + info = data.table(NAME=names[w], NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list()) + for (i in seq_along(w)) { # avoid rbindlist(lapply(DT_names)) in case of a large number of tables + DT = obj[[w[i]]] + set(info, i, "NROW", nrow(DT)) + set(info, i, "NCOL", ncol(DT)) + if (is.function(mb)) set(info, i, "MB", as.integer(mb(DT)/1048576L)) # i.e. 1024**2 + if (!is.null(tt<-names(DT))) set(info, i, "COLS", tt) # TODO: don't need these if()s when #5526 is done + if (!is.null(tt<-key(DT))) set(info, i, "KEY", tt) + if (index && !is.null(tt<-indices(DT))) set(info, i, "INDICES", tt) + } + } + + if (!is.null(list_info) && nrow(list_info)) { + if (is.null(info)) { + info = list_info + } else { + info = rbind(info, list_info) + } + } + + if (is.null(info) || !nrow(info)) { if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) return(invisible(data.table(NULL))) } - info = data.table(NAME=names[w], NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list()) - for (i in seq_along(w)) { # avoid rbindlist(lapply(DT_names)) in case of a large number of tables - DT = obj[[w[i]]] - set(info, i, "NROW", nrow(DT)) - set(info, i, "NCOL", ncol(DT)) - if (is.function(mb)) set(info, i, "MB", as.integer(mb(DT)/1048576L)) # i.e. 1024**2 - if (!is.null(tt<-names(DT))) set(info, i, "COLS", tt) # TODO: don't need these if()s when #5526 is done - if (!is.null(tt<-key(DT))) set(info, i, "KEY", tt) - if (index && !is.null(tt<-indices(DT))) set(info, i, "INDICES", tt) - } if (!is.function(mb)) info[,MB:=NULL] if (!index) info[,INDICES:=NULL] if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fcf78e9f36..4cba4ef683 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21978,3 +21978,71 @@ local({ test(2357.1, fread(f), DT) test(2357.2, fread(paste0("file://", f)), DT) }) + +#7571 issue for na.rm on int64 +if (test_bit64) local({ + # integer64 + GForce grouped sum with na.rm = FALSE + # Example 1 from issue: ids 1:8, 9, 9; three leading NAs then 4:10 + dt_short = data.table( + id = c(1:8, 9, 9), + value = c(rep(NA_integer64_, 3L), as.integer64(4:10)) + ) + test(2358.1, options=c(datatable.optimize=2L), + dt_short[, sum(value, na.rm = FALSE), by = id]$V1, + as.integer64(c(NA, NA, NA, 4:8, 19)) + ) + + # Example 2 from issue: ids in pairs, same values; checks multi-row groups + dt_short2 = data.table( + id = rep(1:5, each = 2L), + value = c(rep(NA_integer64_, 3L), as.integer64(4:10)) + ) + test(2358.2, options=c(datatable.optimize=2L), + dt_short2[, sum(value, na.rm = FALSE), by = id]$V1, + as.integer64(c(NA, NA, 11, 15, 19)) + ) + + # Test mean for integer64 with NA + dt_mean = data.table( + id = c(1,1,2,2,3,3), + value = as.integer64(c(NA, NA, NA, 20000000, 5, 3)) + ) + test(2358.3, options=c(datatable.optimize=2L), + dt_mean[, mean(value, na.rm=FALSE), by = id]$V1, + c(NA, NA, 4) + ) + + # GForce sum vs base::sum for integer64 + DT = data.table(id = sample(letters, 1000, TRUE), value = as.integer64(sample(c(1:100, NA), 1000, TRUE))) + gforce = DT[, .(gforce_sum = sum(value)), by=id] + base = DT[, .(true_sum = base::sum(value)), by=id] + merged = merge(gforce, base, by="id", all=TRUE) + test(2358.4, options=c(datatable.optimize=2L), + merged$gforce_sum, merged$true_sum + ) + + # GForce mean vs base::mean for integer64 + DTm = data.table(id = sample(letters, 1000, TRUE), value = as.integer64(sample(c(1:100, NA), 1000, TRUE))) + gforce_m = DTm[, .(gforce_mean = mean(value)), by=id] + base_m = DTm[, .(true_mean = base::mean(value)), by=id] + merged_m = merge(gforce_m, base_m, by="id", all=TRUE) + test(2358.5, options=c(datatable.optimize=2L), + merged$gforce_mean, merged$true_mean + ) +}) + +#2606 tables() list_search finds nested data.tables in lists +xenv2 = new.env() +xenv2$L = list( + data.table(a = 1L), + data.table(a = 1:2) +) +test(2360.1, + tables(env = xenv2, silent = TRUE, list_search = TRUE)[, .(NAME, NROW, NCOL)], + data.table(NAME = c("L[[1]]", "L[[2]]"), NROW = c(1L, 2L), NCOL = c(1L, 1L)) +) +test(2360.2, + nrow(tables(env = xenv2, silent = TRUE, list_search = TRUE, list_len_threshold = 1L)), + 0L +) +rm(xenv2) diff --git a/man/data.table.Rd b/man/data.table.Rd index a674ecccb0..cfdcb27068 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -181,7 +181,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. For more details see \href{../doc/datatable-programming.html}{\code{vignette("datatable-programming")}}. } - \item{showProgress}{ \code{TRUE} shows progress indicator with estimated time to completion for lengthy "by" operations. } + \item{showProgress}{ \code{TRUE} (default when \code{interactive()}) shows a progress indicator with estimated time to completion for lengthy "by" operations, updating every 3 seconds. An integer value controls the update interval in seconds (minimum 3). \code{FALSE} disables the progress indicator. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr diff --git a/src/dogroups.c b/src/dogroups.c index 5200d33f36..d5b9f01bee 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -86,9 +86,10 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX SEXP SDall = PROTECT(findVar(install(".SDall"), env)); nprotect++; // PROTECT for rchk SEXP SD = PROTECT(findVar(install(".SD"), env)); nprotect++; - const bool showProgress = LOGICAL(showProgressArg)[0]==1 && ngrp > 1; // showProgress only if more than 1 group + int updateTime = INTEGER(showProgressArg)[0]; + const bool showProgress = updateTime > 0 && ngrp > 1; // showProgress only if more than 1 group double startTime = (showProgress) ? wallclock() : 0; // For progress printing, startTime is set at the beginning - double nextTime = (showProgress) ? startTime+3 : 0; // wait 3 seconds before printing progress + double nextTime = (showProgress) ? startTime + MAX(updateTime, 3) : 0; // wait at least 3 seconds before starting to print progress hashtab * specials = hash_create(3 + ngrpcols + xlength(SDall)); // .I, .N, .GRP plus columns of .BY plus SDall PROTECT(specials->prot); nprotect++; @@ -451,17 +452,17 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX // could potentially refactor to use fread's progress() function, however we would lose some information in favor of simplicity. double now; if (showProgress && (now=wallclock())>=nextTime) { + // # nocov start. Requires long-running test case double avgTimePerGroup = (now-startTime)/(i+1); int ETA = (int)(avgTimePerGroup*(ngrp-i-1)); if (hasPrinted || ETA >= 0) { - // # nocov start. Requires long-running test case if (verbose && !hasPrinted) Rprintf(_("\n")); Rprintf("\r"); // # notranslate. \r is not internationalizable Rprintf(_("Processed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds."), i+1, ngrp, 100.0*(i+1)/ngrp, (int)(now-startTime), ETA); - // # nocov end } - nextTime = now+1; + nextTime = now+updateTime; hasPrinted = true; + // # nocov end } ansloc += maxn; if (firstalloc) { diff --git a/src/gsumm.c b/src/gsumm.c index 5970f59194..ed0bc1b56c 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -502,13 +502,13 @@ SEXP gsum(SEXP x, SEXP narmArg) const int64_t *my_gx = gx + b*batchSize + pos; const uint16_t *my_low = low + b*batchSize + pos; for (int i=0; i