diff --git a/NAMESPACE b/NAMESPACE index 8381a14a7..7d51d3450 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,7 +11,7 @@ export(setindex, setindexv, indices) export(as.data.table,is.data.table,test.data.table) export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%", "%notin%") export(timetaken) -export(truelength, setalloccol, alloc.col, ":=", let) +export(truelength, setalloccol, setallocrow, alloc.col, ":=", let) export(setattr, setnames, setcolorder, set, setDT, setDF) export(setorder, setorderv) export(setNumericRounding, getNumericRounding) diff --git a/R/data.table.R b/R/data.table.R index a989538b1..f516aba6b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2850,6 +2850,10 @@ setalloccol = alloc.col = function(DT, n=getOption("datatable.alloccol"), verbos ans } +setallocrow = function(DT, n=0L) { + invisible(.Call(Callocrowwrapper, DT, as.integer(n))) +} + selfrefok = function(DT,verbose=getOption("datatable.verbose")) { .Call(Cselfrefokwrapper,DT,verbose) } diff --git a/man/truelength.Rd b/man/truelength.Rd index a85f78b1b..ac19a54d4 100644 --- a/man/truelength.Rd +++ b/man/truelength.Rd @@ -2,6 +2,7 @@ \alias{truelength} \alias{setalloccol} \alias{alloc.col} +\alias{setallocrow} \title{ Over-allocation access } \description{ These functions are experimental and somewhat advanced. By \emph{experimental} we mean their names might change and perhaps the syntax, argument names and types. So if you write a lot of code using them, you have been warned! They should work and be stable, though, so please report problems with them. \code{alloc.col} is just an alias to \code{setalloccol}. We recommend to use \code{setalloccol} (though \code{alloc.col} will continue to be supported) because the \code{set*} prefix in \code{setalloccol} makes it clear that its input argument is modified in-place. @@ -14,11 +15,14 @@ setalloccol(DT, alloc.col(DT, n = getOption("datatable.alloccol"), # default: 1024L verbose = getOption("datatable.verbose")) # default: FALSE +setallocrow(DT, n = 0L) } \arguments{ \item{x}{ Any type of vector, including \code{data.table} which is a \code{list} vector of column pointers. } \item{DT}{ A \code{data.table}. } -\item{n}{ The number of spare column pointer slots to ensure are available. If \code{DT} is a 1,000 column \code{data.table} with 24 spare slots remaining, \code{n=1024L} means grow the 24 spare slots to be 1024. \code{truelength(DT)} will then be 2024 in this example. } +\item{n}{ For \code{setalloccol} and \code{alloc.col}: the number of spare column pointer slots to ensure are available. If \code{DT} is a 1,000 column \code{data.table} with 24 spare slots remaining, \code{n=1024L} means grow the 24 spare slots to be 1024. \code{truelength(DT)} will then be 2024 in this example. + + For \code{setallocrow}: the number of rows to over-allocate. If \code{n > 0}, allocates capacity for current rows plus \code{n} additional rows. If \code{n == 0} (default), shrinks columns to exact current size to free excess memory. } \item{verbose}{ Output status and information. } } \details{ @@ -34,6 +38,12 @@ alloc.col(DT, (perhaps in your .Rprofile); e.g., \code{options(datatable.alloccol=10000L)}. Please note: over-allocation of the column pointer vector is not for efficiency \emph{per se}; it is so that \code{:=} can add columns by reference without a shallow copy. + + \code{setallocrow} is a utility function that prepares columns for fast row operations (delete or insert) by reference and manages row capacity. (Note that 'insert' by reference is not yet implemented) + Before deleting or inserting rows by reference, columns must be resizable. + \code{setallocrow} ensures all columns are in the appropriate state by converting ALTREP columns to materialized form and reallocating + columns to have the target capacity. When \code{n > 0}, columns are over-allocated with extra capacity for future row additions. + When \code{n == 0}, columns are shrunk to exact size to free unused memory. This operation modifies \code{DT} by reference. } \value{ \code{truelength(x)} returns the length of the vector allocated in memory. \code{length(x)} of those items are in use. Currently, it is just the list vector of column @@ -43,6 +53,8 @@ alloc.col(DT, \code{setalloccol} \emph{reallocates} \code{DT} by reference. This may be useful for efficiency if you know you are about to going to add a lot of columns in a loop. It also returns the new \code{DT}, for convenience in compound queries. + + \code{setallocrow} modifies \code{DT} by reference to ensure all columns are resizable. } \seealso{ \code{\link{copy}} } \examples{ diff --git a/src/assign.c b/src/assign.c index 05a55cb5a..f979cc2fe 100644 --- a/src/assign.c +++ b/src/assign.c @@ -592,7 +592,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) Rprintf(_("RHS for item %d has been duplicated because MAYBE_REFERENCED==%d MAYBE_SHARED==%d ALTREP==%d, but then is being plonked. length(values)==%d; length(cols)==%d\n"), i+1, MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue), ALTREP(thisvalue), length(values), length(cols)); } - thisvalue = copyAsPlain(thisvalue); // PROTECT not needed as assigned as element to protected list below. + thisvalue = copyAsPlain(thisvalue, -1); // PROTECT not needed as assigned as element to protected list below. } else { if (verbose) Rprintf(_("Direct plonk of unnamed RHS, no copy. MAYBE_REFERENCED==%d, MAYBE_SHARED==%d\n"), MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue)); // e.g. DT[,a:=as.character(a)] as tested by 754.5 } diff --git a/src/coalesce.c b/src/coalesce.c index 10b7b7757..cd0758109 100644 --- a/src/coalesce.c +++ b/src/coalesce.c @@ -52,7 +52,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) { error(_("Item %d is length %d but the first item is length %d. Only singletons are recycled."), i+2, length(item), nrow); } if (!inplace) { - first = PROTECT(copyAsPlain(first)); nprotect++; + first = PROTECT(copyAsPlain(first, -1)); nprotect++; if (verbose) Rprintf(_("coalesce copied first item (inplace=FALSE)\n")); } const void **valP = (const void **)R_alloc(nval, sizeof(*valP)); diff --git a/src/data.table.h b/src/data.table.h index e7ccc55d3..b970272b0 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -330,7 +330,8 @@ bool allNA(SEXP x, bool errorForBadType); SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent); bool INHERITS(SEXP x, SEXP char_); void copyVectorElements(SEXP dst, SEXP src, R_xlen_t n, bool deep_copy, const char *caller); -SEXP copyAsPlain(SEXP x); +SEXP copyAsPlain(SEXP x, R_xlen_t overalloc); +SEXP allocrow(SEXP dt, R_xlen_t n); void copySharedColumns(SEXP x); SEXP lock(SEXP x); SEXP unlock(SEXP x); @@ -406,6 +407,7 @@ SEXP assign(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP copy(SEXP); SEXP setdt_nrows(SEXP); SEXP alloccolwrapper(SEXP, SEXP, SEXP); +SEXP allocrowwrapper(SEXP, SEXP); SEXP selfrefokwrapper(SEXP, SEXP); SEXP truelength(SEXP); SEXP setcharvec(SEXP, SEXP, SEXP); diff --git a/src/dogroups.c b/src/dogroups.c index 06dfe84be..00480c9f9 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -346,7 +346,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX target = VECTOR_ELT(dt, colj); bool copied = false; if (isNewList(target) && anySpecialStatic(RHS, specials)) { // see comments in anySpecialStatic() - RHS = PROTECT(copyAsPlain(RHS)); + RHS = PROTECT(copyAsPlain(RHS, -1)); copied = true; } const char *warn = memrecycle(target, order, INTEGER(starts)[i]-1, grpn, RHS, 0, -1, 0, ""); @@ -452,7 +452,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } bool copied = false; if (isNewList(target) && anySpecialStatic(source, specials)) { // see comments in anySpecialStatic() - source = PROTECT(copyAsPlain(source)); + source = PROTECT(copyAsPlain(source, -1)); copied = true; } memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, -1, 0, ""); diff --git a/src/fmelt.c b/src/fmelt.c index 287ba4d0d..7db6b0991 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -811,7 +811,7 @@ SEXP fmelt(SEXP DT, SEXP id, SEXP measure, SEXP varfactor, SEXP valfactor, SEXP // edge case no measure.vars if (!data.lmax) { SEXP tt = PROTECT(shallowwrapper(DT, data.idcols)); protecti++; - ans = PROTECT(copyAsPlain(tt)); protecti++; + ans = PROTECT(copyAsPlain(tt, -1)); protecti++; } else { ansvals = PROTECT(getvaluecols(DT, dtnames, LOGICAL(valfactor)[0], verbose, &data)); protecti++; ansvars = PROTECT(getvarcols(DT, dtnames, LOGICAL(varfactor)[0], verbose, &data)); protecti++; diff --git a/src/init.c b/src/init.c index 13421998b..23136f603 100644 --- a/src/init.c +++ b/src/init.c @@ -95,6 +95,7 @@ static const R_CallMethodDef callMethods[] = { {"CconvertNegAndZeroIdx", (DL_FUNC)&convertNegAndZeroIdx, -1}, {"Cfrank", (DL_FUNC)&frank, -1}, {"Cdt_na", (DL_FUNC)&dt_na, -1}, + {"Callocrowwrapper", (DL_FUNC)&allocrowwrapper, 2}, {"Clookup", (DL_FUNC)&lookup, -1}, {"Coverlaps", (DL_FUNC)&overlaps, -1}, {"Cwhichwrapper", (DL_FUNC)&whichwrapper, -1}, diff --git a/src/reorder.c b/src/reorder.c index 8fe682e86..61a65b20d 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -24,7 +24,7 @@ SEXP reorder(SEXP x, SEXP order) error(_("Column %d is length %d which differs from length of column 1 (%d). Invalid data.table."), i+1, length(v), nrow); if (RTYPE_SIZEOF(v) > maxSize) maxSize=RTYPE_SIZEOF(v); - if (ALTREP(v)) SET_VECTOR_ELT(x, i, copyAsPlain(v)); + if (ALTREP(v)) SET_VECTOR_ELT(x, i, copyAsPlain(v, -1)); } copySharedColumns(x); // otherwise two columns which point to the same vector would be reordered and then re-reordered, issues linked in PR#3768 } else { @@ -40,7 +40,7 @@ SEXP reorder(SEXP x, SEXP order) if (length(order) != nrow) error("nrow(x)[%d]!=length(order)[%d]", nrow, length(order)); // # notranslate int nprotect = 0; - if (ALTREP(order)) { order=PROTECT(copyAsPlain(order)); nprotect++; } // TODO: if it's an ALTREP sequence some optimizations are possible rather than expand + if (ALTREP(order)) { order=PROTECT(copyAsPlain(order, -1)); nprotect++; } // TODO: if it's an ALTREP sequence some optimizations are possible rather than expand const int *restrict idx = INTEGER_RO(order); int i=0; diff --git a/src/subset.c b/src/subset.c index d1381223b..ea6a402da 100644 --- a/src/subset.c +++ b/src/subset.c @@ -313,7 +313,7 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md for (int i=0; i= 0: resizable vector with capacity = length(x) + overalloc if (isNull(x)) { // deal with up front because isNewList(R_NilValue) is true @@ -262,7 +266,14 @@ SEXP copyAsPlain(SEXP x) { return duplicate(x); } const int64_t n = XLENGTH(x); - SEXP ans = PROTECT(allocVector(TYPEOF(x), n)); + SEXP ans; + if (overalloc == -1) { + ans = PROTECT(allocVector(TYPEOF(x), n)); + } else { + const R_xlen_t capacity = n + overalloc; + ans = PROTECT(R_allocResizableVector(TYPEOF(x), capacity)); + R_resizeVector(ans, n); + } // aside: unlike R's duplicate we do not copy truelength here; important for dogroups.c which uses negative truelenth to mark its specials if (ALTREP(ans)) internal_error(__func__, "copyAsPlain returning ALTREP for type '%s'", type2char(TYPEOF(x))); // # nocov @@ -277,6 +288,57 @@ SEXP copyAsPlain(SEXP x) { return ans; } +SEXP allocrow(SEXP dt, R_xlen_t n) { + if (!INHERITS(dt, char_datatable)) + error(_("input to allocrow is not a data.table")); // #nocov + + if (n < 0) + error(_("n must be non-negative in allocrow")); // #nocov + + if (!xlength(dt)) return dt; // zero-column data.table + + const bool verbose = GetVerbose(); + int n_modified = 0; + + for (R_xlen_t i = 0; i < length(dt); i++) { + SEXP col = VECTOR_ELT(dt, i); + if (!isVector(col)) + error(_("Cannot make non-vector column %lld resizable"), (long long)(i + 1)); // #nocov + + const R_xlen_t currentLength = length(col); + const R_xlen_t currentCapacity = R_isResizable(col) ? R_maxLength(col) : currentLength; + const R_xlen_t targetCapacity = currentLength + n; + + // Only reallocate if not resizable, or capacity differs from target + if (!R_isResizable(col) || currentCapacity != targetCapacity) { + SEXP newcol = PROTECT(copyAsPlain(col, n)); + SET_VECTOR_ELT(dt, i, newcol); + UNPROTECT(1); + n_modified++; + } + } + + if (verbose) { + if (n_modified > 0) { + if (n > 0) { + Rprintf(Pl_(n_modified, + "Modified %d column (overallocated %lld rows)\n", + "Modified %d columns (overallocated %lld rows)\n"), + n_modified, (long long)n); + } else { + Rprintf(Pl_(n_modified, + "Modified %d column (shrunk to exact size)\n", + "Modified %d columns (shrunk to exact size)\n"), + n_modified); + } + } else { + Rprintf(_("allocrow had no effect, all columns already at target size\n")); + } + } + + return dt; +} + void copySharedColumns(SEXP x) { const int ncol = length(x); if (!isNewList(x) || ncol==1) return; @@ -301,7 +363,7 @@ void copySharedColumns(SEXP x) { if (nShared) { for (int i=0; i