diff --git a/CODEOWNERS b/CODEOWNERS index bcd1ea095..f121172f7 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -87,3 +87,7 @@ # C code tricks /src/chmatch.c @aitap /src/fread.c @aitap + +# ordering +/src/topn.c @ben-schwen +/man/topn.Rd @ben-schwen diff --git a/NAMESPACE b/NAMESPACE index 361b706d3..7c1155c1f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -217,6 +217,8 @@ S3method(format_list_item, data.frame) export(fdroplevels, setdroplevels) S3method(droplevels, data.table) + +export(topn) export(frev) export(.selfref.ok) diff --git a/NEWS.md b/NEWS.md index bec4ada86..03f739612 100644 --- a/NEWS.md +++ b/NEWS.md @@ -296,7 +296,26 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T # user system elapsed # 0.028 0.000 0.005 ``` - 20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. + +20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. + +21. New function `topn(x,n)` [#3804](https://github.com/Rdatatable/data.table/issues/3804). It returns the indices of the `n` smallest/largest values of a vector `x`. Previously, one had to use `order(x)[1:n]` which produced a full sorting of `x`. Usage of `topn` is advised for large vectors where sorting takes long time. Thanks to Michael Chirico for requesting, and Benjamin Schwendinger for the PR. + + ```R + set.seed(123) + x = rnorm(1e8) + bm = bench::mark(check=FALSE, min_iterations=10, + topn(x, 5L, sorted=TRUE), + topn(x, 5L, sorted=FALSE), + order(x)[1:5] + ) + setDT(bm)[, .(expression, min, median, lapply(time, max), mem_alloc)] + # expression min median V4 mem_alloc + # + # 1: topn(x, 5L, sorted = TRUE) 151.65ms 155.21ms 171ms 0B + # 2: topn(x, 5L, sorted = FALSE) 151.59ms 158.77ms 176ms 0B + # 3: order(x)[1:5] 2.69s 2.77s 2.84s 381MB + ``` ### BUG FIXES diff --git a/R/wrappers.R b/R/wrappers.R index 7683b434f..5effb7e50 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -22,5 +22,7 @@ fitsInInt64 = function(x) .Call(CfitsInInt64R, x) coerceAs = function(x, as, copy=TRUE) .Call(CcoerceAs, x, as, copy) +topn = function(x, n, na.last=TRUE, decreasing=FALSE, sorted=FALSE) .Call(Ctopn, x, as.integer(n), na.last, decreasing, sorted) +quickn = function(x, n, na.last=TRUE, decreasing=FALSE) .Call(Cquickn, x, as.integer(n), na.last, decreasing) frev = function(x) .Call(Cfrev, x, TRUE) setfrev = function(x) invisible(.Call(Cfrev, x, FALSE)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8b151609b..1cdc27fbf 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -76,6 +76,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { .shallow = data.table:::.shallow stopf = data.table:::stopf test = data.table:::test + topn = data.table:::topn uniqlengths = data.table:::uniqlengths uniqlist = data.table:::uniqlist warningf = data.table:::warningf @@ -17938,6 +17939,7 @@ DT = data.table(A=letters[1:3], B=4:6, key="A") test(2215.1, DT["b", B], 5L) # has worked forever test(2215.2, DT[factor("b"), B], 5L) # now works too, joining fact/fact, char/fact and fact/char have plenty of tests + # segfault on merge keyed all-NA_character_ due to is.sorted, #5070 DT1 = data.table(x1 = rep(letters[1:4], each=3), x2=NA_character_, key="x2") DT2 = data.table(x1 = letters[1:3]) @@ -21901,3 +21903,49 @@ rm(DT, strings) DT = unserialize(serialize(as.data.table(mtcars), NULL)) test(2351, DT[,carb:=NULL], as.data.table(mtcars)[,carb:=NULL]) rm(DT) + +# topn retrieving which.max/which.min for more than 1 element #3804 +set.seed(373L) +x=list(sample(c(-1:1, NA), 10, TRUE), + sample(c(rnorm(3), NA), 10, TRUE), + sample(c(letters[1:3],NA), 10, TRUE), + c(NaN, Inf, -Inf), + c(-Inf, 0, Inf), + c(-Inf, Inf), + c(Inf, -Inf), + c(0, NaN), + c(NaN, 0), + c(3+9i, 10+5i, 8+2i, 10+4i, 3+3i, 1+2i, 5+1i, 8+1i, 8+2i, 10+6i), + c(3, 10+5i, 8, 10+4i, NA, 3, 1+2i, 5+1i, 8+1i, 8, NA)) +testnum = 2352.0 +b = CJ(c(TRUE,FALSE), c(TRUE,FALSE)) +for (i in seq.int(x)) { + l = length(x[[i]]) + s = sample(l-1L, 1L) + for (j in nrow(b)) { + test(testnum, topn(x[[i]], l, na.last=b[[j,1]], decreasing=b[[j,2]], sorted=TRUE), + order(x[[i]], na.last=b[[j,1]], decreasing=b[[j,2]])) + testnum = testnum + 0.01 + test(testnum, topn(x[[i]], s, na.last=b[[j,1]], decreasing=b[[j,2]], sorted=TRUE), + order(x[[i]], na.last=b[[j,1]], decreasing=b[[j,2]])[1L:s]) + testnum = testnum + 0.01 + test(testnum, sort(topn(x[[i]], s, na.last=b[[j,1]], decreasing=b[[j,2]], sorted=FALSE)), + sort(order(x[[i]], na.last=b[[j,1]], decreasing=b[[j,2]])[1L:s])) + testnum = testnum + 0.01 + } +} +test(2352.79, topn(as.raw(1), 1), error="Type 'raw' not supported by topn.") +test(2352.80, topn(1L, 2L), 1L, warning="n should be smaller or equal than length(x) but provided n=2 and length(x)=1.\n Coercing n to length(x).") +test(2352.81, topn(1L, -1L), error="topn(x,n) only implemented for n > 0.") +if (test_bit64) { + x=as.integer64(c(-5, 5, 0, NA, -5, 5, 1e18, NA, 5, 1e-18)) + test(2352.82, topn(x, 10L, na.last=TRUE, decreasing=FALSE, sorted=TRUE), c(1L, 5L, 3L, 10L, 2L, 6L, 9L, 7L, 4L, 8L)) + test(2352.83, topn(x, 10L, na.last=FALSE, decreasing=FALSE, sorted=TRUE), c(4L, 8L, 1L, 5L, 3L, 10L, 2L, 6L, 9L, 7L)) + test(2352.84, topn(x, 10L, na.last=TRUE, decreasing=TRUE, sorted=TRUE), c(7L, 2L, 6L, 9L, 3L, 10L, 1L, 5L, 4L, 8L)) + test(2352.85, topn(x, 10L, na.last=FALSE, decreasing=TRUE, sorted=TRUE), c(4L, 8L, 7L, 2L, 6L, 9L, 3L, 10L, 1L, 5L)) + # answers are sort(head(x, 2)) of answer before + test(2352.86, sort(topn(x, 2L, na.last=TRUE, decreasing=FALSE, sorted=FALSE)), c(1L, 5L)) + test(2352.87, sort(topn(x, 2L, na.last=FALSE, decreasing=FALSE, sorted=FALSE)), c(4L, 8L)) + test(2352.88, sort(topn(x, 2L, na.last=TRUE, decreasing=TRUE, sorted=FALSE)), c(2L, 7L)) + test(2352.89, sort(topn(x, 2L, na.last=FALSE, decreasing=TRUE, sorted=FALSE)), c(4L, 8L)) +} diff --git a/man/topn.Rd b/man/topn.Rd new file mode 100644 index 000000000..442360ba2 --- /dev/null +++ b/man/topn.Rd @@ -0,0 +1,45 @@ +\name{topn} +\alias{topn} +\alias{which.max} +\alias{which.min} +\title{ Top n values index } +\description{ + \code{topn} returns the indices of the smallest (resp. largest) \code{n} values of x. This is an extension \code{\link{which.min}} (\code{\link{which.max}}) which only return the index of the minimum (resp. maximum). + + Especially, for large vectors this method will be faster and less memory intensive than \code{order} since no full sort is performed. + + \code{bit64::integer64} type is also supported. +} + +\usage{ +topn(x, n, na.last=TRUE, decreasing=FALSE, sorted=FALSE) +} +\arguments{ + \item{x}{ A numeric, complex, character or logical vector. } + \item{n}{ A numeric vector length 1. How many indices to select. } + \item{na.last}{ Control treatment of \code{NA}s. If \code{TRUE}, missing values in the data are put last; if \code{FALSE}, they are put first. } + \item{decreasing}{ Logical. Default is \code{FALSE}. Indicating whether the order should be increasing or decreasing. } + \item{sorted}{ Logical. Default is \code{FALSE}. Indicating whether order should be sorted with respect to decreasing. } +} + +\value{ + An integer vector giving the indicies of the \code{n} smallest (largest) for \code{decreasing=FALSE (TRUE)} elements of \code{x}. +} + +\examples{ +x = c(1:4, 0:5, 11) +# indices of smallest 3 values +topn(x, 3) +# indices of largest 3 values +topn(x, 3, decreasing = TRUE) + +## NA's can be put to front or back +x = c(NA, 1:4) +topn(x, 5) +topn(x, 5, na.last=FALSE) + +} +\seealso{ + \code{\link{data.table}}, \code{\link{order}}, \code{\link{which.max}}, \code{\link{which.min}} +} +\keyword{ data } diff --git a/src/data.table.h b/src/data.table.h index 0bfcb09d8..47b44ad89 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -363,6 +363,10 @@ SEXP substitute_call_arg_namesR(SEXP expr, SEXP env); //negate.c SEXP notchin(SEXP x, SEXP table); +// topn.c +SEXP topn(SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP quickn(SEXP, SEXP, SEXP, SEXP); + // hash.c typedef struct { SEXP prot; // make sure to PROTECT() while the table is in use diff --git a/src/init.c b/src/init.c index ecdc64e39..d55afa209 100644 --- a/src/init.c +++ b/src/init.c @@ -147,6 +147,8 @@ R_CallMethodDef callMethods[] = { {"Cdt_has_zlib", (DL_FUNC)&dt_has_zlib, -1}, {"Csubstitute_call_arg_namesR", (DL_FUNC) &substitute_call_arg_namesR, -1}, {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, +{"Ctopn", (DL_FUNC)&topn, -1}, +{"Cquickn", (DL_FUNC)&quickn, -1}, {"CconvertDate", (DL_FUNC)&convertDate, -1}, {"Cnotchin", (DL_FUNC)¬chin, -1}, {"Ccbindlist", (DL_FUNC) &cbindlist, -1}, diff --git a/src/topn.c b/src/topn.c new file mode 100644 index 000000000..a58c0bc41 --- /dev/null +++ b/src/topn.c @@ -0,0 +1,199 @@ +#include "data.table.h" + +static inline void swap(int *a, int *b) { int tmp=*a; *a=*b; *b=tmp; } + +static inline bool icmp(const int *x, int i, int j, bool min, bool nalast) { + if (x[i]==x[j]) return i > j; + if (x[i]==NA_INTEGER) return nalast; + if (x[j]==NA_INTEGER) return !nalast; + return min ? x[i] < x[j] : x[i] > x[j]; +} + +static inline bool dcmp(const double *x, int i, int j, bool min, bool nalast) { + if (x[i]==x[j] || (isnan(x[i]) && isnan(x[j]))) return i > j; + if (isnan(x[i])) return nalast; + if (isnan(x[j])) return !nalast; + return min ? x[i] < x[j] : x[i] > x[j]; +} + +static inline bool i64cmp(const int64_t *x, int i, int j, bool min, bool nalast) { + if (x[i]==x[j]) return i > j; + if (x[i]==NA_INTEGER64) return nalast; + if (x[j]==NA_INTEGER64) return !nalast; + return min ? x[i] < x[j] : x[i] > x[j]; +} + +static inline bool scmp(const SEXP *restrict x, int i, int j, bool min, bool nalast) { + if (strcmp(CHAR(x[i]), CHAR(x[j])) == 0) return i > j; + if (x[i]==NA_STRING) return nalast; + if (x[j]==NA_STRING) return !nalast; + return min ? strcmp(CHAR(x[i]),CHAR(x[j]))<0 : strcmp(CHAR(x[i]),CHAR(x[j]))>0; +} + +static inline bool ccmp(const Rcomplex *x, int i, int j, bool min, bool nalast) { + if (ISNAN_COMPLEX(x[i]) && ISNAN_COMPLEX(x[j])) return i > j; + if (x[i].r==x[j].r) { + if (x[i].i==x[j].i) return i > j; + return min ? x[i].i < x[j].i : x[i].i > x[j].i; + } + if (ISNAN_COMPLEX(x[i])) return nalast; + if (ISNAN_COMPLEX(x[j])) return !nalast; + return min ? x[i].r < x[j].r : x[i].r > x[j].r; +} + +// compare value of node with values of left/right child nodes and sift value down if value of child node is smaller than parent (for minheap) +#undef SIFT +#define SIFT(CMP) { \ + int smallest, l, r; \ + while(true) { \ + smallest = k; \ + l = (k << 1) + 1; \ + r = l+1; \ + if (l < len && CMP(VAL,INDEX[l],INDEX[smallest],min,nalast)) \ + smallest = l; \ + if (r < len && CMP(VAL,INDEX[r],INDEX[smallest],min,nalast)) \ + smallest = r; \ + if (smallest != k) { \ + swap(&INDEX[k], &INDEX[smallest]); \ + k = smallest; \ + } else { \ + break; \ + } \ + } \ +} + +// for finding decreasing topn build minheap and add values if they exceed +// minimum by overwriting minimum and following down sifting +#undef HEAPN +#define HEAPN(CTYPE, RTYPE, CMP, SORTED) { \ + const CTYPE *restrict VAL = (const CTYPE *)RTYPE(x); \ + for (int i=n/2; i>=0; --i) { k=i; len=n; SIFT(CMP); } \ + for (int i=n; i 0.")); + if (!IS_TRUE_OR_FALSE(ascArg)) error(_("%s must be TRUE or FALSE"), "decreasing"); + if (!IS_TRUE_OR_FALSE(naArg)) error(_("%s must be TRUE or FALSE"), "na.last"); + if (!IS_TRUE_OR_FALSE(sortedArg)) error(_("%s must be TRUE or FALSE"), "sorted"); + + const int xlen = LENGTH(x); + int n = INTEGER(nArg)[0]; + if (n > xlen) { + warning(_("n should be smaller or equal than length(x) but provided n=%d and length(x)=%d.\n Coercing n to length(x)."), n, xlen); + n = xlen; + } + + const bool min = LOGICAL(ascArg)[0]; + const bool nalast = LOGICAL(naArg)[0]; + const bool sorted = LOGICAL(sortedArg)[0]; + + SEXP ans; + int k, len; + ans = PROTECT(allocVector(INTSXP, n)); + int *restrict ians = INTEGER(ans); + int *restrict INDEX = malloc(n*sizeof(int)); + if (!INDEX) error(_("Internal error: Couldn't allocate memory for heap indices.")); // # nocov + for (int i=0; i> 1; \ + SWAP(ix+mid, ix+l + 1); \ + if (CMP(ix,l,ir,min,nalast)) { \ + SWAP(ix+l, ix+ir); \ + } \ + if (CMP(ix,l+1,ir,min,nalast)) { \ + SWAP(ix+l+1, ix+ir); \ + } \ + if (CMP(ix,l,l+1,min,nalast)) { \ + SWAP(ix+l, ix+l+1); \ + } \ + unsigned long i = l + 1, j = ir; \ + for (;;) { \ + do i++; while (CMP(ix,l+1,i,min,nalast)); \ + do j--; while (CMP(ix,j,l+1,min,nalast)); \ + if (j < i) break; \ + SWAP(ix+i, ix+j); \ + } \ + SWAP(ix+l+1, ix+j); \ + if (j >= n) ir = j - 1; \ + if (j <= n) l = i; \ + } \ + } \ + memcpy(ians, ix, n * sizeof(CTYPE)) + +static inline void iswap(int *a, int *b) {int tmp=*a; *a=*b; *b=tmp;} +static inline void dswap(double *a, double *b) {double tmp=*a; *a=*b; *b=tmp;} +static inline void i64swap(int64_t *a, int64_t *b) {int64_t tmp=*a; *a=*b; *b=tmp;} +static inline void cswap(Rcomplex *a, Rcomplex *b) {Rcomplex tmp=*a; *a=*b; *b=tmp;} +static inline void sswap(SEXP *a, SEXP *b) {SEXP tmp=*a; *a=*b; *b=tmp;} + +SEXP quickn(SEXP x, SEXP nArg, SEXP naArg, SEXP ascArg) { + if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<=0 || INTEGER(nArg)[0]==NA_INTEGER) error(_("topn(x,n) only implemented for n > 0.")); + if (!IS_TRUE_OR_FALSE(ascArg)) error(_("%s must be TRUE or FALSE"), "decreasing"); + if (!IS_TRUE_OR_FALSE(naArg)) error(_("%s must be TRUE or FALSE"), "na.last"); + + const int xlen = LENGTH(x); + int n = INTEGER(nArg)[0]; + x = PROTECT(duplicate(x)); + + const bool min = LOGICAL(ascArg)[0]; + const bool nalast = LOGICAL(naArg)[0]; + + SEXP ans; + ans = PROTECT(allocVector(TYPEOF(x), n)); + switch(TYPEOF(x)) { + case LGLSXP: case INTSXP: { QUICKN(int, INTEGER, icmp, iswap); } break; + case REALSXP: { + if (INHERITS(x, char_integer64)) { QUICKN(int64_t, REAL, i64cmp, i64swap); } + else { QUICKN(double, REAL, dcmp, dswap); } break; } + case CPLXSXP: { QUICKN(Rcomplex, COMPLEX, ccmp, cswap); } break; + case STRSXP: { QUICKN(SEXP, STRING_PTR, scmp, sswap); } break; + default: + error(_("Type '%s' not supported by quickn."), type2char(TYPEOF(x))); + } + copyMostAttrib(x, ans); + UNPROTECT(2); + return(ans); +}