Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

### NEW FEATURES

1. `nafill()`, `setnafill()` extended to work on logical and factor vectors (part of [#3992](https://github.com/Rdatatable/data.table/issues/3992)). Thanks @jangorecki for the request and @MichaelChirico for the PR.
1. `nafill()`, `setnafill()` extended to work on logical and factor vectors (part of [#3992](https://github.com/Rdatatable/data.table/issues/3992)). `nafill()` works for character vectors, but not yet `setnafill()`. Thanks @jangorecki for the request and @jangorecki and @MichaelChirico for the PRs.

2. `[,showProgress=]` and `options(datatable.showProgress)` now accept an integer to control the progress bar update interval in seconds, allowing finer control over progress reporting frequency; `TRUE` uses the default 3-second interval, [#6514](https://github.com/Rdatatable/data.table/issues/6514). Thanks @ethanbsmith for the report and @ben-schwen for the PR.

Expand Down
121 changes: 93 additions & 28 deletions inst/tests/nafill.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ x = 1:10
test(3.01, nafill(x, "locf", fill=0L), x)
test(3.02, setnafill(list(copy(x)), "locf", fill=0L), list(x))
test(3.03, setnafill(x, "locf"), error="in-place update is supported only for list")
test(3.04, nafill(letters[1:5], fill=0), error="must be logical/numeric type, or list/data.table")
test(3.05, setnafill(list(letters[1:5]), fill=0), error="must be logical/numeric type, or list/data.table")
test(3.04, nafill(as.raw(x), fill=0), error="not supported")
test(3.05, setnafill(list(as.raw(x)), fill=0), error="not supported")
test(3.06, nafill(x, fill=1:2), error="fill must be a vector of length 1.*fcoalesce")
test(3.07, nafill(x, "locf", fill=1:2), error="fill must be a vector of length 1.*x\\.$")
test(3.08, nafill(x, fill="asd"), x, warning=c("Coercing.*character.*integer","NAs introduced by coercion"))
Expand Down Expand Up @@ -324,42 +324,107 @@ test(11.09, coerceAs(1L, a), error="must not be matrix or array")
x = c(NA, NA, TRUE, FALSE, NA, NA, FALSE, TRUE, NA, NA)
test(12.01, nafill(x, "locf"), c(NA, NA, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE))
test(12.02, nafill(x, "nocb"), c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, NA, NA))
test(12.03, nafill(x, fill=TRUE), c(TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE))
test(12.04, nafill(x, fill=0L), c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE))
test(12.05, nafill(x, fill=5.0), c(TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE), warning="double.*taken as TRUE")
test(12.06, nafill(x, fill=Inf), c(TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE), warning="double.*taken as TRUE")
test(12.07, nafill(x, fill=NA), x)
test(12.08, nafill(x, fill=NA_integer_), x)
test(12.09, nafill(x, fill=NA_real_), x)
test(12.10, nafill(x, fill=NaN), x)
test(12.03, nafill(x, "locf", fill=TRUE), c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE))
test(12.04, nafill(x, "nocb", fill=TRUE), c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE))
test(12.05, nafill(x, fill=TRUE), c(TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE))
test(12.06, nafill(x, fill=0L), c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE))
test(12.07, nafill(x, fill=5.0), c(TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE), warning="double.*taken as TRUE")
test(12.08, nafill(x, fill=Inf), c(TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE), warning="double.*taken as TRUE")
test(12.09, nafill(x, fill=NA), x)
test(12.10, nafill(x, fill=NA_integer_), x)
test(12.11, nafill(x, fill=NA_real_), x)
test(12.12, nafill(x, fill=NaN), x)

## factor input
x = rep(NA_character_, 10L)
x[c(3:4, 7:8)] = c("a", "b", "a", "c")
x = as.factor(x)
test(13.01, nafill(x, "locf"), replace(replace(x, 5:6, "b"), 9:10, "c"))
test(13.02, nafill(x, "nocb"), replace(x, c(1:2, 5:6), "a"))
test(13.03, nafill(x, "locf", fill="b"), replace(replace(x, c(1:2, 5:6), "b"), 9:10, "c"))
test(13.04, nafill(x, "nocb", fill="a"), replace(x, c(1:2, 5:6, 9:10), "a"))
x_fill_a = replace(x, c(1:2, 5:6, 9:10), "a")
test(13.03, nafill(x, fill="a"), x_fill_a)
test(13.04, nafill(x, fill=1L), x_fill_a)
test(13.05, nafill(x, fill=1.0), x_fill_a)
test(13.06, nafill(x, fill=factor("a")), x_fill_a)
test(13.07, nafill(x, fill=factor("a", levels=levels(x))), x_fill_a)
test(13.08, nafill(x, fill=factor("a", levels=c("a", "b"))), x_fill_a)
test(13.09, nafill(x, fill=factor("a", levels=c("a", "d"))), factor(x_fill_a, levels=c("a", "b", "c", "d")))
test(13.05, nafill(x, fill="a"), x_fill_a)
test(13.06, nafill(x, fill=1L), x_fill_a)
test(13.07, nafill(x, fill=1.0), x_fill_a)
test(13.08, nafill(x, fill=factor("a")), x_fill_a)
test(13.09, nafill(x, fill=factor("a", levels=levels(x))), x_fill_a)
test(13.10, nafill(x, fill=factor("a", levels=c("a", "b"))), x_fill_a)
test(13.11, nafill(x, fill=factor("a", levels=c("a", "d"))), factor(x_fill_a, levels=c("a", "b", "c", "d")))
x_fill_d = replace(factor(x, levels = c(levels(x), "d")), c(1:2, 5:6, 9:10), "d")
test(13.10, nafill(x, fill="d"), x_fill_d)
test(13.11, nafill(x, fill=factor("d", levels=c("a", "b", "c", "d"))), x_fill_d)
test(13.12, nafill(x, fill=factor("d", levels=c("d", "a", "b", "c"))), x_fill_d)
test(13.13, nafill(x, fill=factor("d", levels=c("d", "c", "b", "a"))), x_fill_d)
test(13.14, nafill(x, fill=factor("d", levels=c("b", "c", "d"))), x_fill_d)
test(13.15, nafill(x, fill=NA), x)
test(13.16, nafill(x, fill=NA_integer_), x)
test(13.17, nafill(x, fill=NA_character_), x)
test(13.12, nafill(x, fill="d"), x_fill_d)
test(13.13, nafill(x, fill=factor("d", levels=c("a", "b", "c", "d"))), x_fill_d)
test(13.14, nafill(x, fill=factor("d", levels=c("d", "a", "b", "c"))), x_fill_d)
test(13.15, nafill(x, fill=factor("d", levels=c("d", "c", "b", "a"))), x_fill_d)
test(13.16, nafill(x, fill=factor("d", levels=c("b", "c", "d"))), x_fill_d)
test(13.17, nafill(x, fill=NA), x)
test(13.18, nafill(x, fill=NA_integer_), x)
test(13.19, nafill(x, fill=NA_real_), x)
test(13.20, nafill(x, fill=NA_character_), x)

## character input
x = c(NA, NA, "a", "b", NA, NA, "c","d", NA, NA)
test(14.01, nafill(x, fill="unknown"), c("unknown", "unknown", "a", "b", "unknown", "unknown", "c", "d", "unknown", "unknown"))
test(14.02, nafill(x, fill=NA), x)
test(14.03, nafill(x, "locf"), c(NA, NA, "a", "b", "b", "b", "c", "d", "d", "d"))
test(14.04, nafill(x, "nocb"), c("a", "a", "a", "b", "c", "c", "c", "d", NA, NA))
test(14.05, nafill(x, "locf", fill="unknown"), c("unknown", "unknown", "a", "b", "b", "b", "c", "d", "d", "d"))
test(14.06, nafill(x, "nocb", fill="unknown"), c("a", "a", "a", "b", "c", "c", "c", "d", "unknown", "unknown"))
test(14.07, nafill(x, fill=TRUE), c("TRUE", "TRUE", "a", "b", "TRUE", "TRUE", "c", "d", "TRUE", "TRUE"))
test(14.08, nafill(x, fill=1L), c("1", "1", "a", "b", "1", "1", "c", "d", "1", "1"))
test(14.09, nafill(x, fill=1.0), c("1", "1", "a", "b", "1", "1", "c", "d", "1", "1"))
test(14.10, nafill(x, fill=NA_integer_), x)
test(14.11, nafill(x, fill=NA_real_), x)
test(14.12, nafill(x, fill=NA_character_), x)
test(14.13, options=c(datatable.verbose=TRUE),
nafill(x, fill="z"), c("z", "z", "a", "b", "z", "z", "c", "d", "z", "z"),
output="nafillString: took")

## setnafill
DT = data.table(l1=c(NA, NA, TRUE, TRUE, NA, NA, FALSE, FALSE, NA, NA),
l2=c(NA, NA, FALSE, FALSE, NA, NA, TRUE, TRUE, NA, NA),
i1=c(NA, NA, 0:1, NA, NA, 2:3, NA, NA),
i2=c(NA, NA, 3:2, NA, NA, 1:0, NA, NA),
d1=c(NA, NA, 0.0, 1L, NA, NA, 2:3, NA, NA),
d2=c(NA, NA, 3.0, 2L, NA, NA, 1:0, NA, NA),
f1=as.factor(c(NA, NA, "a", "b", NA, NA, "b", "c", NA, NA)),
f2=as.factor(c(NA, NA, "c", "b", NA, NA, "b", "a", NA, NA)),
c1=c(NA, NA, "a", "b", NA, NA, "c", "d", NA, NA),
c2=c(NA, NA, "d", "c", NA, NA, "b", "a", NA, NA))
test(15.01, setnafill(copy(DT), fill=TRUE, cols='l1')$l1,
c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE))
test(15.02, setnafill(copy(DT), fill=TRUE, cols=c('l1', 'l2'))[, .(l1, l2)],
data.table(l1=c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE),
l2=c(TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE)))
test(15.03, setnafill(copy(DT), fill=9L, cols='i1')$i1,
c(9L, 9L, 0:1, 9L, 9L, 2:3, 9L, 9L))
test(15.04, setnafill(copy(DT), fill=9L, cols=c('i1', 'i2'))[, .(i1, i2)],
data.table(i1=c(9L, 9L, 0:1, 9L, 9L, 2:3, 9L, 9L),
i2=c(9L, 9L, 3:2, 9L, 9L, 1:0, 9L, 9L)))
test(15.05, setnafill(copy(DT), fill=9.0, cols='d1')$d1,
c(9.0, 9L, 0:1, 9L, 9L, 2:3, 9L, 9L))
test(15.06, setnafill(copy(DT), fill=9.0, cols=c('d1', 'd2'))[, .(d1, d2)],
data.table(d1=c(9.0, 9L, 0:1, 9L, 9L, 2:3, 9L, 9L),
d2=c(9.0, 9L, 3:2, 9L, 9L, 1:0, 9L, 9L)))
test(15.07, setnafill(copy(DT), fill="a", cols='f1')$f1,
as.factor(c("a", "a", "a", "b", "a", "a", "b", "c", "a", "a")))
test(15.08, setnafill(copy(DT), fill="a", cols=c('f1', 'f2'))[, .(f1, f2)],
data.table(f1=as.factor(c("a", "a", "a", "b", "a", "a", "b", "c", "a", "a")),
f2=as.factor(c("a", "a", "c", "b", "a", "a", "b", "a", "a", "a"))))
test(15.09, setnafill(DT, fill="z", cols='c1'), error="not yet supported")
# test(15.10, setnafill(copy(DT), fill="z", cols=c('c1', 'c2'))[, .(c1, c2)],
# data.table(c1=c("z", "z", "a", "b", "z", "z", "c", "d", "z", "z"),
# c2=c("z", "z", "d", "c", "z", "z", "b", "a", "z", "z")))
test(15.11, setnafill(copy(DT), fill=list(TRUE, 9L, 9.0, "a"), cols=c("l1", "i1", "d1", "f1"))[, .(l1, i1, d1, f1)],
data.table(l1=c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE),
i1=c(9L, 9L, 0:1, 9L, 9L, 2:3, 9L, 9L),
d1=c(9.0, 9L, 0L, 1L, 9L, 9L, 2:3, 9L, 9L),
f1=as.factor(c("a", "a", "a", "b", "a", "a", "b", "c", "a", "a"))))
test(15.12, setnafill(DT, cols=c("l1", "c1")), error="not yet supported")
DT = data.table(l=c(NA, FALSE), i=c(NA, 0L))
setnafill(DT, fill=list(TRUE, 1L))
test(15.13, DT, data.table(l=c(TRUE, FALSE), i=1:0))

## logical
## character
## factor
## Date
## POSIXct
## IDate
Expand Down
72 changes: 61 additions & 11 deletions src/nafill.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,31 @@ void nafillInteger64(int64_t *x, uint_fast64_t nx, unsigned int type, int64_t fi
snprintf(ans->message[0], 500, _("%s: took %.3fs\n"), __func__, omp_get_wtime()-tic);
}

void nafillString(const SEXP *x, uint_fast64_t nx, unsigned int type, SEXP fill, ans_t *ans, bool verbose) {
double tic=0.0;
if (verbose)
tic = omp_get_wtime();
if (type==0) { // const 1Code has comments. Press enter to view.
for (uint_fast64_t i=0; i<nx; i++) {
SET_STRING_ELT(ans->char_v, i, x[i]==NA_STRING ? fill : x[i]);
}
} else if (type==1) { // locf
SET_STRING_ELT(ans->char_v, 0, x[0]==NA_STRING ? fill : x[0]);
const SEXP* thisans = SEXPPTR_RO(ans->char_v); // takes out STRING_ELT from loop
for (uint_fast64_t i=1; i<nx; i++) {
SET_STRING_ELT(ans->char_v, i, x[i]==NA_STRING ? thisans[i-1] : x[i]);
}
} else if (type==2) { // nocb
SET_STRING_ELT(ans->char_v, nx-1, x[nx-1]==NA_STRING ? fill : x[nx-1]);
const SEXP* thisans = SEXPPTR_RO(ans->char_v); // takes out STRING_ELT from loop
for (int_fast64_t i=nx-2; i>=0; i--) {
SET_STRING_ELT(ans->char_v, i, x[i]==NA_STRING ? thisans[i+1] : x[i]);
}
}
if (verbose)
snprintf(ans->message[0], 500, _("%s: took %.3fs\n"), __func__, omp_get_wtime()-tic);
}

/*
OpenMP is being used here to parallelize the loop that fills missing values
over columns of the input data. This includes handling different data types
Expand All @@ -113,28 +138,31 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S
if (obj_scalar) {
if (binplace)
error(_("'x' argument is atomic vector, in-place update is supported only for list/data.table"));
else if (!isReal(obj) && TYPEOF(obj) != INTSXP && !isLogical(obj))
error(_("'x' argument must be logical/numeric type, or list/data.table of logical/numeric types"));
else if (!isReal(obj) && TYPEOF(obj) != INTSXP && !isLogical(obj) && !isString(obj))
error(_("'x' argument (type %s) not supported."), type2char(TYPEOF(obj)));
SEXP obj1 = obj;
obj = PROTECT(allocVector(VECSXP, 1)); protecti++; // wrap into list
SET_VECTOR_ELT(obj, 0, obj1);
}
SEXP ricols = PROTECT(colnamesInt(obj, cols, /* check_dups= */ ScalarLogical(TRUE), /* skip_absent= */ ScalarLogical(FALSE))); protecti++; // nafill cols=NULL which turns into seq_along(obj)
x = PROTECT(allocVector(VECSXP, length(ricols))); protecti++;
int *icols = INTEGER(ricols);
bool any_char = false;
for (int i=0; i<length(ricols); i++) {
SEXP this_col = VECTOR_ELT(obj, icols[i]-1);
if (!isReal(this_col) && TYPEOF(this_col) != INTSXP && !isLogical(this_col))
error(_("'x' argument must be logical/numeric type, or list/data.table of logical/numeric types"));
if (isString(this_col)) {
any_char = true;
} else if (!isReal(this_col) && TYPEOF(this_col) != INTSXP && !isLogical(this_col))
error(_("'x' argument (type %s) not supported."), type2char(TYPEOF(this_col)));
SET_VECTOR_ELT(x, i, this_col);
}
R_len_t nx = length(x);

double **dx = (double**)R_alloc(nx, sizeof(*dx));
int32_t **ix = (int32_t**)R_alloc(nx, sizeof(*ix));
const SEXP **sx = (const SEXP**)R_alloc(nx, sizeof(SEXP*));
int64_t **i64x = (int64_t**)R_alloc(nx, sizeof(*i64x));
uint_fast64_t *inx = (uint_fast64_t*)R_alloc(nx, sizeof(*inx));
SEXP ans = R_NilValue;
ans_t *vans = (ans_t *)R_alloc(nx, sizeof(*vans));
for (R_len_t i=0; i<nx; i++) {
const SEXP xi = VECTOR_ELT(x, i);
Expand All @@ -143,21 +171,40 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S
if (isReal(xi)) {
dx[i] = REAL(xi);
i64x[i] = (int64_t *)REAL(xi);
ix[i] = NULL;
ix[i] = NULL; sx[i] = NULL;
} else if (isString(xi)) {
sx[i] = STRING_PTR_RO(xi);
ix[i] = NULL; dx[i] = NULL; i64x[i] = NULL;
} else {
ix[i] = INTEGER(xi);
dx[i] = NULL;
i64x[i] = NULL;
dx[i] = NULL; sx[i] = NULL; i64x[i] = NULL;
}
}
SEXP ans = R_NilValue;
if (!binplace) {
ans = PROTECT(allocVector(VECSXP, nx)); protecti++;
for (R_len_t i=0; i<nx; i++) {
SET_VECTOR_ELT(ans, i, allocVector(TYPEOF(VECTOR_ELT(x, i)), inx[i]));
const SEXP ansi = VECTOR_ELT(ans, i);
const void *p = isReal(ansi) ? (void *)REAL(ansi) : (void *)INTEGER(ansi);
vans[i] = ((ans_t) { .dbl_v=(double *)p, .int_v=(int *)p, .int64_v=(int64_t *)p, .status=0, .message={"\0","\0","\0","\0"} });
const void *p;
switch (TYPEOF(ansi)) {
case LGLSXP:
p = LOGICAL(ansi);
break;
case INTSXP:
p = INTEGER(ansi);
break;
case REALSXP:
p = REAL(ansi);
break;
default:
p = ansi;
break;
}
vans[i] = ((ans_t) { .dbl_v=(double *)p, .int_v=(int *)p, .int64_v=(int64_t *)p, .char_v=(SEXP)p, .status=0, .message={"\0","\0","\0","\0"} });
}
} else if (any_char) {
error(_("In-place filling of character columns is not yet supported."));
} else {
for (R_len_t i=0; i<nx; i++) {
vans[i] = ((ans_t) { .dbl_v=dx[i], .int_v=ix[i], .int64_v=i64x[i], .status=0, .message={"\0","\0","\0","\0"} });
Expand Down Expand Up @@ -200,7 +247,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S
fillp[i] = SEXPPTR_RO(VECTOR_ELT(fill, i)); // do like this so we can use in parallel region
}
}
#pragma omp parallel for if (nx>1) num_threads(getDTthreads(nx, true))
#pragma omp parallel for if (nx>1 && !any_char) num_threads(getDTthreads(nx, true))
for (R_len_t i=0; i<nx; i++) {
switch (TYPEOF(VECTOR_ELT(x, i))) {
case REALSXP : {
Expand All @@ -213,6 +260,9 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S
case LGLSXP: case INTSXP : {
nafillInteger(ix[i], inx[i], itype, hasFill ? ((int32_t *)fillp[i])[0] : NA_INTEGER, &vans[i], verbose);
} break;
case STRSXP : {
nafillString(sx[i], inx[i], itype, hasFill ? ((SEXP *)fillp[i])[0] : NA_STRING, &vans[i], verbose);
} break;
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ typedef struct ans_t {
int32_t *int_v; // used in nafill
double *dbl_v; // used in froll, nafill
int64_t *int64_v; // used in nafill
//void *char_v; // to be used in nafill but then must escape parallelism
void *char_v; // ineligible for filling in parallel!
uint8_t status; // 0:ok, 1:message, 2:warning, 3:error; unix return signal: {0,1,2}=0, {3}=1
char message[4][ANS_MSG_SIZE]; // STDOUT: output, STDERR: message, warning, error
// implicit n_message limit discussed here: https://github.com/Rdatatable/data.table/issues/3423#issuecomment-487722586
Expand Down
Loading