diff --git a/NAMESPACE b/NAMESPACE index 8381a14a73..7d51d3450c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,7 +11,7 @@ export(setindex, setindexv, indices) export(as.data.table,is.data.table,test.data.table) export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%", "%notin%") export(timetaken) -export(truelength, setalloccol, alloc.col, ":=", let) +export(truelength, setalloccol, setallocrow, alloc.col, ":=", let) export(setattr, setnames, setcolorder, set, setDT, setDF) export(setorder, setorderv) export(setNumericRounding, getNumericRounding) diff --git a/R/data.table.R b/R/data.table.R index a989538b14..5c1bb6ed8c 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2850,6 +2850,10 @@ setalloccol = alloc.col = function(DT, n=getOption("datatable.alloccol"), verbos ans } +setallocrow = function(DT, n=-1L) { + invisible(.Call(Callocrowwrapper, DT, as.integer(n))) +} + selfrefok = function(DT,verbose=getOption("datatable.verbose")) { .Call(Cselfrefokwrapper,DT,verbose) } diff --git a/man/truelength.Rd b/man/truelength.Rd index a85f78b1b6..4520ccae73 100644 --- a/man/truelength.Rd +++ b/man/truelength.Rd @@ -2,6 +2,7 @@ \alias{truelength} \alias{setalloccol} \alias{alloc.col} +\alias{setallocrow} \title{ Over-allocation access } \description{ These functions are experimental and somewhat advanced. By \emph{experimental} we mean their names might change and perhaps the syntax, argument names and types. So if you write a lot of code using them, you have been warned! They should work and be stable, though, so please report problems with them. \code{alloc.col} is just an alias to \code{setalloccol}. We recommend to use \code{setalloccol} (though \code{alloc.col} will continue to be supported) because the \code{set*} prefix in \code{setalloccol} makes it clear that its input argument is modified in-place. @@ -14,11 +15,14 @@ setalloccol(DT, alloc.col(DT, n = getOption("datatable.alloccol"), # default: 1024L verbose = getOption("datatable.verbose")) # default: FALSE +setallocrow(DT, n = -1L) } \arguments{ \item{x}{ Any type of vector, including \code{data.table} which is a \code{list} vector of column pointers. } \item{DT}{ A \code{data.table}. } -\item{n}{ The number of spare column pointer slots to ensure are available. If \code{DT} is a 1,000 column \code{data.table} with 24 spare slots remaining, \code{n=1024L} means grow the 24 spare slots to be 1024. \code{truelength(DT)} will then be 2024 in this example. } +\item{n}{ For \code{setalloccol} and \code{alloc.col}: the number of spare column pointer slots to ensure are available. If \code{DT} is a 1,000 column \code{data.table} with 24 spare slots remaining, \code{n=1024L} means grow the 24 spare slots to be 1024. \code{truelength(DT)} will then be 2024 in this example. + + For \code{setallocrow}: the total number of rows to allocate. If \code{n >= 0}, allocates capacity for exactly \code{n} rows in total. If \code{n == -1} (default), shrinks columns to exact current size to free excess memory. } \item{verbose}{ Output status and information. } } \details{ @@ -34,6 +38,9 @@ alloc.col(DT, (perhaps in your .Rprofile); e.g., \code{options(datatable.alloccol=10000L)}. Please note: over-allocation of the column pointer vector is not for efficiency \emph{per se}; it is so that \code{:=} can add columns by reference without a shallow copy. + + \code{setallocrow} prepares columns for fast row operations (delete or insert) by reference. (Note that 'insert' by reference is not yet implemented.) + Before such operations, columns must be resizable: ALTREP columns are materialized and all columns are made resizable (this might trigger reallocation). } \value{ \code{truelength(x)} returns the length of the vector allocated in memory. \code{length(x)} of those items are in use. Currently, it is just the list vector of column @@ -43,6 +50,8 @@ alloc.col(DT, \code{setalloccol} \emph{reallocates} \code{DT} by reference. This may be useful for efficiency if you know you are about to going to add a lot of columns in a loop. It also returns the new \code{DT}, for convenience in compound queries. + + \code{setallocrow} modifies \code{DT} by reference to ensure all columns are resizable. Note that unlike typical by-reference operations, the underlying memory of each column vector is likely to be reallocated and relocated, if necessary. This means the memory addresses of the column vectors themselves may change, even though the \code{data.table} object \code{DT} is modified in place. } \seealso{ \code{\link{copy}} } \examples{ diff --git a/src/assign.c b/src/assign.c index 1b474072c9..edd856dabc 100644 --- a/src/assign.c +++ b/src/assign.c @@ -592,7 +592,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) Rprintf(_("RHS for item %d has been duplicated because MAYBE_REFERENCED==%d MAYBE_SHARED==%d ALTREP==%d, but then is being plonked. length(values)==%d; length(cols)==%d\n"), i+1, MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue), ALTREP(thisvalue), length(values), length(cols)); } - thisvalue = copyAsPlain(thisvalue); // PROTECT not needed as assigned as element to protected list below. + thisvalue = copyAsPlain(thisvalue, -1); // PROTECT not needed as assigned as element to protected list below. } else { if (verbose) Rprintf(_("Direct plonk of unnamed RHS, no copy. MAYBE_REFERENCED==%d, MAYBE_SHARED==%d\n"), MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue)); // e.g. DT[,a:=as.character(a)] as tested by 754.5 } diff --git a/src/coalesce.c b/src/coalesce.c index 10b7b77576..cd07581093 100644 --- a/src/coalesce.c +++ b/src/coalesce.c @@ -52,7 +52,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) { error(_("Item %d is length %d but the first item is length %d. Only singletons are recycled."), i+2, length(item), nrow); } if (!inplace) { - first = PROTECT(copyAsPlain(first)); nprotect++; + first = PROTECT(copyAsPlain(first, -1)); nprotect++; if (verbose) Rprintf(_("coalesce copied first item (inplace=FALSE)\n")); } const void **valP = (const void **)R_alloc(nval, sizeof(*valP)); diff --git a/src/data.table.h b/src/data.table.h index e7ccc55d38..b970272b00 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -330,7 +330,8 @@ bool allNA(SEXP x, bool errorForBadType); SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent); bool INHERITS(SEXP x, SEXP char_); void copyVectorElements(SEXP dst, SEXP src, R_xlen_t n, bool deep_copy, const char *caller); -SEXP copyAsPlain(SEXP x); +SEXP copyAsPlain(SEXP x, R_xlen_t overalloc); +SEXP allocrow(SEXP dt, R_xlen_t n); void copySharedColumns(SEXP x); SEXP lock(SEXP x); SEXP unlock(SEXP x); @@ -406,6 +407,7 @@ SEXP assign(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP copy(SEXP); SEXP setdt_nrows(SEXP); SEXP alloccolwrapper(SEXP, SEXP, SEXP); +SEXP allocrowwrapper(SEXP, SEXP); SEXP selfrefokwrapper(SEXP, SEXP); SEXP truelength(SEXP); SEXP setcharvec(SEXP, SEXP, SEXP); diff --git a/src/dogroups.c b/src/dogroups.c index 06dfe84bec..00480c9f99 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -346,7 +346,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX target = VECTOR_ELT(dt, colj); bool copied = false; if (isNewList(target) && anySpecialStatic(RHS, specials)) { // see comments in anySpecialStatic() - RHS = PROTECT(copyAsPlain(RHS)); + RHS = PROTECT(copyAsPlain(RHS, -1)); copied = true; } const char *warn = memrecycle(target, order, INTEGER(starts)[i]-1, grpn, RHS, 0, -1, 0, ""); @@ -452,7 +452,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } bool copied = false; if (isNewList(target) && anySpecialStatic(source, specials)) { // see comments in anySpecialStatic() - source = PROTECT(copyAsPlain(source)); + source = PROTECT(copyAsPlain(source, -1)); copied = true; } memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, -1, 0, ""); diff --git a/src/fmelt.c b/src/fmelt.c index 287ba4d0d4..7db6b09916 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -811,7 +811,7 @@ SEXP fmelt(SEXP DT, SEXP id, SEXP measure, SEXP varfactor, SEXP valfactor, SEXP // edge case no measure.vars if (!data.lmax) { SEXP tt = PROTECT(shallowwrapper(DT, data.idcols)); protecti++; - ans = PROTECT(copyAsPlain(tt)); protecti++; + ans = PROTECT(copyAsPlain(tt, -1)); protecti++; } else { ansvals = PROTECT(getvaluecols(DT, dtnames, LOGICAL(valfactor)[0], verbose, &data)); protecti++; ansvars = PROTECT(getvarcols(DT, dtnames, LOGICAL(varfactor)[0], verbose, &data)); protecti++; diff --git a/src/init.c b/src/init.c index 13421998b4..23136f6036 100644 --- a/src/init.c +++ b/src/init.c @@ -95,6 +95,7 @@ static const R_CallMethodDef callMethods[] = { {"CconvertNegAndZeroIdx", (DL_FUNC)&convertNegAndZeroIdx, -1}, {"Cfrank", (DL_FUNC)&frank, -1}, {"Cdt_na", (DL_FUNC)&dt_na, -1}, + {"Callocrowwrapper", (DL_FUNC)&allocrowwrapper, 2}, {"Clookup", (DL_FUNC)&lookup, -1}, {"Coverlaps", (DL_FUNC)&overlaps, -1}, {"Cwhichwrapper", (DL_FUNC)&whichwrapper, -1}, diff --git a/src/reorder.c b/src/reorder.c index 8fe682e861..61a65b20dc 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -24,7 +24,7 @@ SEXP reorder(SEXP x, SEXP order) error(_("Column %d is length %d which differs from length of column 1 (%d). Invalid data.table."), i+1, length(v), nrow); if (RTYPE_SIZEOF(v) > maxSize) maxSize=RTYPE_SIZEOF(v); - if (ALTREP(v)) SET_VECTOR_ELT(x, i, copyAsPlain(v)); + if (ALTREP(v)) SET_VECTOR_ELT(x, i, copyAsPlain(v, -1)); } copySharedColumns(x); // otherwise two columns which point to the same vector would be reordered and then re-reordered, issues linked in PR#3768 } else { @@ -40,7 +40,7 @@ SEXP reorder(SEXP x, SEXP order) if (length(order) != nrow) error("nrow(x)[%d]!=length(order)[%d]", nrow, length(order)); // # notranslate int nprotect = 0; - if (ALTREP(order)) { order=PROTECT(copyAsPlain(order)); nprotect++; } // TODO: if it's an ALTREP sequence some optimizations are possible rather than expand + if (ALTREP(order)) { order=PROTECT(copyAsPlain(order, -1)); nprotect++; } // TODO: if it's an ALTREP sequence some optimizations are possible rather than expand const int *restrict idx = INTEGER_RO(order); int i=0; diff --git a/src/subset.c b/src/subset.c index d1381223b6..ea6a402daf 100644 --- a/src/subset.c +++ b/src/subset.c @@ -313,7 +313,7 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md for (int i=0; i= 0: resizable vector with capacity = length(x) + overalloc if (isNull(x)) { // deal with up front because isNewList(R_NilValue) is true @@ -262,7 +266,14 @@ SEXP copyAsPlain(SEXP x) { return duplicate(x); } const int64_t n = XLENGTH(x); - SEXP ans = PROTECT(allocVector(TYPEOF(x), n)); + SEXP ans; + if (overalloc == -1) { + ans = PROTECT(allocVector(TYPEOF(x), n)); + } else { + const R_xlen_t capacity = n + overalloc; + ans = PROTECT(R_allocResizableVector(TYPEOF(x), capacity)); + R_resizeVector(ans, n); + } // aside: unlike R's duplicate we do not copy truelength here; important for dogroups.c which uses negative truelenth to mark its specials if (ALTREP(ans)) internal_error(__func__, "copyAsPlain returning ALTREP for type '%s'", type2char(TYPEOF(x))); // # nocov @@ -277,6 +288,60 @@ SEXP copyAsPlain(SEXP x) { return ans; } +SEXP allocrow(SEXP dt, R_xlen_t n) { + if (!INHERITS(dt, char_datatable)) + error(_("input to allocrow is not a data.table")); // #nocov + + if (n < -1) + error(_("n must be >= -1 in allocrow")); // #nocov + + if (!xlength(dt)) return dt; // zero-column data.table + + const bool verbose = GetVerbose(); + int n_modified = 0; + + for (R_xlen_t i = 0; i < length(dt); i++) { + SEXP col = VECTOR_ELT(dt, i); + if (!isVector(col)) + error(_("Cannot make non-vector column %lld resizable"), (long long)(i + 1)); // #nocov + + const R_xlen_t currentLength = length(col); + const R_xlen_t currentCapacity = R_isResizable(col) ? R_maxLength(col) : currentLength; + // n == -1: shrink to exact size; n >= 0: set total capacity to n + const R_xlen_t targetCapacity = n < 0 ? currentLength : n; + const R_xlen_t overalloc = targetCapacity > currentLength ? targetCapacity - currentLength : 0; + + // Only reallocate if not resizable, or capacity differs from target + if (!R_isResizable(col) || currentCapacity != targetCapacity) { + SEXP newcol = PROTECT(copyAsPlain(col, overalloc)); + SET_VECTOR_ELT(dt, i, newcol); + UNPROTECT(1); + n_modified++; + } + } + + if (verbose) { + if (n_modified > 0) { + const R_xlen_t nrow = length(VECTOR_ELT(dt, 0)); + if (n > nrow) { + Rprintf(Pl_(n_modified, + "Modified %d column (allocated %lld rows total)\n", + "Modified %d columns (allocated %lld rows total)\n"), + n_modified, (long long)n); + } else { + Rprintf(Pl_(n_modified, + "Modified %d column (shrunk to exact size)\n", + "Modified %d columns (shrunk to exact size)\n"), + n_modified); + } + } else { + Rprintf(_("allocrow had no effect, all columns already at target size\n")); + } + } + + return dt; +} + void copySharedColumns(SEXP x) { const int ncol = length(x); if (!isNewList(x) || ncol==1) return; @@ -301,7 +366,7 @@ void copySharedColumns(SEXP x) { if (nShared) { for (int i=0; i= -1 and non-NA")); // #nocov + return allocrow(dt, (R_xlen_t)INTEGER(n)[0]); +} + SEXP dim(SEXP x) { // fast implementation of dim.data.table