diff --git a/NEWS.md b/NEWS.md index c02d0cc4e..d87656c9f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -52,6 +52,8 @@ 9. Grouping operations on empty (0-row, 0-column) data.tables work as intended, [#7749](https://github.com/Rdatatable/data.table/issues/7749). Thanks @rickhelmus for the report and @MichaelChirico for the fix. +10. `setkey()`, `setindex()`, `CJ()`, and `setnames()` now prevent creating ambiguous keys from duplicated column names, and keyed joins now error on existing duplicated key columns rather than silently giving incorrect results, [#4888](https://github.com/Rdatatable/data.table/issues/4888) and [#4891](https://github.com/Rdatatable/data.table/issues/4891). Thanks @magerton and @MichaelChirico for the reports, and @ben-schwen for the fix. + ### Notes 1. {data.table} now depends on R 3.5.0 (2018). diff --git a/R/data.table.R b/R/data.table.R index a989538b1..fac54b06d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -841,9 +841,10 @@ replace_dot_alias = function(e) { if (is.data.frame(i)) { if (missing(on)) { - if (!haskey(x)) { - stopf("When i is a data.table (or character vector), the columns to join by must be specified using the 'on=' argument (see ?data.table); by keying x (i.e., x is sorted and marked as such, see ?setkey); or by using 'on = .NATURAL' to indicate using the shared column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") - } + if (haskey(x)) check_duplicate_key(x) + else stopf("When i is a data.table (or character vector), the columns to join by must be specified using the 'on=' argument (see ?data.table); by keying x (i.e., x is sorted and marked as such, see ?setkey); or by using 'on = .NATURAL' to indicate using the shared column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.") + + if (haskey(i)) check_duplicate_key(i) } else if (identical(substitute(on), as.name(".NATURAL"))) { naturaljoin = TRUE } @@ -1871,10 +1872,10 @@ replace_dot_alias = function(e) { } shared_keys = get_shared_keys(jsub, jvnames, sdvars = sdvars, key(x)) if (is.null(irows) && !is.null(shared_keys)) { - setattr(jval, 'sorted', shared_keys) + if (!any(shared_keys %chin% duplicated_values(names(jval)))) setattr(jval, 'sorted', shared_keys) # potentially inefficient backup -- check if jval is sorted by key(x) } else if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) { - setattr(jval, 'sorted', key(x)) + if (!any(key(x) %chin% duplicated_values(names(jval)))) setattr(jval, 'sorted', key(x)) } if (any(vapply_1b(jval, is.null))) internal_error("j has created a data.table result containing a NULL column") # nocov } @@ -2188,7 +2189,8 @@ replace_dot_alias = function(e) { setkeyv(ans,names(ans)[seq_along(byval)]) if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} } else if (.by_result_is_keyable(x, keyby, bysameorder, byjoin, allbyvars, bysub)) { - setattr(ans, "sorted", names(ans)[seq_along(grpcols)]) + grpnames = names(ans)[seq_along(grpcols)] + if (!any(grpnames %chin% duplicated_values(names(ans)))) setattr(ans, "sorted", grpnames) } setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line } @@ -2610,6 +2612,7 @@ subset.data.table = function(x, subset, select, ...) } else { setkey(ans,NULL) } + if (haskey(ans) && any(key(ans) %chin% duplicated_values(names(ans)))) setattr(ans, "sorted", NULL) ans } @@ -2946,6 +2949,15 @@ setnames = function(x,old,new,skip_absent=FALSE) { # update the key if the column name being change is in the key m = chmatch(names(x)[i], key(x)) w = which(!is.na(m)) + if (haskey(x)) { + check_duplicate_key(x) + k = key(x) + if (length(w)) k[m[w]] = new[w] + newnames = names(x) + newnames[i] = new + duplicate_key = unique(c(k[duplicated(k)], k[k %chin% duplicated_values(newnames)])) + if (length(duplicate_key)) stopf("The new names would result in duplicated key columns: %s", brackify(duplicate_key)) + } if (length(w)) .Call(Csetcharvec, attr(x, "sorted", exact=TRUE), m[w], new[w]) diff --git a/R/setkey.R b/R/setkey.R index afaf06293..14aa5b9b8 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -44,6 +44,9 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU cols = gsub("`", "", cols, fixed = TRUE) miss = !(cols %chin% colnames(x)) if (any(miss)) stopf("some columns are not in the data.table: %s", brackify(cols[miss]), class = "dt_missing_column_error") + if (anyDuplicated(cols)) stopf("cols contains duplicate column names: %s", brackify(duplicated_values(cols))) + if (any(cols %chin% (dups <- duplicated_values(names(x))))) + stopf("x has duplicated column names in the columns to key by: %s", brackify(cols[cols %chin% dups])) if (physical && identical(head(key(x), length(cols)), cols)){ ## for !physical we need to compute groups as well #4387 ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key. @@ -309,6 +312,7 @@ CJ = function(..., sorted = TRUE, unique = FALSE) l = list(...) vnames = name_dots(...)$vnames if (any(tt <- !nzchar(vnames))) vnames[tt] = paste0("V", which(tt)) + if (sorted && anyDuplicated(vnames)) stopf("CJ() cannot create a keyed data.table with duplicated column names: %s", brackify(duplicated_values(vnames))) dups = FALSE # fix for #1513 for (i in seq_along(l)) { y = l[[i]] diff --git a/R/utils.R b/R/utils.R index 9d89f6f0a..27df81183 100644 --- a/R/utils.R +++ b/R/utils.R @@ -35,6 +35,17 @@ check_duplicate_names = function(x, table_name=deparse(substitute(x))) { table_name, brackify(duplicate_names), domain=NA) } +check_duplicate_key = function(x) { + k = key(x) + duplicate_key = unique(c(k[duplicated(k)], k[k %chin% duplicated_values(names(x))])) + if (length(duplicate_key)) + stopf(ngettext(length(duplicate_key), + "%s has duplicated key column %s. Please remove or rename the duplicate and try again.", + "%s has duplicated key columns %s. Please remove or rename the duplicates and try again."), + deparse(substitute(x)), brackify(duplicate_key), domain=NA) + invisible() +} + duplicated_values = function(x) { # fast anyDuplicated for the typical/non-error case; second duplicated() pass for (usually) error case if (!anyDuplicated(x)) return(vector(typeof(x))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 483638099..59c2218e6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2405,7 +2405,8 @@ b=data.table('User ID'=c(1,2,3), 'Yadda Yadda'=c(1,2,3), key='User ID') test(827.1, names(a[b]), c("User ID","Blah Blah","Yadda Yadda")) # setcolorder and merge check for dup column names, #2193(ii) -setnames(DT2,"b","a") +DT2 = data.table(a=letters[1:5], a=6L) +setattr(DT2, "sorted", "a") test(828, setcolorder(DT2,c("a","b")), error="x has duplicated column name [a]. Please remove or rename") test(829, merge(DT1,DT2), error="y has duplicated column name [a]. Please remove or rename") test(830, merge(DT2,DT1), error="x has duplicated column name [a]. Please remove or rename") @@ -12364,9 +12365,9 @@ children = data.table(parent=c("Sarah", "Max", "Max"), sex=c("M", "M", "F"), age=c(5,8,7)) joined = merge(parents, children, by.x="name", by.y="parent") test(1880.1, length(names(joined)), length(unique(names(joined)))) -test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3L, +test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""), sort=FALSE)), 3L, warning = "column names.*are duplicated in the result") -joined = suppressWarnings(merge(parents, children, by.x="name", by.y="parent", no.dups=FALSE)) +joined = suppressWarnings(merge(parents, children, by.x="name", by.y="parent", no.dups=FALSE, sort=FALSE)) test(1880.3, anyDuplicated(names(joined)) > 0L, TRUE) # out-of-sample quote rule bump, #2265 @@ -12384,7 +12385,7 @@ unlink(f) test(1882.1, .Machine$integer.max, 2147483647L) # same on all platforms and very unlikely to change in R (which is good) test(1882.2, ceiling(.Machine$integer.max^(1/3)), 1291) v = seq_len(1291L) -test(1882.3, CJ(v, v, v), error="Cross product of elements provided to CJ() would result in 2151685171 rows which exceeds .Machine$integer.max == 2147483647") +test(1882.3, CJ(v, v, v, sorted=FALSE), error="Cross product of elements provided to CJ() would result in 2151685171 rows which exceeds .Machine$integer.max == 2147483647") # no re-read for particular file, #2509 if (test_R.utils) test(1883, fread(testDir("SA2-by-DJZ.csv.gz"), verbose=TRUE, header=FALSE)[c(1,2,1381,.N),], @@ -13599,7 +13600,7 @@ test(1967.26, foverlaps(x, y, by.x = c('start', 'END')), error = "Elements listed in 'by.x' must be valid names") test(1967.27, foverlaps(x, y, by.x = c('start', 'start')), error = 'Duplicate columns are not allowed') -setkey(y, start, start) +setattr(y, 'sorted', c('start', 'start')) test(1967.28, foverlaps(x, y, by.y = c('start', 'start')), error = 'Duplicate columns are not allowed') setkey(y, start, end) @@ -13693,7 +13694,7 @@ test(1967.69,optimize=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOu # test 1967.69 subsumes 1967.69 and 1967.70 test(1967.71,optimize=1L, x[order(a), .N, verbose = TRUE], 5L, output = "forder.c received 5 rows and 1 column") -setkey(x) +setattr(x, 'sorted', c('a', 'a')) test(1967.72,optimize=1L, x[x, .N, on = 'a', verbose = TRUE], 5L, output = "on= matches existing key") @@ -16102,8 +16103,9 @@ test(2114.2, DT[,g(.GRP),by=A], data.table(A=INT(1,1,2,2,2), V1=as.factor(c("a", set.seed(2) ids = sample(letters, 10) # reduced from 20 to 10 dates = 1:10 # and 40 to 10 to save ram, #5517 -dt = data.table(CJ(dates, ids, ids)) +dt = data.table(CJ(dates, ids, ids, sorted=FALSE)) setnames(dt, c("date", "id1", "id2")) +setorder(dt, date, id1, id2) dt[, value := rnorm(length(date))] dt = dt[!(date == 1 & (id1 == "a" | id2 == "a"))] dt = dt[!(date == 4 & (id1 == "e" | id2 == "e"))] @@ -21620,3 +21622,22 @@ test(2373.1, empty_dt[, 1, by = numeric()], data.table(numeric=numeric(), V1=num test(2373.2, empty_dt[, 1L, by = numeric()], data.table(numeric=numeric(), V1=integer())) test(2373.3, empty_dt[, .(x=1), by = .(b=numeric())], data.table(b=numeric(), x=numeric())) test(2373.4, data.table()[, x := 1, by=numeric()], data.table(x=numeric())) + +# prohibit duplicated column names as key columns, #4888 and #4891 +DT = data.table(a=1:3, a=4:6) +test(2374.01, setkey(DT, a, a), error="cols contains duplicate column names") +test(2374.02, setindex(DT, a, a), error="cols contains duplicate column names") +test(2374.03, setkey(DT, a), error="duplicated column names in the columns to key by") +DT = data.table(a=1:2, b=3:4, key=c("a", "b")) +test(2374.04, setnames(DT, "b", "a"), error="duplicated key columns") +test(2374.05, key(DT), c("a", "b")) +test(2374.06, CJ(a=1:3, a=4:5), error="duplicated column names") +DT1 = CJ(a=1:3, b=c('a','b'), c=6) +DT2 = data.table(a=rep(1:3, each=2L), a=rep(4:5, 3L), d=6) +setattr(DT2, "sorted", c("a", "a", "d")) +test(2374.07, DT1[DT2], error="i has duplicated key column") +DT = data.table(a=1:2, b=3:4, key="a") +test(2374.08, key(DT[, .(a, a)]), NULL) +test(2374.09, key(subset(DT, select=c(a, a))), NULL) +DT = data.table(a=1:2, a.1=3:4, val=10:11) +test(2374.10, key(DT[, .(a.1, sum(val)), keyby=.(a, a)]), NULL)