diff --git a/NEWS.md b/NEWS.md index 7cf63f0b7..85c1792d9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,6 +28,8 @@ 4. `dcast()` and `melt()` "just work" when passed a data.frame, not just data.tables, with no need for coercion, [#7614](https://github.com/Rdatatable/data.table/issues/7614). Thanks @MichaelChirico for the suggestion and @manmita for the PR. Note that to avoid potential conflicts with {reshape2}'s data.frame methods, we do the dispatch to the data.table method manually. +5. `setnames()` now supports a global option `datatable.unique.names` to control the creation of duplicate column names. Users can choose between `"off"` (default), `"warn"`, `"error"`, or `"rename"`. This addresses long-standing ambiguity issues when duplicate names were created silently, [#4044](https://github.com/Rdatatable/data.table/issues/4044). Thanks to @venom1204 for the PR. + ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. diff --git a/R/data.table.R b/R/data.table.R index a989538b1..c6240eb5d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2943,13 +2943,17 @@ setnames = function(x,old,new,skip_absent=FALSE) { if (!length(new)) return(invisible(x)) # no changes if (length(i) != length(new)) internal_error("length(i)!=length(new)") # nocov } - # update the key if the column name being change is in the key + + full_names = names(x) + full_names[i] = new + full_names = process_name_policy(full_names) + new = full_names[i] + m = chmatch(names(x)[i], key(x)) w = which(!is.na(m)) if (length(w)) .Call(Csetcharvec, attr(x, "sorted", exact=TRUE), m[w], new[w]) - # update secondary keys idx = attr(x, "index", exact=TRUE) for (k in names(attributes(idx))) { tt = strsplit(k,split="__")[[1L]][-1L] diff --git a/R/onLoad.R b/R/onLoad.R index b72fee4d1..ba404305b 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -98,7 +98,8 @@ datatable.auto.index=TRUE, # DT[col=="val"] to auto add index so 2nd time faster datatable.use.index=TRUE, # global switch to address #1422 datatable.prettyprint.char=NULL, # FR #1091 - datatable.old.matrix.autoname=FALSE # #7145: how data.table(x=1, matrix(1)) is auto-named set to change + datatable.old.matrix.autoname=FALSE, # #7145: how data.table(x=1, matrix(1)) is auto-named set to change + datatable.unique.names = "off" ) opts = opts[!names(opts) %chin% names(options())] options(opts) diff --git a/R/utils.R b/R/utils.R index 9d89f6f0a..e16107449 100644 --- a/R/utils.R +++ b/R/utils.R @@ -35,6 +35,32 @@ check_duplicate_names = function(x, table_name=deparse(substitute(x))) { table_name, brackify(duplicate_names), domain=NA) } +process_name_policy = function(names_vec) { + policy = getOption("datatable.unique.names", "off") + + if (is.null(policy) || policy == "off") return(names_vec) + + allowed = c("warn", "error", "rename") + if (!policy %in% allowed) { + warningf("Invalid value for 'datatable.unique.names': [%s]. Falling back to 'off'. Allowed values are: 'off', 'warn', 'error', 'rename'.", as.character(policy)) + return(names_vec) + } + + if (anyDuplicated(names_vec)) { + dups = unique(names_vec[duplicated(names_vec)]) + msg = sprintf("Duplicate column names created: %s. This may cause ambiguity.", brackify(dups)) + + if (policy == "warn") { + warningf(msg) + } else if (policy == "error") { + stopf(msg) + } else if (policy == "rename") { + return(make.unique(names_vec)) + } + } + return(names_vec) +} + duplicated_values = function(x) { # fast anyDuplicated for the typical/non-error case; second duplicated() pass for (usually) error case if (!anyDuplicated(x)) return(vector(typeof(x))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 508bf6aa0..13d26d22d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21520,3 +21520,14 @@ test(2365.1, melt(df_melt, id.vars=1:2), melt(dt_melt, id.vars=1:2)) df_dcast = data.frame(a = c("x", "y"), b = 1:2, v = 3:4) dt_dcast = data.table(a = c("x", "y"), b = 1:2, v = 3:4) test(2365.2, dcast(df_dcast, a ~ b, value.var = "v"), dcast(dt_dcast, a ~ b, value.var = "v")) + +#4044 +DT = as.data.table(iris) +options(datatable.unique.names = "off") +test(2366.1, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species")) +options(datatable.unique.names = "warn") +test(2366.2, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), warning = "Duplicate column names created") +options(datatable.unique.names = "error") +test(2366.3, setnames(copy(DT), "Petal.Length", "Sepal.Length"), error = "Duplicate column names created") +options(datatable.unique.names = "rename") +test(2366.4, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), c("Sepal.Length", "Sepal.Width", "Sepal.Length.1", "Petal.Width", "Species")) \ No newline at end of file diff --git a/man/data.table-options.Rd b/man/data.table-options.Rd index 439e88ef2..4bb477af6 100644 --- a/man/data.table-options.Rd +++ b/man/data.table-options.Rd @@ -105,6 +105,12 @@ \item{\code{datatable.enlist}}{Experimental feature. Default is \code{NULL}. If set to a function (e.g., \code{list}), the \code{j} expression can return a \code{list}, which will then be "enlisted" into columns in the result.} + \item{\code{datatable.unique.names}}{A character string, default \code{"off"}. + Controls the behavior when operations (like \code{setnames}) would result in + duplicate column names. Can be \code{"off"} (silently allow duplicates), + \code{"warn"} (issue a warning), \code{"error"} (halt with an error), + or \code{"rename"} (automatically fix duplicates using \code{make.unique}). + Invalid values will trigger a warning and fall back to \code{"off"}.} } }