From ca5b8c99a56e9a376faadd6a73d6bbcc0f308f2d Mon Sep 17 00:00:00 2001 From: venom1204 Date: Mon, 16 Feb 2026 01:37:05 +0000 Subject: [PATCH 1/2] changes applied --- R/data.table.R | 6 ++++++ R/onLoad.R | 3 ++- R/utils.R | 10 ++++++++++ inst/tests/tests.Rraw | 9 +++++++++ 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 85d623d392..cca9538749 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2940,6 +2940,12 @@ setnames = function(x,old,new,skip_absent=FALSE) { if (!length(new)) return(invisible(x)) # no changes if (length(i) != length(new)) internal_error("length(i)!=length(new)") # nocov } + + # NEW: Check for duplicates using the centralized helper in utils.R + full_names = names(x) + full_names[i] = new + warn_if_duplicate_names(full_names) + # update the key if the column name being change is in the key m = chmatch(names(x)[i], key(x)) w = which(!is.na(m)) diff --git a/R/onLoad.R b/R/onLoad.R index b72fee4d1b..e5f54ca794 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -98,7 +98,8 @@ datatable.auto.index=TRUE, # DT[col=="val"] to auto add index so 2nd time faster datatable.use.index=TRUE, # global switch to address #1422 datatable.prettyprint.char=NULL, # FR #1091 - datatable.old.matrix.autoname=FALSE # #7145: how data.table(x=1, matrix(1)) is auto-named set to change + datatable.old.matrix.autoname=FALSE, # #7145: how data.table(x=1, matrix(1)) is auto-named set to change + datatable.warn.duplicate.names=FALSE # ADD THIS LINE FOR ISSUE #4044 ) opts = opts[!names(opts) %chin% names(options())] options(opts) diff --git a/R/utils.R b/R/utils.R index 9d89f6f0a4..83e1ef3a3b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -35,6 +35,16 @@ check_duplicate_names = function(x, table_name=deparse(substitute(x))) { table_name, brackify(duplicate_names), domain=NA) } +warn_if_duplicate_names = function(names_vec) { + # Use FALSE as the second argument so it defaults to OFF if not set + if (isTRUE(getOption("datatable.warn.duplicate.names", FALSE))) { + if (anyDuplicated(names_vec)) { + dups = unique(names_vec[duplicated(names_vec)]) + warningf("Duplicate column names created: %s. This may cause ambiguity in future operations.", brackify(dups)) + } + } +} + duplicated_values = function(x) { # fast anyDuplicated for the typical/non-error case; second duplicated() pass for (usually) error case if (!anyDuplicated(x)) return(vector(typeof(x))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f30467dae7..99a433f651 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21515,3 +21515,12 @@ test(2365.1, melt(df_melt, id.vars=1:2), melt(dt_melt, id.vars=1:2)) df_dcast = data.frame(a = c("x", "y"), b = 1:2, v = 3:4) dt_dcast = data.table(a = c("x", "y"), b = 1:2, v = 3:4) test(2365.2, dcast(df_dcast, a ~ b, value.var = "v"), dcast(dt_dcast, a ~ b, value.var = "v")) + +DT = as.data.table(iris) +options(datatable.warn.duplicate.names = FALSE) +test(2366.1,names({ tmp = copy(DT); setnames(tmp, "Petal.Length", "Sepal.Length"); tmp }),c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species")) +options(datatable.warn.duplicate.names = TRUE) +test(2366.2,names({ tmp = copy(DT); setnames(tmp, "Petal.Length", "Sepal.Length"); tmp }),c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"),warning = "Duplicate column names created") +test(2366.3,names({ tmp = copy(DT); setnames(tmp, "Petal.Length", "New.Length"); tmp }),c("Sepal.Length", "Sepal.Width", "New.Length", "Petal.Width", "Species")) +options(datatable.warn.duplicate.names = FALSE) +test(2366.4,names({ tmp = copy(DT); setnames(tmp, "Petal.Length", "Sepal.Length"); tmp }),c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species")) From 86993168f24bb5465da8a326df444199f1418657 Mon Sep 17 00:00:00 2001 From: venom1204 Date: Wed, 25 Feb 2026 08:50:25 +0000 Subject: [PATCH 2/2] added news and doc --- NEWS.md | 2 ++ R/data.table.R | 6 ++---- R/onLoad.R | 2 +- R/utils.R | 28 ++++++++++++++++++++++------ inst/tests/tests.Rraw | 16 +++++++++------- man/data.table-options.Rd | 6 ++++++ 6 files changed, 42 insertions(+), 18 deletions(-) diff --git a/NEWS.md b/NEWS.md index fd0ee8bf16..6f80c2b410 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,6 +28,8 @@ 4. `dcast()` and `melt()` "just work" when passed a data.frame, not just data.tables, with no need for coercion, [#7614](https://github.com/Rdatatable/data.table/issues/7614). Thanks @MichaelChirico for the suggestion and @manmita for the PR. Note that to avoid potential conflicts with {reshape2}'s data.frame methods, we do the dispatch to the data.table method manually. +5. `setnames()` now supports a global option `datatable.unique.names` to control the creation of duplicate column names. Users can choose between `"off"` (default), `"warn"`, `"error"`, or `"rename"`. This addresses long-standing ambiguity issues when duplicate names were created silently, [#4044](https://github.com/Rdatatable/data.table/issues/4044). Thanks to @venom1204 for the PR. + ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. diff --git a/R/data.table.R b/R/data.table.R index cca9538749..e2a5731e1b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2941,18 +2941,16 @@ setnames = function(x,old,new,skip_absent=FALSE) { if (length(i) != length(new)) internal_error("length(i)!=length(new)") # nocov } - # NEW: Check for duplicates using the centralized helper in utils.R full_names = names(x) full_names[i] = new - warn_if_duplicate_names(full_names) + full_names = process_name_policy(full_names) + new = full_names[i] - # update the key if the column name being change is in the key m = chmatch(names(x)[i], key(x)) w = which(!is.na(m)) if (length(w)) .Call(Csetcharvec, attr(x, "sorted", exact=TRUE), m[w], new[w]) - # update secondary keys idx = attr(x, "index", exact=TRUE) for (k in names(attributes(idx))) { tt = strsplit(k,split="__")[[1L]][-1L] diff --git a/R/onLoad.R b/R/onLoad.R index e5f54ca794..ba404305b4 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -99,7 +99,7 @@ datatable.use.index=TRUE, # global switch to address #1422 datatable.prettyprint.char=NULL, # FR #1091 datatable.old.matrix.autoname=FALSE, # #7145: how data.table(x=1, matrix(1)) is auto-named set to change - datatable.warn.duplicate.names=FALSE # ADD THIS LINE FOR ISSUE #4044 + datatable.unique.names = "off" ) opts = opts[!names(opts) %chin% names(options())] options(opts) diff --git a/R/utils.R b/R/utils.R index 83e1ef3a3b..e161074497 100644 --- a/R/utils.R +++ b/R/utils.R @@ -35,14 +35,30 @@ check_duplicate_names = function(x, table_name=deparse(substitute(x))) { table_name, brackify(duplicate_names), domain=NA) } -warn_if_duplicate_names = function(names_vec) { - # Use FALSE as the second argument so it defaults to OFF if not set - if (isTRUE(getOption("datatable.warn.duplicate.names", FALSE))) { - if (anyDuplicated(names_vec)) { - dups = unique(names_vec[duplicated(names_vec)]) - warningf("Duplicate column names created: %s. This may cause ambiguity in future operations.", brackify(dups)) +process_name_policy = function(names_vec) { + policy = getOption("datatable.unique.names", "off") + + if (is.null(policy) || policy == "off") return(names_vec) + + allowed = c("warn", "error", "rename") + if (!policy %in% allowed) { + warningf("Invalid value for 'datatable.unique.names': [%s]. Falling back to 'off'. Allowed values are: 'off', 'warn', 'error', 'rename'.", as.character(policy)) + return(names_vec) + } + + if (anyDuplicated(names_vec)) { + dups = unique(names_vec[duplicated(names_vec)]) + msg = sprintf("Duplicate column names created: %s. This may cause ambiguity.", brackify(dups)) + + if (policy == "warn") { + warningf(msg) + } else if (policy == "error") { + stopf(msg) + } else if (policy == "rename") { + return(make.unique(names_vec)) } } + return(names_vec) } duplicated_values = function(x) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 99a433f651..f481334d37 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21516,11 +21516,13 @@ df_dcast = data.frame(a = c("x", "y"), b = 1:2, v = 3:4) dt_dcast = data.table(a = c("x", "y"), b = 1:2, v = 3:4) test(2365.2, dcast(df_dcast, a ~ b, value.var = "v"), dcast(dt_dcast, a ~ b, value.var = "v")) +#4044 DT = as.data.table(iris) -options(datatable.warn.duplicate.names = FALSE) -test(2366.1,names({ tmp = copy(DT); setnames(tmp, "Petal.Length", "Sepal.Length"); tmp }),c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species")) -options(datatable.warn.duplicate.names = TRUE) -test(2366.2,names({ tmp = copy(DT); setnames(tmp, "Petal.Length", "Sepal.Length"); tmp }),c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"),warning = "Duplicate column names created") -test(2366.3,names({ tmp = copy(DT); setnames(tmp, "Petal.Length", "New.Length"); tmp }),c("Sepal.Length", "Sepal.Width", "New.Length", "Petal.Width", "Species")) -options(datatable.warn.duplicate.names = FALSE) -test(2366.4,names({ tmp = copy(DT); setnames(tmp, "Petal.Length", "Sepal.Length"); tmp }),c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species")) +options(datatable.unique.names = "off") +test(2366.1, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species")) +options(datatable.unique.names = "warn") +test(2366.2, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), warning = "Duplicate column names created") +options(datatable.unique.names = "error") +test(2366.3, setnames(copy(DT), "Petal.Length", "Sepal.Length"), error = "Duplicate column names created") +options(datatable.unique.names = "rename") +test(2366.4, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), c("Sepal.Length", "Sepal.Width", "Sepal.Length.1", "Petal.Width", "Species")) \ No newline at end of file diff --git a/man/data.table-options.Rd b/man/data.table-options.Rd index 439e88ef2f..4bb477af64 100644 --- a/man/data.table-options.Rd +++ b/man/data.table-options.Rd @@ -105,6 +105,12 @@ \item{\code{datatable.enlist}}{Experimental feature. Default is \code{NULL}. If set to a function (e.g., \code{list}), the \code{j} expression can return a \code{list}, which will then be "enlisted" into columns in the result.} + \item{\code{datatable.unique.names}}{A character string, default \code{"off"}. + Controls the behavior when operations (like \code{setnames}) would result in + duplicate column names. Can be \code{"off"} (silently allow duplicates), + \code{"warn"} (issue a warning), \code{"error"} (halt with an error), + or \code{"rename"} (automatically fix duplicates using \code{make.unique}). + Invalid values will trigger a warning and fall back to \code{"off"}.} } }