diff --git a/NEWS.md b/NEWS.md index 7cf63f0b7..5f1bee817 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,6 +28,8 @@ 4. `dcast()` and `melt()` "just work" when passed a data.frame, not just data.tables, with no need for coercion, [#7614](https://github.com/Rdatatable/data.table/issues/7614). Thanks @MichaelChirico for the suggestion and @manmita for the PR. Note that to avoid potential conflicts with {reshape2}'s data.frame methods, we do the dispatch to the data.table method manually. +5. `tables()` can now optionally report `data.table` objects stored one level deep inside list objects when `depth=1L`, [#2606](https://github.com/Rdatatable/data.table/issues/2606). Thanks @MichaelChirico for the report and @manmita for the PR + ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. diff --git a/R/tables.R b/R/tables.R index 960c74343..7c805a6ed 100644 --- a/R/tables.R +++ b/R/tables.R @@ -19,7 +19,8 @@ type_size = function(DT) { } tables = function(mb=type_size, order.col="NAME", width=80L, - env=parent.frame(), silent=FALSE, index=FALSE) + env=parent.frame(), silent=FALSE, index=FALSE, + depth=0L) { # Prints name, size and colnames of all data.tables in the calling environment by default mb_name = as.character(substitute(mb)) @@ -27,11 +28,78 @@ tables = function(mb=type_size, order.col="NAME", width=80L, names = ls(envir=env, all.names=TRUE) # include "hidden" objects (starting with .) obj = mget(names, envir=env) # doesn't copy; mget is ok with ... unlike get, #5197 w = which(vapply_1b(obj, is.data.table)) - if (!length(w)) { - if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) - return(invisible(data.table(NULL))) + + info = NULL + # we check if depth=1L is requested and add found tables to w + if (depth==1L) { + is_list = vapply_1b(obj, is.list) + is_df = vapply_1b(obj, is.data.frame) + is_dt = vapply_1b(obj, is.data.table) + # list_index is a index of list which is not data.frame or data.table + list_index = which(is_list & !is_dt & !is_df) + # obj_list is a list of lists of data.tables found inside lists + obj_list = vector("list", length(list_index)) + # make a list of size list_index and add wl in it + total_dt = 0L + # filling obj_list and counting total_dt + for (i in seq_along(list_index)) { + L = obj[[list_index[i]]] + wl = which(vapply_1b(L, is.data.table)) + total_dt = total_dt + length(wl) + obj_list[[i]] = L[wl] + } + name_count = length(w) + total_dt + # initialize info data.table with total number of data.tables found + if (name_count==0L) { + # nocov start. Requires long-running test case + if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) + # nocov end + return(invisible(data.table(NULL))) + } + # create info data.table with total rows equal to number of data.tables found + info = data.table(NAME=character(name_count), NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list()) + # fill in the names of data.tables found in w + for (i in seq_along(w)) { # names of w items + set(info, i, "NAME", names[w[i]]) + } + # now fill in the data.tables found inside lists + cnt = 1L + if (total_dt>0L) { + for (i in seq_along(list_index)) { + if (length(obj_list[[i]])==0L) next + # get the parent list name + parent_name = names[list_index[i]] + for (j in seq_along(obj_list[[i]])) { + elem_names = names(obj[[list_index[i]]]) + if (!is.null(elem_names) && nzchar(elem_names[j])) { + new_name = paste0(parent_name, "$", elem_names[j]) + } else { + new_name = paste0(parent_name, "[[", j, "]]") + } + DT = obj_list[[i]][[j]] + k = cnt + length(w) # row number in info data.table + cnt = cnt + 1L + set(info, k, "NAME", new_name) + set(info, k, "NROW", nrow(DT)) + set(info, k, "NCOL", ncol(DT)) + if (is.function(mb)) set(info, k, "MB", as.integer(mb(DT)/1048576L)) # i.e. 1024**2 + if (!is.null(tt<-names(DT))) set(info, k, "COLS", tt) # TODO: don't need these if()s when #5526 is done + if (!is.null(tt<-key(DT))) set(info, k, "KEY", tt) + if (index && !is.null(tt<-indices(DT))) set(info, k, "INDICES", tt) + } + } + } + } else if (depth==0L) { + # if depth is 0 + if (!length(w)) { + if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) + return(invisible(data.table(NULL))) + } + info = data.table(NAME=names[w], NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list()) + } else { + # for depth greater than 1L,recursion is not implemented yet + stopf("depth>1L is not implemented yet") } - info = data.table(NAME=names[w], NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list()) for (i in seq_along(w)) { # avoid rbindlist(lapply(DT_names)) in case of a large number of tables DT = obj[[w[i]]] set(info, i, "NROW", nrow(DT)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 508bf6aa0..95423470e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21520,3 +21520,31 @@ test(2365.1, melt(df_melt, id.vars=1:2), melt(dt_melt, id.vars=1:2)) df_dcast = data.frame(a = c("x", "y"), b = 1:2, v = 3:4) dt_dcast = data.table(a = c("x", "y"), b = 1:2, v = 3:4) test(2365.2, dcast(df_dcast, a ~ b, value.var = "v"), dcast(dt_dcast, a ~ b, value.var = "v")) + +#2606 tables() depth=1 finds nested data.tables in lists +# creating env so that the names are within it +xenv = new.env() +xenv$DT = data.table(a=1L) +xenv$L = list(data.table(a=1, b=4:6), data.table(a=2, b=7:10)) +xenv$LL = list(list(data.table(a=1L, b=4:6))) +xenv$M = list(b=data.table(a=1, b=4:6), a=1:5) +xenv$N = list(a=1:5) +test(2366.1, tables(env=xenv, depth=1L)[, .(NAME, NROW, NCOL)], + data.table( + NAME = c("DT", "L[[1]]", "L[[2]]", "M$b"), + NROW = c(1L, 3L, 4L, 3L), + NCOL = c(1L, 2L, 2L, 2L) +)) +setindex(xenv$M$b, b) +test(2366.2, tables(env=xenv, depth=1L, index=TRUE)$INDICES, list(NULL, NULL, NULL, "b")) +setkey(xenv$M$b, a) +test(2366.3, tables(env=xenv, depth=1L, index=TRUE)$KEY, list(NULL, NULL, NULL, "a")) +# test for depth>1L +test(2366.4, tables(env=xenv, depth=2L), error="depth>1L is not implemented yet") +rm(xenv) + +# no data.table test +xenv_empty = new.env() +test(2366.5, tables(env=xenv_empty, depth=1L), invisible(data.table(NULL))) +test(2366.6, tables(env=xenv_empty), invisible(data.table(NULL))) +rm(xenv_empty) diff --git a/man/tables.Rd b/man/tables.Rd index 5898cd730..9a7d308c4 100644 --- a/man/tables.Rd +++ b/man/tables.Rd @@ -6,7 +6,7 @@ } \usage{ tables(mb=type_size, order.col="NAME", width=80, - env=parent.frame(), silent=FALSE, index=FALSE) + env=parent.frame(), silent=FALSE, index=FALSE, depth=0L) } \arguments{ \item{mb}{ a function which accepts a \code{data.table} and returns its size in bytes. By default, \code{type_size} (same as \code{TRUE}) provides a fast lower bound by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). A column \code{"MB"} is included in the output unless \code{FALSE} or \code{NULL}. } @@ -15,6 +15,7 @@ tables(mb=type_size, order.col="NAME", width=80, \item{env}{ An \code{environment}, typically the \code{.GlobalEnv} by default, see Details. } \item{silent}{ \code{logical}; should the output be printed? } \item{index}{ \code{logical}; if \code{TRUE}, the column \code{INDICES} is added to indicate the indices assorted with each object, see \code{\link{indices}}. } + \item{depth}{\code{integer}, default \code{0L}. Larger values govern the depth of search in recursive objects like lists or environments. For example, \code{depth=1L} will find all data.tables in the global environment as well as all data.tables in lists (but in lists of lists). NB: \code{depth > 1L} is not yet supported.} } \details{ Usually \code{tables()} is executed at the prompt, where \code{parent.frame()} returns \code{.GlobalEnv}. \code{tables()} may also be useful inside functions where \code{parent.frame()} is the local scope of the function; in such a scenario, simply set it to \code{.GlobalEnv} to get the same behaviour as at prompt.