diff --git a/AGENTS.md b/AGENTS.md index 2b0ad10c..4b42e822 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -35,3 +35,11 @@ binary 0/1 in an underlying `raw` object. - Increment the `.900X` dev version suffix in `DESCRIPTION` with each `NEWS.md` update. - Check that existing tests cover all new code. (The GHA test suite uses codecov.) + +## Optimization notes + +- `descendant_edges_single()`: the O(n_edge) linear scan per node looks + theoretically O(n²), but benchmarking (CSR index, vector-of-vectors) + showed the original is faster at all practical sizes (up to 50k tips) + due to cache-friendly sequential access over contiguous Rcpp memory. + Not worth optimizing. diff --git a/DESCRIPTION b/DESCRIPTION index 772b6ca2..f3c54e4d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeTools Title: Create, Modify and Analyse Phylogenetic Trees -Version: 2.1.0.9005 +Version: 2.1.0.9006 Authors@R: c( person("Martin R.", 'Smith', role = c("aut", "cre", "cph"), email = "martin.smith@durham.ac.uk", diff --git a/NEWS.md b/NEWS.md index 76e7bcb7..4f6ce9cc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# TreeTools 2.1.0.9006 (2026-03-13) # + +- `NodeDepth()` for unrooted trees rewritten as O(n) two-pass C++ algorithm, + replacing iterative R while-loop. + # TreeTools 2.1.0.9005 (2026-03-13) # - `SplitFrequency(reference = NULL)` performance improvements: diff --git a/R/RcppExports.R b/R/RcppExports.R index b5e6e8f7..27743d54 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -85,6 +85,10 @@ n_cherries_wrapper <- function(parent, child, nTip) { .Call(`_TreeTools_n_cherries_wrapper`, parent, child, nTip) } +node_depth_unrooted <- function(parent, child, postorder, shortest) { + .Call(`_TreeTools_node_depth_unrooted`, parent, child, postorder, shortest) +} + path_lengths <- function(edge, weight, init_nas) { .Call(`_TreeTools_path_lengths`, edge, weight, init_nas) } diff --git a/R/tree_properties.R b/R/tree_properties.R index 004b8c72..feef5a7b 100644 --- a/R/tree_properties.R +++ b/R/tree_properties.R @@ -79,81 +79,13 @@ NodeDepth.phylo <- function(x, shortest = FALSE, includeTips = TRUE) { #' @export NodeDepth.matrix <- function(x, shortest = FALSE, includeTips = TRUE) { - - - .NodeDepth.short <- function() { - - depths <- c(leaf0s, vapply(minVertex:nVertex, function(node) - if (any(!is.na(leaf0s[child[parent == node]]))) 1L else NA_integer_ - , 0L)) - maxDepth <- 1L - - while(any(is.na(depths))) { - for (node in rev(which(is.na(depths)))) { - incident <- c(depths[child[parent == node]], - depths[parent[child == node]]) - na <- is.na(incident) - nNa <- sum(na) - if (nNa == 0L) { - depths[node] <- min(incident) + 1L - } else if (nNa == 1L) { - aIncident <- incident[!na] - if (all(aIncident <= maxDepth)) { - depths[node] <- min(aIncident) + 1L - } - } - } - maxDepth <- maxDepth + 1L - } - - #Return: - depths - } - - .NodeDepth.long <- function() { - - depths <- c(leaf0s, vapply(minVertex:nVertex, function(node) - if (any(is.na(leaf0s[child[parent == node]]))) NA_integer_ else 1L - , 0L)) - maxDepth <- 1L - - while(any(is.na(depths))) { - for (node in rev(which(is.na(depths)))) { - incident <- c(depths[child[parent == node]], - depths[parent[child == node]]) - na <- is.na(incident) - nNa <- sum(na) - if (nNa == 0L) { - depths[node] <- sort(incident, decreasing = TRUE)[2] + 1L - } else if (nNa == 1L) { - aIncident <- incident[!na] - if (all(aIncident <= maxDepth)) { - depths[node] <- max(aIncident) + 1L - } - } - } - maxDepth <- maxDepth + 1L - } - - #Return: - depths - } - - parent <- x[, 1] child <- x[, 2] - minVertex <- min(parent) - nVertex <- max(parent) - - nLeaf <- minVertex - 1L - nNode <- nVertex - nLeaf - leaf0s <- integer(nLeaf) - - depths <- if (shortest) .NodeDepth.short() else .NodeDepth.long() + postorder <- PostorderOrder(x) + depths <- node_depth_unrooted(parent, child, postorder, shortest) # Return: - if (includeTips) depths else depths[minVertex:nVertex] - + if (includeTips) depths else depths[(min(parent)):max(parent)] } .NodeDepth.rooted <- function(x, shortest = FALSE, includeTips = TRUE) { diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index d08ceb81..4cf5bd0f 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -271,6 +271,20 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// node_depth_unrooted +IntegerVector node_depth_unrooted(const IntegerVector parent, const IntegerVector child, const IntegerVector postorder, const bool shortest); +RcppExport SEXP _TreeTools_node_depth_unrooted(SEXP parentSEXP, SEXP childSEXP, SEXP postorderSEXP, SEXP shortestSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const IntegerVector >::type parent(parentSEXP); + Rcpp::traits::input_parameter< const IntegerVector >::type child(childSEXP); + Rcpp::traits::input_parameter< const IntegerVector >::type postorder(postorderSEXP); + Rcpp::traits::input_parameter< const bool >::type shortest(shortestSEXP); + rcpp_result_gen = Rcpp::wrap(node_depth_unrooted(parent, child, postorder, shortest)); + return rcpp_result_gen; +END_RCPP +} // path_lengths NumericMatrix path_lengths(const IntegerMatrix edge, const DoubleVector weight, const LogicalVector init_nas); RcppExport SEXP _TreeTools_path_lengths(SEXP edgeSEXP, SEXP weightSEXP, SEXP init_nasSEXP) { @@ -516,6 +530,7 @@ static const R_CallMethodDef CallEntries[] = { {"_TreeTools_kept_vertices", (DL_FUNC) &_TreeTools_kept_vertices, 2}, {"_TreeTools_minimum_spanning_tree", (DL_FUNC) &_TreeTools_minimum_spanning_tree, 1}, {"_TreeTools_n_cherries_wrapper", (DL_FUNC) &_TreeTools_n_cherries_wrapper, 3}, + {"_TreeTools_node_depth_unrooted", (DL_FUNC) &_TreeTools_node_depth_unrooted, 4}, {"_TreeTools_path_lengths", (DL_FUNC) &_TreeTools_path_lengths, 3}, {"_TreeTools_cpp_edge_to_splits", (DL_FUNC) &_TreeTools_cpp_edge_to_splits, 3}, {"_TreeTools_duplicated_splits", (DL_FUNC) &_TreeTools_duplicated_splits, 2}, diff --git a/src/node_depth.cpp b/src/node_depth.cpp new file mode 100644 index 00000000..2225c351 --- /dev/null +++ b/src/node_depth.cpp @@ -0,0 +1,141 @@ +#include +#include /* for min_element, max_element, sort */ +#include +using namespace Rcpp; + +// Compute node depth for unrooted trees in O(n) via two-pass traversal. +// depth[v] = second-largest neighbour distance (shortest=false) +// or smallest neighbour distance (shortest=true) +// Tips have depth 0. +// Returns depths for all vertices 1..max_node. +// [[Rcpp::export]] +IntegerVector node_depth_unrooted( + const IntegerVector parent, + const IntegerVector child, + const IntegerVector postorder, // 1-based edge indices in postorder + const bool shortest +) { + const int n_edge = parent.size(); + const int root = *std::min_element(parent.begin(), parent.end()); + const int n_tip = root - 1; + const int max_node = *std::max_element(parent.begin(), parent.end()); + + // subtree_h[v]: max (or min) depth below v in the rooted representation + // Tips: 0. Internal: func(children's subtree_h + 1) + std::vector subtree_h(max_node + 1, 0); + + // For longest path: store top-2 subtree heights per node + // For shortest path: store bottom-2 + // best1[v] >= best2[v] (longest) or best1[v] <= best2[v] (shortest) + // best1_child[v] = which child contributed best1 + const int INF = 1000000; + const int NEG_INF = -1; + std::vector best1(max_node + 1, shortest ? INF : NEG_INF); + std::vector best2(max_node + 1, shortest ? INF : NEG_INF); + std::vector best1_child(max_node + 1, -1); + + // Postorder: compute subtree_h and best1/best2 + for (int i = 0; i < n_edge; ++i) { + const int e = postorder[i] - 1; // 0-based + const int p = parent[e]; + const int c = child[e]; + const int ch = subtree_h[c] + 1; // height through this child + + if (shortest) { + if (ch <= best1[p]) { + best2[p] = best1[p]; + best1[p] = ch; + best1_child[p] = c; + } else if (ch < best2[p]) { + best2[p] = ch; + } + subtree_h[p] = best1[p]; + } else { + if (ch >= best1[p]) { + best2[p] = best1[p]; + best1[p] = ch; + best1_child[p] = c; + } else if (ch > best2[p]) { + best2[p] = ch; + } + subtree_h[p] = best1[p]; + } + } + + // from_above[v]: longest (or shortest) path from v going through parent + std::vector from_above(max_node + 1, NEG_INF); + // Root has no parent; from_above[root] stays at NEG_INF + + // Preorder: compute from_above (reverse postorder) + for (int i = n_edge - 1; i >= 0; --i) { + const int e = postorder[i] - 1; + const int p = parent[e]; + const int c = child[e]; + + // Best alternative at p excluding path through c: + // If c contributed best1[p], use best2[p]; otherwise use best1[p] + int best_sibling; + if (best1_child[p] == c) { + best_sibling = best2[p]; + } else { + best_sibling = best1[p]; + } + + if (shortest) { + // from_above[c] = 1 + min(from_above[p], best_sibling) + if (from_above[p] == NEG_INF) { + // p is root (or from_above not set): only siblings matter + from_above[c] = (best_sibling < INF) ? 1 + best_sibling : INF; + } else { + from_above[c] = 1 + std::min(from_above[p], best_sibling); + } + } else { + if (from_above[p] == NEG_INF) { + from_above[c] = (best_sibling > NEG_INF) ? 1 + best_sibling : NEG_INF; + } else { + from_above[c] = 1 + std::max(from_above[p], best_sibling); + } + } + } + + // Combine: depth[v] for internal nodes + IntegerVector depths(max_node, 0); // 1-indexed via [v-1]; tips stay 0 + + for (int v = n_tip + 1; v <= max_node; ++v) { + // Collect all neighbour distances + // Children contribute subtree_h[child] + 1 (already in best1/best2) + // Parent direction contributes from_above[v] + + if (shortest) { + int d = best1[v]; // smallest child path + if (from_above[v] != NEG_INF && from_above[v] < d) { + d = from_above[v]; + } + depths[v - 1] = d; + } else { + // Need second-largest among all neighbour distances + // Neighbours: best1[v], best2[v], from_above[v] + // (best1 >= best2 for longest) + // Sort descending and take [1] (second largest) + int a = best1[v]; + int b = best2[v]; + int c_val = from_above[v]; + // Ensure we handle cases where from_above is undefined (root) + if (c_val == NEG_INF) { + // Root node: only children matter; second largest = best2 + depths[v - 1] = (b > NEG_INF) ? b : a; + } else { + // Three candidates: a >= b, c_val unknown + if (c_val >= a) { + depths[v - 1] = a; // c_val >= a >= b, second = a + } else if (c_val >= b) { + depths[v - 1] = c_val; // a > c_val >= b, second = c_val + } else { + depths[v - 1] = b; // a >= b > c_val, second = b + } + } + } + } + + return depths; +}