From a688e95df06263fe2209b1fd21edbd56e33ec35d Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Fri, 13 Mar 2026 09:45:26 +0000
Subject: [PATCH 1/8] c++ for normalize splits.

---
 .positai/settings.json |  4 ++++
 AGENTS.md              |  7 +++++++
 DESCRIPTION            |  2 +-
 NEWS.md                |  5 +++--
 R/RcppExports.R        |  4 ++++
 R/Support.R            | 28 ++++----------------------
 src/RcppExports.cpp    | 13 ++++++++++++
 src/splits.cpp         | 45 ++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 81 insertions(+), 27 deletions(-)

diff --git a/.positai/settings.json b/.positai/settings.json
index 510265efd..500757dea 100644
--- a/.positai/settings.json
+++ b/.positai/settings.json
@@ -11,5 +11,9 @@
     "executeCode": {
       "*": "allow"
     }
+  },
+  "model": {
+    "id": "claude-opus-4-6",
+    "provider": "positai"
   }
 }
\ No newline at end of file
diff --git a/AGENTS.md b/AGENTS.md
index cc81b9576..2f0146acc 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -27,3 +27,10 @@ of internal nodes for any topologically identical tree.
 
 Splits objects are defined in `as.Splits()`, and denote split membership as
 binary 0/1 in an underlying `raw` object.
+
+## Workflow requirements
+
+- After completing each optimization or user-visible change, update `NEWS.md`
+  before moving on to the next task.
+- Increment the `.900X` dev version suffix in `DESCRIPTION` with each
+  `NEWS.md` update.
diff --git a/DESCRIPTION b/DESCRIPTION
index 04beda37a..772b6ca22 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: TreeTools
 Title: Create, Modify and Analyse Phylogenetic Trees
-Version: 2.1.0.9004
+Version: 2.1.0.9005
 Authors@R: c(
   person("Martin R.", 'Smith', role = c("aut", "cre", "cph"), 
          email = "martin.smith@durham.ac.uk",
diff --git a/NEWS.md b/NEWS.md
index bf4bb119a..854812580 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,6 +1,7 @@
-# TreeTools 2.1.0.9004 (2026-03-12) #
+# TreeTools 2.1.0.9005 (2026-03-13) #
 
-- Rewrite popcount calculation for more efficient `TipsInSplits()`.
+- `SplitFrequency(reference = NULL)` split normalization moved to C++,
+  eliminating an R-level per-split loop.
 
 # TreeTools 2.1.0.9003 (2026-03-09) #
 
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 8117f6562..b5e6e8f72 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -137,6 +137,10 @@ cpp_count_splits <- function(edge, nTip) {
     .Call(`_TreeTools_cpp_count_splits`, edge, nTip)
 }
 
+normalize_splits <- function(splits, n_tip) {
+    .Call(`_TreeTools_normalize_splits`, splits, n_tip)
+}
+
 splits_to_edge <- function(splits, nTip) {
     .Call(`_TreeTools_splits_to_edge`, splits, nTip)
 }
diff --git a/R/Support.R b/R/Support.R
index b105f4167..e080f9cc2 100644
--- a/R/Support.R
+++ b/R/Support.R
@@ -58,30 +58,10 @@ SplitFrequency <- function(reference, forest = NULL) {
       return(structure(splits, nTip = nTip, tip.label = tipLabels,              # nocov
                        count = integer(), class = "Splits"))                    # nocov
     }
-    # The ClusterTable outputs clusters (clades); normalize so bit 0 (tip 1)
-    # is not in the set (matching as.Splits convention)
-    nTipMod <- nTip %% 8L
-    lastByteMask <- if (nTipMod == 0L) as.raw(0xff) else as.raw(bitwShiftL(1L, nTipMod) - 1L)
-    keep <- logical(nrow(splits))
-    for (i in seq_len(nrow(splits))) {
-      val <- splits[i, ]
-      # Count bits set (to filter trivial splits)
-      nBits <- sum(vapply(as.integer(val), function(b) sum(as.integer(intToBits(b))), integer(1)))
-      if (nBits < 2L || nBits > nTip - 2L) next # trivial split
-      # Normalize: if bit 0 is NOT set, complement to match as.Splits format
-      if (!as.logical(as.integer(val[1]) %% 2L)) {
-        for (j in seq_along(val)) {
-          splits[i, j] <- as.raw(bitwXor(as.integer(val[j]), 0xffL))
-        }
-        # Mask last byte
-        if (nTipMod > 0L) {
-          splits[i, nbin] <- as.raw(bitwAnd(as.integer(splits[i, nbin]),
-                                             as.integer(lastByteMask)))
-        }
-      }
-      keep[i] <- TRUE
-    }
-    splits <- splits[keep, , drop = FALSE]
+    # Normalize splits: ensure bit 0 is set, filter trivial splits
+    normalized <- normalize_splits(splits, nTip)
+    keep <- normalized[["keep"]]
+    splits <- normalized[["splits"]][keep, , drop = FALSE]
     counts <- counts[keep]
     ret <- structure(splits,
                      nTip = nTip,
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index ccf10bbeb..d08ceb813 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -426,6 +426,18 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// normalize_splits
+Rcpp::List normalize_splits(Rcpp::RawMatrix splits, const int n_tip);
+RcppExport SEXP _TreeTools_normalize_splits(SEXP splitsSEXP, SEXP n_tipSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< Rcpp::RawMatrix >::type splits(splitsSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP);
+    rcpp_result_gen = Rcpp::wrap(normalize_splits(splits, n_tip));
+    return rcpp_result_gen;
+END_RCPP
+}
 // splits_to_edge
 IntegerMatrix splits_to_edge(const RawMatrix splits, const IntegerVector nTip);
 RcppExport SEXP _TreeTools_splits_to_edge(SEXP splitsSEXP, SEXP nTipSEXP) {
@@ -517,6 +529,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_TreeTools_pack_splits_logical", (DL_FUNC) &_TreeTools_pack_splits_logical, 1},
     {"_TreeTools_pack_splits_logical_vec", (DL_FUNC) &_TreeTools_pack_splits_logical_vec, 1},
     {"_TreeTools_cpp_count_splits", (DL_FUNC) &_TreeTools_cpp_count_splits, 2},
+    {"_TreeTools_normalize_splits", (DL_FUNC) &_TreeTools_normalize_splits, 2},
     {"_TreeTools_splits_to_edge", (DL_FUNC) &_TreeTools_splits_to_edge, 2},
     {"_TreeTools_tips_in_splits", (DL_FUNC) &_TreeTools_tips_in_splits, 1},
     {"_TreeTools_edge_to_rooted_shape", (DL_FUNC) &_TreeTools_edge_to_rooted_shape, 3},
diff --git a/src/splits.cpp b/src/splits.cpp
index 096dc2d70..2024caa37 100644
--- a/src/splits.cpp
+++ b/src/splits.cpp
@@ -714,3 +714,48 @@ int cpp_count_splits(const Rcpp::IntegerMatrix& edge, const int nTip) {
   
   return (n_internal - n_singles) - 1 - (is_rooted ? 1 : 0);
 }
+
+// Normalize splits from ClusterTable output for as.Splits convention:
+// - Filter trivial splits (fewer than 2 or more than nTip-2 bits set)
+// - Ensure bit 0 is set (complement if not)
+// - Mask unused trailing bits in last byte
+// Returns a List with "splits" (RawMatrix) and "keep" (LogicalVector)
+// [[Rcpp::export]]
+Rcpp::List normalize_splits(Rcpp::RawMatrix splits, const int n_tip) {
+  const int n_split = splits.nrow();
+  const int n_bin = splits.ncol();
+  const int n_spare = n_tip % BIN_SIZE;
+  const Rbyte last_mask = n_spare == 0
+    ? Rbyte(0xff)
+    : static_cast<Rbyte>((1 << n_spare) - 1);
+
+  Rcpp::LogicalVector keep(n_split, false);
+
+  for (int i = 0; i < n_split; ++i) {
+    // Count bits set
+    int n_bits = 0;
+    for (int j = 0; j < n_bin; ++j) {
+      n_bits += __builtin_popcount(static_cast<unsigned>(splits(i, j)));
+    }
+
+    if (n_bits < 2 || n_bits > n_tip - 2) continue; // trivial
+
+    // Normalize: if bit 0 is NOT set, complement
+    if (!(splits(i, 0) & Rbyte(1))) {
+      for (int j = 0; j < n_bin; ++j) {
+        splits(i, j) = static_cast<Rbyte>(~splits(i, j));
+      }
+      // Mask trailing bits in last byte
+      if (n_spare > 0) {
+        splits(i, n_bin - 1) &= last_mask;
+      }
+    }
+
+    keep[i] = true;
+  }
+
+  return Rcpp::List::create(
+    Rcpp::Named("splits") = splits,
+    Rcpp::Named("keep") = keep
+  );
+}

From 789a418456277e5f01da10038efbfd6d11559595 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Fri, 13 Mar 2026 09:52:19 +0000
Subject: [PATCH 2/8]  hash map

---
 AGENTS.md                     |  1 +
 NEWS.md                       |  5 +++--
 src/consensus.cpp             | 24 ++++++++++++++----------
 tests/testthat/test-Support.R | 13 +++++++++++++
 4 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 2f0146acc..2b0ad10cd 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -34,3 +34,4 @@ binary 0/1 in an underlying `raw` object.
   before moving on to the next task.
 - Increment the `.900X` dev version suffix in `DESCRIPTION` with each
   `NEWS.md` update.
+- Check that existing tests cover all new code. (The GHA test suite uses codecov.)
diff --git a/NEWS.md b/NEWS.md
index 854812580..785605773 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,7 +1,8 @@
 # TreeTools 2.1.0.9005 (2026-03-13) #
 
-- `SplitFrequency(reference = NULL)` split normalization moved to C++,
-  eliminating an R-level per-split loop.
+- `SplitFrequency(reference = NULL)` performance improvements:
+  split normalization moved to C++; internal split deduplication uses
+  hash map instead of ordered map.
 
 # TreeTools 2.1.0.9003 (2026-03-09) #
 
diff --git a/src/consensus.cpp b/src/consensus.cpp
index 93a3b02b6..e6585df11 100644
--- a/src/consensus.cpp
+++ b/src/consensus.cpp
@@ -6,7 +6,8 @@ using namespace Rcpp;
 
 #include <algorithm> /* for fill */
 #include <array> /* for array */
-#include <map> /* for map */
+#include <string> /* for string (hash key) */
+#include <unordered_map> /* for unordered_map */
 
 using TreeTools::ct_stack_threshold;
 using TreeTools::ct_max_leaves_heap;
@@ -178,12 +179,15 @@ List calc_split_frequencies(
 
   StackEntry *const S_start = S.data();
   
-  // Use a map to store unique splits and their counts
-  // Key: split bit pattern; Value: index in output
-  std::map<std::vector<Rbyte>, int32> split_map;
+  // Hash map for O(1) amortized split deduplication
+  std::unordered_map<std::string, int32> split_map;
+  split_map.reserve(ntip_3 * 2);
   std::vector<std::vector<Rbyte>> split_patterns;
   std::vector<int32> counts;
 
+  // Reusable key buffer — avoids per-split heap allocation
+  std::string key(nbin, '\0');
+
   for (int32 i = 0; i < n_trees; ++i) {
     if (tables[i].NOSWX(ntip_3)) {
       continue;
@@ -247,21 +251,21 @@ List calc_split_frequencies(
       const int32 end   = tables[i].X_right(k + 1);
       if (start == 0 && end == 0) continue; // No valid cluster at this position
       
-      // Build the bit pattern for this split
-      std::vector<Rbyte> pattern(nbin, 0);
+      // Build the bit pattern into the reusable key buffer
+      std::fill(key.begin(), key.end(), '\0');
       for (int32 j = start; j <= end; ++j) {
         const int32 leaf_idx = tables[i].DECODE(j) - 1; // 0-based
         const int32 byte_idx = leaf_idx >> 3;
         const int32 bit_idx  = leaf_idx & 7;
-        pattern[byte_idx] |= (Rbyte(1) << bit_idx);
+        key[byte_idx] |= static_cast<char>(1 << bit_idx);
       }
       
-      auto it = split_map.find(pattern);
+      auto it = split_map.find(key);
       if (it == split_map.end()) {
         // New split: record it with count from this reference tree
         const int32 idx = split_patterns.size();
-        split_map[pattern] = idx;
-        split_patterns.push_back(std::move(pattern));
+        split_map.emplace(key, idx);
+        split_patterns.emplace_back(key.begin(), key.end());
         counts.push_back(split_count[k]);
       }
       // If already found, the first reference tree that found it has the
diff --git a/tests/testthat/test-Support.R b/tests/testthat/test-Support.R
index 7b2495274..2fa0e3421 100644
--- a/tests/testthat/test-Support.R
+++ b/tests/testthat/test-Support.R
@@ -59,6 +59,19 @@ test_that("Node support colours consistent", {
                c("oor", "1", "34", "67", "101"))
 })
 
+test_that("normalize_splits() covers all branches", {
+  # n_spare == 0 (nTip multiple of 8): complement without trailing-bit mask
+  trees16 <- c(PectinateTree(16), PectinateTree(16))
+  freq16 <- SplitFrequency(trees16)
+  expect_equal(length(freq16), NSplits(trees16[[1]]))
+  expect_true(all(attr(freq16, "count") == 2L))
+
+  # Trivial splits are filtered: a star tree produces no non-trivial splits
+  star <- c(StarTree(10), StarTree(10))
+  freq_star <- SplitFrequency(star)
+  expect_equal(length(freq_star), 0)
+})
+
 test_that("SplitFrequency() handles four-split trees", {
   trees <- AddTipEverywhere(BalancedTree(3))
   trees <- c(trees[1], trees)

From 38ee89ac7947dd407e485ae8074232fc1ab920e2 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Fri, 13 Mar 2026 09:56:06 +0000
Subject: [PATCH 3/8]      branches: ["*"]

---
 .github/workflows/R-CMD-check.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml
index b9472d538..85b28879f 100644
--- a/.github/workflows/R-CMD-check.yml
+++ b/.github/workflows/R-CMD-check.yml
@@ -21,9 +21,7 @@ on:
       - "**.R[dD]ata"
       - "**.Rpro*"
   pull_request:
-    branches:
-      - main
-      - master
+    branches: ["*"]
     paths-ignore:
       - "Meta**"
       - "memcheck**"

From 9968668e5115c284a5d14cd1aea287ce170ec493 Mon Sep 17 00:00:00 2001
From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com>
Date: Fri, 13 Mar 2026 09:56:23 +0000
Subject: [PATCH 4/8]      branches: ["*"]

---
 .github/workflows/R-CMD-check.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml
index b9472d538..85b28879f 100644
--- a/.github/workflows/R-CMD-check.yml
+++ b/.github/workflows/R-CMD-check.yml
@@ -21,9 +21,7 @@ on:
       - "**.R[dD]ata"
       - "**.Rpro*"
   pull_request:
-    branches:
-      - main
-      - master
+    branches: ["*"]
     paths-ignore:
       - "Meta**"
       - "memcheck**"

From de0f4ae5c6d060f3973892940dadd8e22e264513 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Fri, 13 Mar 2026 10:35:16 +0000
Subject: [PATCH 5/8] NodeDepth c++

---
 AGENTS.md           |   8 +++
 DESCRIPTION         |   2 +-
 NEWS.md             |   4 +-
 R/RcppExports.R     |   4 ++
 R/tree_properties.R |  74 +----------------------
 src/RcppExports.cpp |  15 +++++
 src/node_depth.cpp  | 141 ++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 175 insertions(+), 73 deletions(-)
 create mode 100644 src/node_depth.cpp

diff --git a/AGENTS.md b/AGENTS.md
index 2b0ad10cd..4b42e8220 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -35,3 +35,11 @@ binary 0/1 in an underlying `raw` object.
 - Increment the `.900X` dev version suffix in `DESCRIPTION` with each
   `NEWS.md` update.
 - Check that existing tests cover all new code. (The GHA test suite uses codecov.)
+
+## Optimization notes
+
+- `descendant_edges_single()`: the O(n_edge) linear scan per node looks
+  theoretically O(n²), but benchmarking (CSR index, vector-of-vectors)
+  showed the original is faster at all practical sizes (up to 50k tips)
+  due to cache-friendly sequential access over contiguous Rcpp memory.
+  Not worth optimizing.
diff --git a/DESCRIPTION b/DESCRIPTION
index 772b6ca22..f3c54e4dc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: TreeTools
 Title: Create, Modify and Analyse Phylogenetic Trees
-Version: 2.1.0.9005
+Version: 2.1.0.9006
 Authors@R: c(
   person("Martin R.", 'Smith', role = c("aut", "cre", "cph"), 
          email = "martin.smith@durham.ac.uk",
diff --git a/NEWS.md b/NEWS.md
index 785605773..bbc1ad01d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,8 +1,10 @@
-# TreeTools 2.1.0.9005 (2026-03-13) #
+# TreeTools 2.1.0.9006 (2026-03-13) #
 
 - `SplitFrequency(reference = NULL)` performance improvements:
   split normalization moved to C++; internal split deduplication uses
   hash map instead of ordered map.
+- `NodeDepth()` for unrooted trees rewritten as O(n) two-pass C++ algorithm,
+  replacing iterative R while-loop.
 
 # TreeTools 2.1.0.9003 (2026-03-09) #
 
diff --git a/R/RcppExports.R b/R/RcppExports.R
index b5e6e8f72..27743d54a 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -85,6 +85,10 @@ n_cherries_wrapper <- function(parent, child, nTip) {
     .Call(`_TreeTools_n_cherries_wrapper`, parent, child, nTip)
 }
 
+node_depth_unrooted <- function(parent, child, postorder, shortest) {
+    .Call(`_TreeTools_node_depth_unrooted`, parent, child, postorder, shortest)
+}
+
 path_lengths <- function(edge, weight, init_nas) {
     .Call(`_TreeTools_path_lengths`, edge, weight, init_nas)
 }
diff --git a/R/tree_properties.R b/R/tree_properties.R
index 004b8c724..feef5a7b9 100644
--- a/R/tree_properties.R
+++ b/R/tree_properties.R
@@ -79,81 +79,13 @@ NodeDepth.phylo <- function(x, shortest = FALSE, includeTips = TRUE) {
 
 #' @export
 NodeDepth.matrix <- function(x, shortest = FALSE, includeTips = TRUE) {
-
-
-  .NodeDepth.short <- function() {
-
-    depths <- c(leaf0s, vapply(minVertex:nVertex, function(node)
-      if (any(!is.na(leaf0s[child[parent == node]]))) 1L else NA_integer_
-      , 0L))
-    maxDepth <- 1L
-
-    while(any(is.na(depths))) {
-      for (node in rev(which(is.na(depths)))) {
-        incident <- c(depths[child[parent == node]],
-                      depths[parent[child == node]])
-        na <- is.na(incident)
-        nNa <- sum(na)
-        if (nNa == 0L) {
-          depths[node] <- min(incident) + 1L
-        } else if (nNa == 1L) {
-          aIncident <- incident[!na]
-          if (all(aIncident <= maxDepth)) {
-            depths[node] <- min(aIncident) + 1L
-          }
-        }
-      }
-      maxDepth <- maxDepth + 1L
-    }
-
-    #Return:
-    depths
-  }
-
-  .NodeDepth.long <- function() {
-
-    depths <- c(leaf0s, vapply(minVertex:nVertex, function(node)
-      if (any(is.na(leaf0s[child[parent == node]]))) NA_integer_ else 1L
-      , 0L))
-    maxDepth <- 1L
-
-    while(any(is.na(depths))) {
-      for (node in rev(which(is.na(depths)))) {
-        incident <- c(depths[child[parent == node]],
-                      depths[parent[child == node]])
-        na <- is.na(incident)
-        nNa <- sum(na)
-        if (nNa == 0L) {
-          depths[node] <- sort(incident, decreasing = TRUE)[2] + 1L
-        } else if (nNa == 1L) {
-          aIncident <- incident[!na]
-          if (all(aIncident <= maxDepth)) {
-            depths[node] <- max(aIncident) + 1L
-          }
-        }
-      }
-      maxDepth <- maxDepth + 1L
-    }
-
-    #Return:
-    depths
-  }
-
-
   parent <- x[, 1]
   child <- x[, 2]
-  minVertex <- min(parent)
-  nVertex <- max(parent)
-
-  nLeaf <- minVertex - 1L
-  nNode <- nVertex - nLeaf
-  leaf0s <- integer(nLeaf)
-
-  depths <- if (shortest) .NodeDepth.short() else .NodeDepth.long()
+  postorder <- PostorderOrder(x)
+  depths <- node_depth_unrooted(parent, child, postorder, shortest)
 
   # Return:
-  if (includeTips) depths else depths[minVertex:nVertex]
-
+  if (includeTips) depths else depths[(min(parent)):max(parent)]
 }
 
 .NodeDepth.rooted <- function(x, shortest = FALSE, includeTips = TRUE) {
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index d08ceb813..4cf5bd0f0 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -271,6 +271,20 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// node_depth_unrooted
+IntegerVector node_depth_unrooted(const IntegerVector parent, const IntegerVector child, const IntegerVector postorder, const bool shortest);
+RcppExport SEXP _TreeTools_node_depth_unrooted(SEXP parentSEXP, SEXP childSEXP, SEXP postorderSEXP, SEXP shortestSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const IntegerVector >::type parent(parentSEXP);
+    Rcpp::traits::input_parameter< const IntegerVector >::type child(childSEXP);
+    Rcpp::traits::input_parameter< const IntegerVector >::type postorder(postorderSEXP);
+    Rcpp::traits::input_parameter< const bool >::type shortest(shortestSEXP);
+    rcpp_result_gen = Rcpp::wrap(node_depth_unrooted(parent, child, postorder, shortest));
+    return rcpp_result_gen;
+END_RCPP
+}
 // path_lengths
 NumericMatrix path_lengths(const IntegerMatrix edge, const DoubleVector weight, const LogicalVector init_nas);
 RcppExport SEXP _TreeTools_path_lengths(SEXP edgeSEXP, SEXP weightSEXP, SEXP init_nasSEXP) {
@@ -516,6 +530,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_TreeTools_kept_vertices", (DL_FUNC) &_TreeTools_kept_vertices, 2},
     {"_TreeTools_minimum_spanning_tree", (DL_FUNC) &_TreeTools_minimum_spanning_tree, 1},
     {"_TreeTools_n_cherries_wrapper", (DL_FUNC) &_TreeTools_n_cherries_wrapper, 3},
+    {"_TreeTools_node_depth_unrooted", (DL_FUNC) &_TreeTools_node_depth_unrooted, 4},
     {"_TreeTools_path_lengths", (DL_FUNC) &_TreeTools_path_lengths, 3},
     {"_TreeTools_cpp_edge_to_splits", (DL_FUNC) &_TreeTools_cpp_edge_to_splits, 3},
     {"_TreeTools_duplicated_splits", (DL_FUNC) &_TreeTools_duplicated_splits, 2},
diff --git a/src/node_depth.cpp b/src/node_depth.cpp
new file mode 100644
index 000000000..2225c351b
--- /dev/null
+++ b/src/node_depth.cpp
@@ -0,0 +1,141 @@
+#include <Rcpp/Lightest>
+#include <algorithm> /* for min_element, max_element, sort */
+#include <vector>
+using namespace Rcpp;
+
+// Compute node depth for unrooted trees in O(n) via two-pass traversal.
+// depth[v] = second-largest neighbour distance (shortest=false)
+//         or smallest neighbour distance (shortest=true)
+// Tips have depth 0.
+// Returns depths for all vertices 1..max_node.
+// [[Rcpp::export]]
+IntegerVector node_depth_unrooted(
+    const IntegerVector parent,
+    const IntegerVector child,
+    const IntegerVector postorder, // 1-based edge indices in postorder
+    const bool shortest
+) {
+  const int n_edge = parent.size();
+  const int root = *std::min_element(parent.begin(), parent.end());
+  const int n_tip = root - 1;
+  const int max_node = *std::max_element(parent.begin(), parent.end());
+
+  // subtree_h[v]: max (or min) depth below v in the rooted representation
+  // Tips: 0.  Internal: func(children's subtree_h + 1)
+  std::vector<int> subtree_h(max_node + 1, 0);
+
+  // For longest path: store top-2 subtree heights per node
+  // For shortest path: store bottom-2
+  // best1[v] >= best2[v] (longest) or best1[v] <= best2[v] (shortest)
+  // best1_child[v] = which child contributed best1
+  const int INF = 1000000;
+  const int NEG_INF = -1;
+  std::vector<int> best1(max_node + 1, shortest ? INF : NEG_INF);
+  std::vector<int> best2(max_node + 1, shortest ? INF : NEG_INF);
+  std::vector<int> best1_child(max_node + 1, -1);
+
+  // Postorder: compute subtree_h and best1/best2
+  for (int i = 0; i < n_edge; ++i) {
+    const int e = postorder[i] - 1; // 0-based
+    const int p = parent[e];
+    const int c = child[e];
+    const int ch = subtree_h[c] + 1; // height through this child
+
+    if (shortest) {
+      if (ch <= best1[p]) {
+        best2[p] = best1[p];
+        best1[p] = ch;
+        best1_child[p] = c;
+      } else if (ch < best2[p]) {
+        best2[p] = ch;
+      }
+      subtree_h[p] = best1[p];
+    } else {
+      if (ch >= best1[p]) {
+        best2[p] = best1[p];
+        best1[p] = ch;
+        best1_child[p] = c;
+      } else if (ch > best2[p]) {
+        best2[p] = ch;
+      }
+      subtree_h[p] = best1[p];
+    }
+  }
+
+  // from_above[v]: longest (or shortest) path from v going through parent
+  std::vector<int> from_above(max_node + 1, NEG_INF);
+  // Root has no parent; from_above[root] stays at NEG_INF
+
+  // Preorder: compute from_above (reverse postorder)
+  for (int i = n_edge - 1; i >= 0; --i) {
+    const int e = postorder[i] - 1;
+    const int p = parent[e];
+    const int c = child[e];
+
+    // Best alternative at p excluding path through c:
+    // If c contributed best1[p], use best2[p]; otherwise use best1[p]
+    int best_sibling;
+    if (best1_child[p] == c) {
+      best_sibling = best2[p];
+    } else {
+      best_sibling = best1[p];
+    }
+
+    if (shortest) {
+      // from_above[c] = 1 + min(from_above[p], best_sibling)
+      if (from_above[p] == NEG_INF) {
+        // p is root (or from_above not set): only siblings matter
+        from_above[c] = (best_sibling < INF) ? 1 + best_sibling : INF;
+      } else {
+        from_above[c] = 1 + std::min(from_above[p], best_sibling);
+      }
+    } else {
+      if (from_above[p] == NEG_INF) {
+        from_above[c] = (best_sibling > NEG_INF) ? 1 + best_sibling : NEG_INF;
+      } else {
+        from_above[c] = 1 + std::max(from_above[p], best_sibling);
+      }
+    }
+  }
+
+  // Combine: depth[v] for internal nodes
+  IntegerVector depths(max_node, 0); // 1-indexed via [v-1]; tips stay 0
+
+  for (int v = n_tip + 1; v <= max_node; ++v) {
+    // Collect all neighbour distances
+    // Children contribute subtree_h[child] + 1 (already in best1/best2)
+    // Parent direction contributes from_above[v]
+
+    if (shortest) {
+      int d = best1[v]; // smallest child path
+      if (from_above[v] != NEG_INF && from_above[v] < d) {
+        d = from_above[v];
+      }
+      depths[v - 1] = d;
+    } else {
+      // Need second-largest among all neighbour distances
+      // Neighbours: best1[v], best2[v], from_above[v]
+      // (best1 >= best2 for longest)
+      // Sort descending and take [1] (second largest)
+      int a = best1[v];
+      int b = best2[v];
+      int c_val = from_above[v];
+      // Ensure we handle cases where from_above is undefined (root)
+      if (c_val == NEG_INF) {
+        // Root node: only children matter; second largest = best2
+        depths[v - 1] = (b > NEG_INF) ? b : a;
+      } else {
+        // Three candidates: a >= b, c_val unknown
+        if (c_val >= a) {
+          depths[v - 1] = a; // c_val >= a >= b, second = a
+        } else if (c_val >= b) {
+          depths[v - 1] = c_val; // a > c_val >= b, second = c_val
+        } else {
+          depths[v - 1] = b; // a >= b > c_val, second = b
+        }
+      }
+    }
+  }
+
+  return depths;
+}

From edc75123b6a9965cdf9cdc63e1de474d7a6779ca Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Fri, 13 Mar 2026 10:59:50 +0000
Subject: [PATCH 6/8] Fix benchmark: do.call() broke NSE expression capture

_init.R: do.call(bench::mark, args) evaluates expressions before
passing them, so bench stored the evaluated result (e.g. a phylo
tree) instead of the call. Restored direct ... passthrough.

_compare_results.R: Use deparse1() instead of as.character(unlist())
for expression extraction (the latter decomposes call objects into
components). Also iterate by index to guard against NA names, and
use res$change instead of outer-scope percentage_change.
---
 benchmark/_compare_results.R | 25 ++++++++++++++++---------
 benchmark/_init.R            | 11 ++++++++---
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/benchmark/_compare_results.R b/benchmark/_compare_results.R
index b0aded68c..a719d1a02 100644
--- a/benchmark/_compare_results.R
+++ b/benchmark/_compare_results.R
@@ -24,8 +24,12 @@ for (pr_file in pr_files) {
   # Prepare a report
   report <- list()
   
+  # Use deparse1 for reliable expression-to-string conversion;
+  # as.character(unlist()) decomposes call objects into their components.
+  expr_names <- vapply(pr1[["expression"]], deparse1, "")
+  
   # Iterate over each function benchmarked
-  for (fn_name in unique(as.character(unlist(pr1[["expression"]])))) {
+  for (fn_name in unique(expr_names)) {
     pr1_times <-  as.numeric(pr1[["time"]][[1]])
     pr2_times <-  as.numeric(pr2[["time"]][[1]])
     pr_times <- if (rep_exists) c(pr1_times, pr2_times) else pr1_times
@@ -79,18 +83,21 @@ for (pr_file in pr_files) {
   # Create a markdown-formatted message
   has_significant_regression <- FALSE
   
-  for (fn_name in names(report)) {
-    res <- report[[fn_name]]
-    status <- if (res$matched) {
+  for (i in seq_along(report)) {
+    fn_name <- names(report)[[i]]
+    res <- report[[i]]
+    if (is.null(res) || is.null(res$matched)) next
+    
+    status <- if (isTRUE(res$matched)) {
       if (res$slower) {
-        if (abs(percentage_change) > threshold_percent) {
+        if (abs(res$change) > threshold_percent) {
           has_significant_regression <- TRUE
           "\U1F7E0 Slower \U1F641"
         } else {
           "\U1F7E3 ~same"
         }
       } else if (res$faster) {
-        if (abs(percentage_change) > threshold_percent) {
+        if (abs(res$change) > threshold_percent) {
           "\U1F7E2 Faster!"
         } else {
           "\U1F7E3 ~same"
@@ -111,14 +118,14 @@ for (pr_file in pr_files) {
       signif(res$median_pr   * 1e3, 3), ",  ",
       signif(res$median_cf   * 1e3, 3), " |\n"
     )
+    
+    cat(message)
+    output <- paste0(output, message)
   }
   
   if (has_significant_regression) {
     regressions <- TRUE
   }
-  
-  cat(message)
-  output <- paste0(output, message)
 }
 
 cat(paste0(output, "\nEOF"), file = Sys.getenv("GITHUB_OUTPUT"), append = TRUE)
diff --git a/benchmark/_init.R b/benchmark/_init.R
index 11af7c836..216945c98 100644
--- a/benchmark/_init.R
+++ b/benchmark/_init.R
@@ -1,9 +1,14 @@
 library("TreeTools")
 
 Benchmark <- function(..., min_iterations = NULL, min_time = NULL) {
-  args <- list(..., min_iterations = min_iterations %||% 3, time_unit = "us")
-  if (!is.null(min_time)) args[["min_time"]] <- min_time
-  result <- do.call(bench::mark, args)
+  # Pass ... directly to bench::mark to preserve non-standard evaluation;
+  # do.call() would evaluate expressions first, breaking expression capture.
+  result <- if (is.null(min_time)) {
+    bench::mark(..., min_iterations = min_iterations %||% 3, time_unit = "us")
+  } else {
+    bench::mark(..., min_iterations = min_iterations %||% 3,
+                min_time = min_time, time_unit = "us")
+  }
   if (interactive()) {
     print(result)
   } else {

From bf1473398eaa642a738cb07f004bf3f705a1bebe Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Fri, 13 Mar 2026 11:00:55 +0000
Subject: [PATCH 7/8] sp

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 785605773..76e7bcb7b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,7 +1,7 @@
 # TreeTools 2.1.0.9005 (2026-03-13) #
 
 - `SplitFrequency(reference = NULL)` performance improvements:
-  split normalization moved to C++; internal split deduplication uses
+  split normalization moved to C++; internal split de-duplication uses
   hash map instead of ordered map.
 
 # TreeTools 2.1.0.9003 (2026-03-09) #

From 54484ce1c5aa1b36f81705248193044911edf117 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Fri, 13 Mar 2026 10:59:50 +0000
Subject: [PATCH 8/8] Fix benchmark: do.call() broke NSE expression capture

_init.R: do.call(bench::mark, args) evaluates expressions before
passing them, so bench stored the evaluated result (e.g. a phylo
tree) instead of the call. Restored direct ... passthrough.

_compare_results.R: Use deparse1() instead of as.character(unlist())
for expression extraction (the latter decomposes call objects into
components). Also iterate by index to guard against NA names, and
use res$change instead of outer-scope percentage_change.
---
 benchmark/_compare_results.R | 25 ++++++++++++++++---------
 benchmark/_init.R            | 11 ++++++++---
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/benchmark/_compare_results.R b/benchmark/_compare_results.R
index b0aded68c..a719d1a02 100644
--- a/benchmark/_compare_results.R
+++ b/benchmark/_compare_results.R
@@ -24,8 +24,12 @@ for (pr_file in pr_files) {
   # Prepare a report
   report <- list()
   
+  # Use deparse1 for reliable expression-to-string conversion;
+  # as.character(unlist()) decomposes call objects into their components.
+  expr_names <- vapply(pr1[["expression"]], deparse1, "")
+  
   # Iterate over each function benchmarked
-  for (fn_name in unique(as.character(unlist(pr1[["expression"]])))) {
+  for (fn_name in unique(expr_names)) {
     pr1_times <-  as.numeric(pr1[["time"]][[1]])
     pr2_times <-  as.numeric(pr2[["time"]][[1]])
     pr_times <- if (rep_exists) c(pr1_times, pr2_times) else pr1_times
@@ -79,18 +83,21 @@ for (pr_file in pr_files) {
   # Create a markdown-formatted message
   has_significant_regression <- FALSE
   
-  for (fn_name in names(report)) {
-    res <- report[[fn_name]]
-    status <- if (res$matched) {
+  for (i in seq_along(report)) {
+    fn_name <- names(report)[[i]]
+    res <- report[[i]]
+    if (is.null(res) || is.null(res$matched)) next
+    
+    status <- if (isTRUE(res$matched)) {
       if (res$slower) {
-        if (abs(percentage_change) > threshold_percent) {
+        if (abs(res$change) > threshold_percent) {
           has_significant_regression <- TRUE
           "\U1F7E0 Slower \U1F641"
         } else {
           "\U1F7E3 ~same"
         }
       } else if (res$faster) {
-        if (abs(percentage_change) > threshold_percent) {
+        if (abs(res$change) > threshold_percent) {
           "\U1F7E2 Faster!"
         } else {
           "\U1F7E3 ~same"
@@ -111,14 +118,14 @@ for (pr_file in pr_files) {
       signif(res$median_pr   * 1e3, 3), ",  ",
       signif(res$median_cf   * 1e3, 3), " |\n"
     )
+    
+    cat(message)
+    output <- paste0(output, message)
   }
   
   if (has_significant_regression) {
     regressions <- TRUE
   }
-  
-  cat(message)
-  output <- paste0(output, message)
 }
 
 cat(paste0(output, "\nEOF"), file = Sys.getenv("GITHUB_OUTPUT"), append = TRUE)
diff --git a/benchmark/_init.R b/benchmark/_init.R
index 11af7c836..216945c98 100644
--- a/benchmark/_init.R
+++ b/benchmark/_init.R
@@ -1,9 +1,14 @@
 library("TreeTools")
 
 Benchmark <- function(..., min_iterations = NULL, min_time = NULL) {
-  args <- list(..., min_iterations = min_iterations %||% 3, time_unit = "us")
-  if (!is.null(min_time)) args[["min_time"]] <- min_time
-  result <- do.call(bench::mark, args)
+  # Pass ... directly to bench::mark to preserve non-standard evaluation;
+  # do.call() would evaluate expressions first, breaking expression capture.
+  result <- if (is.null(min_time)) {
+    bench::mark(..., min_iterations = min_iterations %||% 3, time_unit = "us")
+  } else {
+    bench::mark(..., min_iterations = min_iterations %||% 3,
+                min_time = min_time, time_unit = "us")
+  }
   if (interactive()) {
     print(result)
   } else {