From a688e95df06263fe2209b1fd21edbd56e33ec35d Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Fri, 13 Mar 2026 09:45:26 +0000
Subject: [PATCH 1/2] c++ for normalize splits.

---
 .positai/settings.json |  4 ++++
 AGENTS.md              |  7 +++++++
 DESCRIPTION            |  2 +-
 NEWS.md                |  5 +++--
 R/RcppExports.R        |  4 ++++
 R/Support.R            | 28 ++++----------------------
 src/RcppExports.cpp    | 13 ++++++++++++
 src/splits.cpp         | 45 ++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 81 insertions(+), 27 deletions(-)

diff --git a/.positai/settings.json b/.positai/settings.json
index 510265efd..500757dea 100644
--- a/.positai/settings.json
+++ b/.positai/settings.json
@@ -11,5 +11,9 @@
     "executeCode": {
       "*": "allow"
     }
+  },
+  "model": {
+    "id": "claude-opus-4-6",
+    "provider": "positai"
   }
 }
\ No newline at end of file
diff --git a/AGENTS.md b/AGENTS.md
index cc81b9576..2f0146acc 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -27,3 +27,10 @@ of internal nodes for any topologically identical tree.
 
 Splits objects are defined in `as.Splits()`, and denote split membership as
 binary 0/1 in an underlying `raw` object.
+
+## Workflow requirements
+
+- After completing each optimization or user-visible change, update `NEWS.md`
+  before moving on to the next task.
+- Increment the `.900X` dev version suffix in `DESCRIPTION` with each
+  `NEWS.md` update.
diff --git a/DESCRIPTION b/DESCRIPTION
index 04beda37a..772b6ca22 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: TreeTools
 Title: Create, Modify and Analyse Phylogenetic Trees
-Version: 2.1.0.9004
+Version: 2.1.0.9005
 Authors@R: c(
   person("Martin R.", 'Smith', role = c("aut", "cre", "cph"), 
          email = "martin.smith@durham.ac.uk",
diff --git a/NEWS.md b/NEWS.md
index bf4bb119a..854812580 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,6 +1,7 @@
-# TreeTools 2.1.0.9004 (2026-03-12) #
+# TreeTools 2.1.0.9005 (2026-03-13) #
 
-- Rewrite popcount calculation for more efficient `TipsInSplits()`.
+- `SplitFrequency(reference = NULL)` split normalization moved to C++,
+  eliminating an R-level per-split loop.
 
 # TreeTools 2.1.0.9003 (2026-03-09) #
 
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 8117f6562..b5e6e8f72 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -137,6 +137,10 @@ cpp_count_splits <- function(edge, nTip) {
     .Call(`_TreeTools_cpp_count_splits`, edge, nTip)
 }
 
+normalize_splits <- function(splits, n_tip) {
+    .Call(`_TreeTools_normalize_splits`, splits, n_tip)
+}
+
 splits_to_edge <- function(splits, nTip) {
     .Call(`_TreeTools_splits_to_edge`, splits, nTip)
 }
diff --git a/R/Support.R b/R/Support.R
index b105f4167..e080f9cc2 100644
--- a/R/Support.R
+++ b/R/Support.R
@@ -58,30 +58,10 @@ SplitFrequency <- function(reference, forest = NULL) {
       return(structure(splits, nTip = nTip, tip.label = tipLabels,              # nocov
                        count = integer(), class = "Splits"))                    # nocov
     }
-    # The ClusterTable outputs clusters (clades); normalize so bit 0 (tip 1)
-    # is not in the set (matching as.Splits convention)
-    nTipMod <- nTip %% 8L
-    lastByteMask <- if (nTipMod == 0L) as.raw(0xff) else as.raw(bitwShiftL(1L, nTipMod) - 1L)
-    keep <- logical(nrow(splits))
-    for (i in seq_len(nrow(splits))) {
-      val <- splits[i, ]
-      # Count bits set (to filter trivial splits)
-      nBits <- sum(vapply(as.integer(val), function(b) sum(as.integer(intToBits(b))), integer(1)))
-      if (nBits < 2L || nBits > nTip - 2L) next # trivial split
-      # Normalize: if bit 0 is NOT set, complement to match as.Splits format
-      if (!as.logical(as.integer(val[1]) %% 2L)) {
-        for (j in seq_along(val)) {
-          splits[i, j] <- as.raw(bitwXor(as.integer(val[j]), 0xffL))
-        }
-        # Mask last byte
-        if (nTipMod > 0L) {
-          splits[i, nbin] <- as.raw(bitwAnd(as.integer(splits[i, nbin]),
-                                             as.integer(lastByteMask)))
-        }
-      }
-      keep[i] <- TRUE
-    }
-    splits <- splits[keep, , drop = FALSE]
+    # Normalize splits: ensure bit 0 is set, filter trivial splits
+    normalized <- normalize_splits(splits, nTip)
+    keep <- normalized[["keep"]]
+    splits <- normalized[["splits"]][keep, , drop = FALSE]
     counts <- counts[keep]
     ret <- structure(splits,
                      nTip = nTip,
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index ccf10bbeb..d08ceb813 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -426,6 +426,18 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// normalize_splits
+Rcpp::List normalize_splits(Rcpp::RawMatrix splits, const int n_tip);
+RcppExport SEXP _TreeTools_normalize_splits(SEXP splitsSEXP, SEXP n_tipSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< Rcpp::RawMatrix >::type splits(splitsSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP);
+    rcpp_result_gen = Rcpp::wrap(normalize_splits(splits, n_tip));
+    return rcpp_result_gen;
+END_RCPP
+}
 // splits_to_edge
 IntegerMatrix splits_to_edge(const RawMatrix splits, const IntegerVector nTip);
 RcppExport SEXP _TreeTools_splits_to_edge(SEXP splitsSEXP, SEXP nTipSEXP) {
@@ -517,6 +529,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_TreeTools_pack_splits_logical", (DL_FUNC) &_TreeTools_pack_splits_logical, 1},
     {"_TreeTools_pack_splits_logical_vec", (DL_FUNC) &_TreeTools_pack_splits_logical_vec, 1},
     {"_TreeTools_cpp_count_splits", (DL_FUNC) &_TreeTools_cpp_count_splits, 2},
+    {"_TreeTools_normalize_splits", (DL_FUNC) &_TreeTools_normalize_splits, 2},
     {"_TreeTools_splits_to_edge", (DL_FUNC) &_TreeTools_splits_to_edge, 2},
     {"_TreeTools_tips_in_splits", (DL_FUNC) &_TreeTools_tips_in_splits, 1},
     {"_TreeTools_edge_to_rooted_shape", (DL_FUNC) &_TreeTools_edge_to_rooted_shape, 3},
diff --git a/src/splits.cpp b/src/splits.cpp
index 096dc2d70..2024caa37 100644
--- a/src/splits.cpp
+++ b/src/splits.cpp
@@ -714,3 +714,48 @@ int cpp_count_splits(const Rcpp::IntegerMatrix& edge, const int nTip) {
   
   return (n_internal - n_singles) - 1 - (is_rooted ? 1 : 0);
 }
+
+// Normalize splits from ClusterTable output for as.Splits convention:
+// - Filter trivial splits (fewer than 2 or more than nTip-2 bits set)
+// - Ensure bit 0 is set (complement if not)
+// - Mask unused trailing bits in last byte
+// Returns a List with "splits" (RawMatrix) and "keep" (LogicalVector)
+// [[Rcpp::export]]
+Rcpp::List normalize_splits(Rcpp::RawMatrix splits, const int n_tip) {
+  const int n_split = splits.nrow();
+  const int n_bin = splits.ncol();
+  const int n_spare = n_tip % BIN_SIZE;
+  const Rbyte last_mask = n_spare == 0
+    ? Rbyte(0xff)
+    : static_cast<Rbyte>((1 << n_spare) - 1);
+
+  Rcpp::LogicalVector keep(n_split, false);
+
+  for (int i = 0; i < n_split; ++i) {
+    // Count bits set
+    int n_bits = 0;
+    for (int j = 0; j < n_bin; ++j) {
+      n_bits += __builtin_popcount(static_cast<unsigned>(splits(i, j)));
+    }
+
+    if (n_bits < 2 || n_bits > n_tip - 2) continue; // trivial
+
+    // Normalize: if bit 0 is NOT set, complement
+    if (!(splits(i, 0) & Rbyte(1))) {
+      for (int j = 0; j < n_bin; ++j) {
+        splits(i, j) = static_cast<Rbyte>(~splits(i, j));
+      }
+      // Mask trailing bits in last byte
+      if (n_spare > 0) {
+        splits(i, n_bin - 1) &= last_mask;
+      }
+    }
+
+    keep[i] = true;
+  }
+
+  return Rcpp::List::create(
+    Rcpp::Named("splits") = splits,
+    Rcpp::Named("keep") = keep
+  );
+}

From 54484ce1c5aa1b36f81705248193044911edf117 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Fri, 13 Mar 2026 10:59:50 +0000
Subject: [PATCH 2/2] Fix benchmark: do.call() broke NSE expression capture

_init.R: do.call(bench::mark, args) evaluates expressions before
passing them, so bench stored the evaluated result (e.g. a phylo
tree) instead of the call. Restored direct ... passthrough.

_compare_results.R: Use deparse1() instead of as.character(unlist())
for expression extraction (the latter decomposes call objects into
components). Also iterate by index to guard against NA names, and
use res$change instead of outer-scope percentage_change.
---
 benchmark/_compare_results.R | 25 ++++++++++++++++---------
 benchmark/_init.R            | 11 ++++++++---
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/benchmark/_compare_results.R b/benchmark/_compare_results.R
index b0aded68c..a719d1a02 100644
--- a/benchmark/_compare_results.R
+++ b/benchmark/_compare_results.R
@@ -24,8 +24,12 @@ for (pr_file in pr_files) {
   # Prepare a report
   report <- list()
   
+  # Use deparse1 for reliable expression-to-string conversion;
+  # as.character(unlist()) decomposes call objects into their components.
+  expr_names <- vapply(pr1[["expression"]], deparse1, "")
+  
   # Iterate over each function benchmarked
-  for (fn_name in unique(as.character(unlist(pr1[["expression"]])))) {
+  for (fn_name in unique(expr_names)) {
     pr1_times <-  as.numeric(pr1[["time"]][[1]])
     pr2_times <-  as.numeric(pr2[["time"]][[1]])
     pr_times <- if (rep_exists) c(pr1_times, pr2_times) else pr1_times
@@ -79,18 +83,21 @@ for (pr_file in pr_files) {
   # Create a markdown-formatted message
   has_significant_regression <- FALSE
   
-  for (fn_name in names(report)) {
-    res <- report[[fn_name]]
-    status <- if (res$matched) {
+  for (i in seq_along(report)) {
+    fn_name <- names(report)[[i]]
+    res <- report[[i]]
+    if (is.null(res) || is.null(res$matched)) next
+    
+    status <- if (isTRUE(res$matched)) {
       if (res$slower) {
-        if (abs(percentage_change) > threshold_percent) {
+        if (abs(res$change) > threshold_percent) {
           has_significant_regression <- TRUE
           "\U1F7E0 Slower \U1F641"
         } else {
           "\U1F7E3 ~same"
         }
       } else if (res$faster) {
-        if (abs(percentage_change) > threshold_percent) {
+        if (abs(res$change) > threshold_percent) {
           "\U1F7E2 Faster!"
         } else {
           "\U1F7E3 ~same"
@@ -111,14 +118,14 @@ for (pr_file in pr_files) {
       signif(res$median_pr   * 1e3, 3), ",  ",
       signif(res$median_cf   * 1e3, 3), " |\n"
     )
+    
+    cat(message)
+    output <- paste0(output, message)
   }
   
   if (has_significant_regression) {
     regressions <- TRUE
   }
-  
-  cat(message)
-  output <- paste0(output, message)
 }
 
 cat(paste0(output, "\nEOF"), file = Sys.getenv("GITHUB_OUTPUT"), append = TRUE)
diff --git a/benchmark/_init.R b/benchmark/_init.R
index 11af7c836..216945c98 100644
--- a/benchmark/_init.R
+++ b/benchmark/_init.R
@@ -1,9 +1,14 @@
 library("TreeTools")
 
 Benchmark <- function(..., min_iterations = NULL, min_time = NULL) {
-  args <- list(..., min_iterations = min_iterations %||% 3, time_unit = "us")
-  if (!is.null(min_time)) args[["min_time"]] <- min_time
-  result <- do.call(bench::mark, args)
+  # Pass ... directly to bench::mark to preserve non-standard evaluation;
+  # do.call() would evaluate expressions first, breaking expression capture.
+  result <- if (is.null(min_time)) {
+    bench::mark(..., min_iterations = min_iterations %||% 3, time_unit = "us")
+  } else {
+    bench::mark(..., min_iterations = min_iterations %||% 3,
+                min_time = min_time, time_unit = "us")
+  }
   if (interactive()) {
     print(result)
   } else {