diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index b9472d53..85b28879 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -21,9 +21,7 @@ on: - "**.R[dD]ata" - "**.Rpro*" pull_request: - branches: - - main - - master + branches: ["*"] paths-ignore: - "Meta**" - "memcheck**" diff --git a/AGENTS.md b/AGENTS.md index 2f0146ac..2b0ad10c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -34,3 +34,4 @@ binary 0/1 in an underlying `raw` object. before moving on to the next task. - Increment the `.900X` dev version suffix in `DESCRIPTION` with each `NEWS.md` update. +- Check that existing tests cover all new code. (The GHA test suite uses codecov.) diff --git a/NEWS.md b/NEWS.md index 85481258..76e7bcb7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,8 @@ # TreeTools 2.1.0.9005 (2026-03-13) # -- `SplitFrequency(reference = NULL)` split normalization moved to C++, - eliminating an R-level per-split loop. +- `SplitFrequency(reference = NULL)` performance improvements: + split normalization moved to C++; internal split de-duplication uses + hash map instead of ordered map. # TreeTools 2.1.0.9003 (2026-03-09) # diff --git a/src/consensus.cpp b/src/consensus.cpp index 93a3b02b..e6585df1 100644 --- a/src/consensus.cpp +++ b/src/consensus.cpp @@ -6,7 +6,8 @@ using namespace Rcpp; #include /* for fill */ #include /* for array */ -#include /* for map */ +#include /* for string (hash key) */ +#include /* for unordered_map */ using TreeTools::ct_stack_threshold; using TreeTools::ct_max_leaves_heap; @@ -178,12 +179,15 @@ List calc_split_frequencies( StackEntry *const S_start = S.data(); - // Use a map to store unique splits and their counts - // Key: split bit pattern; Value: index in output - std::map, int32> split_map; + // Hash map for O(1) amortized split deduplication + std::unordered_map split_map; + split_map.reserve(ntip_3 * 2); std::vector> split_patterns; std::vector counts; + // Reusable key buffer — avoids per-split heap allocation + std::string key(nbin, '\0'); + for (int32 i = 0; i < n_trees; ++i) { if (tables[i].NOSWX(ntip_3)) { continue; @@ -247,21 +251,21 @@ List calc_split_frequencies( const int32 end = tables[i].X_right(k + 1); if (start == 0 && end == 0) continue; // No valid cluster at this position - // Build the bit pattern for this split - std::vector pattern(nbin, 0); + // Build the bit pattern into the reusable key buffer + std::fill(key.begin(), key.end(), '\0'); for (int32 j = start; j <= end; ++j) { const int32 leaf_idx = tables[i].DECODE(j) - 1; // 0-based const int32 byte_idx = leaf_idx >> 3; const int32 bit_idx = leaf_idx & 7; - pattern[byte_idx] |= (Rbyte(1) << bit_idx); + key[byte_idx] |= static_cast(1 << bit_idx); } - auto it = split_map.find(pattern); + auto it = split_map.find(key); if (it == split_map.end()) { // New split: record it with count from this reference tree const int32 idx = split_patterns.size(); - split_map[pattern] = idx; - split_patterns.push_back(std::move(pattern)); + split_map.emplace(key, idx); + split_patterns.emplace_back(key.begin(), key.end()); counts.push_back(split_count[k]); } // If already found, the first reference tree that found it has the diff --git a/tests/testthat/test-Support.R b/tests/testthat/test-Support.R index 7b249527..2fa0e342 100644 --- a/tests/testthat/test-Support.R +++ b/tests/testthat/test-Support.R @@ -59,6 +59,19 @@ test_that("Node support colours consistent", { c("oor", "1", "34", "67", "101")) }) +test_that("normalize_splits() covers all branches", { + # n_spare == 0 (nTip multiple of 8): complement without trailing-bit mask + trees16 <- c(PectinateTree(16), PectinateTree(16)) + freq16 <- SplitFrequency(trees16) + expect_equal(length(freq16), NSplits(trees16[[1]])) + expect_true(all(attr(freq16, "count") == 2L)) + + # Trivial splits are filtered: a star tree produces no non-trivial splits + star <- c(StarTree(10), StarTree(10)) + freq_star <- SplitFrequency(star) + expect_equal(length(freq_star), 0) +}) + test_that("SplitFrequency() handles four-split trees", { trees <- AddTipEverywhere(BalancedTree(3)) trees <- c(trees[1], trees)