diff --git a/R/RcppExports.R b/R/RcppExports.R index f70456fda..8fbc5db73 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -172,10 +172,7 @@ ts_xss_search <- function(edge, contrast, tip_data, weight, levels, nPartitions .Call(`_TreeSearch_ts_xss_search`, edge, contrast, tip_data, weight, levels, nPartitions, xssRounds, acceptEqual, ratchetCycles, maxHits, min_steps, concavity) } -# Thin wrapper for the C++ ts_driven_search (10 grouped args). -# Production callers (MaximizeParsimony, .ResampleHierarchy) call this directly. -# Tests use ts_driven_search() in ts-driven-compat.R for backward compatibility. -.ts_driven_search_raw <- function(contrast, tip_data, weight, levels, searchControl, runtimeConfig, scoringConfig, constraintConfig = NULL, hsjConfig = NULL, xformConfig = NULL) { +ts_driven_search <- function(contrast, tip_data, weight, levels, searchControl, runtimeConfig, scoringConfig, constraintConfig = NULL, hsjConfig = NULL, xformConfig = NULL) { .Call(`_TreeSearch_ts_driven_search`, contrast, tip_data, weight, levels, searchControl, runtimeConfig, scoringConfig, constraintConfig, hsjConfig, xformConfig) } @@ -215,3 +212,7 @@ ts_test_strategy_tracker <- function(seed, n_draws) { .Call(`_TreeSearch_ts_test_strategy_tracker`, seed, n_draws) } +ts_tbr_diagnostics <- function(edge, contrast, tip_data, weight, levels, maxHits = 1L, acceptEqual = FALSE, maxChanges = 0L, min_steps = integer(), concavity = -1.0, clipOrder = 0L) { + .Call(`_TreeSearch_ts_tbr_diagnostics`, edge, contrast, tip_data, weight, levels, maxHits, acceptEqual, maxChanges, min_steps, concavity, clipOrder) +} + diff --git a/R/SearchControl.R b/R/SearchControl.R index 4a4c4bde3..26fed429a 100644 --- a/R/SearchControl.R +++ b/R/SearchControl.R @@ -14,6 +14,14 @@ #' #' @param tbrMaxHits Integer; number of equally-scoring trees to accept #' before stopping a TBR pass. +#' @param clipOrder Integer (experimental); clip-ordering strategy for TBR +#' search. Determines the order in which edges are tried as clip points. +#' 0 = random (default); 1 = inverse-weight (fewest descendant taxa first); +#' 2 = tips-first (terminal edges before internal); 3 = bucket ordering; +#' 4 = anti-tip (internal before terminal); 5 = large-first (most descendant +#' taxa first). On datasets with \eqn{\ge}65 tips, \code{clipOrder = 2L} +#' (tips-first) typically increases replicate throughput by 5--15\% by +#' evaluating higher-probability improvement candidates earlier. #' @param nniFirst Logical; run an NNI pass before SPR/TBR in each replicate? #' At small tree sizes (\eqn{\le}88 tips) overhead is negligible; at \eqn{\ge}100 tips #' this significantly accelerates the initial descent from the Wagner tree. @@ -201,6 +209,11 @@ SearchControl <- function( # TBR tbrMaxHits = 1L, + # TBR clip ordering strategy (experimental). + # 0L=RANDOM (default), 1L=INV_WEIGHT (w=1/(1+s)), 2L=TIPS_FIRST, + # 3L=BUCKET (tips/small/large), 4L=ANTI_TIP (non-tips first), + # 5L=LARGE_FIRST (large then small then tips) + clipOrder = 0L, nniFirst = TRUE, sprFirst = FALSE, tabuSize = 100L, @@ -274,6 +287,7 @@ SearchControl <- function( structure( list( tbrMaxHits = as.integer(tbrMaxHits), + clipOrder = as.integer(clipOrder), nniFirst = as.logical(nniFirst), sprFirst = as.logical(sprFirst), tabuSize = as.integer(tabuSize), @@ -331,7 +345,7 @@ SearchControl <- function( #' @export print.SearchControl <- function(x, ...) { groups <- list( - "TBR" = c("tbrMaxHits", "nniFirst", "sprFirst", "tabuSize", + "TBR" = c("tbrMaxHits", "clipOrder", "nniFirst", "sprFirst", "tabuSize", "wagnerStarts", "wagnerBias", "wagnerBiasTemp", "outerCycles", "maxOuterResets"), "Ratchet" = c("ratchetCycles", "ratchetPerturbProb", "ratchetPerturbMode", diff --git a/dev/benchmarks/bench_clip_ordering.R b/dev/benchmarks/bench_clip_ordering.R new file mode 100644 index 000000000..daa376a64 --- /dev/null +++ b/dev/benchmarks/bench_clip_ordering.R @@ -0,0 +1,215 @@ +# bench_clip_ordering.R +# +# Benchmark comparison of TBR clip ordering strategies. +# +# Compares six clip_order variants: +# 0 = RANDOM (current default) +# 1 = INV_WEIGHT (w = 1/(1+s)) +# 2 = TIPS_FIRST (tips first, then rest; shuffled within) +# 3 = BUCKET (tips / small / large buckets; shuffled within) +# 4 = ANTI_TIP (non-tips first, then tips) +# 5 = LARGE_FIRST (large > √n first, then small, then tips) +# +# Metric: time-adjusted expected best (TAEB) score — the expected minimum +# score from k independent replicates where k = floor(budget / rep_time). +# Estimated via bootstrap resampling of per-replicate scores. +# +# Usage: Rscript dev/benchmarks/bench_clip_ordering.R [lib_path] [n_seeds] +# lib_path defaults to ".agent-wc" +# n_seeds defaults to 10 + +args <- commandArgs(trailingOnly = TRUE) +lib_path <- if (length(args) >= 1) args[1] else ".agent-wc" +n_seeds <- if (length(args) >= 2) as.integer(args[2]) else 10L + +library(TreeSearch, lib.loc = lib_path) + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +DATASETS <- c("Vinther2008", "Agnarsson2004", "Zhu2013", "Dikow2009") +BUDGETS <- c(30, 60) # seconds +SEEDS <- seq_len(n_seeds) * 1000L + 847L + +ORDERS <- c( + RANDOM = 0L, + INV_WEIGHT = 1L, + TIPS_FIRST = 2L, + BUCKET = 3L, + ANTI_TIP = 4L, + LARGE_FIRST= 5L +) + +# Expected-best bootstrap: given per-replicate scores and total wall time, +# estimate E[min score] at each time budget. +taeb <- function(scores, times_ms, budgets_s, n_boot = 2000L) { + stopifnot(length(scores) == length(times_ms)) + n <- length(scores) + if (n == 0) return(setNames(rep(NA_real_, length(budgets_s)), budgets_s)) + + # Mean time per replicate (use median to be robust to outliers) + med_time_s <- median(times_ms) / 1000 + if (med_time_s <= 0) med_time_s <- 1 + + vapply(budgets_s, function(budget) { + k <- max(1L, floor(budget / med_time_s)) + if (k >= n) { + # Can't bootstrap more than n replicates; just return min + return(min(scores)) + } + boot_mins <- replicate(n_boot, min(sample(scores, k, replace = TRUE))) + mean(boot_mins) + }, numeric(1L)) +} + +# --------------------------------------------------------------------------- +# Prepare datasets +# --------------------------------------------------------------------------- + +prepare <- function(name) { + ds <- TreeSearch::inapplicable.phyData[[name]] + at <- attributes(ds) + list( + name = name, + contrast = at$contrast, + tip_data = matrix(unlist(ds, use.names = FALSE), + nrow = length(ds), byrow = TRUE), + weight = at$weight, + levels = at$levels, + n_taxa = length(ds) + ) +} + +# --------------------------------------------------------------------------- +# Build a default SearchControl with preset based on n_tip +# --------------------------------------------------------------------------- + +make_sc <- function(n_tip, clip_order_int = 0L) { + # Mirror the "default" preset for datasets in 31-119 tip range, + # with only the clip_order changed. This gives a realistic context + # (same ratchet/XSS/RSS settings as normal use) for the comparison. + # + # NOTE: maxSeconds is set per run; runtimeConfig controls the budget. + # Here we only set SearchControl parameters. + SearchControl( + tbrMaxHits = 1L, + clipOrder = clip_order_int, + nniFirst = TRUE, + sprFirst = FALSE, + wagnerStarts = 1L, + wagnerBias = 0L, + outerCycles = 1L, + maxOuterResets = 0L, + ratchetCycles = 12L, + ratchetPerturbProb = 0.25, + ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = FALSE, + nniPerturbCycles = 0L, + driftCycles = 0L, + xssRounds = 3L, xssPartitions = 4L, + rssRounds = 1L, + cssRounds = 0L, + fuseInterval = 3L, + adaptiveLevel = TRUE, + consensusStableReps = 0L + ) +} + +make_runtime <- function(max_seconds) { + list(maxReplicates = 9999L, targetHits = 9999L, + maxSeconds = max_seconds, verbosity = 0L, nThreads = 1L) +} + +# --------------------------------------------------------------------------- +# Data collection +# --------------------------------------------------------------------------- + +cat(sprintf("Benchmark: %d datasets × %d ordering variants × %d seeds\n", + length(DATASETS), length(ORDERS), n_seeds)) +cat(sprintf("Budgets: %s seconds\n\n", paste(BUDGETS, collapse = ", "))) + +all_results <- list() + +for (dname in DATASETS) { + d <- prepare(dname) + + cat(sprintf("=== %s (n_tip=%d) ===\n", dname, d$n_taxa)) + + ds_results <- list() + + for (oname in names(ORDERS)) { + oint <- ORDERS[[oname]] + sc <- make_sc(d$n_taxa, oint) + + rep_scores <- numeric(n_seeds) + rep_times <- numeric(n_seeds) # ms per replicate + + for (i in seq_along(SEEDS)) { + set.seed(SEEDS[i]) + res <- TreeSearch:::ts_driven_search( + d$contrast, d$tip_data, d$weight, d$levels, + searchControl = sc, + runtimeConfig = make_runtime(max(BUDGETS)), + scoringConfig = list(min_steps = integer(), concavity = -1.0, + xpiwe = FALSE, xpiwe_r = 0.5, xpiwe_max_f = 5.0, + obs_count = integer(), infoAmounts = NULL) + ) + rep_scores[i] <- res$best_score + # Estimate per-replicate time from timings + t_total_ms <- sum(unlist(res$timings)) + n_reps <- max(1L, res$n_replicates) + rep_times[i] <- t_total_ms / n_reps + } + + taeb_vals <- taeb(rep_scores, rep_times, BUDGETS) + cat(sprintf(" %-12s: scores %s med_rep %.1fs TAEB@%ds=%.1f @%ds=%.1f\n", + oname, + paste(sprintf("%.0f", range(rep_scores)), collapse = "-"), + median(rep_times)/1000, + BUDGETS[1], taeb_vals[1], + BUDGETS[2], taeb_vals[2])) + + ds_results[[oname]] <- list( + order = oname, order_int = oint, + dataset = dname, n_tip = d$n_taxa, + scores = rep_scores, times_ms = rep_times, + taeb = taeb_vals + ) + } + + all_results[[dname]] <- ds_results + cat("\n") +} + +# --------------------------------------------------------------------------- +# Summary: Δ vs RANDOM baseline for each variant, averaged across datasets +# --------------------------------------------------------------------------- + +cat("=== Summary: TAEB Δ vs RANDOM (lower = better) ===\n\n") + +for (budget in BUDGETS) { + cat(sprintf("--- Budget: %ds ---\n", budget)) + cat(sprintf(" %-15s", "")) + for (dname in DATASETS) cat(sprintf(" %13s", dname)) + cat(sprintf(" %13s\n", "mean_delta")) + + for (oname in names(ORDERS)) { + if (oname == "RANDOM") next + cat(sprintf(" %-15s", oname)) + deltas <- numeric(length(DATASETS)) + for (j in seq_along(DATASETS)) { + dname <- DATASETS[j] + ref <- all_results[[dname]][["RANDOM"]]$taeb[[which(BUDGETS == budget)]] + this_val <- all_results[[dname]][[oname]]$taeb[[which(BUDGETS == budget)]] + delta <- this_val - ref # positive = worse (more steps) + deltas[j] <- delta + cat(sprintf(" %+13.2f", delta)) + } + cat(sprintf(" %+13.2f\n", mean(deltas))) + } + cat("\n") +} + +cat("Positive Δ = worse than RANDOM; negative Δ = better than RANDOM.\n") +cat("Done.\n") diff --git a/dev/benchmarks/diag_clip_ordering.R b/dev/benchmarks/diag_clip_ordering.R new file mode 100644 index 000000000..1a3722b19 --- /dev/null +++ b/dev/benchmarks/diag_clip_ordering.R @@ -0,0 +1,286 @@ +# diag_clip_ordering.R +# +# Diagnostic script for the size-weighted TBR clip ordering experiment. +# +# Purpose: Characterise baseline (random) TBR clip ordering behaviour to test +# whether the small-clip-first hypothesis holds empirically. +# +# For each dataset and seed, builds a random Wagner starting tree, runs +# ts_tbr_diagnostics() to convergence, and accumulates per-pass records. +# Produces three summary tables: +# +# 1. Accepted clip size breakdown by bucket (tips / small / large). +# Key question: are tip clips over-represented in accepted moves +# relative to their uniform expectation? +# +# 2. Clips tried before acceptance (productive passes). +# Key question: is n_clips_tried typically large enough that a +# small-first ordering could meaningfully reduce it? +# +# 3. Evaluation budget split: productive vs null passes. +# Key question: what fraction of TBR work is "wasted" in null passes? +# +# Usage: Rscript dev/benchmarks/diag_clip_ordering.R [lib_path] +# lib_path defaults to ".agent-wc" + +args <- commandArgs(trailingOnly = TRUE) +lib_path <- if (length(args) >= 1) args[1] else ".agent-wc" + +library(TreeSearch, lib.loc = lib_path) + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +DATASETS <- c("Vinther2008", "Agnarsson2004", "Zhu2013", "Dikow2009") +SEEDS <- c(1847L, 2956L, 3712L, 4519L, 5823L, 6401L, 7238L, 8145L, 9032L, 9871L) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +prepare <- function(name) { + ds <- TreeSearch::inapplicable.phyData[[name]] + at <- attributes(ds) + list( + name = name, + contrast = at$contrast, + tip_data = matrix(unlist(ds, use.names = FALSE), + nrow = length(ds), byrow = TRUE), + weight = at$weight, + levels = at$levels, + n_taxa = length(ds) + ) +} + +# Bucket label for a clip of subtree size s given n_tip. +# Tip: s == 1 +# Small: 2 <= s <= floor(sqrt(n_tip)) +# Large: s > floor(sqrt(n_tip)) +clip_bucket <- function(s, n_tip) { + sq <- floor(sqrt(n_tip)) + ifelse(s == 1, "tip", + ifelse(s <= sq, "small", "large")) +} + +# Expected fraction of clips in each bucket for a binary rooted tree with +# n_tip leaves. Total clips = 2*(n_tip-1). +# Tip clips (s==1) : exactly n_tip +# Non-tip clips : n_tip - 2 +# Among non-tip, sizes 2..n_tip-1. Approximate uniform distribution: +# small (2..floor(sqrt)) : floor(sqrt)-1 sizes +# large (floor(sqrt)+1..n_tip-1): n_tip-1-floor(sqrt) sizes +# This is approximate (not all sizes appear equally often), but adequate +# for comparison against observed acceptance fractions. +expected_bucket_fracs <- function(n_tip) { + n_clips <- 2L * (n_tip - 1L) + sq <- floor(sqrt(n_tip)) + n_tip_c <- n_tip # tip clips + n_nontip <- n_tip - 2L # non-tip clips + n_small_c <- sq - 1L # sizes 2..sq (approximate, may be 0) + n_large_c <- n_nontip - n_small_c + list( + tip = n_tip_c / n_clips, + small = max(0, n_small_c) / n_clips, + large = max(0, n_large_c) / n_clips + ) +} + +# --------------------------------------------------------------------------- +# Data collection +# --------------------------------------------------------------------------- + +cat("Collecting TBR pass diagnostics (", length(SEEDS), "seeds per dataset)...\n\n", + sep = "") + +all_records <- list() + +for (dname in DATASETS) { + d <- prepare(dname) + n_tip <- d$n_taxa + sq <- floor(sqrt(n_tip)) + exp <- expected_bucket_fracs(n_tip) + + cat(sprintf("Dataset: %-15s n_tip=%d sqrt_n=%d total_clips=%d\n", + dname, n_tip, sq, 2L*(n_tip-1L))) + + ds_records <- vector("list", length(SEEDS)) + + for (i in seq_along(SEEDS)) { + set.seed(SEEDS[i]) + + # Random Wagner starting tree + wag <- TreeSearch:::ts_random_wagner_tree( + d$contrast, d$tip_data, d$weight, d$levels + ) + + # TBR to convergence with per-pass diagnostics (default clip_order = RANDOM) + res <- TreeSearch:::ts_tbr_diagnostics( + wag$edge, d$contrast, d$tip_data, d$weight, d$levels + ) + + passes <- res$passes + passes$dataset <- dname + passes$seed <- SEEDS[i] + passes$n_tip <- n_tip + passes$n_clips <- 2L * (n_tip - 1L) + passes$final_score <- res$score + passes$bucket <- clip_bucket(passes$accepted_clip_size, n_tip) + # bucket is only meaningful for productive passes; set NA for null passes + passes$bucket[!passes$productive] <- NA_character_ + + ds_records[[i]] <- passes + } + + all_records[[dname]] <- do.call(rbind, ds_records) + recs <- all_records[[dname]] + prod <- recs[recs$productive, ] + null <- recs[!recs$productive, ] + + cat(sprintf(" Passes: %d productive=%d (%.0f%%) null=%d (%.0f%%)\n", + nrow(recs), nrow(prod), 100*nrow(prod)/nrow(recs), + nrow(null), 100*nrow(null)/nrow(recs))) + + if (nrow(prod) > 0) { + tip_obs <- mean(prod$accepted_clip_size == 1) + tip_exp <- exp$tip + enrich <- tip_obs / tip_exp + cat(sprintf(" Tip-clip acceptance: observed=%.0f%% expected=%.0f%% enrichment=%.2fx\n", + 100*tip_obs, 100*tip_exp, enrich)) + cat(sprintf(" Clips tried before accept: median=%d mean=%.1f (out of %d clips)\n", + median(prod$n_clips_tried), mean(prod$n_clips_tried), 2L*(n_tip-1L))) + cat(sprintf(" Final score range: %.0f – %.0f\n", + min(recs$final_score), max(recs$final_score))) + } + cat("\n") +} + +combined <- do.call(rbind, all_records) +prod_all <- combined[combined$productive, ] + +# --------------------------------------------------------------------------- +# Table 1: Accepted clip size bucket breakdown +# --------------------------------------------------------------------------- + +cat("=== Table 1: Accepted clip size breakdown (productive passes only) ===\n\n") + +fmt_pct <- function(x) sprintf("%.1f%%", 100 * x) + +bucket_tbl <- do.call(rbind, lapply(DATASETS, function(dname) { + p <- prod_all[prod_all$dataset == dname, ] + n_tip <- p$n_tip[1] + exp <- expected_bucket_fracs(n_tip) + tot <- nrow(p) + + tip_obs <- mean(p$accepted_clip_size == 1) + small_obs <- mean(p$accepted_clip_size > 1 & + p$accepted_clip_size <= floor(sqrt(n_tip))) + large_obs <- mean(p$accepted_clip_size > floor(sqrt(n_tip))) + + data.frame( + dataset = dname, + n_tip = n_tip, + n_prod_passes = tot, + tip_obs = fmt_pct(tip_obs), + tip_exp = fmt_pct(exp$tip), + tip_enrichment = round(tip_obs / exp$tip, 2), + small_obs = fmt_pct(small_obs), + small_exp = fmt_pct(exp$small), + large_obs = fmt_pct(large_obs), + large_exp = fmt_pct(exp$large) + ) +})) + +print(bucket_tbl, row.names = FALSE) + +# --------------------------------------------------------------------------- +# Table 2: Clips tried before acceptance +# --------------------------------------------------------------------------- + +cat("\n=== Table 2: Clips tried in productive passes ===\n") +cat("(n_clips_tried includes the accepted clip itself; 1 = first clip accepted)\n\n") + +tried_tbl <- do.call(rbind, lapply(DATASETS, function(dname) { + p <- prod_all[prod_all$dataset == dname, ] + n_clips <- p$n_clips[1] + tried <- p$n_clips_tried + + data.frame( + dataset = dname, + n_clips = n_clips, + n_prod_passes = nrow(p), + pct_first_clip = fmt_pct(mean(tried == 1)), + pct_within_5 = fmt_pct(mean(tried <= 5)), + pct_within_10pct = fmt_pct(mean(tried <= 0.1 * n_clips)), + median_tried = median(tried), + mean_tried = round(mean(tried), 1), + median_position = round(median(tried) / n_clips, 2) + ) +})) + +print(tried_tbl, row.names = FALSE) + +# --------------------------------------------------------------------------- +# Table 3: Evaluation budget — productive vs null passes +# --------------------------------------------------------------------------- + +cat("\n=== Table 3: Evaluation budget by pass type ===\n\n") + +eval_tbl <- do.call(rbind, lapply(DATASETS, function(dname) { + d <- combined[combined$dataset == dname, ] + prod <- d[d$productive, ] + null <- d[!d$productive, ] + tot <- sum(d$n_candidates_evaluated) + + data.frame( + dataset = dname, + n_prod_passes = nrow(prod), + n_null_passes = nrow(null), + pct_evals_prod = fmt_pct(sum(prod$n_candidates_evaluated) / tot), + pct_evals_null = fmt_pct(sum(null$n_candidates_evaluated) / tot), + med_evals_prod = if (nrow(prod) > 0) median(prod$n_candidates_evaluated) else NA_real_, + med_evals_null = if (nrow(null) > 0) median(null$n_candidates_evaluated) else NA_real_ + ) +})) + +print(eval_tbl, row.names = FALSE) + +# --------------------------------------------------------------------------- +# Hypothesis assessment +# --------------------------------------------------------------------------- + +cat("\n=== Hypothesis assessment ===\n") +cat("H: small clips (s=1) are over-represented in accepted moves,\n") +cat(" AND n_clips_tried is large enough that ordering would help.\n\n") + +for (dname in DATASETS) { + p <- prod_all[prod_all$dataset == dname, ] + n_clips <- p$n_clips[1] + n_tip <- p$n_tip[1] + enrich <- (mean(p$accepted_clip_size == 1)) / + (expected_bucket_fracs(n_tip)$tip) + med_pos <- median(p$n_clips_tried) / n_clips # fraction of clips needed + + # Potential saving if tips-first: E[position of accepted tip clip in random + # order] - E[position in tips-first order]. Very roughly: + # random E[pos] ≈ n_clips/2; tips-first E[pos] ≈ n_tip/2. + # saving_fraction ≈ (n_clips/2 - n_tip/2) / n_clips = (1 - n_tip/n_clips)/2 ≈ 0.25 + # But only beneficial if tip clips ARE more commonly accepted (enrich > 1). + + verdict <- if (enrich >= 2.0 && med_pos >= 0.25) { + "STRONGLY SUPPORTS ordering (high enrichment + late acceptance)" + } else if (enrich >= 1.5 && med_pos >= 0.15) { + "SUPPORTS ordering (moderate enrichment + moderate position)" + } else if (enrich >= 1.5) { + "PARTIAL (enrichment, but acceptance mostly in first few clips)" + } else if (enrich < 0.8) { + "CONTRADICTS hypothesis (large clips accepted more often)" + } else { + "NEUTRAL (no consistent tip-clip enrichment)" + } + + cat(sprintf(" %-15s: tip enrichment=%.2fx median_pos=%.2f -> %s\n", + dname, enrich, med_pos, verdict)) +} + +cat("\nDone.\n") diff --git a/man/SearchControl.Rd b/man/SearchControl.Rd index c9e23986a..c7d682e6c 100644 --- a/man/SearchControl.Rd +++ b/man/SearchControl.Rd @@ -6,6 +6,7 @@ \usage{ SearchControl( tbrMaxHits = 1L, + clipOrder = 0L, nniFirst = TRUE, sprFirst = FALSE, tabuSize = 100L, @@ -61,6 +62,15 @@ SearchControl( \item{tbrMaxHits}{Integer; number of equally-scoring trees to accept before stopping a TBR pass.} +\item{clipOrder}{Integer (experimental); clip-ordering strategy for TBR +search. Determines the order in which edges are tried as clip points. +0 = random (default); 1 = inverse-weight (fewest descendant taxa first); +2 = tips-first (terminal edges before internal); 3 = bucket ordering; +4 = anti-tip (internal before terminal); 5 = large-first (most descendant +taxa first). On datasets with \eqn{\ge}65 tips, \code{clipOrder = 2L} +(tips-first) typically increases replicate throughput by 5--15\\% by +evaluating higher-probability improvement candidates earlier.} + \item{nniFirst}{Logical; run an NNI pass before SPR/TBR in each replicate? At small tree sizes (\eqn{\le}88 tips) overhead is negligible; at \eqn{\ge}100 tips this significantly accelerates the initial descent from the Wagner tree.} @@ -215,9 +225,19 @@ cycle. Default 0.10 (10\%). Always drops at least 3 tips and keeps at least 4.} \item{pruneReinsertSelection}{Integer; tip selection strategy for choosing -which tips to drop. 0 = random (default), 1 = instability-weighted -(tips whose placement varies across pool trees are preferentially -dropped).} +which tips to drop: +\itemize{ +\item \code{0} = random (default). +\item \code{1} = instability-weighted: tips whose parent-edge split is rare across +pool trees are preferentially dropped. Requires \eqn{\ge}2 pool trees; +falls back to random otherwise. +\item \code{2} = missing-data-weighted: tips with more ambiguous or inapplicable +characters are preferentially dropped. High-missingness taxa are +hardest to score correctly and most likely to be trapped in suboptimal +positions. +\item \code{3} = combined: weight = instability × (1 + normalised missingness). +Targets taxa that are both unstably placed and data-poor. +}} \item{pruneReinsertTbrMoves}{Integer; maximum number of TBR moves accepted during the reduced-tree backbone optimisation phase of each @@ -231,12 +251,11 @@ you prefer thorough backbone optimisation over replicate throughput.} full-tree polish after each prune-reinsert cycle. 0 (default) runs to convergence. Has no effect when \code{pruneReinsertNni = TRUE}.} -\item{pruneReinsertNni}{Logical; if \code{TRUE}, use NNI -(nearest-neighbour interchange) instead of TBR for the full-tree polish -step of each prune-reinsert cycle. NNI converges roughly 5x faster than -TBR at large tip counts (\eqn{\ge}120), substantially reducing per-cycle -cost while still reaching a local optimum before the outer-loop TBR -polish. Default \code{FALSE}.} +\item{pruneReinsertNni}{Logical; if \code{TRUE}, use NNI (nearest-neighbour +interchange) instead of TBR for the full-tree polish step. NNI +converges roughly 5x faster than TBR at large tip counts (\eqn{\ge}120), +substantially reducing per-cycle cost while still reaching a local +optimum before the outer-loop TBR polish. Default \code{FALSE}.} \item{annealCycles}{Integer; number of simulated annealing perturbation cycles (PCSA) per replicate. Each cycle perturbs the current best tree diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 8967bf315..a903fbc6c 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -804,3 +804,24 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// ts_tbr_diagnostics +List ts_tbr_diagnostics(IntegerMatrix edge, NumericMatrix contrast, IntegerMatrix tip_data, IntegerVector weight, CharacterVector levels, int maxHits, bool acceptEqual, int maxChanges, IntegerVector min_steps, double concavity, int clipOrder); +RcppExport SEXP _TreeSearch_ts_tbr_diagnostics(SEXP edgeSEXP, SEXP contrastSEXP, SEXP tip_dataSEXP, SEXP weightSEXP, SEXP levelsSEXP, SEXP maxHitsSEXP, SEXP acceptEqualSEXP, SEXP maxChangesSEXP, SEXP min_stepsSEXP, SEXP concavitySEXP, SEXP clipOrderSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< IntegerMatrix >::type edge(edgeSEXP); + Rcpp::traits::input_parameter< NumericMatrix >::type contrast(contrastSEXP); + Rcpp::traits::input_parameter< IntegerMatrix >::type tip_data(tip_dataSEXP); + Rcpp::traits::input_parameter< IntegerVector >::type weight(weightSEXP); + Rcpp::traits::input_parameter< CharacterVector >::type levels(levelsSEXP); + Rcpp::traits::input_parameter< int >::type maxHits(maxHitsSEXP); + Rcpp::traits::input_parameter< bool >::type acceptEqual(acceptEqualSEXP); + Rcpp::traits::input_parameter< int >::type maxChanges(maxChangesSEXP); + Rcpp::traits::input_parameter< IntegerVector >::type min_steps(min_stepsSEXP); + Rcpp::traits::input_parameter< double >::type concavity(concavitySEXP); + Rcpp::traits::input_parameter< int >::type clipOrder(clipOrderSEXP); + rcpp_result_gen = Rcpp::wrap(ts_tbr_diagnostics(edge, contrast, tip_data, weight, levels, maxHits, acceptEqual, maxChanges, min_steps, concavity, clipOrder)); + return rcpp_result_gen; +END_RCPP +} diff --git a/src/TreeSearch-init.c b/src/TreeSearch-init.c index 5a0976ad2..79350ab7d 100644 --- a/src/TreeSearch-init.c +++ b/src/TreeSearch-init.c @@ -62,6 +62,7 @@ extern SEXP _TreeSearch_mc_fitch_scores(SEXP, SEXP); extern SEXP _TreeSearch_ts_wagner_bias_bench(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); /* ts_stochastic_tbr and ts_parallel_temper removed — on feature/parallel-temper */ extern SEXP _TreeSearch_ts_test_strategy_tracker(SEXP, SEXP); +extern SEXP _TreeSearch_ts_tbr_diagnostics(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); static const R_CallMethodDef callMethods[] = { {"_R_wrap_mpl_new_Morphy", (DL_FUNC) &_R_wrap_mpl_new_Morphy, 0}, @@ -141,6 +142,7 @@ static const R_CallMethodDef callMethods[] = { {"RANDOM_TREE", (DL_FUNC) &RANDOM_TREE, 1}, {"RANDOM_TREE_SCORE", (DL_FUNC) &RANDOM_TREE_SCORE, 2}, {"_TreeSearch_ts_test_strategy_tracker", (DL_FUNC) &_TreeSearch_ts_test_strategy_tracker, 2}, + {"_TreeSearch_ts_tbr_diagnostics", (DL_FUNC) &_TreeSearch_ts_tbr_diagnostics, 11}, {NULL, NULL, 0} }; diff --git a/src/ts_driven.cpp b/src/ts_driven.cpp index 447ecbb29..2bfa01ded 100644 --- a/src/ts_driven.cpp +++ b/src/ts_driven.cpp @@ -167,6 +167,7 @@ ReplicateResult run_single_replicate( { TBRParams tp; tp.tabu_size = params.tabu_size; + tp.clip_order = static_cast(params.clip_order); tbr_search(result.tree, ds, tp, cd, nullptr, nullptr, check_timeout); } result.timings.tbr_ms = ph_lap(); @@ -227,6 +228,7 @@ ReplicateResult run_single_replicate( sp.max_sector_size = params.sector_max_size; sp.internal_ratchet_cycles = 0; sp.internal_max_hits = 1; + sp.clip_order = params.clip_order; // XSS: systematic partitioning sp.n_partitions = params.xss_partitions; @@ -272,6 +274,7 @@ ReplicateResult run_single_replicate( css_sp.n_partitions = params.css_partitions; css_sp.xss_rounds = params.css_rounds; css_sp.internal_max_hits = 1; + css_sp.clip_order = params.clip_order; css_search(result.tree, ds, css_sp, cd); result.timings.css_ms += ph_lap(); @@ -305,6 +308,7 @@ ReplicateResult run_single_replicate( rp.perturb_max_moves = params.ratchet_perturb_max_moves; rp.adaptive = params.ratchet_adaptive; rp.tabu_size = params.tabu_size; + rp.clip_order = params.clip_order; ratchet_search(result.tree, ds, rp, cd, check_timeout); } result.timings.ratchet_ms += ph_lap(); @@ -330,6 +334,7 @@ ReplicateResult run_single_replicate( sp.max_sector_size = params.sector_max_size; sp.internal_ratchet_cycles = 0; sp.internal_max_hits = 1; + sp.clip_order = params.clip_order; if (params.xss_rounds > 0) { sp.n_partitions = params.xss_partitions; @@ -362,6 +367,7 @@ ReplicateResult run_single_replicate( css_sp.n_partitions = params.css_partitions; css_sp.xss_rounds = params.css_rounds; css_sp.internal_max_hits = 1; + css_sp.clip_order = params.clip_order; css_search(result.tree, ds, css_sp, cd); result.timings.css_ms += ph_lap(); @@ -450,6 +456,7 @@ ReplicateResult run_single_replicate( { TBRParams tp; tp.tabu_size = params.tabu_size; + tp.clip_order = static_cast(params.clip_order); tbr_search(result.tree, ds, tp, cd, nullptr, nullptr, check_timeout); } @@ -517,6 +524,7 @@ ReplicateResult run_single_replicate( { TBRParams tp; tp.tabu_size = params.tabu_size; + tp.clip_order = static_cast(params.clip_order); tbr_search(result.tree, ds, tp, cd, nullptr, nullptr, check_timeout); } result.timings.final_tbr_ms += ph_lap(); diff --git a/src/ts_driven.h b/src/ts_driven.h index 99dd6f568..7611aeeb1 100644 --- a/src/ts_driven.h +++ b/src/ts_driven.h @@ -137,6 +137,11 @@ struct DrivenParams { // the shallower initial optimum gives ratchet/drift more room to explore. bool spr_first = false; + // TBR clip ordering strategy (see ClipOrder enum in ts_tbr.h). + // 0=RANDOM (default), 1=INV_WEIGHT, 2=TIPS_FIRST, 3=BUCKET, + // 4=ANTI_TIP, 5=LARGE_FIRST + int clip_order = 0; + // Number of random Wagner trees per replicate (keep best-scoring) int wagner_starts = 1; diff --git a/src/ts_ratchet.cpp b/src/ts_ratchet.cpp index 8cac02f23..97565a9c1 100644 --- a/src/ts_ratchet.cpp +++ b/src/ts_ratchet.cpp @@ -148,6 +148,7 @@ RatchetResult ratchet_search(TreeState& tree, DataSet& ds, search_params.max_accepted_changes = 0; search_params.max_hits = params.max_hits; search_params.tabu_size = params.tabu_size; + search_params.clip_order = static_cast(params.clip_order); TBRResult initial = tbr_search(tree, ds, search_params, cd, nullptr, nullptr, check_timeout); @@ -169,6 +170,7 @@ RatchetResult ratchet_search(TreeState& tree, DataSet& ds, perturb_params.max_accepted_changes = actual_max_moves; perturb_params.max_hits = 1; perturb_params.tabu_size = params.tabu_size; + perturb_params.clip_order = static_cast(params.clip_order); // Seed RNG (from R in serial mode, from thread-local in parallel mode) std::mt19937 rng = ts::make_rng(); diff --git a/src/ts_ratchet.h b/src/ts_ratchet.h index af953b1b0..3cfb2ed0b 100644 --- a/src/ts_ratchet.h +++ b/src/ts_ratchet.h @@ -41,6 +41,7 @@ struct RatchetParams { double adapt_max_prob = 0.50; // Upper bound for adaptive tuning int tabu_size = 0; // Tabu list size for TBR calls (0 = disabled) + int clip_order = 0; // ClipOrder cast to int (RANDOM = 0) }; struct RatchetResult { diff --git a/src/ts_rcpp.cpp b/src/ts_rcpp.cpp index 85b6ef99b..7d1793920 100644 --- a/src/ts_rcpp.cpp +++ b/src/ts_rcpp.cpp @@ -1191,6 +1191,7 @@ static void extract_info_amounts( static void unpack_search_control(List ctrl, ts::DrivenParams& params) { // TBR / initial descent params.tbr_max_hits = as(ctrl["tbrMaxHits"]); + params.clip_order = as(ctrl["clipOrder"]); params.nni_first = as(ctrl["nniFirst"]); params.spr_first = as(ctrl["sprFirst"]); params.tabu_size = as(ctrl["tabuSize"]); @@ -2656,3 +2657,73 @@ List ts_test_strategy_tracker(int seed, int n_draws) { ); } +// Run TBR to convergence with per-pass diagnostic instrumentation. +// Returns the final tree, scalar summary statistics, and a data frame +// of TBRPassRecords (one row per pass of the outer while loop). +// clipOrder: 0=RANDOM (baseline), 1=INV_WEIGHT, 2=TIPS_FIRST, 3=BUCKET +// [[Rcpp::export]] +List ts_tbr_diagnostics( + IntegerMatrix edge, + NumericMatrix contrast, + IntegerMatrix tip_data, + IntegerVector weight, + CharacterVector levels, + int maxHits = 1, + bool acceptEqual = false, + int maxChanges = 0, + IntegerVector min_steps = IntegerVector(), + double concavity = -1.0, + int clipOrder = 0) +{ + ts::DataSet ds = make_dataset(contrast, tip_data, weight, levels, + min_steps, concavity); + + ts::TreeState tree; + tree.init_from_edge( + &edge(0, 0), &edge(0, 1), + edge.nrow(), ds); + + ts::TBRParams params; + params.max_hits = maxHits; + params.accept_equal = acceptEqual; + params.max_accepted_changes = maxChanges; + params.diagnostics = true; + params.clip_order = static_cast(clipOrder); + + ts::TBRResult result = ts::tbr_search(tree, ds, params); + + // Unpack pass records into parallel vectors for a data frame + int n_passes = static_cast(result.pass_records.size()); + IntegerVector pass_index(n_passes); + LogicalVector productive(n_passes); + IntegerVector accepted_clip_size(n_passes); + IntegerVector n_clips_tried(n_passes); + IntegerVector n_candidates_evaluated(n_passes); + + for (int i = 0; i < n_passes; ++i) { + const ts::TBRPassRecord& rec = result.pass_records[i]; + pass_index[i] = rec.pass_index; + productive[i] = rec.productive; + accepted_clip_size[i] = rec.accepted_clip_size; + n_clips_tried[i] = rec.n_clips_tried; + n_candidates_evaluated[i]= rec.n_candidates_evaluated; + } + + DataFrame passes = DataFrame::create( + Named("pass_index") = pass_index, + Named("productive") = productive, + Named("accepted_clip_size") = accepted_clip_size, + Named("n_clips_tried") = n_clips_tried, + Named("n_candidates_evaluated") = n_candidates_evaluated + ); + + return List::create( + Named("edge") = tree_to_edge(tree), + Named("score") = result.best_score, + Named("n_accepted") = result.n_accepted, + Named("n_evaluated") = result.n_evaluated, + Named("n_zero_skipped")= result.n_zero_skipped, + Named("converged") = result.converged, + Named("passes") = passes + ); +} diff --git a/src/ts_sector.cpp b/src/ts_sector.cpp index dadff7056..e23ea65de 100644 --- a/src/ts_sector.cpp +++ b/src/ts_sector.cpp @@ -497,7 +497,7 @@ ReducedDataset build_reduced_dataset(const TreeState& tree, // Search the sector with TBR. The internal_ratchet_cycles parameter is // reserved for future use (ratchet perturbation within sectors). static double search_sector(ReducedDataset& rd, int /*internal_ratchet_cycles*/, - int max_hits) { + int max_hits, int clip_order = 0) { int htu_idx = rd.n_real_tips; int root = rd.subtree.n_tip; int sr_mapped = rd.full_to_sector[rd.sector_root]; @@ -511,6 +511,7 @@ static double search_sector(ReducedDataset& rd, int /*internal_ratchet_cycles*/, TBRParams tp; tp.max_hits = max_hits; + tp.clip_order = static_cast(clip_order); TBRResult tr = tbr_search(rd.subtree, rd.data, tp); // Verify root structure: HTU and sector_root_mapped must remain @@ -675,6 +676,7 @@ SectorResult rss_search(TreeState& tree, DataSet& ds, // No sectors of appropriate size; run global TBR and return TBRParams tp; tp.max_hits = params.internal_max_hits; + tp.clip_order = static_cast(params.clip_order); TBRResult tr = tbr_search(tree, ds, tp); result.best_score = tr.best_score; result.total_steps_saved = @@ -726,7 +728,7 @@ SectorResult rss_search(TreeState& tree, DataSet& ds, // Search the sector double sector_best = search_sector(rd, params.internal_ratchet_cycles, - params.internal_max_hits); + params.internal_max_hits, params.clip_order); ++result.n_sectors_searched; bool improved = sector_best < sector_current; @@ -813,6 +815,7 @@ SectorResult rss_search(TreeState& tree, DataSet& ds, { TBRParams tp; tp.max_hits = params.internal_max_hits; + tp.clip_order = static_cast(params.clip_order); TBRResult tr = tbr_search(tree, ds, tp, cd); if (tr.best_score < result.best_score) { result.total_steps_saved += @@ -870,7 +873,8 @@ SectorResult xss_search(TreeState& tree, DataSet& ds, double sector_current = score_tree(rd.subtree, rd.data); double sector_best = search_sector( - rd, params.internal_ratchet_cycles, params.internal_max_hits); + rd, params.internal_ratchet_cycles, params.internal_max_hits, + params.clip_order); ++result.n_sectors_searched; bool improved = sector_best < sector_current; @@ -928,6 +932,7 @@ SectorResult xss_search(TreeState& tree, DataSet& ds, { TBRParams tp; tp.max_hits = params.internal_max_hits; + tp.clip_order = static_cast(params.clip_order); TBRResult tr = tbr_search(tree, ds, tp, cd); if (tr.best_score < result.best_score) { result.total_steps_saved += @@ -996,6 +1001,7 @@ SectorResult css_search(TreeState& tree, DataSet& ds, TBRParams tp; tp.max_hits = params.internal_max_hits; + tp.clip_order = static_cast(params.clip_order); TBRResult tr = tbr_search(tree, ds, tp, cd, §or_mask); ++result.n_sectors_searched; @@ -1014,6 +1020,7 @@ SectorResult css_search(TreeState& tree, DataSet& ds, { TBRParams tp; tp.max_hits = params.internal_max_hits; + tp.clip_order = static_cast(params.clip_order); TBRResult tr = tbr_search(tree, ds, tp, cd); if (tr.best_score < result.best_score) { result.total_steps_saved += diff --git a/src/ts_sector.h b/src/ts_sector.h index e75e491e0..51a10d5c9 100644 --- a/src/ts_sector.h +++ b/src/ts_sector.h @@ -32,6 +32,8 @@ struct SectorParams { // sectors containing splits absent from the pool consensus. // Owned externally (by driven_search); never freed by sector code. const SplitFrequencyTable* split_freq = nullptr; + + int clip_order = 0; // ClipOrder cast to int (RANDOM = 0) }; struct SectorResult { diff --git a/src/ts_tbr.cpp b/src/ts_tbr.cpp index bd8ae5a68..170d3fff2 100644 --- a/src/ts_tbr.cpp +++ b/src/ts_tbr.cpp @@ -5,6 +5,7 @@ #include "ts_tabu.h" #include "ts_splits.h" #include +#include #include #include #include @@ -447,6 +448,110 @@ static void precompute_vroot_cache( } +// --- Clip ordering --- + +// Apply the selected clip ordering strategy to clip_candidates. +// subtree_sizes must be up-to-date. n_tip is the number of tips. +static void order_clips( + std::vector& clips, + const std::vector& subtree_sizes, + int n_tip, + ClipOrder order, + std::mt19937& rng) +{ + switch (order) { + case ClipOrder::RANDOM: + std::shuffle(clips.begin(), clips.end(), rng); + break; + + case ClipOrder::INV_WEIGHT: { + // Weighted random: w = 1/(1+s). Full Fisher-Yates with weighted draw. + int n = static_cast(clips.size()); + std::vector w(n); + for (int i = 0; i < n; ++i) { + w[i] = 1.0 / (1.0 + subtree_sizes[clips[i]]); + } + for (int i = 0; i < n - 1; ++i) { + double total = 0.0; + for (int j = i; j < n; ++j) total += w[j]; + if (total <= 0.0) break; + std::uniform_real_distribution dist(0.0, total); + double r = dist(rng); + double cumul = 0.0; + int pick = i; + for (int j = i; j < n; ++j) { + cumul += w[j]; + if (cumul >= r) { pick = j; break; } + } + std::swap(clips[i], clips[pick]); + std::swap(w[i], w[pick]); + } + break; + } + + case ClipOrder::TIPS_FIRST: { + // Partition: tips first, then internal. Shuffle within each group. + auto mid = std::partition(clips.begin(), clips.end(), + [n_tip](int node) { return node < n_tip; }); + std::shuffle(clips.begin(), mid, rng); + std::shuffle(mid, clips.end(), rng); + break; + } + + case ClipOrder::BUCKET: { + // Three buckets: tips (s=1), small (2 <= s <= sqrt(n)), large (s > sqrt(n)) + int sqrt_n = static_cast(std::sqrt(static_cast(n_tip))); + if (sqrt_n < 2) sqrt_n = 2; + + auto large_start = std::partition(clips.begin(), clips.end(), + [&subtree_sizes, sqrt_n](int node) { + return subtree_sizes[node] <= sqrt_n; + }); + auto small_start = std::partition(clips.begin(), large_start, + [n_tip](int node) { return node < n_tip; }); + + // clips = [tips | small internal | large] + std::shuffle(clips.begin(), small_start, rng); + std::shuffle(small_start, large_start, rng); + std::shuffle(large_start, clips.end(), rng); + break; + } + + case ClipOrder::ANTI_TIP: { + // Non-tip clips (shuffled) first, tip clips (shuffled) last. + // Hypothesis: tips are under-productive; deprioritise them. + // Inverse of TIPS_FIRST. + auto tip_start = std::partition(clips.begin(), clips.end(), + [n_tip](int node) { return node >= n_tip; }); // non-tips first + std::shuffle(clips.begin(), tip_start, rng); + std::shuffle(tip_start, clips.end(), rng); + break; + } + + case ClipOrder::LARGE_FIRST: { + // Large (>√n) clips first, then small (2..√n), then tips; random within. + // Hypothesis: large clips are enriched relative to their clip-fraction. + int sqrt_n = static_cast(std::sqrt(static_cast(n_tip))); + if (sqrt_n < 2) sqrt_n = 2; + + // Partition: [large | rest] + auto rest_start = std::partition(clips.begin(), clips.end(), + [&subtree_sizes, sqrt_n](int node) { + return subtree_sizes[node] > sqrt_n; + }); + // Partition rest: [large | small-internal | tips] + auto tip_start = std::partition(rest_start, clips.end(), + [n_tip](int node) { return node >= n_tip; }); // non-tip non-large = small + + // clips = [large | small-internal | tips] + std::shuffle(clips.begin(), rest_start, rng); + std::shuffle(rest_start, tip_start, rng); + std::shuffle(tip_start, clips.end(), rng); + break; + } + } +} + // --- Main TBR search --- TBRResult tbr_search(TreeState& tree, const DataSet& ds, @@ -574,6 +679,13 @@ TBRResult tbr_search(TreeState& tree, const DataSet& ds, int clips_since_timeout_check = 0; bool timed_out = false; + // Per-pass diagnostic counters + int pass_index = 0; + int pass_clips_tried = 0; + int pass_candidates_evaluated = 0; + int accepted_clip_size = 0; + std::vector diag_records; + while (keep_going && !timed_out) { keep_going = false; @@ -586,16 +698,23 @@ TBRResult tbr_search(TreeState& tree, const DataSet& ds, state_snap.save(tree); states_valid = true; - // Optimization #6: only reshuffle when the previous pass found no + // Reset per-pass diagnostic counters + pass_clips_tried = 0; + pass_candidates_evaluated = 0; + accepted_clip_size = 0; + + // Recompute subtree sizes (needed for smaller-subtree filtering + // and for clip ordering strategies) + compute_subtree_sizes(tree, subtree_sizes); + + // Optimization #6: only reorder when the previous pass found no // improvement. After an accepted move, retry with the same ordering // (the topology changed, so previously-failing clips may now succeed). if (need_shuffle) { - std::shuffle(clip_candidates.begin(), clip_candidates.end(), rng); + order_clips(clip_candidates, subtree_sizes, tree.n_tip, + params.clip_order, rng); } - need_shuffle = true; // default: reshuffle next time (unless we accept) - - // Recompute subtree sizes for smaller-subtree filtering - compute_subtree_sizes(tree, subtree_sizes); + need_shuffle = true; // default: reorder next time (unless we accept) for (int clip_node : clip_candidates) { if (tree.parent[clip_node] == tree.n_tip) continue; @@ -616,6 +735,9 @@ TBRResult tbr_search(TreeState& tree, const DataSet& ds, continue; } + ++pass_clips_tried; + int clip_evals_before = n_evaluated; + // --- Phase 1: Clip + indirect evaluation --- // Save clip subtree's actives before clipping (needed for NA indirect) @@ -1090,6 +1212,8 @@ TBRResult tbr_search(TreeState& tree, const DataSet& ds, } if (keep_going) { + accepted_clip_size = clip_size; + pass_candidates_evaluated += (n_evaluated - clip_evals_before); // Recompute collapsed regions after the accepted move (states are // valid from full_rescore in the accept path above). if (!collapsed.empty()) { @@ -1106,6 +1230,8 @@ TBRResult tbr_search(TreeState& tree, const DataSet& ds, break; } + pass_candidates_evaluated += (n_evaluated - clip_evals_before); + if (ts::check_interrupt()) break; ++clips_since_timeout_check; if (check_timeout && clips_since_timeout_check >= timeout_interval) { @@ -1114,6 +1240,18 @@ TBRResult tbr_search(TreeState& tree, const DataSet& ds, } } + // Record per-pass diagnostics + if (params.diagnostics) { + diag_records.push_back({ + pass_index, + /*productive=*/ accepted_clip_size > 0, + accepted_clip_size, + pass_clips_tried, + pass_candidates_evaluated + }); + } + ++pass_index; + if (params.max_accepted_changes > 0 && n_accepted >= params.max_accepted_changes) { break; @@ -1132,7 +1270,7 @@ TBRResult tbr_search(TreeState& tree, const DataSet& ds, && n_accepted >= params.max_accepted_changes); return TBRResult{best_score, n_accepted, n_evaluated, n_zero_skipped, - converged}; + converged, std::move(diag_records)}; } } // namespace ts diff --git a/src/ts_tbr.h b/src/ts_tbr.h index 23ca78e51..c9c6bd2ee 100644 --- a/src/ts_tbr.h +++ b/src/ts_tbr.h @@ -15,14 +15,37 @@ #include "ts_constraint.h" #include "ts_pool.h" #include +#include namespace ts { +// Clip ordering strategy for TBR search. +enum class ClipOrder { + RANDOM = 0, // Current default: uniform random shuffle + INV_WEIGHT = 1, // Weighted random, w = 1/(1+s) where s = subtree size + TIPS_FIRST = 2, // All tip clips first (shuffled), then rest (shuffled) + BUCKET = 3, // Three size buckets: tips/small/large, random within each + ANTI_TIP = 4, // Non-tip clips (shuffled) first, tip clips (shuffled) last + LARGE_FIRST = 5 // Large (>√n) first, then small (2..√n), then tips; random within each +}; + struct TBRParams { bool accept_equal = false; // accept Δ=0 moves? int max_accepted_changes = 0; // 0 = no limit (run to convergence) int max_hits = 1; // equal-score hits before stopping int tabu_size = 0; // tabu list capacity (0 = disabled) + ClipOrder clip_order = ClipOrder::RANDOM; + bool diagnostics = false; // collect per-pass diagnostic records +}; + +// Per-pass diagnostic record (populated only when TBRParams::diagnostics +// is true). One record per pass of the outer while loop. +struct TBRPassRecord { + int pass_index; + bool productive; // true if a move was accepted + int accepted_clip_size; // subtree size of accepted clip (0 if null) + int n_clips_tried; // clips evaluated before acceptance (or total) + int n_candidates_evaluated; // total regraft×reroot evaluations this pass }; struct TBRResult { @@ -31,6 +54,7 @@ struct TBRResult { int n_evaluated; int n_zero_skipped; // clips skipped due to zero-length edge (opt #7) bool converged; // true if stopped due to no improvement + std::vector pass_records; // populated when diagnostics=true }; // Run TBR hill-climbing search on `tree` with dataset `ds`.