From ff8661f434312de893c40a2543dceb3253018817 Mon Sep 17 00:00:00 2001 From: Andrei Smirnov Date: Wed, 4 Mar 2026 11:19:16 +0300 Subject: [PATCH 1/7] *: added debugging scripts and skills --- .claude/skills/cluster-config/SKILL.md | 34 ++ .claude/skills/consensus-leader/SKILL.md | 63 +++ .claude/skills/duty-timeline/SKILL.md | 146 +++++++ .claude/skills/grafana-datasources/SKILL.md | 20 + .gitignore | 4 +- scripts/README.md | 91 +++- scripts/cluster-config.sh | 115 +++++ scripts/consensus-leader.sh | 134 ++++++ scripts/duty-timeline.sh | 455 ++++++++++++++++++++ scripts/grafana-datasources.sh | 34 ++ 10 files changed, 1094 insertions(+), 2 deletions(-) create mode 100644 .claude/skills/cluster-config/SKILL.md create mode 100644 .claude/skills/consensus-leader/SKILL.md create mode 100644 .claude/skills/duty-timeline/SKILL.md create mode 100644 .claude/skills/grafana-datasources/SKILL.md create mode 100755 scripts/cluster-config.sh create mode 100755 scripts/consensus-leader.sh create mode 100644 scripts/duty-timeline.sh create mode 100755 scripts/grafana-datasources.sh diff --git a/.claude/skills/cluster-config/SKILL.md b/.claude/skills/cluster-config/SKILL.md new file mode 100644 index 0000000000..b2b196a765 --- /dev/null +++ b/.claude/skills/cluster-config/SKILL.md @@ -0,0 +1,34 @@ +--- +name: cluster-config +description: Fetch cluster configuration metrics (version, operators, threshold, validators) from Prometheus +user-invokable: true +--- + +# Cluster Config + +Fetch cluster configuration metrics from Prometheus for a given cluster name and optional network. + +## Arguments + +The user must provide: +- **cluster name** (required): e.g. `Lido x Obol: Ethereal Elf` +- **network** (optional, default: `mainnet`): e.g. `mainnet`, `hoodi` + +## Execution + +Run the script with the cluster name and network: +```bash +bash scripts/cluster-config.sh "" "" +``` + +## Output + +Present the results to the user in a readable format: +- **Cluster**: name and network +- **App Version**: charon version running +- **Operators**: number of operators in the cluster +- **Threshold**: signature threshold +- **Active Validators**: currently active validators +- **Total Validators**: total validators in the cluster + +If the script exits with an error (cluster not found), relay the error and suggest the user double-check the cluster name spelling or try a different network. diff --git a/.claude/skills/consensus-leader/SKILL.md b/.claude/skills/consensus-leader/SKILL.md new file mode 100644 index 0000000000..ce22c926ca --- /dev/null +++ b/.claude/skills/consensus-leader/SKILL.md @@ -0,0 +1,63 @@ +--- +name: consensus-leader +description: Calculate consensus leader sequence for a given slot and cluster +user-invokable: true +--- + +# Consensus Leader + +Calculate the consensus leader sequence for a given slot number using the QBFT leader election formula. + +## Arguments + +The user must provide: +- **cluster name** (required): e.g. `Lido x Obol: Ethereal Elf` +- **slot number** (required): e.g. `13813408` +- **network** (optional, default: `mainnet`): e.g. `mainnet`, `hoodi` +- **duty type** (optional, default: `proposer`): e.g. `proposer`, `attester`, `randao`, `sync_message` + +## Duty Types + +Valid duty types (from `core/types.go`): +- `proposer` (1) - block proposal +- `attester` (2) - attestation +- `signature` (3) - generic signature +- `exit` (4) - voluntary exit +- `builder_registration` (6) - MEV builder registration +- `randao` (7) - RANDAO reveal +- `prepare_aggregator` (8) - aggregator preparation +- `aggregator` (9) - attestation aggregation +- `sync_message` (10) - sync committee message +- `prepare_sync_contribution` (11) - sync contribution preparation +- `sync_contribution` (12) - sync committee contribution +- `info_sync` (13) - info sync + +## Execution + +Run the script with the required arguments: +```bash +bash scripts/consensus-leader.sh "" [network] [duty_type] +``` + +## Leader Election Formula + +The consensus leader for each round is calculated as: +``` +leader_index = (slot + duty_type + round) % num_nodes +``` + +Where: +- `slot` is the beacon chain slot number +- `duty_type` is the numeric value of the duty type +- `round` is the QBFT consensus round (1, 2, or 3) +- `num_nodes` is the number of operators in the cluster + +## Output + +Present the results to the user including: +- **Slot Info**: slot number, epoch, slot within epoch, absolute time (UTC) +- **Network**: the Ethereum network +- **Duty**: the duty type being calculated +- **Leaders**: table showing round number, leader index, and peer name for rounds 1-3 + +This helps diagnose consensus issues by identifying which node was responsible for leading each round. diff --git a/.claude/skills/duty-timeline/SKILL.md b/.claude/skills/duty-timeline/SKILL.md new file mode 100644 index 0000000000..b022d23977 --- /dev/null +++ b/.claude/skills/duty-timeline/SKILL.md @@ -0,0 +1,146 @@ +--- +name: duty-timeline +description: Generate a comprehensive timeline of events for a duty across all peers +user-invokable: true +--- + +# Duty Timeline + +Generate a detailed timeline showing the complete lifecycle of a validator duty (block proposal, attestation, etc.) across all cluster peers by analyzing Loki logs. + +## Arguments + +The user must provide: +- **cluster name** (required): e.g. `Lido x Obol: Ethereal Elf` +- **slot number** (required): e.g. `13813408` +- **network** (optional, default: `mainnet`): e.g. `mainnet`, `hoodi` +- **duty type** (optional, default: `proposer`): e.g. `proposer`, `attester`, `randao`, `sync_message`, `aggregator` + +## Execution + +Run the script with the required arguments: +```bash +bash scripts/duty-timeline.sh "" [network] [duty_type] +``` + +## What It Does + +1. Calculates expected consensus leaders for rounds 1, 2, and 3 +2. Queries Loki for all logs related to the duty across the time window +3. Parses and sorts events chronologically +4. Shows timing offset relative to slot start for each event +5. Tracks events across all workflow components: + - Scheduler: slot ticks, duty resolution + - Fetcher: beacon node calls and latency + - QBFT: consensus start, round changes, decisions + - ValidatorAPI: block proposals received + - SigAgg: threshold signature aggregation + - Broadcast: submission to beacon node + - Tracker: participation and inclusion status + +## Key Events Tracked + +| Component | Event | Meaning | +|-----------|-------|---------| +| SCHED | Slot ticked | Slot started | +| SCHED | Resolved duty | Duty assigned to validator | +| FETCHER | Calling beacon node | Fetching unsigned duty data | +| FETCHER | Beacon node call finished | Data fetched successfully | +| FETCHER | SLOW beacon node call | Call took longer than expected | +| QBFT | Consensus started | QBFT instance initialized | +| QBFT | Round TIMEOUT | Round failed, moving to next | +| QBFT | Consensus DECIDED | Agreement reached | +| VAPI | Block proposal received | VC submitted proposal | +| SIGAGG | Threshold signatures aggregated | Enough partial sigs collected | +| BCAST | Broadcast SUCCESS | Submitted to beacon node | +| BCAST | TIMEOUT | Duty expired before broadcast | +| TRACKER | All peers participated | Full participation | +| TRACKER | Not all peers participated | Some peers missing | +| TRACKER | BLOCK MISSED | Block never included on-chain | + +## Output + +The script provides: + +1. **Duty Info**: slot, epoch, time, network, duty type +2. **Expected Consensus Leaders**: who should lead rounds 1, 2, 3 +3. **Event Timeline**: chronological sequence with timing offsets +4. **Summary**: + - Consensus status (success/failure, round count) + - Broadcast status (success/timeout) + - Inclusion status (for proposer duties) + - Participation status + +### Example Output + +``` +=== Duty Info === +Slot: 13813408 +Epoch: 431669 (slot 0 of 32) +Time: 2026-03-04T00:41:36Z +Network: mainnet +Duty: proposer + +=== Expected Consensus Leaders === +Round 1: peer0 (index 0) +Round 2: peer1 (index 1) +Round 3: peer2 (index 2) + +=== Event Timeline === +(Offset relative to slot start time: 2026-03-04T00:41:36Z) + + +0.005s [SCHED] Slot 13813408 started + +0.010s [SCHED] Resolved proposer duty (vidx=123456, pubkey=0x...) + +0.015s [FETCHER] Calling beacon node: /eth/v3/validator/blocks/13813408 + +0.150s [FETCHER] Beacon node call finished: /eth/v3/validator/blocks/13813408 + +0.155s [QBFT] Consensus started + +1.200s [QBFT] ✓ Consensus DECIDED in round 1 + Leader: peer0 (index 0) + +1.500s [SIGAGG] ✓ Threshold signatures aggregated + +1.600s [BCAST] ✓ Broadcast SUCCESS (delay=1.6s) + +8.000s [TRACKER] ✓ All peers participated + +=== Summary === +Consensus: ✓ Completed in round 1 (optimal) +Broadcast: ✓ Successfully submitted to beacon node +Inclusion: ✓ Block included on-chain +Participation: ✓ All peers participated +``` + +## Common Failure Patterns + +### Slow Beacon Node +``` + +2.500s [FETCHER] ⚠️ SLOW beacon node call: /eth/v3/validator/blocks/... +``` +Indicates the beacon node took too long to respond, potentially causing downstream timeouts. + +### Consensus Timeouts +``` + +4.000s [QBFT] ⚠️ Round 1 TIMEOUT -> Round 2 + Reason: leader not proposing +``` +Round 1 leader failed to propose, consensus moved to round 2. + +### Missed Block +``` + +480.0s [TRACKER] ❌ BLOCK MISSED: never included on-chain + Pubkey: 0x..., Broadcast delay: 3.5s +``` +Block was broadcast but not included on-chain (possibly late or network issues). + +## Troubleshooting + +If no logs are found: +- Verify the cluster name spelling is exact +- Check the network is correct +- Confirm the slot had a duty (not all slots have all duty types) +- Logs may have been rotated if the slot is old + +## Dependencies + +This skill uses: +- `cluster-config.sh` - to get cluster info and peer names +- `grafana-datasources.sh` - to discover Loki URL +- Loki API - to query logs +- Requires `OBOL_GRAFANA_API_TOKEN` environment variable diff --git a/.claude/skills/grafana-datasources/SKILL.md b/.claude/skills/grafana-datasources/SKILL.md new file mode 100644 index 0000000000..58af8f0526 --- /dev/null +++ b/.claude/skills/grafana-datasources/SKILL.md @@ -0,0 +1,20 @@ +--- +name: grafana-datasources +description: Discover Prometheus and Loki datasource proxy URLs from Grafana +user-invokable: true +--- + +# Grafana Datasources + +Run the following script to discover Prometheus and Loki datasource proxy URLs from Grafana. The script requires the `OBOL_GRAFANA_API_TOKEN` environment variable. + +Execute this command: +```bash +bash scripts/grafana-datasources.sh +``` + +Present the two output URLs to the user: +- **Prometheus**: for querying metrics via the Prometheus HTTP API (e.g., `query`, `query_range`) +- **Loki**: for querying logs via the Loki HTTP API (e.g., `query`, `query_range`) + +This is a non-interactive skill. Do not ask the user any questions — just run the script and display the results. diff --git a/.gitignore b/.gitignore index cc89875eb4..5c847ce7d3 100644 --- a/.gitignore +++ b/.gitignore @@ -156,7 +156,9 @@ keys/ coverage.out cli-reference.txt changelog.md -.claude/ +.claude/* +!.claude/skills/ + .charon** .anthropic/ .serena/ diff --git a/scripts/README.md b/scripts/README.md index 61109bdd8b..f981f2568a 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -37,11 +37,100 @@ It helps expand an existing cluster with new validators, given the same operator The script will execute `node_merge.sh` for each `nodeX` subfolder found in the source cluster. +## Monitoring and Diagnostics Scripts + +The following scripts query Obol's Grafana/Prometheus/Loki observability stack and require the `OBOL_GRAFANA_API_TOKEN` environment variable to be set: + +```bash +export OBOL_GRAFANA_API_TOKEN= +``` + +### `grafana-datasources.sh` + +Discovers Prometheus and Loki datasource proxy URLs from Grafana. Used internally by the other monitoring scripts. + +#### Usage + +```bash +./grafana-datasources.sh +``` + +Outputs two lines: +``` +PROMETHEUS_URL=https://grafana.monitoring.gcp.obol.tech/api/datasources/proxy//api/v1/ +LOKI_URL=https://grafana.monitoring.gcp.obol.tech/api/datasources/proxy//loki/api/v1/ +``` + +### `cluster-config.sh` + +Fetches cluster configuration metrics (version, operators, threshold, validators, and per-peer info) from Prometheus via Grafana proxy. + +#### Usage + +```bash +./cluster-config.sh [network] +``` + +- **: Human-readable cluster name (e.g., `"Lido x Obol: Ethereal Elf"`). +- *[network]*: Network name — `mainnet` (default), `hoodi`, `sepolia`, etc. + +#### Example + +```bash +./cluster-config.sh "Lido x Obol: Ethereal Elf" mainnet +``` + +### `consensus-leader.sh` + +Calculates the consensus leader sequence for a given slot and cluster using the QBFT leader election formula: `(slot + dutyType + round) % nodes`. + +#### Usage + +```bash +./consensus-leader.sh [network] [duty_type] +``` + +- **: Human-readable cluster name. +- **: Beacon chain slot number (e.g., `13813408`). +- *[network]*: Network name — `mainnet` (default), `hoodi`, `sepolia`, etc. +- *[duty_type]*: Duty type — `proposer` (default), `attester`, `randao`, etc. + +#### Example + +```bash +./consensus-leader.sh "Lido x Obol: Ethereal Elf" 13813408 mainnet proposer +``` + +### `duty-timeline.sh` + +Generates a comprehensive chronological timeline of events for a specific duty across all peers, pulling logs from Loki and cluster metrics from Prometheus. Useful for post-mortem analysis of missed blocks or attestations. + +#### Usage + +```bash +./duty-timeline.sh [network] [duty_type] +``` + +- **: Human-readable cluster name. +- **: Beacon chain slot number (e.g., `13813408`). +- *[network]*: Network name — `mainnet` (default), `hoodi`, `sepolia`, etc. +- *[duty_type]*: Duty type — `proposer` (default), `attester`, `randao`, etc. + +#### Example + +```bash +./duty-timeline.sh "Lido x Obol: Ethereal Elf" 13813408 mainnet proposer +``` + +The script outputs duty info, expected consensus leaders, a chronological event timeline with offsets relative to slot start, and a summary covering consensus outcome, broadcast status, block inclusion, and peer participation. + ## Requirements -Both scripts require **bash** (standard on Linux/macOS) and **jq** (version 1.5+). +All scripts require **bash** (standard on Linux/macOS) and **jq** (version 1.5+). Install via `sudo apt-get install jq` (Debian/Ubuntu) or `brew install jq` (macOS). +The monitoring and diagnostics scripts additionally require **curl** and **bc**. + ## Important Warnings - Always back up your `cluster-lock.json`, node folders, and `validator_keys` folders before use. diff --git a/scripts/cluster-config.sh b/scripts/cluster-config.sh new file mode 100755 index 0000000000..e1c8a078f3 --- /dev/null +++ b/scripts/cluster-config.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash +# Fetches cluster configuration metrics from Prometheus via Grafana proxy. +# Requires OBOL_GRAFANA_API_TOKEN environment variable. +# Usage: bash scripts/cluster-config.sh [network] +# cluster_name: e.g. "Lido x Obol: Ethereal Elf" +# network: mainnet (default), hoodi, sepolia, etc. + +set -euo pipefail + +CLUSTER_NAME="${1:-}" +NETWORK="${2:-mainnet}" + +if [ -z "$CLUSTER_NAME" ]; then + echo "Error: cluster name is required" >&2 + echo "Usage: bash scripts/cluster-config.sh [network]" >&2 + exit 1 +fi + +if [ -z "${OBOL_GRAFANA_API_TOKEN:-}" ]; then + echo "Error: OBOL_GRAFANA_API_TOKEN is not set" >&2 + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Discover Prometheus proxy URL +PROM_URL=$("$SCRIPT_DIR/grafana-datasources.sh" | grep '^PROMETHEUS_URL=' | cut -d= -f2-) + +if [ -z "$PROM_URL" ]; then + echo "Error: could not discover Prometheus URL" >&2 + exit 1 +fi + +AUTH="Authorization: Bearer $OBOL_GRAFANA_API_TOKEN" + +prom_query() { + local metric="$1" + curl -sf -G \ + -H "$AUTH" \ + --data-urlencode "query=${metric}{cluster_name=\"${CLUSTER_NAME}\",cluster_network=\"${NETWORK}\"}" \ + "${PROM_URL}query" +} + +query_metric() { + local metric="$1" + local result + result=$(prom_query "$metric") + + if [ "$metric" = "app_version" ]; then + echo "$result" | jq -r '[.data.result[].metric.version] | unique | sort | join(", ") | if . == "" then "NOT_FOUND" else . end' + else + echo "$result" | jq -r 'if .data.result | length == 0 then "NOT_FOUND" else .data.result[0].value[1] end' + fi +} + +# Query cluster-level metrics; reuse operators raw result to extract common labels. +operators_raw=$(prom_query "cluster_operators") +operators=$(echo "$operators_raw" | jq -r 'if .data.result | length == 0 then "NOT_FOUND" else .data.result[0].value[1] end') +cluster_hash=$(echo "$operators_raw" | jq -r '.data.result[0].metric.cluster_hash // "NOT_FOUND"') + +version=$(query_metric "app_version") +threshold=$(query_metric "cluster_threshold") +active_validators=$(query_metric "core_scheduler_validators_active") +total_validators=$(query_metric "cluster_validators") + +# Check if cluster was found +all_not_found=true +for val in "$version" "$operators" "$threshold" "$active_validators" "$total_validators"; do + if [ -n "$val" ] && [ "$val" != "NOT_FOUND" ]; then + all_not_found=false + break + fi +done + +if $all_not_found; then + echo "Error: no cluster found for name=\"${CLUSTER_NAME}\" network=\"${NETWORK}\"" >&2 + echo "Please double-check the cluster name and network." >&2 + exit 1 +fi + +# Query per-peer info metrics for the peer table. +# app_peerinfo_* metrics use 'peer' label (= cluster_peer value of the described peer). +# app_peer_name uses 'cluster_peer' as key and 'peer_name' as the human-readable name. +# Multiple nodes report peerinfo for all peers, so results are deduplicated by peer. +idx_raw=$(prom_query "app_peerinfo_index") +nick_raw=$(prom_query "app_peerinfo_nickname") +ver_raw=$(prom_query "app_peerinfo_version") + +echo "=== Cluster Info ===" +echo "Name: ${CLUSTER_NAME}" +echo "Hash: ${cluster_hash}" +echo "Version: ${version}" +echo "Network: ${NETWORK}" +echo "Nodes: ${operators} (threshold: ${threshold})" +echo "Validators: ${active_validators} active / ${total_validators} total" +echo "" +echo "=== Peers Info ===" +jq -rn \ + --argjson idx "$idx_raw" \ + --argjson nicks "$nick_raw" \ + --argjson vers "$ver_raw" \ + ' + ($nicks.data.result | map({(.metric.peer): (.metric.peer_nickname // "?")}) | add // {}) as $nick_map | + ($vers.data.result | map({(.metric.peer): (.metric.version // "?")}) | add // {}) as $ver_map | + ["INDEX", "PEER", "NICKNAME", "VERSION"], + ( + $idx.data.result + | map({peer: .metric.peer, index: (.value[1] | tonumber)}) + | unique_by(.peer) + | sort_by(.index) + | .[] + | [(.index | tostring), .peer, ($nick_map[.peer] // "?"), ($ver_map[.peer] // "?")] + ) + | @tsv + ' | column -t -s $'\t' diff --git a/scripts/consensus-leader.sh b/scripts/consensus-leader.sh new file mode 100755 index 0000000000..9de1229421 --- /dev/null +++ b/scripts/consensus-leader.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +# Calculates consensus leader sequence for a given slot number. +# Requires OBOL_GRAFANA_API_TOKEN environment variable (passed to cluster-config.sh). +# Usage: bash scripts/consensus-leader.sh [network] [duty_type] +# cluster_name: e.g. "Lido x Obol: Ethereal Elf" +# slot: slot number (e.g. 13813408) +# network: mainnet (default), hoodi, sepolia, etc. +# duty_type: proposer (default), attester, randao, etc. + +set -euo pipefail + +CLUSTER_NAME="${1:-}" +SLOT="${2:-}" +NETWORK="${3:-mainnet}" +DUTY_TYPE="${4:-proposer}" + +if [ -z "$CLUSTER_NAME" ] || [ -z "$SLOT" ]; then + echo "Error: cluster name and slot are required" >&2 + echo "Usage: bash scripts/consensus-leader.sh [network] [duty_type]" >&2 + exit 1 +fi + +# Duty type name to numeric value mapping (from core/types.go) +declare -A DUTY_MAP=( + [unknown]=0 + [proposer]=1 + [attester]=2 + [signature]=3 + [exit]=4 + [builder_proposer]=5 + [builder_registration]=6 + [randao]=7 + [prepare_aggregator]=8 + [aggregator]=9 + [sync_message]=10 + [prepare_sync_contribution]=11 + [sync_contribution]=12 + [info_sync]=13 +) + +DUTY_VALUE="${DUTY_MAP[$DUTY_TYPE]:-}" +if [ -z "$DUTY_VALUE" ]; then + echo "Error: unknown duty type '$DUTY_TYPE'" >&2 + echo "Valid types: ${!DUTY_MAP[*]}" >&2 + exit 1 +fi + +# Network genesis timestamps and slots per epoch +declare -A GENESIS_TIME=( + [mainnet]=1606824023 + [hoodi]=1742212800 + [sepolia]=1655733600 +) + +SLOTS_PER_EPOCH=32 +SECONDS_PER_SLOT=12 + +# Get genesis time for the network +GENESIS="${GENESIS_TIME[$NETWORK]:-}" +if [ -z "$GENESIS" ]; then + echo "Warning: unknown genesis time for network '$NETWORK', skipping time calculation" >&2 +fi + +# Fetch cluster config using cluster-config.sh +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CLUSTER_OUTPUT=$("$SCRIPT_DIR/cluster-config.sh" "$CLUSTER_NAME" "$NETWORK") + +# Extract number of nodes from "Nodes: N (threshold: T)" +NODES=$(echo "$CLUSTER_OUTPUT" | grep '^Nodes:' | sed -E 's/^Nodes:[[:space:]]*([0-9]+).*/\1/') + +if [ -z "$NODES" ] || [ "$NODES" -eq 0 ]; then + echo "Error: could not determine number of nodes from cluster config" >&2 + exit 1 +fi + +# Extract peer info lines (INDEX PEER NICKNAME VERSION) +# Skip header line, capture peers in order +declare -a PEERS +while IFS= read -r line; do + # Skip header and empty lines + if [[ "$line" =~ ^INDEX ]] || [ -z "$line" ]; then + continue + fi + # Extract peer name (second column) + PEER=$(echo "$line" | awk '{print $2}') + PEERS+=("$PEER") +done < <(echo "$CLUSTER_OUTPUT" | sed -n '/=== Peers/,$ p' | tail -n +2) + +# If we couldn't parse peers, create placeholder names +if [ ${#PEERS[@]} -eq 0 ]; then + for ((i=0; i/dev/null || TZ=UTC date -d "@$SLOT_TIMESTAMP" '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || echo "") +fi + +# Calculate leader indices for rounds 1, 2, 3 +# Formula: (slot + dutyType + round) % nodes +calc_leader() { + local round=$1 + echo $(( (SLOT + DUTY_VALUE + round) % NODES )) +} + +LEADER_R1=$(calc_leader 1) +LEADER_R2=$(calc_leader 2) +LEADER_R3=$(calc_leader 3) + +# Output results +echo "=== Slot Info ===" +echo "Slot: ${SLOT}" +echo "Epoch: ${EPOCH} (slot ${SLOT_IN_EPOCH} of ${SLOTS_PER_EPOCH})" +if [ -n "$SLOT_TIME" ]; then + echo "Time: ${SLOT_TIME}" +fi +echo "Network: ${NETWORK}" +echo "Duty: ${DUTY_TYPE} (value: ${DUTY_VALUE})" +echo "" +echo "=== Consensus Leaders ===" +echo "Cluster: ${CLUSTER_NAME} (${NODES} nodes)" +echo "" +printf "%-8s %-5s %-20s\n" "ROUND" "INDEX" "PEER" +printf "%-8s %-5s %-20s\n" "1" "$LEADER_R1" "${PEERS[$LEADER_R1]:-unknown}" +printf "%-8s %-5s %-20s\n" "2" "$LEADER_R2" "${PEERS[$LEADER_R2]:-unknown}" +printf "%-8s %-5s %-20s\n" "3" "$LEADER_R3" "${PEERS[$LEADER_R3]:-unknown}" diff --git a/scripts/duty-timeline.sh b/scripts/duty-timeline.sh new file mode 100644 index 0000000000..6b12653e87 --- /dev/null +++ b/scripts/duty-timeline.sh @@ -0,0 +1,455 @@ +#!/usr/bin/env bash +# Generates a comprehensive timeline of events for a duty across all peers. +# Requires OBOL_GRAFANA_API_TOKEN environment variable. +# Usage: bash scripts/duty-timeline.sh [network] [duty_type] +# cluster_name: e.g. "Lido x Obol: Ethereal Elf" +# slot: slot number (e.g. 13813408) +# network: mainnet (default), hoodi, sepolia, etc. +# duty_type: proposer (default), attester, randao, etc. + +set -euo pipefail + +CLUSTER_NAME="${1:-}" +SLOT="${2:-}" +NETWORK="${3:-mainnet}" +DUTY_TYPE="${4:-proposer}" + +if [ -z "$CLUSTER_NAME" ] || [ -z "$SLOT" ]; then + echo "Error: cluster name and slot are required" >&2 + echo "Usage: bash scripts/duty-timeline.sh [network] [duty_type]" >&2 + exit 1 +fi + +if [ -z "${OBOL_GRAFANA_API_TOKEN:-}" ]; then + echo "Error: OBOL_GRAFANA_API_TOKEN is not set" >&2 + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Duty type name to numeric value mapping (from core/types.go) +declare -A DUTY_MAP=( + [unknown]=0 + [proposer]=1 + [attester]=2 + [signature]=3 + [exit]=4 + [builder_proposer]=5 + [builder_registration]=6 + [randao]=7 + [prepare_aggregator]=8 + [aggregator]=9 + [sync_message]=10 + [prepare_sync_contribution]=11 + [sync_contribution]=12 + [info_sync]=13 +) + +DUTY_VALUE="${DUTY_MAP[$DUTY_TYPE]:-}" +if [ -z "$DUTY_VALUE" ]; then + echo "Error: unknown duty type '$DUTY_TYPE'" >&2 + echo "Valid types: ${!DUTY_MAP[*]}" >&2 + exit 1 +fi + +# Network genesis timestamps +declare -A GENESIS_TIME=( + [mainnet]=1606824023 + [hoodi]=1742212800 + [sepolia]=1655733600 +) + +SLOTS_PER_EPOCH=32 +SECONDS_PER_SLOT=12 + +GENESIS="${GENESIS_TIME[$NETWORK]:-}" +if [ -z "$GENESIS" ]; then + echo "Error: unknown genesis time for network '$NETWORK'" >&2 + exit 1 +fi + +# Calculate time window for the slot +# Start from 15 seconds before slot (to catch scheduling), end 20 seconds after + 8 minutes for tracker +SLOT_TIMESTAMP=$((GENESIS + SLOT * SECONDS_PER_SLOT)) +START_NS=$(( (SLOT_TIMESTAMP - 15) * 1000000000 )) +END_NS=$(( (SLOT_TIMESTAMP + 500) * 1000000000 )) # ~8 minutes for tracker inclusion checks + +# Discover Loki URL +DATASOURCES=$("$SCRIPT_DIR/grafana-datasources.sh") +LOKI_URL=$(echo "$DATASOURCES" | grep '^LOKI_URL=' | cut -d= -f2-) + +if [ -z "$LOKI_URL" ]; then + echo "Error: could not discover Loki URL" >&2 + exit 1 +fi + +AUTH="Authorization: Bearer $OBOL_GRAFANA_API_TOKEN" + +# Fetch cluster config to get peer info +CLUSTER_OUTPUT=$("$SCRIPT_DIR/cluster-config.sh" "$CLUSTER_NAME" "$NETWORK" 2>/dev/null) || { + echo "Error: failed to fetch cluster config" >&2 + exit 1 +} + +NODES=$(echo "$CLUSTER_OUTPUT" | grep '^Nodes:' | sed -E 's/^Nodes:[[:space:]]*([0-9]+).*/\1/') + +# Extract peers +declare -a PEERS +while IFS= read -r line; do + if [[ "$line" =~ ^INDEX ]] || [ -z "$line" ]; then + continue + fi + PEER=$(echo "$line" | awk '{print $2}') + PEERS+=("$PEER") +done < <(echo "$CLUSTER_OUTPUT" | sed -n '/=== Peers/,$ p' | tail -n +2) + +# Calculate leaders for rounds 1, 2, 3 +calc_leader() { + local round=$1 + echo $(( (SLOT + DUTY_VALUE + round) % NODES )) +} + +LEADER_R1=$(calc_leader 1) +LEADER_R2=$(calc_leader 2) +LEADER_R3=$(calc_leader 3) + +LEADER_PEER_R1="${PEERS[$LEADER_R1]:-unknown}" +LEADER_PEER_R2="${PEERS[$LEADER_R2]:-unknown}" +LEADER_PEER_R3="${PEERS[$LEADER_R3]:-unknown}" + +# Calculate epoch and slot time +EPOCH=$((SLOT / SLOTS_PER_EPOCH)) +SLOT_IN_EPOCH=$((SLOT % SLOTS_PER_EPOCH)) +SLOT_TIME=$(TZ=UTC date -r "$SLOT_TIMESTAMP" '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || TZ=UTC date -d "@$SLOT_TIMESTAMP" '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || echo "") + +echo "=== Duty Info ===" +echo "Slot: ${SLOT}" +echo "Epoch: ${EPOCH} (slot ${SLOT_IN_EPOCH} of ${SLOTS_PER_EPOCH})" +echo "Time: ${SLOT_TIME}" +echo "Network: ${NETWORK}" +echo "Duty: ${DUTY_TYPE}" +echo "" +echo "=== Expected Consensus Leaders ===" +echo "Round 1: ${LEADER_PEER_R1} (index ${LEADER_R1})" +echo "Round 2: ${LEADER_PEER_R2} (index ${LEADER_R2})" +echo "Round 3: ${LEADER_PEER_R3} (index ${LEADER_R3})" +echo "" + +# Query Loki for all logs related to this slot and duty +# Match various log formats for the duty +DUTY_PATTERN="${SLOT}/${DUTY_TYPE}" +LOGQL="{cluster_name=\"${CLUSTER_NAME}\",cluster_network=\"${NETWORK}\"} |~ \`${DUTY_PATTERN}|duty=${DUTY_TYPE}.*slot=${SLOT}|slot.*${SLOT}.*${DUTY_TYPE}\`" + +loki_query() { + local query="$1" + curl -sf -G \ + -H "$AUTH" \ + --data-urlencode "query=${query}" \ + --data-urlencode "start=${START_NS}" \ + --data-urlencode "end=${END_NS}" \ + --data-urlencode "limit=1000" \ + "${LOKI_URL}query_range" +} + +echo "=== Fetching Logs ===" +LOGS_RAW=$(loki_query "$LOGQL") + +if [ -z "$LOGS_RAW" ] || [ "$(echo "$LOGS_RAW" | jq -r '.data.result | length')" = "0" ]; then + echo "" + echo "ERROR: No logs found for ${DUTY_PATTERN}" + echo "This could mean:" + echo " - The cluster did not have this duty in slot ${SLOT}" + echo " - Logs have been rotated/deleted" + echo " - The cluster name or network is incorrect" + exit 1 +fi + +LOG_COUNT=$(echo "$LOGS_RAW" | jq '[.data.result[].values[]] | length') +echo "Found ${LOG_COUNT} log entries" +echo "" + +# Helper function to extract value from logfmt line +extract_logfmt() { + local line="$1" + local field="$2" + echo "$line" | grep -oE "${field}=\"[^\"]*\"|${field}=[^ ]*" | head -1 | sed -E "s/${field}=\"?([^\"]*)\"?/\1/" || true +} + +# Calculate offset from slot start using Loki nanosecond timestamp +SLOT_TIMESTAMP_NS=$((SLOT_TIMESTAMP * 1000000000)) +calc_offset() { + local loki_ts_ns="$1" + local offset_ms=$(( (loki_ts_ns - SLOT_TIMESTAMP_NS) / 1000000 )) + local offset_s=$(echo "scale=3; $offset_ms / 1000" | bc) + printf "%+.3fs" "$offset_s" +} + +# Parse logs and extract key events +# Each stream has labels and values; values are [timestamp, log_line] +PARSED_LOGS=$(echo "$LOGS_RAW" | jq -r ' + .data.result[] | + .stream as $labels | + .values[] | + { + ts: .[0], + peer: $labels.cluster_peer, + line: .[1] + } +' | jq -s 'sort_by(.ts)') + +echo "=== Event Timeline ===" +echo "(Offset relative to slot start time: ${SLOT_TIME})" +echo "" + +# Track key events +declare -A SEEN_EVENTS +CONSENSUS_STARTED=false +CONSENSUS_DECIDED=false +DECIDED_ROUND="" +DECIDED_LEADER="" +declare -A ROUND_TIMEOUT_REASONS + +# Process each log line and extract key events +while IFS= read -r entry; do + [ -z "$entry" ] && continue + + PEER=$(echo "$entry" | jq -r '.peer') + LINE=$(echo "$entry" | jq -r '.line') + LOKI_TS=$(echo "$entry" | jq -r '.ts') + + # Extract fields from log line + MSG=$(extract_logfmt "$LINE" "msg") + LEVEL=$(extract_logfmt "$LINE" "level") + CALLER=$(extract_logfmt "$LINE" "caller") + + # Determine component from caller (e.g., qbft/qbft.go -> qbft) + COMPONENT=$(echo "$CALLER" | cut -d/ -f1) + + if [ -z "$MSG" ]; then + continue + fi + + # Calculate offset from slot start using Loki nanosecond timestamp + OFFSET=$(calc_offset "$LOKI_TS" 2>/dev/null || echo "+?.???s") + + # Create unique event key to avoid duplicate output + EVENT_KEY="${MSG}:${PEER}" + + # Process different event types + case "$MSG" in + "Slot ticked") + if [ -z "${SEEN_EVENTS[slot_ticked]:-}" ]; then + echo " ${OFFSET} [SCHED] Slot ${SLOT} started" + SEEN_EVENTS[slot_ticked]=1 + fi + ;; + + "Resolved proposer duty"|"Resolved attester duty") + PUBKEY=$(extract_logfmt "$LINE" "pubkey") + VIDX=$(extract_logfmt "$LINE" "vidx") + EVENT_KEY="resolved:${PUBKEY}" + if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then + echo " ${OFFSET} [SCHED] Resolved ${DUTY_TYPE} duty (vidx=${VIDX}, pubkey=${PUBKEY})" + SEEN_EVENTS[$EVENT_KEY]=1 + fi + ;; + + "Calling beacon node endpoint...") + ENDPOINT=$(extract_logfmt "$LINE" "endpoint") + EVENT_KEY="fetch_start:${ENDPOINT}" + if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then + echo " ${OFFSET} [FETCHER] Calling beacon node: ${ENDPOINT}" + SEEN_EVENTS[$EVENT_KEY]=1 + fi + ;; + + "Beacon node call finished") + ENDPOINT=$(extract_logfmt "$LINE" "endpoint") + EVENT_KEY="fetch_done:${ENDPOINT}" + if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then + echo " ${OFFSET} [FETCHER] Beacon node call finished: ${ENDPOINT}" + SEEN_EVENTS[$EVENT_KEY]=1 + fi + ;; + + "Beacon node call took longer than expected") + ENDPOINT=$(extract_logfmt "$LINE" "endpoint") + RTT=$(extract_logfmt "$LINE" "rtt") + EVENT_KEY="fetch_slow:${ENDPOINT}" + if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then + echo " ${OFFSET} [FETCHER] ⚠️ SLOW beacon node call: ${ENDPOINT} (RTT=${RTT})" + SEEN_EVENTS[$EVENT_KEY]=1 + fi + ;; + + "QBFT consensus instance starting") + if [ "$CONSENSUS_STARTED" = false ]; then + CONSENSUS_STARTED=true + echo " ${OFFSET} [QBFT] Consensus started" + fi + ;; + + "QBFT round changed") + OLD_ROUND=$(extract_logfmt "$LINE" "round") + NEW_ROUND=$(extract_logfmt "$LINE" "new_round") + REASON=$(extract_logfmt "$LINE" "timeout_reason") + if [ -z "${ROUND_TIMEOUT_REASONS[$OLD_ROUND]:-}" ]; then + echo " ${OFFSET} [QBFT] ⚠️ Round ${OLD_ROUND} TIMEOUT -> Round ${NEW_ROUND}" + echo " Reason: ${REASON}" + ROUND_TIMEOUT_REASONS[$OLD_ROUND]="$REASON" + fi + ;; + + "QBFT consensus decided") + if [ "$CONSENSUS_DECIDED" = false ]; then + CONSENSUS_DECIDED=true + DECIDED_ROUND=$(extract_logfmt "$LINE" "round") + DECIDED_LEADER=$(extract_logfmt "$LINE" "leader_name") + DECIDED_INDEX=$(extract_logfmt "$LINE" "leader_index") + echo " ${OFFSET} [QBFT] ✓ Consensus DECIDED in round ${DECIDED_ROUND}" + echo " Leader: ${DECIDED_LEADER} (index ${DECIDED_INDEX})" + fi + ;; + + "Successfully aggregated partial signatures to reach threshold") + VAPI_ENDPOINT=$(extract_logfmt "$LINE" "vapi_endpoint") + if [ -n "$VAPI_ENDPOINT" ]; then + EVENT_KEY="sigagg:${VAPI_ENDPOINT}" + else + EVENT_KEY="sigagg:${DUTY_TYPE}" + fi + if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then + if [ -n "$VAPI_ENDPOINT" ]; then + echo " ${OFFSET} [SIGAGG] ✓ Threshold signatures aggregated (${VAPI_ENDPOINT})" + else + echo " ${OFFSET} [SIGAGG] ✓ Threshold signatures aggregated" + fi + SEEN_EVENTS[$EVENT_KEY]=1 + fi + ;; + + "Beacon block proposal received from validator client") + BLOCK_VERSION=$(extract_logfmt "$LINE" "block_version") + EVENT_KEY="vapi_proposal" + if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then + echo " ${OFFSET} [VAPI] Block proposal received (version=${BLOCK_VERSION})" + SEEN_EVENTS[$EVENT_KEY]=1 + fi + ;; + + "Successfully submitted v2 attestations to beacon node"|"Successfully submitted proposal to beacon node") + DELAY=$(extract_logfmt "$LINE" "delay") + EVENT_KEY="bcast_success:${MSG}" + if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then + if [ -n "$DELAY" ]; then + echo " ${OFFSET} [BCAST] ✓ Broadcast SUCCESS (delay=${DELAY})" + else + echo " ${OFFSET} [BCAST] ✓ Broadcast SUCCESS" + fi + SEEN_EVENTS[$EVENT_KEY]=1 + fi + ;; + + "Timeout calling bcast/broadcast, duty expired") + VAPI_ENDPOINT=$(extract_logfmt "$LINE" "vapi_endpoint") + EVENT_KEY="bcast_timeout:${VAPI_ENDPOINT}" + if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then + echo " ${OFFSET} [BCAST] ❌ TIMEOUT: duty expired (${VAPI_ENDPOINT})" + SEEN_EVENTS[$EVENT_KEY]=1 + fi + ;; + + "All peers participated in duty") + EVENT_KEY="tracker_all" + if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then + echo " ${OFFSET} [TRACKER] ✓ All peers participated" + SEEN_EVENTS[$EVENT_KEY]=1 + fi + ;; + + "Not all peers participated in duty") + ABSENT=$(extract_logfmt "$LINE" "absent") + EVENT_KEY="tracker_partial" + if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then + echo " ${OFFSET} [TRACKER] ⚠️ Not all peers participated" + echo " Absent: ${ABSENT}" + SEEN_EVENTS[$EVENT_KEY]=1 + fi + ;; + + "Broadcasted block never included on-chain") + PUBKEY=$(extract_logfmt "$LINE" "pubkey") + BLOCK_SLOT=$(extract_logfmt "$LINE" "block_slot") + BROADCAST_DELAY=$(extract_logfmt "$LINE" "broadcast_delay") + EVENT_KEY="tracker_missed" + if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then + echo " ${OFFSET} [TRACKER] ❌ BLOCK MISSED: never included on-chain" + echo " Pubkey: ${PUBKEY}, Broadcast delay: ${BROADCAST_DELAY}" + SEEN_EVENTS[$EVENT_KEY]=1 + fi + ;; + + *"consensus timeout"*|*"duty expired"*) + if [[ "$LEVEL" == "error" ]]; then + EVENT_KEY="error:${MSG:0:50}" + if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then + echo " ${OFFSET} [ERROR] ❌ ${MSG}" + SEEN_EVENTS[$EVENT_KEY]=1 + fi + fi + ;; + esac +done < <(echo "$PARSED_LOGS" | jq -c '.[]') + +echo "" +echo "=== Summary ===" + +# Consensus summary +if [ "$CONSENSUS_STARTED" = true ]; then + if [ "$CONSENSUS_DECIDED" = true ]; then + NUM_TIMEOUTS=${#ROUND_TIMEOUT_REASONS[@]} + if [ "$NUM_TIMEOUTS" -eq 0 ]; then + echo "Consensus: ✓ Completed in round 1 (optimal)" + else + echo "Consensus: ✓ Completed in round ${DECIDED_ROUND} after ${NUM_TIMEOUTS} timeout(s)" + echo " Leader: ${DECIDED_LEADER} (index ${DECIDED_INDEX})" + if [ -n "${ROUND_TIMEOUT_REASONS[1]:-}" ]; then + echo " ⚠️ Round 1 leader ${LEADER_PEER_R1} failed" + fi + fi + else + echo "Consensus: ❌ Did NOT complete" + fi +else + echo "Consensus: ⚠️ Not started (logs may be incomplete)" +fi + +# Broadcast summary +if [ -n "${SEEN_EVENTS[bcast_timeout:submit_proposal_v2]:-}" ] || [ -n "${SEEN_EVENTS[bcast_timeout:submit_attestation]:-}" ]; then + echo "Broadcast: ❌ TIMEOUT - duty expired before broadcast" +elif [ -n "${SEEN_EVENTS[bcast_success:Successfully submitted v2 attestations to beacon node]:-}" ] || \ + [ -n "${SEEN_EVENTS[bcast_success:Successfully submitted proposal to beacon node]:-}" ]; then + echo "Broadcast: ✓ Successfully submitted to beacon node" +else + echo "Broadcast: ⚠️ No broadcast event found in logs" +fi + +# Inclusion summary (for proposer) +if [ "$DUTY_TYPE" = "proposer" ]; then + if [ -n "${SEEN_EVENTS[tracker_missed]:-}" ]; then + echo "Inclusion: ❌ MISSED - block never included on-chain" + elif [ -n "${SEEN_EVENTS[tracker_all]:-}" ]; then + echo "Inclusion: ✓ Block included on-chain" + else + echo "Inclusion: ⚠️ Unknown (tracker event not found)" + fi +fi + +# Participation summary +if [ -n "${SEEN_EVENTS[tracker_partial]:-}" ]; then + echo "Participation: ⚠️ Not all peers participated" +elif [ -n "${SEEN_EVENTS[tracker_all]:-}" ]; then + echo "Participation: ✓ All peers participated" +fi + +echo "" diff --git a/scripts/grafana-datasources.sh b/scripts/grafana-datasources.sh new file mode 100755 index 0000000000..06a83566c7 --- /dev/null +++ b/scripts/grafana-datasources.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# Fetches Prometheus and Loki datasource proxy URLs from Grafana. +# Requires OBOL_GRAFANA_API_TOKEN environment variable. +# Output: two lines in KEY=URL format, e.g.: +# PROMETHEUS_URL=https://grafana.monitoring.gcp.obol.tech/api/datasources/proxy//api/v1/ +# LOKI_URL=https://grafana.monitoring.gcp.obol.tech/api/datasources/proxy//loki/api/v1/ + +set -euo pipefail + +GRAFANA_BASE="https://grafana.monitoring.gcp.obol.tech" + +if [ -z "${OBOL_GRAFANA_API_TOKEN:-}" ]; then + echo "Error: OBOL_GRAFANA_API_TOKEN is not set" >&2 + exit 1 +fi + +response=$(curl -sf -H "Authorization: Bearer $OBOL_GRAFANA_API_TOKEN" "$GRAFANA_BASE/api/datasources") + +# Extract the main Prometheus (name="prometheus") and Loki datasource numeric IDs. +# Grafana datasource proxy requires numeric ID, not UID. +prom_id=$(echo "$response" | jq -r '.[] | select(.type=="prometheus" and .name=="prometheus") | .id') +loki_id=$(echo "$response" | jq -r '.[] | select(.type=="loki" and .name=="Loki") | .id') + +if [ -z "$prom_id" ]; then + echo "Error: Prometheus datasource not found" >&2 + exit 1 +fi +if [ -z "$loki_id" ]; then + echo "Error: Loki datasource not found" >&2 + exit 1 +fi + +echo "PROMETHEUS_URL=${GRAFANA_BASE}/api/datasources/proxy/${prom_id}/api/v1/" +echo "LOKI_URL=${GRAFANA_BASE}/api/datasources/proxy/${loki_id}/loki/api/v1/" From 89bb75cb6fce6ab614f55f9782664335d02c82f0 Mon Sep 17 00:00:00 2001 From: Andrei Smirnov Date: Wed, 4 Mar 2026 13:51:16 +0300 Subject: [PATCH 2/7] Improved duty-timeline --- .claude/skills/duty-timeline/SKILL.md | 160 ++++--- scripts/duty-timeline.sh | 655 ++++++++++++++------------ 2 files changed, 467 insertions(+), 348 deletions(-) diff --git a/.claude/skills/duty-timeline/SKILL.md b/.claude/skills/duty-timeline/SKILL.md index b022d23977..01a02be073 100644 --- a/.claude/skills/duty-timeline/SKILL.md +++ b/.claude/skills/duty-timeline/SKILL.md @@ -6,7 +6,7 @@ user-invokable: true # Duty Timeline -Generate a detailed timeline showing the complete lifecycle of a validator duty (block proposal, attestation, etc.) across all cluster peers by analyzing Loki logs. +Generate a detailed timeline showing the complete lifecycle of a validator duty (block proposal, attestation, etc.) across all cluster peers by analyzing Loki logs. Shows **per-peer details** for key events like BN calls, broadcasts, and errors to support root cause analysis. ## Arguments @@ -27,36 +27,46 @@ bash scripts/duty-timeline.sh "" [network] [duty_type] 1. Calculates expected consensus leaders for rounds 1, 2, and 3 2. Queries Loki for all logs related to the duty across the time window -3. Parses and sorts events chronologically +3. Parses logs with Python (handles nanosecond timestamps correctly) 4. Shows timing offset relative to slot start for each event -5. Tracks events across all workflow components: +5. Displays **per-peer rows** for events where peer-level detail matters +6. Tracks events across all workflow components: - Scheduler: slot ticks, duty resolution - - Fetcher: beacon node calls and latency + - Fetcher: beacon node calls per peer with RTT - QBFT: consensus start, round changes, decisions - - ValidatorAPI: block proposals received - - SigAgg: threshold signature aggregation - - Broadcast: submission to beacon node + - ValidatorAPI: block/blinded block proposals per peer + - SigAgg: threshold signature aggregation per peer + - Broadcast: submission per peer with delay + - SSE: block_gossip/block/head events, "too late" warnings per peer - Tracker: participation and inclusion status + - Errors: consensus timeout / permanent failure per peer ## Key Events Tracked -| Component | Event | Meaning | -|-----------|-------|---------| -| SCHED | Slot ticked | Slot started | -| SCHED | Resolved duty | Duty assigned to validator | -| FETCHER | Calling beacon node | Fetching unsigned duty data | -| FETCHER | Beacon node call finished | Data fetched successfully | -| FETCHER | SLOW beacon node call | Call took longer than expected | -| QBFT | Consensus started | QBFT instance initialized | -| QBFT | Round TIMEOUT | Round failed, moving to next | -| QBFT | Consensus DECIDED | Agreement reached | -| VAPI | Block proposal received | VC submitted proposal | -| SIGAGG | Threshold signatures aggregated | Enough partial sigs collected | -| BCAST | Broadcast SUCCESS | Submitted to beacon node | -| BCAST | TIMEOUT | Duty expired before broadcast | -| TRACKER | All peers participated | Full participation | -| TRACKER | Not all peers participated | Some peers missing | -| TRACKER | BLOCK MISSED | Block never included on-chain | +| Component | Event | Per-peer? | Meaning | +|-----------|-------|-----------|---------| +| SCHED | Slot ticked | first | Slot started | +| SCHED | Resolved duty | first per pubkey | Duty assigned to validator | +| FETCHER | BN call start | yes | Fetching unsigned duty data | +| FETCHER | BN call done | yes | Data fetched (with RTT) | +| FETCHER | SLOW BN call | yes | Call took longer than expected | +| QBFT | Consensus started | first | QBFT instance initialized | +| QBFT | Round TIMEOUT | first per round | Round failed, moving to next | +| QBFT | Consensus DECIDED | first | Agreement reached | +| VAPI | Block proposal received | yes | VC submitted unblinded proposal | +| VAPI | Blinded block received | yes | VC submitted blinded proposal | +| SIGAGG | Threshold reached | yes | Enough partial sigs collected | +| BCAST | Broadcast SUCCESS | yes | Submitted to beacon node (with delay) | +| BCAST | TIMEOUT | yes | Duty expired before broadcast | +| SSE | block_gossip TOO LATE | yes | Late gossip event per peer | +| SSE | block event TOO LATE | yes | Late block event per peer | +| SSE | SSE block gossip/head/block event | first | Normal SSE events | +| TRACKER | All peers participated | first | Full participation | +| TRACKER | Not all peers participated | first | Some peers missing | +| TRACKER | BLOCK MISSED | first | Block never included on-chain | +| TRACKER | BLINDED BLOCK MISSED | first | Blinded block never included | +| ERROR | consensus timeout | yes | Per-peer consensus timeout | +| ERROR | permanent failure | yes | Per-peer permanent failure | ## Output @@ -64,70 +74,105 @@ The script provides: 1. **Duty Info**: slot, epoch, time, network, duty type 2. **Expected Consensus Leaders**: who should lead rounds 1, 2, 3 -3. **Event Timeline**: chronological sequence with timing offsets +3. **Event Timeline**: chronological sequence with timing offsets and per-peer detail 4. **Summary**: - Consensus status (success/failure, round count) - - Broadcast status (success/timeout) - - Inclusion status (for proposer duties) - - Participation status + - Block type (blinded/unblinded) + - Broadcast status with delay range (min-max across peers) + - BN call RTT range across peers + - Inclusion status (for proposer duties, with broadcast_delay) + - Participation status (with absent peers listed) + - Error summary per peer ### Example Output ``` === Duty Info === -Slot: 13813408 -Epoch: 431669 (slot 0 of 32) -Time: 2026-03-04T00:41:36Z +Slot: 13810452 +Epoch: 431576 (slot 20 of 32) +Time: 2026-03-03T21:31:00Z Network: mainnet Duty: proposer === Expected Consensus Leaders === -Round 1: peer0 (index 0) -Round 2: peer1 (index 1) -Round 3: peer2 (index 2) +Round 1: curious-cat (index 2) +Round 2: daring-dog (index 3) +Round 3: eager-elk (index 4) + +=== Fetching Logs === +Found 87 log entries === Event Timeline === -(Offset relative to slot start time: 2026-03-04T00:41:36Z) +(Offset relative to slot start time: 2026-03-03T21:31:00Z) - +0.005s [SCHED] Slot 13813408 started + +0.005s [SCHED] Slot 13810452 started +0.010s [SCHED] Resolved proposer duty (vidx=123456, pubkey=0x...) - +0.015s [FETCHER] Calling beacon node: /eth/v3/validator/blocks/13813408 - +0.150s [FETCHER] Beacon node call finished: /eth/v3/validator/blocks/13813408 - +0.155s [QBFT] Consensus started - +1.200s [QBFT] ✓ Consensus DECIDED in round 1 - Leader: peer0 (index 0) - +1.500s [SIGAGG] ✓ Threshold signatures aggregated - +1.600s [BCAST] ✓ Broadcast SUCCESS (delay=1.6s) - +8.000s [TRACKER] ✓ All peers participated + +0.015s [FETCHER] BN call start: /eth/v3/validator/blocks/13810452 [alpha-ant] + +0.016s [FETCHER] BN call start: /eth/v3/validator/blocks/13810452 [brave-bee] + +0.018s [FETCHER] BN call start: /eth/v3/validator/blocks/13810452 [curious-cat] + +0.920s [FETCHER] BN call done: /eth/v3/validator/blocks/13810452 [alpha-ant] (RTT=0.9s) + +1.800s [FETCHER] BN call done: /eth/v3/validator/blocks/13810452 [brave-bee] (RTT=1.8s) + +2.100s [FETCHER] SLOW BN call: /eth/v3/validator/blocks/13810452 [curious-cat] (RTT=2.1s) + +2.110s [QBFT] Consensus started + +6.200s [QBFT] Round 1 TIMEOUT -> Round 2 + Reason: leader not proposing + +8.500s [QBFT] Consensus DECIDED in round 2 + Leader: daring-dog (index 3) + +8.600s [VAPI] Blinded block received [alpha-ant] (version=deneb) + +8.620s [VAPI] Blinded block received [brave-bee] (version=deneb) + +8.900s [SIGAGG] Threshold reached [alpha-ant] (submit_blinded_block) + +8.920s [SIGAGG] Threshold reached [brave-bee] (submit_blinded_block) + +9.000s [BCAST] Broadcast SUCCESS [alpha-ant] (delay=3.5s) + +9.020s [BCAST] Broadcast SUCCESS [brave-bee] (delay=3.52s) + +9.100s [SSE] block_gossip TOO LATE [alpha-ant] (delay=9.1s) + +9.150s [SSE] block event TOO LATE [brave-bee] (delay=9.15s) + +12.00s [ERROR] consensus timeout [average-road] + +480.0s [TRACKER] BLINDED BLOCK MISSED: never included on-chain + Pubkey: 0x..., Broadcast delay: 3.5s + +480.1s [TRACKER] Not all peers participated + Absent: average-road === Summary === -Consensus: ✓ Completed in round 1 (optimal) -Broadcast: ✓ Successfully submitted to beacon node -Inclusion: ✓ Block included on-chain -Participation: ✓ All peers participated +Consensus: Completed in round 2 after 1 timeout(s) + Leader: daring-dog (index 3) + Round 1 leader curious-cat failed +Block type: blinded +Broadcast: Successfully submitted (delay range: 3.5s-3.5s) +BN call RTT: 0.9s-2.1s across 3 peers +Inclusion: MISSED - block never included on-chain (broadcast_delay=3.5s) +Participation: Not all peers participated (absent: average-road) +Errors: + - [average-road] consensus timeout ``` ## Common Failure Patterns -### Slow Beacon Node +### Slow Beacon Node (per-peer) ``` - +2.500s [FETCHER] ⚠️ SLOW beacon node call: /eth/v3/validator/blocks/... + +0.920s [FETCHER] BN call done: /eth/v3/... [alpha-ant] (RTT=0.9s) + +2.100s [FETCHER] SLOW BN call: /eth/v3/... [curious-cat] (RTT=2.1s) ``` -Indicates the beacon node took too long to respond, potentially causing downstream timeouts. +Shows which specific peers have slow BN calls and the RTT spread. ### Consensus Timeouts ``` - +4.000s [QBFT] ⚠️ Round 1 TIMEOUT -> Round 2 - Reason: leader not proposing + +6.200s [QBFT] Round 1 TIMEOUT -> Round 2 + Reason: leader not proposing ``` Round 1 leader failed to propose, consensus moved to round 2. -### Missed Block +### Missed Block with Broadcast Delay +``` + +480.0s [TRACKER] BLINDED BLOCK MISSED: never included on-chain + Pubkey: 0x..., Broadcast delay: 3.5s +``` +Block was broadcast but not included on-chain. Summary includes broadcast_delay for correlation. + +### Per-peer Errors ``` - +480.0s [TRACKER] ❌ BLOCK MISSED: never included on-chain - Pubkey: 0x..., Broadcast delay: 3.5s + +12.00s [ERROR] consensus timeout [average-road] ``` -Block was broadcast but not included on-chain (possibly late or network issues). +Shows which peer(s) experienced errors, helping identify the failing node. ## Troubleshooting @@ -143,4 +188,5 @@ This skill uses: - `cluster-config.sh` - to get cluster info and peer names - `grafana-datasources.sh` - to discover Loki URL - Loki API - to query logs +- `python3` - to parse Loki JSON (handles nanosecond timestamps) - Requires `OBOL_GRAFANA_API_TOKEN` environment variable diff --git a/scripts/duty-timeline.sh b/scripts/duty-timeline.sh index 6b12653e87..ced6cf9c1c 100644 --- a/scripts/duty-timeline.sh +++ b/scripts/duty-timeline.sh @@ -154,302 +154,375 @@ loki_query() { echo "=== Fetching Logs ===" LOGS_RAW=$(loki_query "$LOGQL") -if [ -z "$LOGS_RAW" ] || [ "$(echo "$LOGS_RAW" | jq -r '.data.result | length')" = "0" ]; then - echo "" - echo "ERROR: No logs found for ${DUTY_PATTERN}" - echo "This could mean:" - echo " - The cluster did not have this duty in slot ${SLOT}" - echo " - Logs have been rotated/deleted" - echo " - The cluster name or network is incorrect" - exit 1 -fi - -LOG_COUNT=$(echo "$LOGS_RAW" | jq '[.data.result[].values[]] | length') -echo "Found ${LOG_COUNT} log entries" -echo "" - -# Helper function to extract value from logfmt line -extract_logfmt() { - local line="$1" - local field="$2" - echo "$line" | grep -oE "${field}=\"[^\"]*\"|${field}=[^ ]*" | head -1 | sed -E "s/${field}=\"?([^\"]*)\"?/\1/" || true -} - -# Calculate offset from slot start using Loki nanosecond timestamp -SLOT_TIMESTAMP_NS=$((SLOT_TIMESTAMP * 1000000000)) -calc_offset() { - local loki_ts_ns="$1" - local offset_ms=$(( (loki_ts_ns - SLOT_TIMESTAMP_NS) / 1000000 )) - local offset_s=$(echo "scale=3; $offset_ms / 1000" | bc) - printf "%+.3fs" "$offset_s" -} - -# Parse logs and extract key events -# Each stream has labels and values; values are [timestamp, log_line] -PARSED_LOGS=$(echo "$LOGS_RAW" | jq -r ' - .data.result[] | - .stream as $labels | - .values[] | - { - ts: .[0], - peer: $labels.cluster_peer, - line: .[1] - } -' | jq -s 'sort_by(.ts)') - -echo "=== Event Timeline ===" -echo "(Offset relative to slot start time: ${SLOT_TIME})" -echo "" - -# Track key events -declare -A SEEN_EVENTS -CONSENSUS_STARTED=false -CONSENSUS_DECIDED=false -DECIDED_ROUND="" -DECIDED_LEADER="" -declare -A ROUND_TIMEOUT_REASONS - -# Process each log line and extract key events -while IFS= read -r entry; do - [ -z "$entry" ] && continue - - PEER=$(echo "$entry" | jq -r '.peer') - LINE=$(echo "$entry" | jq -r '.line') - LOKI_TS=$(echo "$entry" | jq -r '.ts') - - # Extract fields from log line - MSG=$(extract_logfmt "$LINE" "msg") - LEVEL=$(extract_logfmt "$LINE" "level") - CALLER=$(extract_logfmt "$LINE" "caller") - - # Determine component from caller (e.g., qbft/qbft.go -> qbft) - COMPONENT=$(echo "$CALLER" | cut -d/ -f1) - - if [ -z "$MSG" ]; then - continue - fi - - # Calculate offset from slot start using Loki nanosecond timestamp - OFFSET=$(calc_offset "$LOKI_TS" 2>/dev/null || echo "+?.???s") - - # Create unique event key to avoid duplicate output - EVENT_KEY="${MSG}:${PEER}" - - # Process different event types - case "$MSG" in - "Slot ticked") - if [ -z "${SEEN_EVENTS[slot_ticked]:-}" ]; then - echo " ${OFFSET} [SCHED] Slot ${SLOT} started" - SEEN_EVENTS[slot_ticked]=1 - fi - ;; - - "Resolved proposer duty"|"Resolved attester duty") - PUBKEY=$(extract_logfmt "$LINE" "pubkey") - VIDX=$(extract_logfmt "$LINE" "vidx") - EVENT_KEY="resolved:${PUBKEY}" - if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then - echo " ${OFFSET} [SCHED] Resolved ${DUTY_TYPE} duty (vidx=${VIDX}, pubkey=${PUBKEY})" - SEEN_EVENTS[$EVENT_KEY]=1 - fi - ;; - - "Calling beacon node endpoint...") - ENDPOINT=$(extract_logfmt "$LINE" "endpoint") - EVENT_KEY="fetch_start:${ENDPOINT}" - if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then - echo " ${OFFSET} [FETCHER] Calling beacon node: ${ENDPOINT}" - SEEN_EVENTS[$EVENT_KEY]=1 - fi - ;; - - "Beacon node call finished") - ENDPOINT=$(extract_logfmt "$LINE" "endpoint") - EVENT_KEY="fetch_done:${ENDPOINT}" - if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then - echo " ${OFFSET} [FETCHER] Beacon node call finished: ${ENDPOINT}" - SEEN_EVENTS[$EVENT_KEY]=1 - fi - ;; - - "Beacon node call took longer than expected") - ENDPOINT=$(extract_logfmt "$LINE" "endpoint") - RTT=$(extract_logfmt "$LINE" "rtt") - EVENT_KEY="fetch_slow:${ENDPOINT}" - if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then - echo " ${OFFSET} [FETCHER] ⚠️ SLOW beacon node call: ${ENDPOINT} (RTT=${RTT})" - SEEN_EVENTS[$EVENT_KEY]=1 - fi - ;; - - "QBFT consensus instance starting") - if [ "$CONSENSUS_STARTED" = false ]; then - CONSENSUS_STARTED=true - echo " ${OFFSET} [QBFT] Consensus started" - fi - ;; - - "QBFT round changed") - OLD_ROUND=$(extract_logfmt "$LINE" "round") - NEW_ROUND=$(extract_logfmt "$LINE" "new_round") - REASON=$(extract_logfmt "$LINE" "timeout_reason") - if [ -z "${ROUND_TIMEOUT_REASONS[$OLD_ROUND]:-}" ]; then - echo " ${OFFSET} [QBFT] ⚠️ Round ${OLD_ROUND} TIMEOUT -> Round ${NEW_ROUND}" - echo " Reason: ${REASON}" - ROUND_TIMEOUT_REASONS[$OLD_ROUND]="$REASON" - fi - ;; - - "QBFT consensus decided") - if [ "$CONSENSUS_DECIDED" = false ]; then - CONSENSUS_DECIDED=true - DECIDED_ROUND=$(extract_logfmt "$LINE" "round") - DECIDED_LEADER=$(extract_logfmt "$LINE" "leader_name") - DECIDED_INDEX=$(extract_logfmt "$LINE" "leader_index") - echo " ${OFFSET} [QBFT] ✓ Consensus DECIDED in round ${DECIDED_ROUND}" - echo " Leader: ${DECIDED_LEADER} (index ${DECIDED_INDEX})" - fi - ;; - - "Successfully aggregated partial signatures to reach threshold") - VAPI_ENDPOINT=$(extract_logfmt "$LINE" "vapi_endpoint") - if [ -n "$VAPI_ENDPOINT" ]; then - EVENT_KEY="sigagg:${VAPI_ENDPOINT}" - else - EVENT_KEY="sigagg:${DUTY_TYPE}" - fi - if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then - if [ -n "$VAPI_ENDPOINT" ]; then - echo " ${OFFSET} [SIGAGG] ✓ Threshold signatures aggregated (${VAPI_ENDPOINT})" - else - echo " ${OFFSET} [SIGAGG] ✓ Threshold signatures aggregated" - fi - SEEN_EVENTS[$EVENT_KEY]=1 - fi - ;; - - "Beacon block proposal received from validator client") - BLOCK_VERSION=$(extract_logfmt "$LINE" "block_version") - EVENT_KEY="vapi_proposal" - if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then - echo " ${OFFSET} [VAPI] Block proposal received (version=${BLOCK_VERSION})" - SEEN_EVENTS[$EVENT_KEY]=1 - fi - ;; - - "Successfully submitted v2 attestations to beacon node"|"Successfully submitted proposal to beacon node") - DELAY=$(extract_logfmt "$LINE" "delay") - EVENT_KEY="bcast_success:${MSG}" - if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then - if [ -n "$DELAY" ]; then - echo " ${OFFSET} [BCAST] ✓ Broadcast SUCCESS (delay=${DELAY})" - else - echo " ${OFFSET} [BCAST] ✓ Broadcast SUCCESS" - fi - SEEN_EVENTS[$EVENT_KEY]=1 - fi - ;; - - "Timeout calling bcast/broadcast, duty expired") - VAPI_ENDPOINT=$(extract_logfmt "$LINE" "vapi_endpoint") - EVENT_KEY="bcast_timeout:${VAPI_ENDPOINT}" - if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then - echo " ${OFFSET} [BCAST] ❌ TIMEOUT: duty expired (${VAPI_ENDPOINT})" - SEEN_EVENTS[$EVENT_KEY]=1 - fi - ;; - - "All peers participated in duty") - EVENT_KEY="tracker_all" - if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then - echo " ${OFFSET} [TRACKER] ✓ All peers participated" - SEEN_EVENTS[$EVENT_KEY]=1 - fi - ;; - - "Not all peers participated in duty") - ABSENT=$(extract_logfmt "$LINE" "absent") - EVENT_KEY="tracker_partial" - if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then - echo " ${OFFSET} [TRACKER] ⚠️ Not all peers participated" - echo " Absent: ${ABSENT}" - SEEN_EVENTS[$EVENT_KEY]=1 - fi - ;; - - "Broadcasted block never included on-chain") - PUBKEY=$(extract_logfmt "$LINE" "pubkey") - BLOCK_SLOT=$(extract_logfmt "$LINE" "block_slot") - BROADCAST_DELAY=$(extract_logfmt "$LINE" "broadcast_delay") - EVENT_KEY="tracker_missed" - if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then - echo " ${OFFSET} [TRACKER] ❌ BLOCK MISSED: never included on-chain" - echo " Pubkey: ${PUBKEY}, Broadcast delay: ${BROADCAST_DELAY}" - SEEN_EVENTS[$EVENT_KEY]=1 - fi - ;; - - *"consensus timeout"*|*"duty expired"*) - if [[ "$LEVEL" == "error" ]]; then - EVENT_KEY="error:${MSG:0:50}" - if [ -z "${SEEN_EVENTS[$EVENT_KEY]:-}" ]; then - echo " ${OFFSET} [ERROR] ❌ ${MSG}" - SEEN_EVENTS[$EVENT_KEY]=1 - fi - fi - ;; - esac -done < <(echo "$PARSED_LOGS" | jq -c '.[]') - -echo "" -echo "=== Summary ===" +# Save raw Loki JSON to temp file for Python processing +LOKI_TMPFILE=$(mktemp) +trap 'rm -f "$LOKI_TMPFILE"' EXIT +echo "$LOGS_RAW" > "$LOKI_TMPFILE" + +# Process logs with Python (handles nanosecond timestamps correctly) +python3 - "$LOKI_TMPFILE" "$SLOT" "$SLOT_TIMESTAMP" "$DUTY_TYPE" \ + "$LEADER_PEER_R1" "$LEADER_R1" \ + "$LEADER_PEER_R2" "$LEADER_R2" \ + "$LEADER_PEER_R3" "$LEADER_R3" \ + "$SLOT_TIME" <<'PYTHON_SCRIPT' +import json +import re +import sys +from collections import defaultdict + +loki_file = sys.argv[1] +slot = int(sys.argv[2]) +slot_timestamp = int(sys.argv[3]) +duty_type = sys.argv[4] +leader_peer_r1, leader_idx_r1 = sys.argv[5], sys.argv[6] +leader_peer_r2, leader_idx_r2 = sys.argv[7], sys.argv[8] +leader_peer_r3, leader_idx_r3 = sys.argv[9], sys.argv[10] +slot_time = sys.argv[11] + +slot_timestamp_ns = slot_timestamp * 1_000_000_000 + +with open(loki_file) as f: + data = json.load(f) + +results = data.get("data", {}).get("result", []) +if not results: + print() + print(f"ERROR: No logs found for {slot}/{duty_type}") + print("This could mean:") + print(f" - The cluster did not have this duty in slot {slot}") + print(" - Logs have been rotated/deleted") + print(" - The cluster name or network is incorrect") + sys.exit(1) + +# Parse all log entries +entries = [] +for stream in results: + peer = stream.get("stream", {}).get("cluster_peer", "unknown") + for ts_str, line in stream.get("values", []): + entries.append((int(ts_str), peer, line)) + +entries.sort(key=lambda x: x[0]) +print(f"Found {len(entries)} log entries") +print() + + +def extract_logfmt(line, field): + """Extract a field value from a logfmt-formatted line.""" + # Try quoted value first + m = re.search(rf'{field}="([^"]*)"', line) + if m: + return m.group(1) + # Try unquoted value + m = re.search(rf'{field}=(\S+)', line) + if m: + return m.group(1) + return "" + + +def calc_offset(ts_ns): + """Calculate offset from slot start in seconds.""" + offset_ms = (ts_ns - slot_timestamp_ns) / 1_000_000 + offset_s = offset_ms / 1000 + return f"{offset_s:+.3f}s" + + +def fmt(offset, tag, msg, indent_continuation=None): + """Format a timeline row.""" + line = f" {offset} [{tag}]{' ' * max(1, 10 - len(tag))} {msg}" + if indent_continuation: + line += f"\n{'':24s} {indent_continuation}" + return line + + +# --- Collect events --- +# We'll build a list of (ts_ns, sort_priority, formatted_line) tuples +# sort_priority breaks ties: lower = earlier in output for same timestamp +timeline = [] + +# Track state for summary +consensus_started = False +consensus_decided = False +decided_round = "" +decided_leader = "" +decided_index = "" +round_timeout_reasons = {} +seen_first = set() # for first-only events + +# Per-peer tracking for summary +bn_call_rtts = {} # peer -> rtt string +broadcast_delays = {} # peer -> delay string +block_type = None # "blinded" or "unblinded" +broadcast_success = False +broadcast_timeout = False +tracker_all = False +tracker_partial = False +tracker_absent = "" +tracker_missed = False +tracker_broadcast_delay = "" +error_peers = defaultdict(list) # peer -> [error messages] + +for ts_ns, peer, line in entries: + msg = extract_logfmt(line, "msg") + level = extract_logfmt(line, "level") + if not msg: + continue + + offset = calc_offset(ts_ns) + + # --- SCHEDULER --- + if msg == "Slot ticked": + if "slot_ticked" not in seen_first: + seen_first.add("slot_ticked") + timeline.append((ts_ns, 0, fmt(offset, "SCHED", f"Slot {slot} started"))) + + elif msg in ("Resolved proposer duty", "Resolved attester duty"): + pubkey = extract_logfmt(line, "pubkey") + vidx = extract_logfmt(line, "vidx") + key = f"resolved:{pubkey}" + if key not in seen_first: + seen_first.add(key) + timeline.append((ts_ns, 1, fmt(offset, "SCHED", + f"Resolved {duty_type} duty (vidx={vidx}, pubkey={pubkey})"))) + + # --- FETCHER (per-peer) --- + elif msg == "Calling beacon node endpoint...": + endpoint = extract_logfmt(line, "endpoint") + timeline.append((ts_ns, 10, fmt(offset, "FETCHER", + f"BN call start: {endpoint} [{peer}]"))) + + elif msg == "Beacon node call finished": + endpoint = extract_logfmt(line, "endpoint") + rtt = extract_logfmt(line, "rtt") + rtt_part = f" (RTT={rtt})" if rtt else "" + timeline.append((ts_ns, 11, fmt(offset, "FETCHER", + f"BN call done: {endpoint} [{peer}]{rtt_part}"))) + if rtt: + bn_call_rtts[peer] = rtt + + elif msg == "Beacon node call took longer than expected": + endpoint = extract_logfmt(line, "endpoint") + rtt = extract_logfmt(line, "rtt") + timeline.append((ts_ns, 12, fmt(offset, "FETCHER", + f"SLOW BN call: {endpoint} [{peer}] (RTT={rtt})"))) + if rtt: + bn_call_rtts[peer] = rtt + + # --- CONSENSUS --- + elif msg == "QBFT consensus instance starting": + if not consensus_started: + consensus_started = True + timeline.append((ts_ns, 20, fmt(offset, "QBFT", "Consensus started"))) + + elif msg == "QBFT round changed": + old_round = extract_logfmt(line, "round") + new_round = extract_logfmt(line, "new_round") + reason = extract_logfmt(line, "timeout_reason") + if old_round not in round_timeout_reasons: + round_timeout_reasons[old_round] = reason + timeline.append((ts_ns, 21, fmt(offset, "QBFT", + f"Round {old_round} TIMEOUT -> Round {new_round}", + f"Reason: {reason}"))) + + elif msg == "QBFT consensus decided": + if not consensus_decided: + consensus_decided = True + decided_round = extract_logfmt(line, "round") + decided_leader = extract_logfmt(line, "leader_name") + decided_index = extract_logfmt(line, "leader_index") + timeline.append((ts_ns, 22, fmt(offset, "QBFT", + f"Consensus DECIDED in round {decided_round}", + f"Leader: {decided_leader} (index {decided_index})"))) + + # --- VALIDATOR API (per-peer) --- + elif msg == "Beacon block proposal received from validator client": + block_version = extract_logfmt(line, "block_version") + block_type = "unblinded" + timeline.append((ts_ns, 30, fmt(offset, "VAPI", + f"Block proposal received [{peer}] (version={block_version})"))) + + elif msg == "Blinded beacon block received from validator client": + block_version = extract_logfmt(line, "block_version") + block_type = "blinded" + timeline.append((ts_ns, 30, fmt(offset, "VAPI", + f"Blinded block received [{peer}] (version={block_version})"))) + + # --- SIG AGGREGATION (per-peer) --- + elif msg == "Successfully aggregated partial signatures to reach threshold": + vapi_endpoint = extract_logfmt(line, "vapi_endpoint") + ep_part = f" ({vapi_endpoint})" if vapi_endpoint else "" + timeline.append((ts_ns, 40, fmt(offset, "SIGAGG", + f"Threshold reached [{peer}]{ep_part}"))) + + # --- BROADCAST (per-peer) --- + elif msg in ("Successfully submitted proposal to beacon node", + "Successfully submitted block proposal to beacon node", + "Successfully submitted v2 attestations to beacon node"): + delay = extract_logfmt(line, "delay") + broadcast_success = True + delay_part = f" (delay={delay})" if delay else "" + timeline.append((ts_ns, 50, fmt(offset, "BCAST", + f"Broadcast SUCCESS [{peer}]{delay_part}"))) + if delay: + broadcast_delays[peer] = delay + + elif msg == "Timeout calling bcast/broadcast, duty expired": + vapi_endpoint = extract_logfmt(line, "vapi_endpoint") + broadcast_timeout = True + timeline.append((ts_ns, 51, fmt(offset, "BCAST", + f"TIMEOUT: duty expired [{peer}] ({vapi_endpoint})"))) + + # --- SSE EVENTS (per-peer for "too late", first for normal) --- + elif msg == "Beacon node received block_gossip event too late": + gossip_delay = extract_logfmt(line, "gossip_delay") or extract_logfmt(line, "delay") + delay_part = f" (delay={gossip_delay})" if gossip_delay else "" + timeline.append((ts_ns, 55, fmt(offset, "SSE", + f"block_gossip TOO LATE [{peer}]{delay_part}"))) + + elif msg == "Beacon node received block event too late": + block_delay = extract_logfmt(line, "block_delay") or extract_logfmt(line, "delay") + delay_part = f" (delay={block_delay})" if block_delay else "" + timeline.append((ts_ns, 55, fmt(offset, "SSE", + f"block event TOO LATE [{peer}]{delay_part}"))) + + elif msg in ("SSE block gossip event", "SSE head event", "SSE block event"): + key = f"sse:{msg}" + if key not in seen_first: + seen_first.add(key) + timeline.append((ts_ns, 56, fmt(offset, "SSE", msg))) + + # --- TRACKER (first only) --- + elif msg == "All peers participated in duty": + if "tracker_all" not in seen_first: + seen_first.add("tracker_all") + tracker_all = True + timeline.append((ts_ns, 60, fmt(offset, "TRACKER", + "All peers participated"))) + + elif msg == "Not all peers participated in duty": + if "tracker_partial" not in seen_first: + seen_first.add("tracker_partial") + tracker_partial = True + tracker_absent = extract_logfmt(line, "absent") + timeline.append((ts_ns, 60, fmt(offset, "TRACKER", + "Not all peers participated", + f"Absent: {tracker_absent}"))) + + elif msg == "Broadcasted block never included on-chain": + if "tracker_missed" not in seen_first: + seen_first.add("tracker_missed") + tracker_missed = True + pubkey = extract_logfmt(line, "pubkey") + tracker_broadcast_delay = extract_logfmt(line, "broadcast_delay") + timeline.append((ts_ns, 61, fmt(offset, "TRACKER", + "BLOCK MISSED: never included on-chain", + f"Pubkey: {pubkey}, Broadcast delay: {tracker_broadcast_delay}"))) + + elif msg == "Broadcasted blinded block never included on-chain": + if "tracker_missed_blinded" not in seen_first: + seen_first.add("tracker_missed_blinded") + tracker_missed = True + pubkey = extract_logfmt(line, "pubkey") + tracker_broadcast_delay = extract_logfmt(line, "broadcast_delay") + timeline.append((ts_ns, 61, fmt(offset, "TRACKER", + "BLINDED BLOCK MISSED: never included on-chain", + f"Pubkey: {pubkey}, Broadcast delay: {tracker_broadcast_delay}"))) + + # --- ERRORS (per-peer) --- + elif level == "error" and ("consensus timeout" in msg.lower() or "permanent failure" in msg.lower()): + error_peers[peer].append(msg) + timeline.append((ts_ns, 70, fmt(offset, "ERROR", + f"{msg} [{peer}]"))) + +# Sort and print timeline +timeline.sort(key=lambda x: (x[0], x[1])) + +print("=== Event Timeline ===") +print(f"(Offset relative to slot start time: {slot_time})") +print() + +for _, _, line in timeline: + print(line) + +print() +print("=== Summary ===") # Consensus summary -if [ "$CONSENSUS_STARTED" = true ]; then - if [ "$CONSENSUS_DECIDED" = true ]; then - NUM_TIMEOUTS=${#ROUND_TIMEOUT_REASONS[@]} - if [ "$NUM_TIMEOUTS" -eq 0 ]; then - echo "Consensus: ✓ Completed in round 1 (optimal)" - else - echo "Consensus: ✓ Completed in round ${DECIDED_ROUND} after ${NUM_TIMEOUTS} timeout(s)" - echo " Leader: ${DECIDED_LEADER} (index ${DECIDED_INDEX})" - if [ -n "${ROUND_TIMEOUT_REASONS[1]:-}" ]; then - echo " ⚠️ Round 1 leader ${LEADER_PEER_R1} failed" - fi - fi - else - echo "Consensus: ❌ Did NOT complete" - fi -else - echo "Consensus: ⚠️ Not started (logs may be incomplete)" -fi +if consensus_started: + if consensus_decided: + num_timeouts = len(round_timeout_reasons) + if num_timeouts == 0: + print("Consensus: Completed in round 1 (optimal)") + else: + print(f"Consensus: Completed in round {decided_round} after {num_timeouts} timeout(s)") + print(f" Leader: {decided_leader} (index {decided_index})") + if "1" in round_timeout_reasons: + print(f" Round 1 leader {leader_peer_r1} failed") + else: + print("Consensus: Did NOT complete") +else: + print("Consensus: Not started (logs may be incomplete)") + +# Block type +if block_type: + print(f"Block type: {block_type}") # Broadcast summary -if [ -n "${SEEN_EVENTS[bcast_timeout:submit_proposal_v2]:-}" ] || [ -n "${SEEN_EVENTS[bcast_timeout:submit_attestation]:-}" ]; then - echo "Broadcast: ❌ TIMEOUT - duty expired before broadcast" -elif [ -n "${SEEN_EVENTS[bcast_success:Successfully submitted v2 attestations to beacon node]:-}" ] || \ - [ -n "${SEEN_EVENTS[bcast_success:Successfully submitted proposal to beacon node]:-}" ]; then - echo "Broadcast: ✓ Successfully submitted to beacon node" -else - echo "Broadcast: ⚠️ No broadcast event found in logs" -fi +if broadcast_timeout: + print("Broadcast: TIMEOUT - duty expired before broadcast") +elif broadcast_success: + if broadcast_delays: + delays_str = ", ".join(f"{p}={d}" for p, d in sorted(broadcast_delays.items())) + # Parse delay values for min-max + delay_vals = [] + for d in broadcast_delays.values(): + m = re.search(r'[\d.]+', d) + if m: + delay_vals.append(float(m.group())) + if len(delay_vals) >= 2: + print(f"Broadcast: Successfully submitted (delay range: {min(delay_vals):.1f}s-{max(delay_vals):.1f}s)") + else: + print(f"Broadcast: Successfully submitted ({delays_str})") + else: + print("Broadcast: Successfully submitted to beacon node") +else: + print("Broadcast: No broadcast event found in logs") + +# BN call RTT summary +if bn_call_rtts: + rtt_vals = [] + for r in bn_call_rtts.values(): + m = re.search(r'[\d.]+', r) + if m: + rtt_vals.append(float(m.group())) + if rtt_vals: + if len(rtt_vals) >= 2: + print(f"BN call RTT: {min(rtt_vals):.1f}s-{max(rtt_vals):.1f}s across {len(rtt_vals)} peers") + else: + rtts_str = ", ".join(f"{p}={r}" for p, r in sorted(bn_call_rtts.items())) + print(f"BN call RTT: {rtts_str}") # Inclusion summary (for proposer) -if [ "$DUTY_TYPE" = "proposer" ]; then - if [ -n "${SEEN_EVENTS[tracker_missed]:-}" ]; then - echo "Inclusion: ❌ MISSED - block never included on-chain" - elif [ -n "${SEEN_EVENTS[tracker_all]:-}" ]; then - echo "Inclusion: ✓ Block included on-chain" - else - echo "Inclusion: ⚠️ Unknown (tracker event not found)" - fi -fi +if duty_type == "proposer": + if tracker_missed: + delay_part = f" (broadcast_delay={tracker_broadcast_delay})" if tracker_broadcast_delay else "" + print(f"Inclusion: MISSED - block never included on-chain{delay_part}") + elif tracker_all: + print("Inclusion: Block included on-chain") + else: + print("Inclusion: Unknown (tracker event not found)") # Participation summary -if [ -n "${SEEN_EVENTS[tracker_partial]:-}" ]; then - echo "Participation: ⚠️ Not all peers participated" -elif [ -n "${SEEN_EVENTS[tracker_all]:-}" ]; then - echo "Participation: ✓ All peers participated" -fi - -echo "" +if tracker_partial: + print(f"Participation: Not all peers participated (absent: {tracker_absent})") +elif tracker_all: + print("Participation: All peers participated") + +# Error summary +if error_peers: + print("Errors:") + for p, msgs in sorted(error_peers.items()): + for m in msgs: + print(f" - [{p}] {m}") + +print() +PYTHON_SCRIPT From 14e8f35749f5e6d87fa382a10a3af25d2b1417af Mon Sep 17 00:00:00 2001 From: Andrei Smirnov Date: Wed, 4 Mar 2026 17:10:48 +0300 Subject: [PATCH 3/7] *: improved bcast and tracker checks --- scripts/duty-timeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/duty-timeline.sh b/scripts/duty-timeline.sh index ced6cf9c1c..e51cef9c4a 100644 --- a/scripts/duty-timeline.sh +++ b/scripts/duty-timeline.sh @@ -138,7 +138,7 @@ echo "" # Query Loki for all logs related to this slot and duty # Match various log formats for the duty DUTY_PATTERN="${SLOT}/${DUTY_TYPE}" -LOGQL="{cluster_name=\"${CLUSTER_NAME}\",cluster_network=\"${NETWORK}\"} |~ \`${DUTY_PATTERN}|duty=${DUTY_TYPE}.*slot=${SLOT}|slot.*${SLOT}.*${DUTY_TYPE}\`" +LOGQL="{cluster_name=\"${CLUSTER_NAME}\",cluster_network=\"${NETWORK}\"} |~ \`${DUTY_PATTERN}|duty=${DUTY_TYPE}.*slot=${SLOT}|slot.*${SLOT}.*${DUTY_TYPE}|block_slot=${SLOT}\`" loki_query() { local query="$1" From 02fd9e8ac6b13b77919eb33276dd1962db1bb383 Mon Sep 17 00:00:00 2001 From: Andrei Smirnov Date: Wed, 4 Mar 2026 17:31:50 +0300 Subject: [PATCH 4/7] Use tracker metric to confirm inclusion --- scripts/duty-timeline.sh | 61 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/scripts/duty-timeline.sh b/scripts/duty-timeline.sh index e51cef9c4a..17b84229ac 100644 --- a/scripts/duty-timeline.sh +++ b/scripts/duty-timeline.sh @@ -74,9 +74,10 @@ SLOT_TIMESTAMP=$((GENESIS + SLOT * SECONDS_PER_SLOT)) START_NS=$(( (SLOT_TIMESTAMP - 15) * 1000000000 )) END_NS=$(( (SLOT_TIMESTAMP + 500) * 1000000000 )) # ~8 minutes for tracker inclusion checks -# Discover Loki URL +# Discover Loki and Prometheus URLs DATASOURCES=$("$SCRIPT_DIR/grafana-datasources.sh") LOKI_URL=$(echo "$DATASOURCES" | grep '^LOKI_URL=' | cut -d= -f2-) +PROM_URL=$(echo "$DATASOURCES" | grep '^PROMETHEUS_URL=' | cut -d= -f2-) if [ -z "$LOKI_URL" ]; then echo "Error: could not discover Loki URL" >&2 @@ -151,6 +152,36 @@ loki_query() { "${LOKI_URL}query_range" } +# Query core_tracker_inclusion_missed_total metric delta around the slot's inclusion check window. +# InclCheckLag=6 slots, InclMissedLag=32 slots (from core/tracker/inclusion.go). +# We sample the counter just before the check window opens and just after it closes. +INCL_METRIC_DELTA="unknown" +if [ -n "$PROM_URL" ] && [ "$DUTY_TYPE" = "proposer" ]; then + INCL_CHECK_LAG=6 + INCL_MISSED_LAG=32 + INCL_BEFORE_TIME=$(( SLOT_TIMESTAMP + INCL_CHECK_LAG * SECONDS_PER_SLOT - 1 )) + INCL_AFTER_TIME=$(( SLOT_TIMESTAMP + (INCL_MISSED_LAG + 2) * SECONDS_PER_SLOT )) + METRIC_QUERY="core_tracker_inclusion_missed_total{cluster_name=\"${CLUSTER_NAME}\",cluster_network=\"${NETWORK}\",duty=\"proposer\"}" + # Sum across all peers to get cluster-wide delta + VAL_BEFORE_SUM=$(curl -sf -G \ + -H "$AUTH" \ + --data-urlencode "query=sum(${METRIC_QUERY})" \ + --data-urlencode "time=${INCL_BEFORE_TIME}" \ + "${PROM_URL}query" | jq -r 'if .data.result | length == 0 then "0" else .data.result[0].value[1] end' 2>/dev/null || echo "0") + VAL_AFTER_SUM=$(curl -sf -G \ + -H "$AUTH" \ + --data-urlencode "query=sum(${METRIC_QUERY})" \ + --data-urlencode "time=${INCL_AFTER_TIME}" \ + "${PROM_URL}query" | jq -r 'if .data.result | length == 0 then "0" else .data.result[0].value[1] end' 2>/dev/null || echo "0") + # Use integer arithmetic to determine delta + DELTA=$(echo "$VAL_BEFORE_SUM $VAL_AFTER_SUM" | awk '{d=$2-$1; if(d<0) d=0; printf "%d", d}') + if [ "$DELTA" -gt 0 ] 2>/dev/null; then + INCL_METRIC_DELTA="missed" + elif [ "$VAL_AFTER_SUM" != "0" ] || [ "$VAL_BEFORE_SUM" != "0" ]; then + INCL_METRIC_DELTA="not_missed" + fi +fi + echo "=== Fetching Logs ===" LOGS_RAW=$(loki_query "$LOGQL") @@ -164,7 +195,7 @@ python3 - "$LOKI_TMPFILE" "$SLOT" "$SLOT_TIMESTAMP" "$DUTY_TYPE" \ "$LEADER_PEER_R1" "$LEADER_R1" \ "$LEADER_PEER_R2" "$LEADER_R2" \ "$LEADER_PEER_R3" "$LEADER_R3" \ - "$SLOT_TIME" <<'PYTHON_SCRIPT' + "$SLOT_TIME" "$INCL_METRIC_DELTA" <<'PYTHON_SCRIPT' import json import re import sys @@ -178,6 +209,7 @@ leader_peer_r1, leader_idx_r1 = sys.argv[5], sys.argv[6] leader_peer_r2, leader_idx_r2 = sys.argv[7], sys.argv[8] leader_peer_r3, leader_idx_r3 = sys.argv[9], sys.argv[10] slot_time = sys.argv[11] +incl_metric_delta = sys.argv[12] # "missed", "not_missed", or "unknown" slot_timestamp_ns = slot_timestamp * 1_000_000_000 @@ -258,6 +290,7 @@ tracker_all = False tracker_partial = False tracker_absent = "" tracker_missed = False +tracker_included = False tracker_broadcast_delay = "" error_peers = defaultdict(list) # peer -> [error messages] @@ -407,6 +440,17 @@ for ts_ns, peer, line in entries: "Not all peers participated", f"Absent: {tracker_absent}"))) + elif msg in ("Broadcasted block included on-chain", "Broadcasted blinded block included on-chain"): + if "tracker_included" not in seen_first: + seen_first.add("tracker_included") + tracker_included = True + pubkey = extract_logfmt(line, "pubkey") + tracker_broadcast_delay = extract_logfmt(line, "broadcast_delay") + label = "BLINDED BLOCK included on-chain" if "blinded" in msg else "BLOCK included on-chain" + timeline.append((ts_ns, 61, fmt(offset, "TRACKER", + label, + f"Pubkey: {pubkey}, Broadcast delay: {tracker_broadcast_delay}"))) + elif msg == "Broadcasted block never included on-chain": if "tracker_missed" not in seen_first: seen_first.add("tracker_missed") @@ -506,16 +550,23 @@ if duty_type == "proposer": if tracker_missed: delay_part = f" (broadcast_delay={tracker_broadcast_delay})" if tracker_broadcast_delay else "" print(f"Inclusion: MISSED - block never included on-chain{delay_part}") - elif tracker_all: - print("Inclusion: Block included on-chain") + elif tracker_included: + delay_part = f" (broadcast_delay={tracker_broadcast_delay})" if tracker_broadcast_delay else "" + print(f"Inclusion: Block included on-chain{delay_part}") + elif incl_metric_delta == "missed": + print("Inclusion: MISSED - inferred from core_tracker_inclusion_missed_total metric (no tracker log found)") + elif incl_metric_delta == "not_missed": + print("Inclusion: Block likely included on-chain (metric counter did not increase)") else: - print("Inclusion: Unknown (tracker event not found)") + print("Inclusion: Unknown (no tracker log or metric data found)") # Participation summary if tracker_partial: print(f"Participation: Not all peers participated (absent: {tracker_absent})") elif tracker_all: print("Participation: All peers participated") +else: + print("Participation: Unknown (tracker event not found)") # Error summary if error_peers: From 7fc12b1cb1cd13c6f4c885c2729d9aeb77a87580 Mon Sep 17 00:00:00 2001 From: Andrei Smirnov Date: Wed, 4 Mar 2026 17:41:53 +0300 Subject: [PATCH 5/7] Added feature flags to cluster-config --- .claude/skills/cluster-config/SKILL.md | 2 ++ scripts/cluster-config.sh | 12 ++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.claude/skills/cluster-config/SKILL.md b/.claude/skills/cluster-config/SKILL.md index b2b196a765..4a8c5a3690 100644 --- a/.claude/skills/cluster-config/SKILL.md +++ b/.claude/skills/cluster-config/SKILL.md @@ -31,4 +31,6 @@ Present the results to the user in a readable format: - **Active Validators**: currently active validators - **Total Validators**: total validators in the cluster +The peers table includes: index, peer name, operator nickname, version, and **feature flags** (comma-separated list of enabled flags, or `-` if none / not reporting). + If the script exits with an error (cluster not found), relay the error and suggest the user double-check the cluster name spelling or try a different network. diff --git a/scripts/cluster-config.sh b/scripts/cluster-config.sh index e1c8a078f3..51d2137d75 100755 --- a/scripts/cluster-config.sh +++ b/scripts/cluster-config.sh @@ -82,9 +82,11 @@ fi # app_peerinfo_* metrics use 'peer' label (= cluster_peer value of the described peer). # app_peer_name uses 'cluster_peer' as key and 'peer_name' as the human-readable name. # Multiple nodes report peerinfo for all peers, so results are deduplicated by peer. +# app_feature_flags is reported by each node for itself, keyed by 'cluster_peer'. idx_raw=$(prom_query "app_peerinfo_index") nick_raw=$(prom_query "app_peerinfo_nickname") ver_raw=$(prom_query "app_peerinfo_version") +flags_raw=$(prom_query "app_feature_flags") echo "=== Cluster Info ===" echo "Name: ${CLUSTER_NAME}" @@ -99,17 +101,19 @@ jq -rn \ --argjson idx "$idx_raw" \ --argjson nicks "$nick_raw" \ --argjson vers "$ver_raw" \ + --argjson flags "$flags_raw" \ ' - ($nicks.data.result | map({(.metric.peer): (.metric.peer_nickname // "?")}) | add // {}) as $nick_map | - ($vers.data.result | map({(.metric.peer): (.metric.version // "?")}) | add // {}) as $ver_map | - ["INDEX", "PEER", "NICKNAME", "VERSION"], + ($nicks.data.result | map({(.metric.peer): (.metric.peer_nickname // "?")}) | add // {}) as $nick_map | + ($vers.data.result | map({(.metric.peer): (.metric.version // "?")}) | add // {}) as $ver_map | + ($flags.data.result | map({(.metric.cluster_peer): (.metric.feature_flags // "")}) | add // {}) as $flags_map | + ["INDEX", "PEER", "NICKNAME", "VERSION", "FEATURE_FLAGS"], ( $idx.data.result | map({peer: .metric.peer, index: (.value[1] | tonumber)}) | unique_by(.peer) | sort_by(.index) | .[] - | [(.index | tostring), .peer, ($nick_map[.peer] // "?"), ($ver_map[.peer] // "?")] + | [(.index | tostring), .peer, ($nick_map[.peer] // "?"), ($ver_map[.peer] // "?"), ($flags_map[.peer] // "-")] ) | @tsv ' | column -t -s $'\t' From bbf445ca8a6ab1a1e30cf6a96405308f3b1de2c7 Mon Sep 17 00:00:00 2001 From: Andrei Smirnov Date: Wed, 4 Mar 2026 20:49:29 +0300 Subject: [PATCH 6/7] Redesigned for single script and skill --- .claude/skills/cluster-config/SKILL.md | 36 -- .claude/skills/consensus-leader/SKILL.md | 63 -- .claude/skills/duty-timeline/SKILL.md | 192 ------ .claude/skills/grafana-datasources/SKILL.md | 20 - .claude/skills/missed-proposal/SKILL.md | 198 +++++++ scripts/README.md | 94 --- scripts/cluster-config.sh | 119 ---- scripts/consensus-leader.sh | 134 ----- scripts/debug/missed_proposal.py | 610 ++++++++++++++++++++ scripts/duty-timeline.sh | 579 ------------------- scripts/grafana-datasources.sh | 34 -- 11 files changed, 808 insertions(+), 1271 deletions(-) delete mode 100644 .claude/skills/cluster-config/SKILL.md delete mode 100644 .claude/skills/consensus-leader/SKILL.md delete mode 100644 .claude/skills/duty-timeline/SKILL.md delete mode 100644 .claude/skills/grafana-datasources/SKILL.md create mode 100644 .claude/skills/missed-proposal/SKILL.md delete mode 100755 scripts/cluster-config.sh delete mode 100755 scripts/consensus-leader.sh create mode 100755 scripts/debug/missed_proposal.py delete mode 100644 scripts/duty-timeline.sh delete mode 100755 scripts/grafana-datasources.sh diff --git a/.claude/skills/cluster-config/SKILL.md b/.claude/skills/cluster-config/SKILL.md deleted file mode 100644 index 4a8c5a3690..0000000000 --- a/.claude/skills/cluster-config/SKILL.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -name: cluster-config -description: Fetch cluster configuration metrics (version, operators, threshold, validators) from Prometheus -user-invokable: true ---- - -# Cluster Config - -Fetch cluster configuration metrics from Prometheus for a given cluster name and optional network. - -## Arguments - -The user must provide: -- **cluster name** (required): e.g. `Lido x Obol: Ethereal Elf` -- **network** (optional, default: `mainnet`): e.g. `mainnet`, `hoodi` - -## Execution - -Run the script with the cluster name and network: -```bash -bash scripts/cluster-config.sh "" "" -``` - -## Output - -Present the results to the user in a readable format: -- **Cluster**: name and network -- **App Version**: charon version running -- **Operators**: number of operators in the cluster -- **Threshold**: signature threshold -- **Active Validators**: currently active validators -- **Total Validators**: total validators in the cluster - -The peers table includes: index, peer name, operator nickname, version, and **feature flags** (comma-separated list of enabled flags, or `-` if none / not reporting). - -If the script exits with an error (cluster not found), relay the error and suggest the user double-check the cluster name spelling or try a different network. diff --git a/.claude/skills/consensus-leader/SKILL.md b/.claude/skills/consensus-leader/SKILL.md deleted file mode 100644 index ce22c926ca..0000000000 --- a/.claude/skills/consensus-leader/SKILL.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -name: consensus-leader -description: Calculate consensus leader sequence for a given slot and cluster -user-invokable: true ---- - -# Consensus Leader - -Calculate the consensus leader sequence for a given slot number using the QBFT leader election formula. - -## Arguments - -The user must provide: -- **cluster name** (required): e.g. `Lido x Obol: Ethereal Elf` -- **slot number** (required): e.g. `13813408` -- **network** (optional, default: `mainnet`): e.g. `mainnet`, `hoodi` -- **duty type** (optional, default: `proposer`): e.g. `proposer`, `attester`, `randao`, `sync_message` - -## Duty Types - -Valid duty types (from `core/types.go`): -- `proposer` (1) - block proposal -- `attester` (2) - attestation -- `signature` (3) - generic signature -- `exit` (4) - voluntary exit -- `builder_registration` (6) - MEV builder registration -- `randao` (7) - RANDAO reveal -- `prepare_aggregator` (8) - aggregator preparation -- `aggregator` (9) - attestation aggregation -- `sync_message` (10) - sync committee message -- `prepare_sync_contribution` (11) - sync contribution preparation -- `sync_contribution` (12) - sync committee contribution -- `info_sync` (13) - info sync - -## Execution - -Run the script with the required arguments: -```bash -bash scripts/consensus-leader.sh "" [network] [duty_type] -``` - -## Leader Election Formula - -The consensus leader for each round is calculated as: -``` -leader_index = (slot + duty_type + round) % num_nodes -``` - -Where: -- `slot` is the beacon chain slot number -- `duty_type` is the numeric value of the duty type -- `round` is the QBFT consensus round (1, 2, or 3) -- `num_nodes` is the number of operators in the cluster - -## Output - -Present the results to the user including: -- **Slot Info**: slot number, epoch, slot within epoch, absolute time (UTC) -- **Network**: the Ethereum network -- **Duty**: the duty type being calculated -- **Leaders**: table showing round number, leader index, and peer name for rounds 1-3 - -This helps diagnose consensus issues by identifying which node was responsible for leading each round. diff --git a/.claude/skills/duty-timeline/SKILL.md b/.claude/skills/duty-timeline/SKILL.md deleted file mode 100644 index 01a02be073..0000000000 --- a/.claude/skills/duty-timeline/SKILL.md +++ /dev/null @@ -1,192 +0,0 @@ ---- -name: duty-timeline -description: Generate a comprehensive timeline of events for a duty across all peers -user-invokable: true ---- - -# Duty Timeline - -Generate a detailed timeline showing the complete lifecycle of a validator duty (block proposal, attestation, etc.) across all cluster peers by analyzing Loki logs. Shows **per-peer details** for key events like BN calls, broadcasts, and errors to support root cause analysis. - -## Arguments - -The user must provide: -- **cluster name** (required): e.g. `Lido x Obol: Ethereal Elf` -- **slot number** (required): e.g. `13813408` -- **network** (optional, default: `mainnet`): e.g. `mainnet`, `hoodi` -- **duty type** (optional, default: `proposer`): e.g. `proposer`, `attester`, `randao`, `sync_message`, `aggregator` - -## Execution - -Run the script with the required arguments: -```bash -bash scripts/duty-timeline.sh "" [network] [duty_type] -``` - -## What It Does - -1. Calculates expected consensus leaders for rounds 1, 2, and 3 -2. Queries Loki for all logs related to the duty across the time window -3. Parses logs with Python (handles nanosecond timestamps correctly) -4. Shows timing offset relative to slot start for each event -5. Displays **per-peer rows** for events where peer-level detail matters -6. Tracks events across all workflow components: - - Scheduler: slot ticks, duty resolution - - Fetcher: beacon node calls per peer with RTT - - QBFT: consensus start, round changes, decisions - - ValidatorAPI: block/blinded block proposals per peer - - SigAgg: threshold signature aggregation per peer - - Broadcast: submission per peer with delay - - SSE: block_gossip/block/head events, "too late" warnings per peer - - Tracker: participation and inclusion status - - Errors: consensus timeout / permanent failure per peer - -## Key Events Tracked - -| Component | Event | Per-peer? | Meaning | -|-----------|-------|-----------|---------| -| SCHED | Slot ticked | first | Slot started | -| SCHED | Resolved duty | first per pubkey | Duty assigned to validator | -| FETCHER | BN call start | yes | Fetching unsigned duty data | -| FETCHER | BN call done | yes | Data fetched (with RTT) | -| FETCHER | SLOW BN call | yes | Call took longer than expected | -| QBFT | Consensus started | first | QBFT instance initialized | -| QBFT | Round TIMEOUT | first per round | Round failed, moving to next | -| QBFT | Consensus DECIDED | first | Agreement reached | -| VAPI | Block proposal received | yes | VC submitted unblinded proposal | -| VAPI | Blinded block received | yes | VC submitted blinded proposal | -| SIGAGG | Threshold reached | yes | Enough partial sigs collected | -| BCAST | Broadcast SUCCESS | yes | Submitted to beacon node (with delay) | -| BCAST | TIMEOUT | yes | Duty expired before broadcast | -| SSE | block_gossip TOO LATE | yes | Late gossip event per peer | -| SSE | block event TOO LATE | yes | Late block event per peer | -| SSE | SSE block gossip/head/block event | first | Normal SSE events | -| TRACKER | All peers participated | first | Full participation | -| TRACKER | Not all peers participated | first | Some peers missing | -| TRACKER | BLOCK MISSED | first | Block never included on-chain | -| TRACKER | BLINDED BLOCK MISSED | first | Blinded block never included | -| ERROR | consensus timeout | yes | Per-peer consensus timeout | -| ERROR | permanent failure | yes | Per-peer permanent failure | - -## Output - -The script provides: - -1. **Duty Info**: slot, epoch, time, network, duty type -2. **Expected Consensus Leaders**: who should lead rounds 1, 2, 3 -3. **Event Timeline**: chronological sequence with timing offsets and per-peer detail -4. **Summary**: - - Consensus status (success/failure, round count) - - Block type (blinded/unblinded) - - Broadcast status with delay range (min-max across peers) - - BN call RTT range across peers - - Inclusion status (for proposer duties, with broadcast_delay) - - Participation status (with absent peers listed) - - Error summary per peer - -### Example Output - -``` -=== Duty Info === -Slot: 13810452 -Epoch: 431576 (slot 20 of 32) -Time: 2026-03-03T21:31:00Z -Network: mainnet -Duty: proposer - -=== Expected Consensus Leaders === -Round 1: curious-cat (index 2) -Round 2: daring-dog (index 3) -Round 3: eager-elk (index 4) - -=== Fetching Logs === -Found 87 log entries - -=== Event Timeline === -(Offset relative to slot start time: 2026-03-03T21:31:00Z) - - +0.005s [SCHED] Slot 13810452 started - +0.010s [SCHED] Resolved proposer duty (vidx=123456, pubkey=0x...) - +0.015s [FETCHER] BN call start: /eth/v3/validator/blocks/13810452 [alpha-ant] - +0.016s [FETCHER] BN call start: /eth/v3/validator/blocks/13810452 [brave-bee] - +0.018s [FETCHER] BN call start: /eth/v3/validator/blocks/13810452 [curious-cat] - +0.920s [FETCHER] BN call done: /eth/v3/validator/blocks/13810452 [alpha-ant] (RTT=0.9s) - +1.800s [FETCHER] BN call done: /eth/v3/validator/blocks/13810452 [brave-bee] (RTT=1.8s) - +2.100s [FETCHER] SLOW BN call: /eth/v3/validator/blocks/13810452 [curious-cat] (RTT=2.1s) - +2.110s [QBFT] Consensus started - +6.200s [QBFT] Round 1 TIMEOUT -> Round 2 - Reason: leader not proposing - +8.500s [QBFT] Consensus DECIDED in round 2 - Leader: daring-dog (index 3) - +8.600s [VAPI] Blinded block received [alpha-ant] (version=deneb) - +8.620s [VAPI] Blinded block received [brave-bee] (version=deneb) - +8.900s [SIGAGG] Threshold reached [alpha-ant] (submit_blinded_block) - +8.920s [SIGAGG] Threshold reached [brave-bee] (submit_blinded_block) - +9.000s [BCAST] Broadcast SUCCESS [alpha-ant] (delay=3.5s) - +9.020s [BCAST] Broadcast SUCCESS [brave-bee] (delay=3.52s) - +9.100s [SSE] block_gossip TOO LATE [alpha-ant] (delay=9.1s) - +9.150s [SSE] block event TOO LATE [brave-bee] (delay=9.15s) - +12.00s [ERROR] consensus timeout [average-road] - +480.0s [TRACKER] BLINDED BLOCK MISSED: never included on-chain - Pubkey: 0x..., Broadcast delay: 3.5s - +480.1s [TRACKER] Not all peers participated - Absent: average-road - -=== Summary === -Consensus: Completed in round 2 after 1 timeout(s) - Leader: daring-dog (index 3) - Round 1 leader curious-cat failed -Block type: blinded -Broadcast: Successfully submitted (delay range: 3.5s-3.5s) -BN call RTT: 0.9s-2.1s across 3 peers -Inclusion: MISSED - block never included on-chain (broadcast_delay=3.5s) -Participation: Not all peers participated (absent: average-road) -Errors: - - [average-road] consensus timeout -``` - -## Common Failure Patterns - -### Slow Beacon Node (per-peer) -``` - +0.920s [FETCHER] BN call done: /eth/v3/... [alpha-ant] (RTT=0.9s) - +2.100s [FETCHER] SLOW BN call: /eth/v3/... [curious-cat] (RTT=2.1s) -``` -Shows which specific peers have slow BN calls and the RTT spread. - -### Consensus Timeouts -``` - +6.200s [QBFT] Round 1 TIMEOUT -> Round 2 - Reason: leader not proposing -``` -Round 1 leader failed to propose, consensus moved to round 2. - -### Missed Block with Broadcast Delay -``` - +480.0s [TRACKER] BLINDED BLOCK MISSED: never included on-chain - Pubkey: 0x..., Broadcast delay: 3.5s -``` -Block was broadcast but not included on-chain. Summary includes broadcast_delay for correlation. - -### Per-peer Errors -``` - +12.00s [ERROR] consensus timeout [average-road] -``` -Shows which peer(s) experienced errors, helping identify the failing node. - -## Troubleshooting - -If no logs are found: -- Verify the cluster name spelling is exact -- Check the network is correct -- Confirm the slot had a duty (not all slots have all duty types) -- Logs may have been rotated if the slot is old - -## Dependencies - -This skill uses: -- `cluster-config.sh` - to get cluster info and peer names -- `grafana-datasources.sh` - to discover Loki URL -- Loki API - to query logs -- `python3` - to parse Loki JSON (handles nanosecond timestamps) -- Requires `OBOL_GRAFANA_API_TOKEN` environment variable diff --git a/.claude/skills/grafana-datasources/SKILL.md b/.claude/skills/grafana-datasources/SKILL.md deleted file mode 100644 index 58af8f0526..0000000000 --- a/.claude/skills/grafana-datasources/SKILL.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -name: grafana-datasources -description: Discover Prometheus and Loki datasource proxy URLs from Grafana -user-invokable: true ---- - -# Grafana Datasources - -Run the following script to discover Prometheus and Loki datasource proxy URLs from Grafana. The script requires the `OBOL_GRAFANA_API_TOKEN` environment variable. - -Execute this command: -```bash -bash scripts/grafana-datasources.sh -``` - -Present the two output URLs to the user: -- **Prometheus**: for querying metrics via the Prometheus HTTP API (e.g., `query`, `query_range`) -- **Loki**: for querying logs via the Loki HTTP API (e.g., `query`, `query_range`) - -This is a non-interactive skill. Do not ask the user any questions — just run the script and display the results. diff --git a/.claude/skills/missed-proposal/SKILL.md b/.claude/skills/missed-proposal/SKILL.md new file mode 100644 index 0000000000..0d935c3506 --- /dev/null +++ b/.claude/skills/missed-proposal/SKILL.md @@ -0,0 +1,198 @@ +```skill +--- +name: missed-proposal +description: Analyze a potentially missed block proposal for a cluster at a specific slot +user-invokable: true +--- + +# Missed Proposal Analysis + +Analyze a potentially missed block proposal by collecting cluster configuration, consensus leader information, and event logs from the specified slot. This skill gathers data and performs root cause analysis. + +## Arguments + +The user must provide: +- **cluster name** (required): e.g. `Lido x Obol: Ethereal Elf` +- **slot number** (required): e.g. `13813408` +- **network** (optional, default: `mainnet`): e.g. `mainnet`, `hoodi`, `sepolia` + +## Execution + +Run the Python script to collect all data: +```bash +python3 scripts/debug/missed_proposal.py "" [network] +``` + +The script outputs JSON with: +- `slot`: slot number, epoch, timestamp, time (UTC) +- `cluster`: cluster config (name, hash, version, operators, threshold, validators, peers) +- `cluster_found`: boolean indicating if cluster was found in Prometheus +- `leaders`: expected consensus leaders for rounds 1, 2, 3 +- `logs`: parsed log events from Loki with warnings +- `inclusion_metric`: "missed", "not_missed", or "unknown" + +## Handling Warnings + +The script may report warnings that require user action: + +### No Logs Available +If `logs.warnings` contains "No log streams found" or `logs.total_entries` is 0: +- Inform the user that no logs are available for this slot +- This could mean: logs have been rotated, the cluster didn't have this duty, or the cluster name is incorrect + +### Missing Peer Logs +If `logs.warnings` contains "Missing logs from peers": +- Report which peers are missing logs +- Explain that complete analysis may not be possible without all peer logs +- Ask the user if they can request logs from the missing operators + +### Cluster Not Found +If `cluster_found` is false: +- Report that the cluster was not found in Prometheus +- Suggest double-checking the cluster name spelling and network + +## Analysis and Output + +After collecting data, analyze and present findings in this format: + +### 1. Cluster Info +Present cluster configuration: +``` +=== Cluster Info === +Name: +Hash: +Version: +Network: +Nodes: (threshold: ) +Validators: active / total +``` + +### 2. Slot Info +Present slot details: +``` +=== Slot Info === +Slot: +Epoch: (slot of 32) +Time: +Duty: proposer +``` + +### 3. Expected Consensus Leaders +Present the leader table: +``` +=== Expected Consensus Leaders === +Round 1: (index ) +Round 2: (index ) +Round 3: (index ) +``` + +### 4. Event Timeline +Present key events chronologically with offset from slot start: +``` +=== Event Timeline === +(Offset relative to slot start time: ) + + +0.005s [SCHED] Slot started + +0.010s [SCHED] Resolved proposer duty (vidx=..., pubkey=...) + +0.015s [FETCHER] BN call start: [] + ... +``` + +Event types to show: +| Type | Tag | Description | +|------|-----|-------------| +| slot_ticked | SCHED | Slot started | +| resolved_duty | SCHED | Duty assigned to validator | +| bn_call_start | FETCHER | Fetching unsigned data from BN | +| bn_call_done | FETCHER | BN call completed with RTT | +| bn_call_slow | FETCHER | BN call took longer than expected | +| consensus_started | QBFT | Consensus instance started | +| round_timeout | QBFT | Round timed out, moving to next | +| consensus_decided | QBFT | Consensus decision reached | +| block_proposal_received | VAPI | VC submitted block proposal | +| threshold_reached | SIGAGG | Threshold signatures aggregated | +| broadcast_success | BCAST | Block broadcast to BN | +| broadcast_timeout | BCAST | Duty expired before broadcast | +| sse_block_gossip_late | SSE | Late block gossip event | +| sse_block_late | SSE | Late block event | +| tracker_all_participated | TRACKER | All peers participated | +| tracker_partial_participation | TRACKER | Some peers missing | +| tracker_block_included | TRACKER | Block included on-chain | +| tracker_block_missed | TRACKER | Block NOT included on-chain | +| error | ERROR | Error message from a peer | + +### 5. Summary +Provide analysis summary: + +**Consensus Status:** +- Did consensus complete? In which round? +- Were there round timeouts? Which leaders failed? + +**Block Type:** +- Was it a blinded or unblinded block? + +**Broadcast Status:** +- Was the block successfully broadcast? +- What was the broadcast delay range? + +**BN Call Performance:** +- What was the RTT range across peers? +- Were there any slow BN calls? + +**Inclusion Status:** +- Was the block included on-chain? (from logs or metric) +- If missed, what was the broadcast delay? + +**Participation:** +- Did all peers participate? +- Which peers were absent? + +**Errors:** +- List any errors per peer + +### 6. Root Cause Analysis + +Based on the data, provide a root cause analysis: + +**Common failure patterns:** + +1. **Leader failure in round 1** + - Round 1 leader did not propose + - Check if leader had connectivity issues or slow BN + +2. **Slow beacon node calls** + - High RTT on BN calls across peers + - May cause consensus to start late + +3. **Consensus timeout without decision** + - All rounds timed out + - Network connectivity issue between peers + +4. **Broadcast too late** + - Block was broadcast but with high delay (>4s) + - Block may have been included but orphaned + +5. **Partial participation** + - Some peers didn't participate + - Check if absent peers had logs at all + +6. **Block missed despite successful broadcast** + - Block was broadcast on time but not included + - May indicate relay/builder issues for MEV blocks + +## Example Usage + +User: "Analyze missed proposal for cluster 'Lido x Obol: Ethereal Elf' at slot 13813408" + +1. Run the script +2. Parse JSON output +3. Present cluster info, leaders, timeline +4. If block was missed, identify likely root cause +5. If data is incomplete, inform user what additional data is needed + +## Dependencies + +- `python3` (standard library only) +- `OBOL_GRAFANA_API_TOKEN` environment variable must be set +- Access to Grafana datasources (Prometheus and Loki) +``` diff --git a/scripts/README.md b/scripts/README.md index f981f2568a..68174e4dbb 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -37,100 +37,6 @@ It helps expand an existing cluster with new validators, given the same operator The script will execute `node_merge.sh` for each `nodeX` subfolder found in the source cluster. -## Monitoring and Diagnostics Scripts - -The following scripts query Obol's Grafana/Prometheus/Loki observability stack and require the `OBOL_GRAFANA_API_TOKEN` environment variable to be set: - -```bash -export OBOL_GRAFANA_API_TOKEN= -``` - -### `grafana-datasources.sh` - -Discovers Prometheus and Loki datasource proxy URLs from Grafana. Used internally by the other monitoring scripts. - -#### Usage - -```bash -./grafana-datasources.sh -``` - -Outputs two lines: -``` -PROMETHEUS_URL=https://grafana.monitoring.gcp.obol.tech/api/datasources/proxy//api/v1/ -LOKI_URL=https://grafana.monitoring.gcp.obol.tech/api/datasources/proxy//loki/api/v1/ -``` - -### `cluster-config.sh` - -Fetches cluster configuration metrics (version, operators, threshold, validators, and per-peer info) from Prometheus via Grafana proxy. - -#### Usage - -```bash -./cluster-config.sh [network] -``` - -- **: Human-readable cluster name (e.g., `"Lido x Obol: Ethereal Elf"`). -- *[network]*: Network name — `mainnet` (default), `hoodi`, `sepolia`, etc. - -#### Example - -```bash -./cluster-config.sh "Lido x Obol: Ethereal Elf" mainnet -``` - -### `consensus-leader.sh` - -Calculates the consensus leader sequence for a given slot and cluster using the QBFT leader election formula: `(slot + dutyType + round) % nodes`. - -#### Usage - -```bash -./consensus-leader.sh [network] [duty_type] -``` - -- **: Human-readable cluster name. -- **: Beacon chain slot number (e.g., `13813408`). -- *[network]*: Network name — `mainnet` (default), `hoodi`, `sepolia`, etc. -- *[duty_type]*: Duty type — `proposer` (default), `attester`, `randao`, etc. - -#### Example - -```bash -./consensus-leader.sh "Lido x Obol: Ethereal Elf" 13813408 mainnet proposer -``` - -### `duty-timeline.sh` - -Generates a comprehensive chronological timeline of events for a specific duty across all peers, pulling logs from Loki and cluster metrics from Prometheus. Useful for post-mortem analysis of missed blocks or attestations. - -#### Usage - -```bash -./duty-timeline.sh [network] [duty_type] -``` - -- **: Human-readable cluster name. -- **: Beacon chain slot number (e.g., `13813408`). -- *[network]*: Network name — `mainnet` (default), `hoodi`, `sepolia`, etc. -- *[duty_type]*: Duty type — `proposer` (default), `attester`, `randao`, etc. - -#### Example - -```bash -./duty-timeline.sh "Lido x Obol: Ethereal Elf" 13813408 mainnet proposer -``` - -The script outputs duty info, expected consensus leaders, a chronological event timeline with offsets relative to slot start, and a summary covering consensus outcome, broadcast status, block inclusion, and peer participation. - -## Requirements - -All scripts require **bash** (standard on Linux/macOS) and **jq** (version 1.5+). -Install via `sudo apt-get install jq` (Debian/Ubuntu) or `brew install jq` (macOS). - -The monitoring and diagnostics scripts additionally require **curl** and **bc**. - ## Important Warnings - Always back up your `cluster-lock.json`, node folders, and `validator_keys` folders before use. diff --git a/scripts/cluster-config.sh b/scripts/cluster-config.sh deleted file mode 100755 index 51d2137d75..0000000000 --- a/scripts/cluster-config.sh +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env bash -# Fetches cluster configuration metrics from Prometheus via Grafana proxy. -# Requires OBOL_GRAFANA_API_TOKEN environment variable. -# Usage: bash scripts/cluster-config.sh [network] -# cluster_name: e.g. "Lido x Obol: Ethereal Elf" -# network: mainnet (default), hoodi, sepolia, etc. - -set -euo pipefail - -CLUSTER_NAME="${1:-}" -NETWORK="${2:-mainnet}" - -if [ -z "$CLUSTER_NAME" ]; then - echo "Error: cluster name is required" >&2 - echo "Usage: bash scripts/cluster-config.sh [network]" >&2 - exit 1 -fi - -if [ -z "${OBOL_GRAFANA_API_TOKEN:-}" ]; then - echo "Error: OBOL_GRAFANA_API_TOKEN is not set" >&2 - exit 1 -fi - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Discover Prometheus proxy URL -PROM_URL=$("$SCRIPT_DIR/grafana-datasources.sh" | grep '^PROMETHEUS_URL=' | cut -d= -f2-) - -if [ -z "$PROM_URL" ]; then - echo "Error: could not discover Prometheus URL" >&2 - exit 1 -fi - -AUTH="Authorization: Bearer $OBOL_GRAFANA_API_TOKEN" - -prom_query() { - local metric="$1" - curl -sf -G \ - -H "$AUTH" \ - --data-urlencode "query=${metric}{cluster_name=\"${CLUSTER_NAME}\",cluster_network=\"${NETWORK}\"}" \ - "${PROM_URL}query" -} - -query_metric() { - local metric="$1" - local result - result=$(prom_query "$metric") - - if [ "$metric" = "app_version" ]; then - echo "$result" | jq -r '[.data.result[].metric.version] | unique | sort | join(", ") | if . == "" then "NOT_FOUND" else . end' - else - echo "$result" | jq -r 'if .data.result | length == 0 then "NOT_FOUND" else .data.result[0].value[1] end' - fi -} - -# Query cluster-level metrics; reuse operators raw result to extract common labels. -operators_raw=$(prom_query "cluster_operators") -operators=$(echo "$operators_raw" | jq -r 'if .data.result | length == 0 then "NOT_FOUND" else .data.result[0].value[1] end') -cluster_hash=$(echo "$operators_raw" | jq -r '.data.result[0].metric.cluster_hash // "NOT_FOUND"') - -version=$(query_metric "app_version") -threshold=$(query_metric "cluster_threshold") -active_validators=$(query_metric "core_scheduler_validators_active") -total_validators=$(query_metric "cluster_validators") - -# Check if cluster was found -all_not_found=true -for val in "$version" "$operators" "$threshold" "$active_validators" "$total_validators"; do - if [ -n "$val" ] && [ "$val" != "NOT_FOUND" ]; then - all_not_found=false - break - fi -done - -if $all_not_found; then - echo "Error: no cluster found for name=\"${CLUSTER_NAME}\" network=\"${NETWORK}\"" >&2 - echo "Please double-check the cluster name and network." >&2 - exit 1 -fi - -# Query per-peer info metrics for the peer table. -# app_peerinfo_* metrics use 'peer' label (= cluster_peer value of the described peer). -# app_peer_name uses 'cluster_peer' as key and 'peer_name' as the human-readable name. -# Multiple nodes report peerinfo for all peers, so results are deduplicated by peer. -# app_feature_flags is reported by each node for itself, keyed by 'cluster_peer'. -idx_raw=$(prom_query "app_peerinfo_index") -nick_raw=$(prom_query "app_peerinfo_nickname") -ver_raw=$(prom_query "app_peerinfo_version") -flags_raw=$(prom_query "app_feature_flags") - -echo "=== Cluster Info ===" -echo "Name: ${CLUSTER_NAME}" -echo "Hash: ${cluster_hash}" -echo "Version: ${version}" -echo "Network: ${NETWORK}" -echo "Nodes: ${operators} (threshold: ${threshold})" -echo "Validators: ${active_validators} active / ${total_validators} total" -echo "" -echo "=== Peers Info ===" -jq -rn \ - --argjson idx "$idx_raw" \ - --argjson nicks "$nick_raw" \ - --argjson vers "$ver_raw" \ - --argjson flags "$flags_raw" \ - ' - ($nicks.data.result | map({(.metric.peer): (.metric.peer_nickname // "?")}) | add // {}) as $nick_map | - ($vers.data.result | map({(.metric.peer): (.metric.version // "?")}) | add // {}) as $ver_map | - ($flags.data.result | map({(.metric.cluster_peer): (.metric.feature_flags // "")}) | add // {}) as $flags_map | - ["INDEX", "PEER", "NICKNAME", "VERSION", "FEATURE_FLAGS"], - ( - $idx.data.result - | map({peer: .metric.peer, index: (.value[1] | tonumber)}) - | unique_by(.peer) - | sort_by(.index) - | .[] - | [(.index | tostring), .peer, ($nick_map[.peer] // "?"), ($ver_map[.peer] // "?"), ($flags_map[.peer] // "-")] - ) - | @tsv - ' | column -t -s $'\t' diff --git a/scripts/consensus-leader.sh b/scripts/consensus-leader.sh deleted file mode 100755 index 9de1229421..0000000000 --- a/scripts/consensus-leader.sh +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env bash -# Calculates consensus leader sequence for a given slot number. -# Requires OBOL_GRAFANA_API_TOKEN environment variable (passed to cluster-config.sh). -# Usage: bash scripts/consensus-leader.sh [network] [duty_type] -# cluster_name: e.g. "Lido x Obol: Ethereal Elf" -# slot: slot number (e.g. 13813408) -# network: mainnet (default), hoodi, sepolia, etc. -# duty_type: proposer (default), attester, randao, etc. - -set -euo pipefail - -CLUSTER_NAME="${1:-}" -SLOT="${2:-}" -NETWORK="${3:-mainnet}" -DUTY_TYPE="${4:-proposer}" - -if [ -z "$CLUSTER_NAME" ] || [ -z "$SLOT" ]; then - echo "Error: cluster name and slot are required" >&2 - echo "Usage: bash scripts/consensus-leader.sh [network] [duty_type]" >&2 - exit 1 -fi - -# Duty type name to numeric value mapping (from core/types.go) -declare -A DUTY_MAP=( - [unknown]=0 - [proposer]=1 - [attester]=2 - [signature]=3 - [exit]=4 - [builder_proposer]=5 - [builder_registration]=6 - [randao]=7 - [prepare_aggregator]=8 - [aggregator]=9 - [sync_message]=10 - [prepare_sync_contribution]=11 - [sync_contribution]=12 - [info_sync]=13 -) - -DUTY_VALUE="${DUTY_MAP[$DUTY_TYPE]:-}" -if [ -z "$DUTY_VALUE" ]; then - echo "Error: unknown duty type '$DUTY_TYPE'" >&2 - echo "Valid types: ${!DUTY_MAP[*]}" >&2 - exit 1 -fi - -# Network genesis timestamps and slots per epoch -declare -A GENESIS_TIME=( - [mainnet]=1606824023 - [hoodi]=1742212800 - [sepolia]=1655733600 -) - -SLOTS_PER_EPOCH=32 -SECONDS_PER_SLOT=12 - -# Get genesis time for the network -GENESIS="${GENESIS_TIME[$NETWORK]:-}" -if [ -z "$GENESIS" ]; then - echo "Warning: unknown genesis time for network '$NETWORK', skipping time calculation" >&2 -fi - -# Fetch cluster config using cluster-config.sh -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -CLUSTER_OUTPUT=$("$SCRIPT_DIR/cluster-config.sh" "$CLUSTER_NAME" "$NETWORK") - -# Extract number of nodes from "Nodes: N (threshold: T)" -NODES=$(echo "$CLUSTER_OUTPUT" | grep '^Nodes:' | sed -E 's/^Nodes:[[:space:]]*([0-9]+).*/\1/') - -if [ -z "$NODES" ] || [ "$NODES" -eq 0 ]; then - echo "Error: could not determine number of nodes from cluster config" >&2 - exit 1 -fi - -# Extract peer info lines (INDEX PEER NICKNAME VERSION) -# Skip header line, capture peers in order -declare -a PEERS -while IFS= read -r line; do - # Skip header and empty lines - if [[ "$line" =~ ^INDEX ]] || [ -z "$line" ]; then - continue - fi - # Extract peer name (second column) - PEER=$(echo "$line" | awk '{print $2}') - PEERS+=("$PEER") -done < <(echo "$CLUSTER_OUTPUT" | sed -n '/=== Peers/,$ p' | tail -n +2) - -# If we couldn't parse peers, create placeholder names -if [ ${#PEERS[@]} -eq 0 ]; then - for ((i=0; i/dev/null || TZ=UTC date -d "@$SLOT_TIMESTAMP" '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || echo "") -fi - -# Calculate leader indices for rounds 1, 2, 3 -# Formula: (slot + dutyType + round) % nodes -calc_leader() { - local round=$1 - echo $(( (SLOT + DUTY_VALUE + round) % NODES )) -} - -LEADER_R1=$(calc_leader 1) -LEADER_R2=$(calc_leader 2) -LEADER_R3=$(calc_leader 3) - -# Output results -echo "=== Slot Info ===" -echo "Slot: ${SLOT}" -echo "Epoch: ${EPOCH} (slot ${SLOT_IN_EPOCH} of ${SLOTS_PER_EPOCH})" -if [ -n "$SLOT_TIME" ]; then - echo "Time: ${SLOT_TIME}" -fi -echo "Network: ${NETWORK}" -echo "Duty: ${DUTY_TYPE} (value: ${DUTY_VALUE})" -echo "" -echo "=== Consensus Leaders ===" -echo "Cluster: ${CLUSTER_NAME} (${NODES} nodes)" -echo "" -printf "%-8s %-5s %-20s\n" "ROUND" "INDEX" "PEER" -printf "%-8s %-5s %-20s\n" "1" "$LEADER_R1" "${PEERS[$LEADER_R1]:-unknown}" -printf "%-8s %-5s %-20s\n" "2" "$LEADER_R2" "${PEERS[$LEADER_R2]:-unknown}" -printf "%-8s %-5s %-20s\n" "3" "$LEADER_R3" "${PEERS[$LEADER_R3]:-unknown}" diff --git a/scripts/debug/missed_proposal.py b/scripts/debug/missed_proposal.py new file mode 100755 index 0000000000..a2720bdf5c --- /dev/null +++ b/scripts/debug/missed_proposal.py @@ -0,0 +1,610 @@ +#!/usr/bin/env python3 +""" +Collects data for missed proposal analysis. +Requires OBOL_GRAFANA_API_TOKEN environment variable. +Usage: python missed_proposal.py [network] + cluster_name: e.g. "Lido x Obol: Ethereal Elf" + slot: slot number (e.g. 13813408) + network: mainnet (default), hoodi, sepolia, etc. + +Outputs JSON with cluster config, consensus leaders, logs, and inclusion metrics. +""" + +import json +import os +import re +import sys +import urllib.error +import urllib.parse +import urllib.request +from datetime import datetime, timezone + +GRAFANA_BASE = "https://grafana.monitoring.gcp.obol.tech" + +# Network genesis timestamps (Unix seconds) +GENESIS_TIME = { + "mainnet": 1606824023, + "hoodi": 1742212800, + "sepolia": 1655733600, +} + +SLOTS_PER_EPOCH = 32 +SECONDS_PER_SLOT = 12 + +# Proposer duty type value (from core/types.go) +DUTY_TYPE_PROPOSER = 1 + + +def get_auth_header() -> dict: + """Return authorization header using OBOL_GRAFANA_API_TOKEN.""" + token = os.environ.get("OBOL_GRAFANA_API_TOKEN") + if not token: + return {} + return {"Authorization": f"Bearer {token}"} + + +def fetch_json(url: str, headers: dict, silent: bool = False) -> dict | None: + """Fetch JSON from URL with headers. Returns None on error if silent.""" + req = urllib.request.Request(url, headers=headers) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read().decode()) + except urllib.error.HTTPError as e: + if not silent: + print(f"Error: HTTP {e.code} fetching {url}", file=sys.stderr) + return None + except urllib.error.URLError as e: + if not silent: + print(f"Error: {e.reason}", file=sys.stderr) + return None + except Exception as e: + if not silent: + print(f"Error: {e}", file=sys.stderr) + return None + + +def discover_datasources(headers: dict) -> tuple[str | None, str | None]: + """Discover Prometheus and Loki datasource proxy URLs from Grafana.""" + url = f"{GRAFANA_BASE}/api/datasources" + datasources = fetch_json(url, headers) + if not datasources: + return None, None + + prom_id = None + loki_id = None + for ds in datasources: + if ds.get("type") == "prometheus" and ds.get("name") == "prometheus": + prom_id = ds.get("id") + if ds.get("type") == "loki" and ds.get("name") == "Loki": + loki_id = ds.get("id") + + prom_url = f"{GRAFANA_BASE}/api/datasources/proxy/{prom_id}/api/v1/" if prom_id else None + loki_url = f"{GRAFANA_BASE}/api/datasources/proxy/{loki_id}/loki/api/v1/" if loki_id else None + return prom_url, loki_url + + +def prom_query(prom_url: str, headers: dict, query: str) -> dict | None: + """Query Prometheus with raw query.""" + url = f"{prom_url}query?query={urllib.parse.quote(query)}" + return fetch_json(url, headers, silent=True) + + +def prom_query_at_time(prom_url: str, headers: dict, query: str, timestamp: int) -> dict | None: + """Query Prometheus at a specific timestamp.""" + url = f"{prom_url}query?query={urllib.parse.quote(query)}&time={timestamp}" + return fetch_json(url, headers, silent=True) + + +def prom_query_cluster(prom_url: str, headers: dict, metric: str, cluster_name: str, network: str) -> dict | None: + """Query Prometheus for a metric with cluster labels.""" + query = f'{metric}{{cluster_name="{cluster_name}",cluster_network="{network}"}}' + return prom_query(prom_url, headers, query) + + +def extract_metric_value(result: dict | None, metric: str = "") -> str: + """Extract value from Prometheus query result.""" + if not result: + return "NOT_FOUND" + data = result.get("data", {}).get("result", []) + if not data: + return "NOT_FOUND" + + if metric == "app_version": + versions = sorted(set(r.get("metric", {}).get("version", "?") for r in data)) + return ", ".join(versions) if versions else "NOT_FOUND" + + return data[0].get("value", [None, "NOT_FOUND"])[1] + + +def loki_query(loki_url: str, headers: dict, logql: str, start_ns: int, end_ns: int) -> dict | None: + """Query Loki for logs.""" + params = urllib.parse.urlencode({ + "query": logql, + "start": str(start_ns), + "end": str(end_ns), + "limit": "2000", + }) + url = f"{loki_url}query_range?{params}" + return fetch_json(url, headers, silent=True) + + +def get_cluster_config(prom_url: str, headers: dict, cluster_name: str, network: str) -> dict: + """Fetch cluster configuration from Prometheus.""" + config = { + "name": cluster_name, + "network": network, + "cluster_hash": "NOT_FOUND", + "version": "NOT_FOUND", + "operators": "NOT_FOUND", + "threshold": "NOT_FOUND", + "active_validators": "NOT_FOUND", + "total_validators": "NOT_FOUND", + "peers": [], + } + + if not prom_url: + return config + + # Query cluster-level metrics + operators_raw = prom_query_cluster(prom_url, headers, "cluster_operators", cluster_name, network) + config["operators"] = extract_metric_value(operators_raw) + + # Extract cluster_hash + if operators_raw: + data = operators_raw.get("data", {}).get("result", []) + if data: + config["cluster_hash"] = data[0].get("metric", {}).get("cluster_hash", "NOT_FOUND") + + config["version"] = extract_metric_value( + prom_query_cluster(prom_url, headers, "app_version", cluster_name, network), "app_version" + ) + config["threshold"] = extract_metric_value( + prom_query_cluster(prom_url, headers, "cluster_threshold", cluster_name, network) + ) + config["active_validators"] = extract_metric_value( + prom_query_cluster(prom_url, headers, "core_scheduler_validators_active", cluster_name, network) + ) + config["total_validators"] = extract_metric_value( + prom_query_cluster(prom_url, headers, "cluster_validators", cluster_name, network) + ) + + # Query per-peer info + idx_raw = prom_query_cluster(prom_url, headers, "app_peerinfo_index", cluster_name, network) + nick_raw = prom_query_cluster(prom_url, headers, "app_peerinfo_nickname", cluster_name, network) + ver_raw = prom_query_cluster(prom_url, headers, "app_peerinfo_version", cluster_name, network) + + # Build lookup maps + nick_map = {} + if nick_raw: + for r in nick_raw.get("data", {}).get("result", []): + peer = r.get("metric", {}).get("peer") + if peer: + nick_map[peer] = r.get("metric", {}).get("peer_nickname", "?") + + ver_map = {} + if ver_raw: + for r in ver_raw.get("data", {}).get("result", []): + peer = r.get("metric", {}).get("peer") + if peer: + ver_map[peer] = r.get("metric", {}).get("version", "?") + + # Build peer list + peers = [] + seen_peers = set() + if idx_raw: + for r in idx_raw.get("data", {}).get("result", []): + peer = r.get("metric", {}).get("peer") + index = int(r.get("value", [None, 0])[1]) + if peer and peer not in seen_peers: + seen_peers.add(peer) + peers.append({ + "index": index, + "peer": peer, + "nickname": nick_map.get(peer, "?"), + "version": ver_map.get(peer, "?"), + }) + + peers.sort(key=lambda x: x["index"]) + config["peers"] = peers + + return config + + +def calculate_leaders(slot: int, num_nodes: int, peers: list[dict]) -> list[dict]: + """Calculate consensus leaders for rounds 1, 2, 3.""" + leaders = [] + for round_num in range(1, 4): + leader_index = (slot + DUTY_TYPE_PROPOSER + round_num) % num_nodes + peer_name = "unknown" + for p in peers: + if p["index"] == leader_index: + peer_name = p["peer"] + break + leaders.append({ + "round": round_num, + "index": leader_index, + "peer": peer_name, + }) + return leaders + + +def extract_logfmt(line: str, field: str) -> str: + """Extract a field value from a logfmt-formatted line.""" + # Try quoted value first + m = re.search(rf'{field}="([^"]*)"', line) + if m: + return m.group(1) + # Try unquoted value + m = re.search(rf"{field}=(\S+)", line) + if m: + return m.group(1) + return "" + + +def parse_logs(logs_raw: dict | None, slot: int, slot_timestamp: int) -> dict: + """Parse Loki logs and extract relevant events.""" + result = { + "total_entries": 0, + "peers_with_logs": [], + "events": [], + "warnings": [], + } + + if not logs_raw: + result["warnings"].append("No logs returned from Loki") + return result + + streams = logs_raw.get("data", {}).get("result", []) + if not streams: + result["warnings"].append("No log streams found for this slot/duty") + return result + + slot_timestamp_ns = slot_timestamp * 1_000_000_000 + + # Collect all entries + entries = [] + peers_seen = set() + for stream in streams: + peer = stream.get("stream", {}).get("cluster_peer", "unknown") + peers_seen.add(peer) + for ts_str, line in stream.get("values", []): + entries.append((int(ts_str), peer, line)) + + result["peers_with_logs"] = sorted(peers_seen) + result["total_entries"] = len(entries) + entries.sort(key=lambda x: x[0]) + + # Parse events + seen_first = set() + events = [] + + for ts_ns, peer, line in entries: + msg = extract_logfmt(line, "msg") + level = extract_logfmt(line, "level") + if not msg: + continue + + offset_ms = (ts_ns - slot_timestamp_ns) / 1_000_000 + offset_s = offset_ms / 1000 + + event = { + "offset_s": round(offset_s, 3), + "peer": peer, + "type": "", + "details": {}, + } + + # --- SCHEDULER --- + if msg == "Slot ticked": + if "slot_ticked" not in seen_first: + seen_first.add("slot_ticked") + event["type"] = "slot_ticked" + events.append(event) + + elif msg == "Resolved proposer duty": + pubkey = extract_logfmt(line, "pubkey") + vidx = extract_logfmt(line, "vidx") + key = f"resolved:{pubkey}" + if key not in seen_first: + seen_first.add(key) + event["type"] = "resolved_duty" + event["details"] = {"pubkey": pubkey, "vidx": vidx} + events.append(event) + + # --- FETCHER --- + elif msg == "Calling beacon node endpoint...": + endpoint = extract_logfmt(line, "endpoint") + event["type"] = "bn_call_start" + event["details"] = {"endpoint": endpoint} + events.append(event) + + elif msg == "Beacon node call finished": + endpoint = extract_logfmt(line, "endpoint") + rtt = extract_logfmt(line, "rtt") + event["type"] = "bn_call_done" + event["details"] = {"endpoint": endpoint, "rtt": rtt} + events.append(event) + + elif msg == "Beacon node call took longer than expected": + endpoint = extract_logfmt(line, "endpoint") + rtt = extract_logfmt(line, "rtt") + event["type"] = "bn_call_slow" + event["details"] = {"endpoint": endpoint, "rtt": rtt} + events.append(event) + + # --- CONSENSUS --- + elif msg == "QBFT consensus instance starting": + if "consensus_started" not in seen_first: + seen_first.add("consensus_started") + event["type"] = "consensus_started" + events.append(event) + + elif msg == "QBFT round changed": + old_round = extract_logfmt(line, "round") + new_round = extract_logfmt(line, "new_round") + reason = extract_logfmt(line, "timeout_reason") + key = f"round_change:{old_round}" + if key not in seen_first: + seen_first.add(key) + event["type"] = "round_timeout" + event["details"] = {"old_round": old_round, "new_round": new_round, "reason": reason} + events.append(event) + + elif msg == "QBFT consensus decided": + if "consensus_decided" not in seen_first: + seen_first.add("consensus_decided") + event["type"] = "consensus_decided" + event["details"] = { + "round": extract_logfmt(line, "round"), + "leader_name": extract_logfmt(line, "leader_name"), + "leader_index": extract_logfmt(line, "leader_index"), + } + events.append(event) + + # --- VALIDATOR API --- + elif msg == "Beacon block proposal received from validator client": + block_version = extract_logfmt(line, "block_version") + event["type"] = "block_proposal_received" + event["details"] = {"block_version": block_version, "blinded": False} + events.append(event) + + elif msg == "Blinded beacon block received from validator client": + block_version = extract_logfmt(line, "block_version") + event["type"] = "block_proposal_received" + event["details"] = {"block_version": block_version, "blinded": True} + events.append(event) + + # --- SIG AGGREGATION --- + elif msg == "Successfully aggregated partial signatures to reach threshold": + vapi_endpoint = extract_logfmt(line, "vapi_endpoint") + event["type"] = "threshold_reached" + event["details"] = {"vapi_endpoint": vapi_endpoint} + events.append(event) + + # --- BROADCAST --- + elif msg in ("Successfully submitted proposal to beacon node", + "Successfully submitted block proposal to beacon node"): + delay = extract_logfmt(line, "delay") + event["type"] = "broadcast_success" + event["details"] = {"delay": delay} + events.append(event) + + elif msg == "Timeout calling bcast/broadcast, duty expired": + vapi_endpoint = extract_logfmt(line, "vapi_endpoint") + event["type"] = "broadcast_timeout" + event["details"] = {"vapi_endpoint": vapi_endpoint} + events.append(event) + + # --- SSE EVENTS --- + elif msg == "Beacon node received block_gossip event too late": + delay = extract_logfmt(line, "gossip_delay") or extract_logfmt(line, "delay") + event["type"] = "sse_block_gossip_late" + event["details"] = {"delay": delay} + events.append(event) + + elif msg == "Beacon node received block event too late": + delay = extract_logfmt(line, "block_delay") or extract_logfmt(line, "delay") + event["type"] = "sse_block_late" + event["details"] = {"delay": delay} + events.append(event) + + # --- TRACKER --- + elif msg == "All peers participated in duty": + if "tracker_all" not in seen_first: + seen_first.add("tracker_all") + event["type"] = "tracker_all_participated" + events.append(event) + + elif msg == "Not all peers participated in duty": + if "tracker_partial" not in seen_first: + seen_first.add("tracker_partial") + absent = extract_logfmt(line, "absent") + event["type"] = "tracker_partial_participation" + event["details"] = {"absent": absent} + events.append(event) + + elif msg in ("Broadcasted block included on-chain", "Broadcasted blinded block included on-chain"): + if "tracker_included" not in seen_first: + seen_first.add("tracker_included") + pubkey = extract_logfmt(line, "pubkey") + broadcast_delay = extract_logfmt(line, "broadcast_delay") + event["type"] = "tracker_block_included" + event["details"] = { + "pubkey": pubkey, + "broadcast_delay": broadcast_delay, + "blinded": "blinded" in msg, + } + events.append(event) + + elif msg in ("Broadcasted block never included on-chain", "Broadcasted blinded block never included on-chain"): + if "tracker_missed" not in seen_first: + seen_first.add("tracker_missed") + pubkey = extract_logfmt(line, "pubkey") + broadcast_delay = extract_logfmt(line, "broadcast_delay") + event["type"] = "tracker_block_missed" + event["details"] = { + "pubkey": pubkey, + "broadcast_delay": broadcast_delay, + "blinded": "blinded" in msg, + } + events.append(event) + + # --- ERRORS --- + elif level == "error" and ("consensus timeout" in msg.lower() or "permanent failure" in msg.lower()): + event["type"] = "error" + event["details"] = {"message": msg} + events.append(event) + + result["events"] = events + return result + + +def check_inclusion_metric(prom_url: str, headers: dict, cluster_name: str, network: str, slot_timestamp: int) -> str: + """Check inclusion metric delta to determine if block was missed.""" + if not prom_url: + return "unknown" + + # InclCheckLag=6 slots, InclMissedLag=32 slots (from core/tracker/inclusion.go) + incl_check_lag = 6 + incl_missed_lag = 32 + + before_time = slot_timestamp + incl_check_lag * SECONDS_PER_SLOT - 1 + after_time = slot_timestamp + (incl_missed_lag + 2) * SECONDS_PER_SLOT + + metric_query = f'sum(core_tracker_inclusion_missed_total{{cluster_name="{cluster_name}",cluster_network="{network}",duty="proposer"}})' + + before_result = prom_query_at_time(prom_url, headers, metric_query, before_time) + after_result = prom_query_at_time(prom_url, headers, metric_query, after_time) + + val_before = extract_metric_value(before_result) + val_after = extract_metric_value(after_result) + + try: + before_val = float(val_before) if val_before != "NOT_FOUND" else 0 + after_val = float(val_after) if val_after != "NOT_FOUND" else 0 + delta = after_val - before_val + if delta > 0: + return "missed" + if val_after != "NOT_FOUND" or val_before != "NOT_FOUND": + return "not_missed" + except (ValueError, TypeError): + pass + + return "unknown" + + +def main(): + if len(sys.argv) < 3: + print(json.dumps({ + "error": "cluster name and slot are required", + "usage": "python missed_proposal.py [network]", + })) + sys.exit(1) + + cluster_name = sys.argv[1] + try: + slot = int(sys.argv[2]) + except ValueError: + print(json.dumps({"error": f"invalid slot number: {sys.argv[2]}"})) + sys.exit(1) + + network = sys.argv[3] if len(sys.argv) > 3 else "mainnet" + + # Check for auth token + headers = get_auth_header() + if not headers: + print(json.dumps({"error": "OBOL_GRAFANA_API_TOKEN environment variable is not set"})) + sys.exit(1) + + # Discover datasources + prom_url, loki_url = discover_datasources(headers) + if not prom_url and not loki_url: + print(json.dumps({"error": "Could not discover Prometheus or Loki datasources from Grafana"})) + sys.exit(1) + + # Calculate slot timing + genesis = GENESIS_TIME.get(network) + slot_timestamp = None + slot_time = None + epoch = slot // SLOTS_PER_EPOCH + slot_in_epoch = slot % SLOTS_PER_EPOCH + + if genesis: + slot_timestamp = genesis + slot * SECONDS_PER_SLOT + slot_time = datetime.fromtimestamp(slot_timestamp, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + # Collect cluster config + cluster_config = get_cluster_config(prom_url, headers, cluster_name, network) + + # Check if cluster was found + cluster_found = not all( + v in (None, "", "NOT_FOUND") + for v in [cluster_config["version"], cluster_config["operators"], cluster_config["threshold"]] + ) + + # Calculate leaders + leaders = [] + num_nodes = 0 + if cluster_found: + try: + num_nodes = int(cluster_config["operators"]) + except (ValueError, TypeError): + num_nodes = len(cluster_config["peers"]) + + if num_nodes > 0: + leaders = calculate_leaders(slot, num_nodes, cluster_config["peers"]) + + # Query logs from Loki + logs_data = {"total_entries": 0, "peers_with_logs": [], "events": [], "warnings": []} + if loki_url and slot_timestamp: + # Time window: 15 seconds before slot to ~8 minutes after (for tracker) + start_ns = (slot_timestamp - 15) * 1_000_000_000 + end_ns = (slot_timestamp + 500) * 1_000_000_000 + + # Query pattern for proposer duty + duty_pattern = f"{slot}/proposer" + logql = f'{{cluster_name="{cluster_name}",cluster_network="{network}"}} |~ `{duty_pattern}|duty=proposer.*slot={slot}|slot.*{slot}.*proposer|block_slot={slot}`' + + logs_raw = loki_query(loki_url, headers, logql, start_ns, end_ns) + logs_data = parse_logs(logs_raw, slot, slot_timestamp) + elif not loki_url: + logs_data["warnings"].append("Loki datasource not available") + elif not slot_timestamp: + logs_data["warnings"].append(f"Unknown genesis time for network '{network}'") + + # Check inclusion metric + inclusion_status = "unknown" + if prom_url and slot_timestamp: + inclusion_status = check_inclusion_metric(prom_url, headers, cluster_name, network, slot_timestamp) + + # Check for missing peer logs + if cluster_found and logs_data["peers_with_logs"]: + expected_peers = {p["peer"] for p in cluster_config["peers"]} + actual_peers = set(logs_data["peers_with_logs"]) + missing_peers = expected_peers - actual_peers + if missing_peers: + logs_data["warnings"].append(f"Missing logs from peers: {', '.join(sorted(missing_peers))}") + + # Build output + output = { + "slot": { + "number": slot, + "epoch": epoch, + "slot_in_epoch": slot_in_epoch, + "timestamp": slot_timestamp, + "time": slot_time, + }, + "duty": "proposer", + "network": network, + "cluster": cluster_config, + "cluster_found": cluster_found, + "leaders": leaders, + "logs": logs_data, + "inclusion_metric": inclusion_status, + } + + print(json.dumps(output, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/scripts/duty-timeline.sh b/scripts/duty-timeline.sh deleted file mode 100644 index 17b84229ac..0000000000 --- a/scripts/duty-timeline.sh +++ /dev/null @@ -1,579 +0,0 @@ -#!/usr/bin/env bash -# Generates a comprehensive timeline of events for a duty across all peers. -# Requires OBOL_GRAFANA_API_TOKEN environment variable. -# Usage: bash scripts/duty-timeline.sh [network] [duty_type] -# cluster_name: e.g. "Lido x Obol: Ethereal Elf" -# slot: slot number (e.g. 13813408) -# network: mainnet (default), hoodi, sepolia, etc. -# duty_type: proposer (default), attester, randao, etc. - -set -euo pipefail - -CLUSTER_NAME="${1:-}" -SLOT="${2:-}" -NETWORK="${3:-mainnet}" -DUTY_TYPE="${4:-proposer}" - -if [ -z "$CLUSTER_NAME" ] || [ -z "$SLOT" ]; then - echo "Error: cluster name and slot are required" >&2 - echo "Usage: bash scripts/duty-timeline.sh [network] [duty_type]" >&2 - exit 1 -fi - -if [ -z "${OBOL_GRAFANA_API_TOKEN:-}" ]; then - echo "Error: OBOL_GRAFANA_API_TOKEN is not set" >&2 - exit 1 -fi - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Duty type name to numeric value mapping (from core/types.go) -declare -A DUTY_MAP=( - [unknown]=0 - [proposer]=1 - [attester]=2 - [signature]=3 - [exit]=4 - [builder_proposer]=5 - [builder_registration]=6 - [randao]=7 - [prepare_aggregator]=8 - [aggregator]=9 - [sync_message]=10 - [prepare_sync_contribution]=11 - [sync_contribution]=12 - [info_sync]=13 -) - -DUTY_VALUE="${DUTY_MAP[$DUTY_TYPE]:-}" -if [ -z "$DUTY_VALUE" ]; then - echo "Error: unknown duty type '$DUTY_TYPE'" >&2 - echo "Valid types: ${!DUTY_MAP[*]}" >&2 - exit 1 -fi - -# Network genesis timestamps -declare -A GENESIS_TIME=( - [mainnet]=1606824023 - [hoodi]=1742212800 - [sepolia]=1655733600 -) - -SLOTS_PER_EPOCH=32 -SECONDS_PER_SLOT=12 - -GENESIS="${GENESIS_TIME[$NETWORK]:-}" -if [ -z "$GENESIS" ]; then - echo "Error: unknown genesis time for network '$NETWORK'" >&2 - exit 1 -fi - -# Calculate time window for the slot -# Start from 15 seconds before slot (to catch scheduling), end 20 seconds after + 8 minutes for tracker -SLOT_TIMESTAMP=$((GENESIS + SLOT * SECONDS_PER_SLOT)) -START_NS=$(( (SLOT_TIMESTAMP - 15) * 1000000000 )) -END_NS=$(( (SLOT_TIMESTAMP + 500) * 1000000000 )) # ~8 minutes for tracker inclusion checks - -# Discover Loki and Prometheus URLs -DATASOURCES=$("$SCRIPT_DIR/grafana-datasources.sh") -LOKI_URL=$(echo "$DATASOURCES" | grep '^LOKI_URL=' | cut -d= -f2-) -PROM_URL=$(echo "$DATASOURCES" | grep '^PROMETHEUS_URL=' | cut -d= -f2-) - -if [ -z "$LOKI_URL" ]; then - echo "Error: could not discover Loki URL" >&2 - exit 1 -fi - -AUTH="Authorization: Bearer $OBOL_GRAFANA_API_TOKEN" - -# Fetch cluster config to get peer info -CLUSTER_OUTPUT=$("$SCRIPT_DIR/cluster-config.sh" "$CLUSTER_NAME" "$NETWORK" 2>/dev/null) || { - echo "Error: failed to fetch cluster config" >&2 - exit 1 -} - -NODES=$(echo "$CLUSTER_OUTPUT" | grep '^Nodes:' | sed -E 's/^Nodes:[[:space:]]*([0-9]+).*/\1/') - -# Extract peers -declare -a PEERS -while IFS= read -r line; do - if [[ "$line" =~ ^INDEX ]] || [ -z "$line" ]; then - continue - fi - PEER=$(echo "$line" | awk '{print $2}') - PEERS+=("$PEER") -done < <(echo "$CLUSTER_OUTPUT" | sed -n '/=== Peers/,$ p' | tail -n +2) - -# Calculate leaders for rounds 1, 2, 3 -calc_leader() { - local round=$1 - echo $(( (SLOT + DUTY_VALUE + round) % NODES )) -} - -LEADER_R1=$(calc_leader 1) -LEADER_R2=$(calc_leader 2) -LEADER_R3=$(calc_leader 3) - -LEADER_PEER_R1="${PEERS[$LEADER_R1]:-unknown}" -LEADER_PEER_R2="${PEERS[$LEADER_R2]:-unknown}" -LEADER_PEER_R3="${PEERS[$LEADER_R3]:-unknown}" - -# Calculate epoch and slot time -EPOCH=$((SLOT / SLOTS_PER_EPOCH)) -SLOT_IN_EPOCH=$((SLOT % SLOTS_PER_EPOCH)) -SLOT_TIME=$(TZ=UTC date -r "$SLOT_TIMESTAMP" '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || TZ=UTC date -d "@$SLOT_TIMESTAMP" '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || echo "") - -echo "=== Duty Info ===" -echo "Slot: ${SLOT}" -echo "Epoch: ${EPOCH} (slot ${SLOT_IN_EPOCH} of ${SLOTS_PER_EPOCH})" -echo "Time: ${SLOT_TIME}" -echo "Network: ${NETWORK}" -echo "Duty: ${DUTY_TYPE}" -echo "" -echo "=== Expected Consensus Leaders ===" -echo "Round 1: ${LEADER_PEER_R1} (index ${LEADER_R1})" -echo "Round 2: ${LEADER_PEER_R2} (index ${LEADER_R2})" -echo "Round 3: ${LEADER_PEER_R3} (index ${LEADER_R3})" -echo "" - -# Query Loki for all logs related to this slot and duty -# Match various log formats for the duty -DUTY_PATTERN="${SLOT}/${DUTY_TYPE}" -LOGQL="{cluster_name=\"${CLUSTER_NAME}\",cluster_network=\"${NETWORK}\"} |~ \`${DUTY_PATTERN}|duty=${DUTY_TYPE}.*slot=${SLOT}|slot.*${SLOT}.*${DUTY_TYPE}|block_slot=${SLOT}\`" - -loki_query() { - local query="$1" - curl -sf -G \ - -H "$AUTH" \ - --data-urlencode "query=${query}" \ - --data-urlencode "start=${START_NS}" \ - --data-urlencode "end=${END_NS}" \ - --data-urlencode "limit=1000" \ - "${LOKI_URL}query_range" -} - -# Query core_tracker_inclusion_missed_total metric delta around the slot's inclusion check window. -# InclCheckLag=6 slots, InclMissedLag=32 slots (from core/tracker/inclusion.go). -# We sample the counter just before the check window opens and just after it closes. -INCL_METRIC_DELTA="unknown" -if [ -n "$PROM_URL" ] && [ "$DUTY_TYPE" = "proposer" ]; then - INCL_CHECK_LAG=6 - INCL_MISSED_LAG=32 - INCL_BEFORE_TIME=$(( SLOT_TIMESTAMP + INCL_CHECK_LAG * SECONDS_PER_SLOT - 1 )) - INCL_AFTER_TIME=$(( SLOT_TIMESTAMP + (INCL_MISSED_LAG + 2) * SECONDS_PER_SLOT )) - METRIC_QUERY="core_tracker_inclusion_missed_total{cluster_name=\"${CLUSTER_NAME}\",cluster_network=\"${NETWORK}\",duty=\"proposer\"}" - # Sum across all peers to get cluster-wide delta - VAL_BEFORE_SUM=$(curl -sf -G \ - -H "$AUTH" \ - --data-urlencode "query=sum(${METRIC_QUERY})" \ - --data-urlencode "time=${INCL_BEFORE_TIME}" \ - "${PROM_URL}query" | jq -r 'if .data.result | length == 0 then "0" else .data.result[0].value[1] end' 2>/dev/null || echo "0") - VAL_AFTER_SUM=$(curl -sf -G \ - -H "$AUTH" \ - --data-urlencode "query=sum(${METRIC_QUERY})" \ - --data-urlencode "time=${INCL_AFTER_TIME}" \ - "${PROM_URL}query" | jq -r 'if .data.result | length == 0 then "0" else .data.result[0].value[1] end' 2>/dev/null || echo "0") - # Use integer arithmetic to determine delta - DELTA=$(echo "$VAL_BEFORE_SUM $VAL_AFTER_SUM" | awk '{d=$2-$1; if(d<0) d=0; printf "%d", d}') - if [ "$DELTA" -gt 0 ] 2>/dev/null; then - INCL_METRIC_DELTA="missed" - elif [ "$VAL_AFTER_SUM" != "0" ] || [ "$VAL_BEFORE_SUM" != "0" ]; then - INCL_METRIC_DELTA="not_missed" - fi -fi - -echo "=== Fetching Logs ===" -LOGS_RAW=$(loki_query "$LOGQL") - -# Save raw Loki JSON to temp file for Python processing -LOKI_TMPFILE=$(mktemp) -trap 'rm -f "$LOKI_TMPFILE"' EXIT -echo "$LOGS_RAW" > "$LOKI_TMPFILE" - -# Process logs with Python (handles nanosecond timestamps correctly) -python3 - "$LOKI_TMPFILE" "$SLOT" "$SLOT_TIMESTAMP" "$DUTY_TYPE" \ - "$LEADER_PEER_R1" "$LEADER_R1" \ - "$LEADER_PEER_R2" "$LEADER_R2" \ - "$LEADER_PEER_R3" "$LEADER_R3" \ - "$SLOT_TIME" "$INCL_METRIC_DELTA" <<'PYTHON_SCRIPT' -import json -import re -import sys -from collections import defaultdict - -loki_file = sys.argv[1] -slot = int(sys.argv[2]) -slot_timestamp = int(sys.argv[3]) -duty_type = sys.argv[4] -leader_peer_r1, leader_idx_r1 = sys.argv[5], sys.argv[6] -leader_peer_r2, leader_idx_r2 = sys.argv[7], sys.argv[8] -leader_peer_r3, leader_idx_r3 = sys.argv[9], sys.argv[10] -slot_time = sys.argv[11] -incl_metric_delta = sys.argv[12] # "missed", "not_missed", or "unknown" - -slot_timestamp_ns = slot_timestamp * 1_000_000_000 - -with open(loki_file) as f: - data = json.load(f) - -results = data.get("data", {}).get("result", []) -if not results: - print() - print(f"ERROR: No logs found for {slot}/{duty_type}") - print("This could mean:") - print(f" - The cluster did not have this duty in slot {slot}") - print(" - Logs have been rotated/deleted") - print(" - The cluster name or network is incorrect") - sys.exit(1) - -# Parse all log entries -entries = [] -for stream in results: - peer = stream.get("stream", {}).get("cluster_peer", "unknown") - for ts_str, line in stream.get("values", []): - entries.append((int(ts_str), peer, line)) - -entries.sort(key=lambda x: x[0]) -print(f"Found {len(entries)} log entries") -print() - - -def extract_logfmt(line, field): - """Extract a field value from a logfmt-formatted line.""" - # Try quoted value first - m = re.search(rf'{field}="([^"]*)"', line) - if m: - return m.group(1) - # Try unquoted value - m = re.search(rf'{field}=(\S+)', line) - if m: - return m.group(1) - return "" - - -def calc_offset(ts_ns): - """Calculate offset from slot start in seconds.""" - offset_ms = (ts_ns - slot_timestamp_ns) / 1_000_000 - offset_s = offset_ms / 1000 - return f"{offset_s:+.3f}s" - - -def fmt(offset, tag, msg, indent_continuation=None): - """Format a timeline row.""" - line = f" {offset} [{tag}]{' ' * max(1, 10 - len(tag))} {msg}" - if indent_continuation: - line += f"\n{'':24s} {indent_continuation}" - return line - - -# --- Collect events --- -# We'll build a list of (ts_ns, sort_priority, formatted_line) tuples -# sort_priority breaks ties: lower = earlier in output for same timestamp -timeline = [] - -# Track state for summary -consensus_started = False -consensus_decided = False -decided_round = "" -decided_leader = "" -decided_index = "" -round_timeout_reasons = {} -seen_first = set() # for first-only events - -# Per-peer tracking for summary -bn_call_rtts = {} # peer -> rtt string -broadcast_delays = {} # peer -> delay string -block_type = None # "blinded" or "unblinded" -broadcast_success = False -broadcast_timeout = False -tracker_all = False -tracker_partial = False -tracker_absent = "" -tracker_missed = False -tracker_included = False -tracker_broadcast_delay = "" -error_peers = defaultdict(list) # peer -> [error messages] - -for ts_ns, peer, line in entries: - msg = extract_logfmt(line, "msg") - level = extract_logfmt(line, "level") - if not msg: - continue - - offset = calc_offset(ts_ns) - - # --- SCHEDULER --- - if msg == "Slot ticked": - if "slot_ticked" not in seen_first: - seen_first.add("slot_ticked") - timeline.append((ts_ns, 0, fmt(offset, "SCHED", f"Slot {slot} started"))) - - elif msg in ("Resolved proposer duty", "Resolved attester duty"): - pubkey = extract_logfmt(line, "pubkey") - vidx = extract_logfmt(line, "vidx") - key = f"resolved:{pubkey}" - if key not in seen_first: - seen_first.add(key) - timeline.append((ts_ns, 1, fmt(offset, "SCHED", - f"Resolved {duty_type} duty (vidx={vidx}, pubkey={pubkey})"))) - - # --- FETCHER (per-peer) --- - elif msg == "Calling beacon node endpoint...": - endpoint = extract_logfmt(line, "endpoint") - timeline.append((ts_ns, 10, fmt(offset, "FETCHER", - f"BN call start: {endpoint} [{peer}]"))) - - elif msg == "Beacon node call finished": - endpoint = extract_logfmt(line, "endpoint") - rtt = extract_logfmt(line, "rtt") - rtt_part = f" (RTT={rtt})" if rtt else "" - timeline.append((ts_ns, 11, fmt(offset, "FETCHER", - f"BN call done: {endpoint} [{peer}]{rtt_part}"))) - if rtt: - bn_call_rtts[peer] = rtt - - elif msg == "Beacon node call took longer than expected": - endpoint = extract_logfmt(line, "endpoint") - rtt = extract_logfmt(line, "rtt") - timeline.append((ts_ns, 12, fmt(offset, "FETCHER", - f"SLOW BN call: {endpoint} [{peer}] (RTT={rtt})"))) - if rtt: - bn_call_rtts[peer] = rtt - - # --- CONSENSUS --- - elif msg == "QBFT consensus instance starting": - if not consensus_started: - consensus_started = True - timeline.append((ts_ns, 20, fmt(offset, "QBFT", "Consensus started"))) - - elif msg == "QBFT round changed": - old_round = extract_logfmt(line, "round") - new_round = extract_logfmt(line, "new_round") - reason = extract_logfmt(line, "timeout_reason") - if old_round not in round_timeout_reasons: - round_timeout_reasons[old_round] = reason - timeline.append((ts_ns, 21, fmt(offset, "QBFT", - f"Round {old_round} TIMEOUT -> Round {new_round}", - f"Reason: {reason}"))) - - elif msg == "QBFT consensus decided": - if not consensus_decided: - consensus_decided = True - decided_round = extract_logfmt(line, "round") - decided_leader = extract_logfmt(line, "leader_name") - decided_index = extract_logfmt(line, "leader_index") - timeline.append((ts_ns, 22, fmt(offset, "QBFT", - f"Consensus DECIDED in round {decided_round}", - f"Leader: {decided_leader} (index {decided_index})"))) - - # --- VALIDATOR API (per-peer) --- - elif msg == "Beacon block proposal received from validator client": - block_version = extract_logfmt(line, "block_version") - block_type = "unblinded" - timeline.append((ts_ns, 30, fmt(offset, "VAPI", - f"Block proposal received [{peer}] (version={block_version})"))) - - elif msg == "Blinded beacon block received from validator client": - block_version = extract_logfmt(line, "block_version") - block_type = "blinded" - timeline.append((ts_ns, 30, fmt(offset, "VAPI", - f"Blinded block received [{peer}] (version={block_version})"))) - - # --- SIG AGGREGATION (per-peer) --- - elif msg == "Successfully aggregated partial signatures to reach threshold": - vapi_endpoint = extract_logfmt(line, "vapi_endpoint") - ep_part = f" ({vapi_endpoint})" if vapi_endpoint else "" - timeline.append((ts_ns, 40, fmt(offset, "SIGAGG", - f"Threshold reached [{peer}]{ep_part}"))) - - # --- BROADCAST (per-peer) --- - elif msg in ("Successfully submitted proposal to beacon node", - "Successfully submitted block proposal to beacon node", - "Successfully submitted v2 attestations to beacon node"): - delay = extract_logfmt(line, "delay") - broadcast_success = True - delay_part = f" (delay={delay})" if delay else "" - timeline.append((ts_ns, 50, fmt(offset, "BCAST", - f"Broadcast SUCCESS [{peer}]{delay_part}"))) - if delay: - broadcast_delays[peer] = delay - - elif msg == "Timeout calling bcast/broadcast, duty expired": - vapi_endpoint = extract_logfmt(line, "vapi_endpoint") - broadcast_timeout = True - timeline.append((ts_ns, 51, fmt(offset, "BCAST", - f"TIMEOUT: duty expired [{peer}] ({vapi_endpoint})"))) - - # --- SSE EVENTS (per-peer for "too late", first for normal) --- - elif msg == "Beacon node received block_gossip event too late": - gossip_delay = extract_logfmt(line, "gossip_delay") or extract_logfmt(line, "delay") - delay_part = f" (delay={gossip_delay})" if gossip_delay else "" - timeline.append((ts_ns, 55, fmt(offset, "SSE", - f"block_gossip TOO LATE [{peer}]{delay_part}"))) - - elif msg == "Beacon node received block event too late": - block_delay = extract_logfmt(line, "block_delay") or extract_logfmt(line, "delay") - delay_part = f" (delay={block_delay})" if block_delay else "" - timeline.append((ts_ns, 55, fmt(offset, "SSE", - f"block event TOO LATE [{peer}]{delay_part}"))) - - elif msg in ("SSE block gossip event", "SSE head event", "SSE block event"): - key = f"sse:{msg}" - if key not in seen_first: - seen_first.add(key) - timeline.append((ts_ns, 56, fmt(offset, "SSE", msg))) - - # --- TRACKER (first only) --- - elif msg == "All peers participated in duty": - if "tracker_all" not in seen_first: - seen_first.add("tracker_all") - tracker_all = True - timeline.append((ts_ns, 60, fmt(offset, "TRACKER", - "All peers participated"))) - - elif msg == "Not all peers participated in duty": - if "tracker_partial" not in seen_first: - seen_first.add("tracker_partial") - tracker_partial = True - tracker_absent = extract_logfmt(line, "absent") - timeline.append((ts_ns, 60, fmt(offset, "TRACKER", - "Not all peers participated", - f"Absent: {tracker_absent}"))) - - elif msg in ("Broadcasted block included on-chain", "Broadcasted blinded block included on-chain"): - if "tracker_included" not in seen_first: - seen_first.add("tracker_included") - tracker_included = True - pubkey = extract_logfmt(line, "pubkey") - tracker_broadcast_delay = extract_logfmt(line, "broadcast_delay") - label = "BLINDED BLOCK included on-chain" if "blinded" in msg else "BLOCK included on-chain" - timeline.append((ts_ns, 61, fmt(offset, "TRACKER", - label, - f"Pubkey: {pubkey}, Broadcast delay: {tracker_broadcast_delay}"))) - - elif msg == "Broadcasted block never included on-chain": - if "tracker_missed" not in seen_first: - seen_first.add("tracker_missed") - tracker_missed = True - pubkey = extract_logfmt(line, "pubkey") - tracker_broadcast_delay = extract_logfmt(line, "broadcast_delay") - timeline.append((ts_ns, 61, fmt(offset, "TRACKER", - "BLOCK MISSED: never included on-chain", - f"Pubkey: {pubkey}, Broadcast delay: {tracker_broadcast_delay}"))) - - elif msg == "Broadcasted blinded block never included on-chain": - if "tracker_missed_blinded" not in seen_first: - seen_first.add("tracker_missed_blinded") - tracker_missed = True - pubkey = extract_logfmt(line, "pubkey") - tracker_broadcast_delay = extract_logfmt(line, "broadcast_delay") - timeline.append((ts_ns, 61, fmt(offset, "TRACKER", - "BLINDED BLOCK MISSED: never included on-chain", - f"Pubkey: {pubkey}, Broadcast delay: {tracker_broadcast_delay}"))) - - # --- ERRORS (per-peer) --- - elif level == "error" and ("consensus timeout" in msg.lower() or "permanent failure" in msg.lower()): - error_peers[peer].append(msg) - timeline.append((ts_ns, 70, fmt(offset, "ERROR", - f"{msg} [{peer}]"))) - -# Sort and print timeline -timeline.sort(key=lambda x: (x[0], x[1])) - -print("=== Event Timeline ===") -print(f"(Offset relative to slot start time: {slot_time})") -print() - -for _, _, line in timeline: - print(line) - -print() -print("=== Summary ===") - -# Consensus summary -if consensus_started: - if consensus_decided: - num_timeouts = len(round_timeout_reasons) - if num_timeouts == 0: - print("Consensus: Completed in round 1 (optimal)") - else: - print(f"Consensus: Completed in round {decided_round} after {num_timeouts} timeout(s)") - print(f" Leader: {decided_leader} (index {decided_index})") - if "1" in round_timeout_reasons: - print(f" Round 1 leader {leader_peer_r1} failed") - else: - print("Consensus: Did NOT complete") -else: - print("Consensus: Not started (logs may be incomplete)") - -# Block type -if block_type: - print(f"Block type: {block_type}") - -# Broadcast summary -if broadcast_timeout: - print("Broadcast: TIMEOUT - duty expired before broadcast") -elif broadcast_success: - if broadcast_delays: - delays_str = ", ".join(f"{p}={d}" for p, d in sorted(broadcast_delays.items())) - # Parse delay values for min-max - delay_vals = [] - for d in broadcast_delays.values(): - m = re.search(r'[\d.]+', d) - if m: - delay_vals.append(float(m.group())) - if len(delay_vals) >= 2: - print(f"Broadcast: Successfully submitted (delay range: {min(delay_vals):.1f}s-{max(delay_vals):.1f}s)") - else: - print(f"Broadcast: Successfully submitted ({delays_str})") - else: - print("Broadcast: Successfully submitted to beacon node") -else: - print("Broadcast: No broadcast event found in logs") - -# BN call RTT summary -if bn_call_rtts: - rtt_vals = [] - for r in bn_call_rtts.values(): - m = re.search(r'[\d.]+', r) - if m: - rtt_vals.append(float(m.group())) - if rtt_vals: - if len(rtt_vals) >= 2: - print(f"BN call RTT: {min(rtt_vals):.1f}s-{max(rtt_vals):.1f}s across {len(rtt_vals)} peers") - else: - rtts_str = ", ".join(f"{p}={r}" for p, r in sorted(bn_call_rtts.items())) - print(f"BN call RTT: {rtts_str}") - -# Inclusion summary (for proposer) -if duty_type == "proposer": - if tracker_missed: - delay_part = f" (broadcast_delay={tracker_broadcast_delay})" if tracker_broadcast_delay else "" - print(f"Inclusion: MISSED - block never included on-chain{delay_part}") - elif tracker_included: - delay_part = f" (broadcast_delay={tracker_broadcast_delay})" if tracker_broadcast_delay else "" - print(f"Inclusion: Block included on-chain{delay_part}") - elif incl_metric_delta == "missed": - print("Inclusion: MISSED - inferred from core_tracker_inclusion_missed_total metric (no tracker log found)") - elif incl_metric_delta == "not_missed": - print("Inclusion: Block likely included on-chain (metric counter did not increase)") - else: - print("Inclusion: Unknown (no tracker log or metric data found)") - -# Participation summary -if tracker_partial: - print(f"Participation: Not all peers participated (absent: {tracker_absent})") -elif tracker_all: - print("Participation: All peers participated") -else: - print("Participation: Unknown (tracker event not found)") - -# Error summary -if error_peers: - print("Errors:") - for p, msgs in sorted(error_peers.items()): - for m in msgs: - print(f" - [{p}] {m}") - -print() -PYTHON_SCRIPT diff --git a/scripts/grafana-datasources.sh b/scripts/grafana-datasources.sh deleted file mode 100755 index 06a83566c7..0000000000 --- a/scripts/grafana-datasources.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash -# Fetches Prometheus and Loki datasource proxy URLs from Grafana. -# Requires OBOL_GRAFANA_API_TOKEN environment variable. -# Output: two lines in KEY=URL format, e.g.: -# PROMETHEUS_URL=https://grafana.monitoring.gcp.obol.tech/api/datasources/proxy//api/v1/ -# LOKI_URL=https://grafana.monitoring.gcp.obol.tech/api/datasources/proxy//loki/api/v1/ - -set -euo pipefail - -GRAFANA_BASE="https://grafana.monitoring.gcp.obol.tech" - -if [ -z "${OBOL_GRAFANA_API_TOKEN:-}" ]; then - echo "Error: OBOL_GRAFANA_API_TOKEN is not set" >&2 - exit 1 -fi - -response=$(curl -sf -H "Authorization: Bearer $OBOL_GRAFANA_API_TOKEN" "$GRAFANA_BASE/api/datasources") - -# Extract the main Prometheus (name="prometheus") and Loki datasource numeric IDs. -# Grafana datasource proxy requires numeric ID, not UID. -prom_id=$(echo "$response" | jq -r '.[] | select(.type=="prometheus" and .name=="prometheus") | .id') -loki_id=$(echo "$response" | jq -r '.[] | select(.type=="loki" and .name=="Loki") | .id') - -if [ -z "$prom_id" ]; then - echo "Error: Prometheus datasource not found" >&2 - exit 1 -fi -if [ -z "$loki_id" ]; then - echo "Error: Loki datasource not found" >&2 - exit 1 -fi - -echo "PROMETHEUS_URL=${GRAFANA_BASE}/api/datasources/proxy/${prom_id}/api/v1/" -echo "LOKI_URL=${GRAFANA_BASE}/api/datasources/proxy/${loki_id}/loki/api/v1/" From 421be5dc7e6cec31d860e8471810d622ae6e19aa Mon Sep 17 00:00:00 2001 From: Andrei Smirnov Date: Thu, 5 Mar 2026 11:27:39 +0300 Subject: [PATCH 7/7] Reverted README changes --- scripts/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/README.md b/scripts/README.md index 68174e4dbb..61109bdd8b 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -37,6 +37,11 @@ It helps expand an existing cluster with new validators, given the same operator The script will execute `node_merge.sh` for each `nodeX` subfolder found in the source cluster. +## Requirements + +Both scripts require **bash** (standard on Linux/macOS) and **jq** (version 1.5+). +Install via `sudo apt-get install jq` (Debian/Ubuntu) or `brew install jq` (macOS). + ## Important Warnings - Always back up your `cluster-lock.json`, node folders, and `validator_keys` folders before use.