From ef4fade9284ebaa26b74ad163bd7a61bc8d671be Mon Sep 17 00:00:00 2001 From: Dmitry Fomin Date: Sun, 22 Mar 2026 20:49:53 +0000 Subject: [PATCH 1/8] Add wait_event_timing: Oracle-style wait event instrumentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add per-backend wait event timing with nanosecond precision, controlled by the wait_event_timing GUC (default: off). When enabled, every pgstat_report_wait_start()/pgstat_report_wait_end() call records the wait duration using clock_gettime(CLOCK_MONOTONIC) and accumulates: - Per-event call count - Per-event total nanoseconds - Per-event max duration - Per-event log2 histogram (16 buckets: <1us to >=16ms) Statistics are stored in per-backend shared memory arrays, requiring no locking (each backend writes only to its own slot). External tools can read accumulated stats via the pg_stat_get_wait_event_timing() function. Overhead when enabled: two VDSO clock_gettime() calls per wait event transition (~40-100 ns), plus a few memory writes — comparable to Oracle's TIMED_STATISTICS. When disabled: one predictable branch per wait event (~1 ns), zero other cost. This provides PostgreSQL with Oracle V$SYSTEM_EVENT / V$EVENT_HISTOGRAM equivalent functionality, enabling: - Wait event time model (which events consume the most time) - Latency histograms (bimodal distributions, tail latency) - Per-backend wait profiles Motivation: external BPF-based wait event tracers (pg_wait_tracer) use hardware watchpoints on PGPROC->wait_event_info, which incur 6-30% overhead due to CPU debug exceptions. Internal instrumentation via clock_gettime VDSO achieves equivalent functionality at ~1-5% overhead. New files: src/include/utils/wait_event_timing.h -- data structures, inline helpers src/backend/utils/activity/wait_event_timing.c -- shmem init, SQL function Modified files: src/include/utils/wait_event.h -- timing calls in inline functions src/backend/storage/lmgr/proc.c -- init/cleanup timing storage src/backend/storage/ipc/ipci.c -- shared memory allocation src/backend/utils/misc/guc_parameters.dat -- wait_event_timing GUC src/include/catalog/pg_proc.dat -- pg_stat_get_wait_event_timing() Co-Authored-By: Claude Opus 4.6 (1M context) --- src/backend/storage/ipc/ipci.c | 2 + src/backend/storage/lmgr/proc.c | 4 + src/backend/utils/activity/Makefile | 3 +- src/backend/utils/activity/meson.build | 1 + .../utils/activity/wait_event_timing.c | 176 ++++++++++++++++++ src/backend/utils/misc/guc_parameters.dat | 6 + src/backend/utils/misc/guc_tables.c | 1 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/catalog/pg_proc.dat | 10 + src/include/utils/wait_event.h | 52 ++++++ src/include/utils/wait_event_timing.h | 156 ++++++++++++++++ 11 files changed, 411 insertions(+), 1 deletion(-) create mode 100644 src/backend/utils/activity/wait_event_timing.c create mode 100644 src/include/utils/wait_event_timing.h diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index a4785daf1e52f..e07f893301d40 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -137,6 +137,7 @@ CalculateShmemSize(void) size = add_size(size, AsyncShmemSize()); size = add_size(size, StatsShmemSize()); size = add_size(size, WaitEventCustomShmemSize()); + size = add_size(size, WaitEventTimingShmemSize()); size = add_size(size, InjectionPointShmemSize()); size = add_size(size, SlotSyncShmemSize()); size = add_size(size, AioShmemSize()); @@ -325,6 +326,7 @@ CreateOrAttachShmemStructs(void) AsyncShmemInit(); StatsShmemInit(); WaitEventCustomShmemInit(); + WaitEventTimingShmemInit(); InjectionPointShmemInit(); AioShmemInit(); WaitLSNShmemInit(); diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 8f5ce0e2a8a82..e91aa33d3ed53 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -529,6 +529,7 @@ InitProcess(void) /* now that we have a proc, report wait events to shared memory */ pgstat_set_wait_event_storage(&MyProc->wait_event_info); + pgstat_set_wait_event_timing_storage(GetNumberFromPGProc(MyProc)); /* * We might be reusing a semaphore that belonged to a failed process. So @@ -701,6 +702,7 @@ InitAuxiliaryProcess(void) /* now that we have a proc, report wait events to shared memory */ pgstat_set_wait_event_storage(&MyProc->wait_event_info); + pgstat_set_wait_event_timing_storage(GetNumberFromPGProc(MyProc)); /* Check that group locking fields are in a proper initial state. */ Assert(MyProc->lockGroupLeader == NULL); @@ -991,6 +993,7 @@ ProcKill(int code, Datum arg) */ SwitchBackToLocalLatch(); pgstat_reset_wait_event_storage(); + pgstat_reset_wait_event_timing_storage(); proc = MyProc; MyProc = NULL; @@ -1056,6 +1059,7 @@ AuxiliaryProcKill(int code, Datum arg) /* look at the equivalent ProcKill() code for comments */ SwitchBackToLocalLatch(); pgstat_reset_wait_event_storage(); + pgstat_reset_wait_event_timing_storage(); proc = MyProc; MyProc = NULL; diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile index c37bfb350bbc5..3d06147bce569 100644 --- a/src/backend/utils/activity/Makefile +++ b/src/backend/utils/activity/Makefile @@ -34,7 +34,8 @@ OBJS = \ pgstat_wal.o \ pgstat_xact.o \ wait_event.o \ - wait_event_funcs.o + wait_event_funcs.o \ + wait_event_timing.o # Force these dependencies to be known even without dependency info built: wait_event.o: wait_event.c $(top_builddir)/src/backend/utils/pgstat_wait_event.c diff --git a/src/backend/utils/activity/meson.build b/src/backend/utils/activity/meson.build index 53bd5a246cabe..f0a108dfb909a 100644 --- a/src/backend/utils/activity/meson.build +++ b/src/backend/utils/activity/meson.build @@ -18,6 +18,7 @@ backend_sources += files( 'pgstat_subscription.c', 'pgstat_wal.c', 'pgstat_xact.c', + 'wait_event_timing.c', ) # this includes a .c file with contents generated in ../../../include/activity, diff --git a/src/backend/utils/activity/wait_event_timing.c b/src/backend/utils/activity/wait_event_timing.c new file mode 100644 index 0000000000000..09a444f4ab881 --- /dev/null +++ b/src/backend/utils/activity/wait_event_timing.c @@ -0,0 +1,176 @@ +/*------------------------------------------------------------------------- + * + * wait_event_timing.c + * Per-backend wait event timing and histogram accumulation. + * + * This module provides Oracle-style wait event instrumentation: every + * call to pgstat_report_wait_start()/pgstat_report_wait_end() records + * the wait duration using clock_gettime() and accumulates per-event + * statistics (count, total nanoseconds, max, histogram) in shared memory. + * + * Overhead: two VDSO clock_gettime() calls per wait event transition + * (~40-100 ns total), plus a few memory writes to per-backend arrays. + * No locking is needed since each backend writes only to its own slot. + * + * Controlled by the wait_event_timing GUC (default: off). + * + * Copyright (c) 2001-2026, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/wait_event_timing.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "funcapi.h" +#include "miscadmin.h" +#include "storage/shmem.h" +#include "utils/builtins.h" +#include "utils/wait_event.h" +#include "utils/wait_event_timing.h" + +/* GUC variable */ +bool wait_event_timing = false; + +/* Pointer to this backend's timing state */ +WaitEventTimingState *my_wait_event_timing = NULL; + +/* Shared memory base pointer (array of MaxBackends entries) */ +static WaitEventTimingState *WaitEventTimingArray = NULL; + +/* + * Report the shared memory space needed. + */ +Size +WaitEventTimingShmemSize(void) +{ + return mul_size(MaxBackends, sizeof(WaitEventTimingState)); +} + +/* + * Initialize shared memory for wait event timing. + */ +void +WaitEventTimingShmemInit(void) +{ + bool found; + Size size; + + size = WaitEventTimingShmemSize(); + + WaitEventTimingArray = (WaitEventTimingState *) + ShmemInitStruct("WaitEventTimingArray", size, &found); + + if (!found) + memset(WaitEventTimingArray, 0, size); +} + +/* + * Point my_wait_event_timing at this backend's slot. + * Called from InitProcess() after the backend has a valid procNumber. + * + * procNumber is the PGPROC array index (from GetNumberFromPGProc). + */ +void +pgstat_set_wait_event_timing_storage(int procNumber) +{ + Assert(WaitEventTimingArray != NULL); + Assert(procNumber >= 0 && procNumber < MaxBackends); + + my_wait_event_timing = &WaitEventTimingArray[procNumber]; + + /* Zero the state for this new backend session */ + memset(my_wait_event_timing, 0, sizeof(WaitEventTimingState)); +} + +/* + * Detach from timing state on backend exit. + */ +void +pgstat_reset_wait_event_timing_storage(void) +{ + my_wait_event_timing = NULL; +} + +/* + * SQL function: pg_stat_get_wait_event_timing(OUT ...) + * + * Returns one row per (backend_id, wait_event) with non-zero counts. + */ +Datum +pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + int backend_idx; + + InitMaterializedSRF(fcinfo, 0); + + if (WaitEventTimingArray == NULL) + PG_RETURN_VOID(); + + for (backend_idx = 0; backend_idx < MaxBackends; backend_idx++) + { + WaitEventTimingState *state = &WaitEventTimingArray[backend_idx]; + int i; + + for (i = 0; i < WAIT_EVENT_TIMING_NUM_EVENTS; i++) + { + WaitEventTimingEntry *entry = &state->events[i]; + Datum values[8]; + bool nulls[8]; + uint32 wait_event_info; + const char *event_type; + const char *event_name; + int bucket; + + if (entry->count == 0) + continue; + + /* Reconstruct wait_event_info from flat index */ + wait_event_info = ((i / WAIT_EVENT_TIMING_EVENTS_PER_CLASS) << 24) | + (i % WAIT_EVENT_TIMING_EVENTS_PER_CLASS); + + event_type = pgstat_get_wait_event_type(wait_event_info); + event_name = pgstat_get_wait_event(wait_event_info); + + if (event_type == NULL || event_name == NULL) + continue; + + memset(nulls, 0, sizeof(nulls)); + + values[0] = Int32GetDatum(backend_idx + 1); /* backend_id (1-based) */ + values[1] = CStringGetTextDatum(event_type); + values[2] = CStringGetTextDatum(event_name); + values[3] = Int64GetDatum(entry->count); + values[4] = Float8GetDatum((double) entry->total_ns / 1000000.0); /* ms */ + values[5] = Float8GetDatum(entry->count > 0 + ? (double) entry->total_ns / entry->count / 1000.0 + : 0.0); /* avg us */ + values[6] = Float8GetDatum((double) entry->max_ns / 1000.0); /* max us */ + + /* Pack histogram into a text representation */ + { + StringInfoData buf; + + initStringInfo(&buf); + appendStringInfoChar(&buf, '{'); + for (bucket = 0; bucket < WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS; bucket++) + { + if (bucket > 0) + appendStringInfoChar(&buf, ','); + appendStringInfo(&buf, "%d", entry->histogram[bucket]); + } + appendStringInfoChar(&buf, '}'); + values[7] = CStringGetTextDatum(buf.data); + pfree(buf.data); + } + + tuplestore_putvalues(rsinfo->setResult, + rsinfo->setDesc, + values, nulls); + } + } + + PG_RETURN_VOID(); +} diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 0c9854ad8fc05..df494f3fa40ab 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -3131,6 +3131,12 @@ boot_val => 'false', }, +{ name => 'wait_event_timing', type => 'bool', context => 'PGC_SUSET', group => 'STATS_CUMULATIVE', + short_desc => 'Collects timing statistics for wait events (count, duration, histogram).', + variable => 'wait_event_timing', + boot_val => 'false', +}, + { name => 'transaction_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', short_desc => 'Sets the size of the dedicated buffer pool used for the transaction status cache.', long_desc => '0 means use a fraction of "shared_buffers".', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 1e14b7b4af060..5875817fd67c6 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -78,6 +78,7 @@ #include "replication/syncrep.h" #include "storage/aio.h" #include "storage/bufmgr.h" +#include "utils/wait_event_timing.h" #include "storage/bufpage.h" #include "storage/copydir.h" #include "storage/fd.h" diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e4abe6c007776..1a08bd9c32d55 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -689,6 +689,7 @@ #track_cost_delay_timing = off #track_io_timing = off #track_wal_io_timing = off +#wait_event_timing = off #track_functions = none # none, pl, all #stats_fetch_consistency = cache # cache, none, snapshot diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 84e7adde0e5b5..83a8182fd895e 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12843,4 +12843,14 @@ proname => 'hashoid8extended', prorettype => 'int8', proargtypes => 'oid8 int8', prosrc => 'hashoid8extended' }, +{ oid => '9950', + descr => 'statistics: per-backend wait event timing (count, duration, histogram)', + proname => 'pg_stat_get_wait_event_timing', prorows => '100', + proretset => 't', provolatile => 's', proparallel => 'r', + prorettype => 'record', proargtypes => '', + proallargtypes => '{int4,text,text,int8,float8,float8,float8,text}', + proargmodes => '{o,o,o,o,o,o,o,o}', + proargnames => '{backend_id,wait_event_type,wait_event,calls,total_time_ms,avg_time_us,max_time_us,histogram}', + prosrc => 'pg_stat_get_wait_event_timing' }, + ] diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index 34c27cc3dc3e1..c584dc5853df6 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -12,6 +12,7 @@ /* enums for wait events */ #include "utils/wait_event_types.h" +#include "utils/wait_event_timing.h" extern const char *pgstat_get_wait_event(uint32 wait_event_info); extern const char *pgstat_get_wait_event_type(uint32 wait_event_info); @@ -63,6 +64,9 @@ extern char **GetWaitEventCustomNames(uint32 classId, int *nwaitevents); * * my_wait_event_info initially points to local memory, making it safe to * call this before MyProc has been initialized. + * + * When wait_event_timing is enabled, we also record the start timestamp + * for later duration computation in pgstat_report_wait_end(). * ---------- */ static inline void @@ -73,17 +77,65 @@ pgstat_report_wait_start(uint32 wait_event_info) * four-bytes, updates are atomic. */ *(volatile uint32 *) my_wait_event_info = wait_event_info; + + /* + * Record start timestamp when wait_event_timing is enabled. + * The branch is predictable (always taken or never taken for the + * lifetime of the session) and costs ~1 ns when not taken. + */ + if (wait_event_timing && likely(my_wait_event_timing != NULL)) + { + INSTR_TIME_SET_CURRENT(my_wait_event_timing->wait_start); + my_wait_event_timing->current_event = wait_event_info; + } } /* ---------- * pgstat_report_wait_end() - * * Called to report end of a wait. + * + * When wait_event_timing is enabled, computes the wait duration and + * accumulates it into per-event statistics. * ---------- */ static inline void pgstat_report_wait_end(void) { + /* + * When timing is enabled, compute duration and accumulate stats + * before clearing the wait event. + */ + if (wait_event_timing && likely(my_wait_event_timing != NULL)) + { + uint32 event = my_wait_event_timing->current_event; + + if (event != 0 && !INSTR_TIME_IS_ZERO(my_wait_event_timing->wait_start)) + { + instr_time now; + int64 duration_ns; + int idx; + + INSTR_TIME_SET_CURRENT(now); + duration_ns = INSTR_TIME_GET_NANOSEC(now) - + INSTR_TIME_GET_NANOSEC(my_wait_event_timing->wait_start); + + idx = wait_event_timing_index(event); + if (likely(idx >= 0)) + { + WaitEventTimingEntry *entry = &my_wait_event_timing->events[idx]; + + entry->count++; + entry->total_ns += duration_ns; + if (duration_ns > entry->max_ns) + entry->max_ns = duration_ns; + entry->histogram[wait_event_timing_bucket(duration_ns)]++; + } + + INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start); + } + } + /* see pgstat_report_wait_start() */ *(volatile uint32 *) my_wait_event_info = 0; } diff --git a/src/include/utils/wait_event_timing.h b/src/include/utils/wait_event_timing.h new file mode 100644 index 0000000000000..403f8e4857c9d --- /dev/null +++ b/src/include/utils/wait_event_timing.h @@ -0,0 +1,156 @@ +/*------------------------------------------------------------------------- + * + * wait_event_timing.h + * Per-backend wait event timing and histogram infrastructure. + * + * When enabled via the wait_event_timing GUC, every call to + * pgstat_report_wait_start()/pgstat_report_wait_end() records the wait + * duration and accumulates per-event statistics (count, total time, + * histogram) in shared memory. + * + * The overhead is two clock_gettime(CLOCK_MONOTONIC) calls per wait event + * transition (~40-100 ns via VDSO), plus a few memory writes to per-backend + * arrays. No locking is needed because each backend writes only to its own + * stats slot. + * + * External tools can read the accumulated stats from shared memory, or + * use the pg_wait_event_timing view. + * + * Copyright (c) 2001-2026, PostgreSQL Global Development Group + * + * src/include/utils/wait_event_timing.h + *------------------------------------------------------------------------- + */ +#ifndef WAIT_EVENT_TIMING_H +#define WAIT_EVENT_TIMING_H + +#include "port/pg_bitutils.h" +#include "portability/instr_time.h" + +/* + * Number of log2 histogram buckets. Bucket i covers durations in + * [2^i, 2^(i+1)) microseconds, except bucket 0 which covers [0, 1) us + * and the last bucket which covers [2^(NBUCKETS-1), infinity). + * + * 16 buckets cover: <1us, 1-2us, 2-4us, ... 8-16ms, >=16ms + */ +#define WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS 16 + +/* + * Maximum number of distinct wait events we track. This must be large + * enough to cover all built-in events (currently ~288) plus some headroom + * for custom extension events. The wait event ID (low 16 bits of + * wait_event_info) is used as the index within a class, and we store + * per-class arrays. + * + * Wait event classes and their max event counts: + * LWLock (0x01): up to 256 tranches (dynamic, use separate array) + * Lock (0x03): ~10 lock types + * Buffer (0x04): ~6 events + * Activity (0x05): ~20 events + * Client (0x06): ~14 events + * Extension (0x07): up to 128 custom events + * IPC (0x08): ~59 events + * Timeout (0x09): ~13 events + * IO (0x0A): ~85 events + * InjectionPoint (0x0B): up to 128 custom events + * + * We use a flat array indexed by (classId >> 24) * 256 + eventId. + * With 12 classes * 256 = 3072 slots, this uses ~73 KB per backend. + */ +#define WAIT_EVENT_TIMING_CLASSES 12 /* 0x00 .. 0x0B */ +#define WAIT_EVENT_TIMING_EVENTS_PER_CLASS 256 +#define WAIT_EVENT_TIMING_NUM_EVENTS \ + (WAIT_EVENT_TIMING_CLASSES * WAIT_EVENT_TIMING_EVENTS_PER_CLASS) + +/* + * Per-event accumulated statistics. One entry per distinct wait event + * per backend. These are written only by the owning backend, so no + * locking is needed. External readers may see torn reads for 64-bit + * fields on 32-bit platforms, but that is acceptable for statistics. + */ +typedef struct WaitEventTimingEntry +{ + int64 count; /* number of occurrences */ + int64 total_ns; /* total wait duration in nanoseconds */ + int64 max_ns; /* longest single wait in nanoseconds */ + int32 histogram[WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS]; +} WaitEventTimingEntry; + +/* + * Per-backend wait event timing state. Allocated in shared memory, + * one per MaxBackends slot. + */ +typedef struct WaitEventTimingState +{ + /* Current wait start timestamp (set by pgstat_report_wait_start) */ + instr_time wait_start; + + /* Current wait_event_info (cached for use in wait_end) */ + uint32 current_event; + + /* Reset counter — incremented by pg_stat_reset_wait_event_timing() */ + int64 reset_count; + + /* Per-event statistics array */ + WaitEventTimingEntry events[WAIT_EVENT_TIMING_NUM_EVENTS]; +} WaitEventTimingState; + + +/* GUC variable */ +extern PGDLLIMPORT bool wait_event_timing; + +/* Pointer to this backend's timing state in shared memory */ +extern PGDLLIMPORT WaitEventTimingState *my_wait_event_timing; + +/* Shared memory setup */ +extern Size WaitEventTimingShmemSize(void); +extern void WaitEventTimingShmemInit(void); + +/* Called from InitProcess() to point my_wait_event_timing at our slot */ +extern void pgstat_set_wait_event_timing_storage(int procNumber); +extern void pgstat_reset_wait_event_timing_storage(void); + + +/* Convert wait_event_info to a flat index for the events[] array */ +static inline int +wait_event_timing_index(uint32 wait_event_info) +{ + int classId = (wait_event_info >> 24) & 0xFF; + int eventId = wait_event_info & 0xFFFF; + + if (unlikely(classId >= WAIT_EVENT_TIMING_CLASSES || + eventId >= WAIT_EVENT_TIMING_EVENTS_PER_CLASS)) + return -1; + + return classId * WAIT_EVENT_TIMING_EVENTS_PER_CLASS + eventId; +} + +/* + * Compute histogram bucket index for a duration in nanoseconds. + * Uses log2 bucketing in microseconds: + * bucket 0: [0, 1) us + * bucket 1: [1, 2) us + * bucket 2: [2, 4) us + * ... + * bucket 15: [16384, inf) us (>= 16ms) + */ +static inline int +wait_event_timing_bucket(int64 duration_ns) +{ + int64 duration_us = duration_ns / 1000; + int bucket; + + if (duration_us <= 0) + return 0; + + /* pg_leftmost_one_pos64 returns the position of the highest set bit */ + bucket = pg_leftmost_one_pos64((uint64) duration_us) + 1; + + if (bucket >= WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS) + bucket = WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS - 1; + + return bucket; +} + +#endif /* WAIT_EVENT_TIMING_H */ From 5120e80ffaeb35efe262a18206dd01108233a019 Mon Sep 17 00:00:00 2001 From: Dmitry Fomin Date: Mon, 23 Mar 2026 17:03:32 +0000 Subject: [PATCH 2/8] Fix: remove pg_bitutils.h dependency, use simple bit shift for histogram bucket --- src/include/utils/wait_event_timing.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/include/utils/wait_event_timing.h b/src/include/utils/wait_event_timing.h index 403f8e4857c9d..db723cc133425 100644 --- a/src/include/utils/wait_event_timing.h +++ b/src/include/utils/wait_event_timing.h @@ -24,7 +24,6 @@ #ifndef WAIT_EVENT_TIMING_H #define WAIT_EVENT_TIMING_H -#include "port/pg_bitutils.h" #include "portability/instr_time.h" /* @@ -144,8 +143,18 @@ wait_event_timing_bucket(int64 duration_ns) if (duration_us <= 0) return 0; - /* pg_leftmost_one_pos64 returns the position of the highest set bit */ - bucket = pg_leftmost_one_pos64((uint64) duration_us) + 1; + /* Find position of highest set bit (log2) using bit shifting */ + { + uint64 val = (uint64) duration_us; + + bucket = 0; + while (val > 1) + { + val >>= 1; + bucket++; + } + bucket++; /* shift so bucket 0 = [0,1)us, bucket 1 = [1,2)us */ + } if (bucket >= WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS) bucket = WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS - 1; From 6a81684f9d1e1e79ec83b995d5a21f115ff1d035 Mon Sep 17 00:00:00 2001 From: Dmitry Fomin Date: Mon, 23 Mar 2026 17:05:07 +0000 Subject: [PATCH 3/8] Fix segfault: handle auxiliary processes with procNumber >= MaxBackends --- src/backend/utils/activity/wait_event_timing.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/backend/utils/activity/wait_event_timing.c b/src/backend/utils/activity/wait_event_timing.c index 09a444f4ab881..7d81561142d9d 100644 --- a/src/backend/utils/activity/wait_event_timing.c +++ b/src/backend/utils/activity/wait_event_timing.c @@ -71,12 +71,21 @@ WaitEventTimingShmemInit(void) * Called from InitProcess() after the backend has a valid procNumber. * * procNumber is the PGPROC array index (from GetNumberFromPGProc). + * Auxiliary processes (startup, bgwriter, etc.) have procNumbers beyond + * MaxBackends — we silently skip timing for them. */ void pgstat_set_wait_event_timing_storage(int procNumber) { - Assert(WaitEventTimingArray != NULL); - Assert(procNumber >= 0 && procNumber < MaxBackends); + if (WaitEventTimingArray == NULL) + return; + + /* Auxiliary processes have procNumbers >= MaxBackends; skip them */ + if (procNumber < 0 || procNumber >= MaxBackends) + { + my_wait_event_timing = NULL; + return; + } my_wait_event_timing = &WaitEventTimingArray[procNumber]; From 7cf09dbb28edc6058914a7d116f63e5c5f63bd17 Mon Sep 17 00:00:00 2001 From: Dmitry Fomin Date: Mon, 23 Mar 2026 20:16:05 +0000 Subject: [PATCH 4/8] Add query attribution: per (query_id, event) wait statistics When wait_event_timing is enabled, also accumulates wait stats per (query_id, wait_event) pair in a per-backend hash table (1024 slots, open addressing with linear probing). Query_id is read from PgBackendStatus->st_query_id via a cached pointer set during pgstat_beinit(). Zero overhead when query_id is not set (most background processes). New SQL function: pg_stat_get_wait_event_timing_by_query() Returns (backend_id, query_id, wait_event_type, wait_event, calls, total_time_ms) for all backends with non-zero counts. Shared memory: 32 KB per backend (1024 * 32-byte entries). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/backend/storage/ipc/ipci.c | 2 + src/backend/utils/activity/backend_status.c | 8 ++ .../utils/activity/wait_event_timing.c | 107 +++++++++++++++++- src/include/catalog/pg_proc.dat | 10 ++ src/include/utils/wait_event.h | 10 ++ src/include/utils/wait_event_timing.h | 83 ++++++++++++++ 6 files changed, 219 insertions(+), 1 deletion(-) diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index e07f893301d40..e02a9a000049c 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -138,6 +138,7 @@ CalculateShmemSize(void) size = add_size(size, StatsShmemSize()); size = add_size(size, WaitEventCustomShmemSize()); size = add_size(size, WaitEventTimingShmemSize()); + size = add_size(size, WaitEventQueryShmemSize()); size = add_size(size, InjectionPointShmemSize()); size = add_size(size, SlotSyncShmemSize()); size = add_size(size, AioShmemSize()); @@ -327,6 +328,7 @@ CreateOrAttachShmemStructs(void) StatsShmemInit(); WaitEventCustomShmemInit(); WaitEventTimingShmemInit(); + WaitEventQueryShmemInit(); InjectionPointShmemInit(); AioShmemInit(); WaitLSNShmemInit(); diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c index cd08712946911..fb920c1481b6c 100644 --- a/src/backend/utils/activity/backend_status.c +++ b/src/backend/utils/activity/backend_status.c @@ -20,6 +20,7 @@ #include "storage/proc.h" /* for MyProc */ #include "storage/procarray.h" #include "utils/ascii.h" +#include "utils/wait_event_timing.h" #include "utils/guc.h" /* for application_name */ #include "utils/memutils.h" @@ -249,6 +250,13 @@ pgstat_beinit(void) Assert(MyProcNumber >= 0 && MyProcNumber < NumBackendStatSlots); MyBEEntry = &BackendStatusArray[MyProcNumber]; + /* + * Point the wait event timing query_id pointer at our st_query_id. + * This must happen here (not in InitProcess) because MyBEEntry is + * not yet set when pgstat_set_wait_event_timing_storage() runs. + */ + my_wait_event_query_id_ptr = &MyBEEntry->st_query_id; + /* Set up a process-exit hook to clean up */ on_shmem_exit(pgstat_beshutdown_hook, 0); } diff --git a/src/backend/utils/activity/wait_event_timing.c b/src/backend/utils/activity/wait_event_timing.c index 7d81561142d9d..d31a3a6a4bc7a 100644 --- a/src/backend/utils/activity/wait_event_timing.c +++ b/src/backend/utils/activity/wait_event_timing.c @@ -36,8 +36,15 @@ bool wait_event_timing = false; /* Pointer to this backend's timing state */ WaitEventTimingState *my_wait_event_timing = NULL; -/* Shared memory base pointer (array of MaxBackends entries) */ +/* Pointer to this backend's query attribution hash */ +WaitEventQueryState *my_wait_event_query = NULL; + +/* Pointer to current backend's query_id in PgBackendStatus */ +volatile int64 *my_wait_event_query_id_ptr = NULL; + +/* Shared memory base pointers */ static WaitEventTimingState *WaitEventTimingArray = NULL; +static WaitEventQueryState *WaitEventQueryArray = NULL; /* * Report the shared memory space needed. @@ -66,6 +73,33 @@ WaitEventTimingShmemInit(void) memset(WaitEventTimingArray, 0, size); } +/* + * Report the shared memory space needed for query attribution. + */ +Size +WaitEventQueryShmemSize(void) +{ + return mul_size(MaxBackends, sizeof(WaitEventQueryState)); +} + +/* + * Initialize shared memory for query attribution. + */ +void +WaitEventQueryShmemInit(void) +{ + bool found; + Size size; + + size = WaitEventQueryShmemSize(); + + WaitEventQueryArray = (WaitEventQueryState *) + ShmemInitStruct("WaitEventQueryArray", size, &found); + + if (!found) + memset(WaitEventQueryArray, 0, size); +} + /* * Point my_wait_event_timing at this backend's slot. * Called from InitProcess() after the backend has a valid procNumber. @@ -91,6 +125,13 @@ pgstat_set_wait_event_timing_storage(int procNumber) /* Zero the state for this new backend session */ memset(my_wait_event_timing, 0, sizeof(WaitEventTimingState)); + + /* Set up query attribution hash (same index) */ + if (WaitEventQueryArray != NULL) + { + my_wait_event_query = &WaitEventQueryArray[procNumber]; + memset(my_wait_event_query, 0, sizeof(WaitEventQueryState)); + } } /* @@ -100,6 +141,8 @@ void pgstat_reset_wait_event_timing_storage(void) { my_wait_event_timing = NULL; + my_wait_event_query = NULL; + my_wait_event_query_id_ptr = NULL; } /* @@ -183,3 +226,65 @@ pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * SQL function: pg_stat_get_wait_event_timing_by_query(OUT ...) + * + * Returns one row per (backend_id, query_id, wait_event) with non-zero counts. + */ +Datum +pg_stat_get_wait_event_timing_by_query(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + int backend_idx; + + InitMaterializedSRF(fcinfo, 0); + + if (WaitEventQueryArray == NULL) + PG_RETURN_VOID(); + + for (backend_idx = 0; backend_idx < MaxBackends; backend_idx++) + { + WaitEventQueryState *qs = &WaitEventQueryArray[backend_idx]; + int i; + + for (i = 0; i < WAIT_EVENT_QUERY_HASH_SIZE; i++) + { + WaitEventQueryEntry *entry = &qs->entries[i]; + Datum values[6]; + bool nulls[6]; + uint32 wait_event_info; + const char *event_type; + const char *event_name; + + if (entry->query_id == 0) + continue; + + /* Reconstruct wait_event_info from flat index */ + wait_event_info = + ((entry->event_idx / WAIT_EVENT_TIMING_EVENTS_PER_CLASS) << 24) | + (entry->event_idx % WAIT_EVENT_TIMING_EVENTS_PER_CLASS); + + event_type = pgstat_get_wait_event_type(wait_event_info); + event_name = pgstat_get_wait_event(wait_event_info); + + if (event_type == NULL || event_name == NULL) + continue; + + memset(nulls, 0, sizeof(nulls)); + + values[0] = Int32GetDatum(backend_idx + 1); + values[1] = Int64GetDatum(entry->query_id); + values[2] = CStringGetTextDatum(event_type); + values[3] = CStringGetTextDatum(event_name); + values[4] = Int64GetDatum(entry->count); + values[5] = Float8GetDatum((double) entry->total_ns / 1000000.0); + + tuplestore_putvalues(rsinfo->setResult, + rsinfo->setDesc, + values, nulls); + } + } + + PG_RETURN_VOID(); +} diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 83a8182fd895e..e0faeb2303b28 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12853,4 +12853,14 @@ proargnames => '{backend_id,wait_event_type,wait_event,calls,total_time_ms,avg_time_us,max_time_us,histogram}', prosrc => 'pg_stat_get_wait_event_timing' }, +{ oid => '9951', + descr => 'statistics: per-backend wait event timing by query (count, duration)', + proname => 'pg_stat_get_wait_event_timing_by_query', prorows => '100', + proretset => 't', provolatile => 's', proparallel => 'r', + prorettype => 'record', proargtypes => '', + proallargtypes => '{int4,int8,text,text,int8,float8}', + proargmodes => '{o,o,o,o,o,o}', + proargnames => '{backend_id,query_id,wait_event_type,wait_event,calls,total_time_ms}', + prosrc => 'pg_stat_get_wait_event_timing_by_query' }, + ] diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index c584dc5853df6..611df800020a3 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -132,6 +132,16 @@ pgstat_report_wait_end(void) entry->histogram[wait_event_timing_bucket(duration_ns)]++; } + /* Query attribution: accumulate per (query_id, event) */ + if (my_wait_event_query_id_ptr) + { + int64 qid = *my_wait_event_query_id_ptr; + + if (qid != 0) + wait_event_query_accumulate(my_wait_event_query, + qid, idx, duration_ns); + } + INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start); } } diff --git a/src/include/utils/wait_event_timing.h b/src/include/utils/wait_event_timing.h index db723cc133425..2f9f0fa0c294b 100644 --- a/src/include/utils/wait_event_timing.h +++ b/src/include/utils/wait_event_timing.h @@ -96,15 +96,47 @@ typedef struct WaitEventTimingState } WaitEventTimingState; +/* + * Per-(query_id, event) accumulated statistics for query attribution. + * Open-addressing hash table with linear probing, per backend. + */ +#define WAIT_EVENT_QUERY_HASH_SIZE 1024 /* must be power of 2 */ + +typedef struct WaitEventQueryEntry +{ + int64 query_id; /* 0 means slot is empty */ + int32 event_idx; /* flat index from wait_event_timing_index() */ + int32 pad; + int64 count; + int64 total_ns; +} WaitEventQueryEntry; /* 32 bytes */ + +typedef struct WaitEventQueryState +{ + int32 num_used; /* occupied slots */ + int32 pad; + int64 overflow_count; /* waits dropped due to full table */ + WaitEventQueryEntry entries[WAIT_EVENT_QUERY_HASH_SIZE]; +} WaitEventQueryState; + + /* GUC variable */ extern PGDLLIMPORT bool wait_event_timing; /* Pointer to this backend's timing state in shared memory */ extern PGDLLIMPORT WaitEventTimingState *my_wait_event_timing; +/* Pointer to this backend's query attribution hash in shared memory */ +extern PGDLLIMPORT WaitEventQueryState *my_wait_event_query; + +/* Pointer to current backend's query_id in PgBackendStatus (set late) */ +extern PGDLLIMPORT volatile int64 *my_wait_event_query_id_ptr; + /* Shared memory setup */ extern Size WaitEventTimingShmemSize(void); extern void WaitEventTimingShmemInit(void); +extern Size WaitEventQueryShmemSize(void); +extern void WaitEventQueryShmemInit(void); /* Called from InitProcess() to point my_wait_event_timing at our slot */ extern void pgstat_set_wait_event_timing_storage(int procNumber); @@ -162,4 +194,55 @@ wait_event_timing_bucket(int64 duration_ns) return bucket; } +/* + * Accumulate a wait event duration into the per-(query_id, event) hash table. + * Uses open addressing with linear probing. Called from the hot path in + * pgstat_report_wait_end() when wait_event_timing is on and query_id != 0. + */ +static inline void +wait_event_query_accumulate(WaitEventQueryState *qs, int64 query_id, + int event_idx, int64 duration_ns) +{ + uint64 hash; + int slot; + int i; + + if (qs == NULL) + return; + + /* Simple hash combining query_id and event_idx */ + hash = ((uint64) query_id * 0x9e3779b97f4a7c15ULL) ^ (uint64) event_idx; + slot = (int) (hash & (WAIT_EVENT_QUERY_HASH_SIZE - 1)); + + for (i = 0; i < WAIT_EVENT_QUERY_HASH_SIZE; i++) + { + WaitEventQueryEntry *e = &qs->entries[slot]; + + if (e->query_id == query_id && e->event_idx == event_idx) + { + /* Found existing entry */ + e->count++; + e->total_ns += duration_ns; + return; + } + + if (e->query_id == 0) + { + /* Empty slot — insert new entry */ + e->query_id = query_id; + e->event_idx = event_idx; + e->count = 1; + e->total_ns = duration_ns; + qs->num_used++; + return; + } + + /* Collision — linear probe */ + slot = (slot + 1) & (WAIT_EVENT_QUERY_HASH_SIZE - 1); + } + + /* Table full — drop this measurement */ + qs->overflow_count++; +} + #endif /* WAIT_EVENT_TIMING_H */ From e8a17171aded61cd3b7cf474546b9d9a82a4c47e Mon Sep 17 00:00:00 2001 From: Dmitry Fomin Date: Mon, 23 Mar 2026 20:25:11 +0000 Subject: [PATCH 5/8] Add query attribution: per (query_id, event) wait statistics When wait_event_timing is enabled, also accumulates wait stats per (query_id, wait_event) pair in a per-backend hash table (1024 slots, open addressing with linear probing). Query_id is read from PgBackendStatus->st_query_id via a cached pointer set during pgstat_beinit(). Zero overhead when query_id is not set (most background processes). New SQL function: pg_stat_get_wait_event_timing_by_query() Returns (backend_id, query_id, wait_event_type, wait_event, calls, total_time_ms) for all backends with non-zero counts. Shared memory: 32 KB per backend (1024 * 32-byte entries). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/include/utils/wait_event.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index 611df800020a3..423cac6532d99 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -133,7 +133,8 @@ pgstat_report_wait_end(void) } /* Query attribution: accumulate per (query_id, event) */ - if (my_wait_event_query_id_ptr) + if (likely(idx >= 0) && my_wait_event_query != NULL && + my_wait_event_query_id_ptr != NULL) { int64 qid = *my_wait_event_query_id_ptr; From 1b95736c912e3c410312834e6e4a670ad71a8750 Mon Sep 17 00:00:00 2001 From: Dmitry Fomin Date: Mon, 23 Mar 2026 21:06:14 +0000 Subject: [PATCH 6/8] Add 10046-style per-session wait event trace ring buffer When wait_event_trace GUC is enabled for a session (PGC_USERSET), every pgstat_report_wait_end() writes a record to a per-backend ring buffer in shared memory: {timestamp_ns, event, duration_ns, query_id}. Ring buffer holds 4096 records (128 KB) per backend. At 220K events/sec this covers ~18ms of history. External tools read via pg_stat_get_wait_event_trace(backend_id). Requires wait_event_timing = on (trace code runs inside the timing block to reuse the already-computed timestamp and duration). New GUC: wait_event_trace (bool, PGC_USERSET, default off) New SQL function: pg_stat_get_wait_event_trace(backend_id int4) Returns (seq, timestamp_ns, wait_event_type, wait_event, duration_us, query_id) in chronological order. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/backend/storage/ipc/ipci.c | 2 + .../utils/activity/wait_event_timing.c | 124 +++++++++++++++++- src/backend/utils/misc/guc_parameters.dat | 18 ++- src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/catalog/pg_proc.dat | 10 ++ src/include/utils/wait_event.h | 15 +++ src/include/utils/wait_event_timing.h | 33 ++++- 7 files changed, 195 insertions(+), 8 deletions(-) diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index e02a9a000049c..f7512905ae10c 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -139,6 +139,7 @@ CalculateShmemSize(void) size = add_size(size, WaitEventCustomShmemSize()); size = add_size(size, WaitEventTimingShmemSize()); size = add_size(size, WaitEventQueryShmemSize()); + size = add_size(size, WaitEventTraceShmemSize()); size = add_size(size, InjectionPointShmemSize()); size = add_size(size, SlotSyncShmemSize()); size = add_size(size, AioShmemSize()); @@ -329,6 +330,7 @@ CreateOrAttachShmemStructs(void) WaitEventCustomShmemInit(); WaitEventTimingShmemInit(); WaitEventQueryShmemInit(); + WaitEventTraceShmemInit(); InjectionPointShmemInit(); AioShmemInit(); WaitLSNShmemInit(); diff --git a/src/backend/utils/activity/wait_event_timing.c b/src/backend/utils/activity/wait_event_timing.c index d31a3a6a4bc7a..feb322330cac5 100644 --- a/src/backend/utils/activity/wait_event_timing.c +++ b/src/backend/utils/activity/wait_event_timing.c @@ -30,8 +30,9 @@ #include "utils/wait_event.h" #include "utils/wait_event_timing.h" -/* GUC variable */ +/* GUC variables */ bool wait_event_timing = false; +bool wait_event_trace = false; /* Pointer to this backend's timing state */ WaitEventTimingState *my_wait_event_timing = NULL; @@ -39,12 +40,16 @@ WaitEventTimingState *my_wait_event_timing = NULL; /* Pointer to this backend's query attribution hash */ WaitEventQueryState *my_wait_event_query = NULL; +/* Pointer to this backend's trace ring buffer */ +WaitEventTraceState *my_wait_event_trace = NULL; + /* Pointer to current backend's query_id in PgBackendStatus */ volatile int64 *my_wait_event_query_id_ptr = NULL; /* Shared memory base pointers */ static WaitEventTimingState *WaitEventTimingArray = NULL; static WaitEventQueryState *WaitEventQueryArray = NULL; +static WaitEventTraceState *WaitEventTraceArray = NULL; /* * Report the shared memory space needed. @@ -100,6 +105,33 @@ WaitEventQueryShmemInit(void) memset(WaitEventQueryArray, 0, size); } +/* + * Report the shared memory space needed for trace ring buffers. + */ +Size +WaitEventTraceShmemSize(void) +{ + return mul_size(MaxBackends, sizeof(WaitEventTraceState)); +} + +/* + * Initialize shared memory for trace ring buffers. + */ +void +WaitEventTraceShmemInit(void) +{ + bool found; + Size size; + + size = WaitEventTraceShmemSize(); + + WaitEventTraceArray = (WaitEventTraceState *) + ShmemInitStruct("WaitEventTraceArray", size, &found); + + if (!found) + memset(WaitEventTraceArray, 0, size); +} + /* * Point my_wait_event_timing at this backend's slot. * Called from InitProcess() after the backend has a valid procNumber. @@ -132,6 +164,14 @@ pgstat_set_wait_event_timing_storage(int procNumber) my_wait_event_query = &WaitEventQueryArray[procNumber]; memset(my_wait_event_query, 0, sizeof(WaitEventQueryState)); } + + /* Set up trace ring buffer (same index) */ + if (WaitEventTraceArray != NULL) + { + my_wait_event_trace = &WaitEventTraceArray[procNumber]; + memset(my_wait_event_trace, 0, sizeof(WaitEventTraceState)); + pg_atomic_init_u64(&my_wait_event_trace->write_pos, 0); + } } /* @@ -142,6 +182,7 @@ pgstat_reset_wait_event_timing_storage(void) { my_wait_event_timing = NULL; my_wait_event_query = NULL; + my_wait_event_trace = NULL; my_wait_event_query_id_ptr = NULL; } @@ -288,3 +329,84 @@ pg_stat_get_wait_event_timing_by_query(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * SQL function: pg_stat_get_wait_event_trace(backend_id int4) + * + * Returns trace records from a backend's ring buffer in chronological order. + * Pass NULL or 0 for own backend. + */ +Datum +pg_stat_get_wait_event_trace(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + int backend_idx; + WaitEventTraceState *ts; + uint64 write_pos; + uint64 read_start; + uint64 i; + + InitMaterializedSRF(fcinfo, 0); + + if (WaitEventTraceArray == NULL) + PG_RETURN_VOID(); + + /* Determine which backend to read */ + if (PG_ARGISNULL(0) || PG_GETARG_INT32(0) <= 0) + { + /* Own backend — find our index */ + if (my_wait_event_trace == NULL) + PG_RETURN_VOID(); + backend_idx = (int) (my_wait_event_trace - WaitEventTraceArray); + } + else + { + backend_idx = PG_GETARG_INT32(0) - 1; /* 1-based to 0-based */ + } + + if (backend_idx < 0 || backend_idx >= MaxBackends) + PG_RETURN_VOID(); + + ts = &WaitEventTraceArray[backend_idx]; + write_pos = pg_atomic_read_u64(&ts->write_pos); + + if (write_pos == 0) + PG_RETURN_VOID(); + + /* Read from oldest available to newest */ + read_start = (write_pos > WAIT_EVENT_TRACE_RING_SIZE) + ? write_pos - WAIT_EVENT_TRACE_RING_SIZE : 0; + + for (i = read_start; i < write_pos; i++) + { + WaitEventTraceRecord *rec = + &ts->records[i & (WAIT_EVENT_TRACE_RING_SIZE - 1)]; + Datum values[6]; + bool nulls[6]; + const char *event_type; + const char *event_name; + + if (rec->event == 0) + continue; + + event_type = pgstat_get_wait_event_type(rec->event); + event_name = pgstat_get_wait_event(rec->event); + if (event_type == NULL || event_name == NULL) + continue; + + memset(nulls, 0, sizeof(nulls)); + + values[0] = Int64GetDatum((int64) i); /* seq */ + values[1] = Int64GetDatum(rec->timestamp_ns); + values[2] = CStringGetTextDatum(event_type); + values[3] = CStringGetTextDatum(event_name); + values[4] = Float8GetDatum((double) rec->duration_ns / 1000.0); + values[5] = Int64GetDatum(rec->query_id); + + tuplestore_putvalues(rsinfo->setResult, + rsinfo->setDesc, + values, nulls); + } + + PG_RETURN_VOID(); +} diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index df494f3fa40ab..436b751af504a 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -3131,12 +3131,6 @@ boot_val => 'false', }, -{ name => 'wait_event_timing', type => 'bool', context => 'PGC_SUSET', group => 'STATS_CUMULATIVE', - short_desc => 'Collects timing statistics for wait events (count, duration, histogram).', - variable => 'wait_event_timing', - boot_val => 'false', -}, - { name => 'transaction_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', short_desc => 'Sets the size of the dedicated buffer pool used for the transaction status cache.', long_desc => '0 means use a fraction of "shared_buffers".', @@ -3336,6 +3330,18 @@ boot_val => 'true', }, +{ name => 'wait_event_timing', type => 'bool', context => 'PGC_SUSET', group => 'STATS_CUMULATIVE', + short_desc => 'Collects timing statistics for wait events (count, duration, histogram).', + variable => 'wait_event_timing', + boot_val => 'false', +}, + +{ name => 'wait_event_trace', type => 'bool', context => 'PGC_USERSET', group => 'STATS_CUMULATIVE', + short_desc => 'Enables per-session wait event trace recording into a ring buffer.', + variable => 'wait_event_trace', + boot_val => 'false', +}, + { name => 'wal_block_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', short_desc => 'Shows the block size in the write ahead log.', flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 1a08bd9c32d55..ad88b593d9955 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -690,6 +690,7 @@ #track_io_timing = off #track_wal_io_timing = off #wait_event_timing = off +#wait_event_trace = off #track_functions = none # none, pl, all #stats_fetch_consistency = cache # cache, none, snapshot diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index e0faeb2303b28..e087513710e30 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12863,4 +12863,14 @@ proargnames => '{backend_id,query_id,wait_event_type,wait_event,calls,total_time_ms}', prosrc => 'pg_stat_get_wait_event_timing_by_query' }, +{ oid => '9952', + descr => 'statistics: per-session wait event trace ring buffer', + proname => 'pg_stat_get_wait_event_trace', prorows => '1000', + proretset => 't', provolatile => 's', proparallel => 'r', + prorettype => 'record', proargtypes => 'int4', + proallargtypes => '{int4,int8,int8,text,text,float8,int8}', + proargmodes => '{i,o,o,o,o,o,o}', + proargnames => '{backend_id,seq,timestamp_ns,wait_event_type,wait_event,duration_us,query_id}', + prosrc => 'pg_stat_get_wait_event_trace' }, + ] diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index 423cac6532d99..03e4a71e79525 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -143,6 +143,21 @@ pgstat_report_wait_end(void) qid, idx, duration_ns); } + /* 10046-style per-session trace ring buffer */ + if (unlikely(wait_event_trace && my_wait_event_trace != NULL)) + { + uint64 pos = pg_atomic_read_u64(&my_wait_event_trace->write_pos); + WaitEventTraceRecord *rec = + &my_wait_event_trace->records[pos & (WAIT_EVENT_TRACE_RING_SIZE - 1)]; + + rec->timestamp_ns = INSTR_TIME_GET_NANOSEC(now); + rec->event = event; + rec->duration_ns = duration_ns; + rec->query_id = my_wait_event_query_id_ptr + ? *my_wait_event_query_id_ptr : 0; + pg_atomic_write_u64(&my_wait_event_trace->write_pos, pos + 1); + } + INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start); } } diff --git a/src/include/utils/wait_event_timing.h b/src/include/utils/wait_event_timing.h index 2f9f0fa0c294b..ce951ef1ec3c5 100644 --- a/src/include/utils/wait_event_timing.h +++ b/src/include/utils/wait_event_timing.h @@ -24,6 +24,7 @@ #ifndef WAIT_EVENT_TIMING_H #define WAIT_EVENT_TIMING_H +#include "port/atomics.h" #include "portability/instr_time.h" /* @@ -120,8 +121,33 @@ typedef struct WaitEventQueryState } WaitEventQueryState; -/* GUC variable */ +/* + * Per-session wait event trace ring buffer (10046-style). + * When wait_event_trace GUC is on for a session, every wait_end writes + * a record to a per-backend ring buffer. External tools read the buffer + * via pg_stat_get_wait_event_trace(). + */ +#define WAIT_EVENT_TRACE_RING_SIZE 4096 /* must be power of 2 */ + +typedef struct WaitEventTraceRecord +{ + int64 timestamp_ns; /* monotonic clock */ + uint32 event; /* wait_event_info */ + uint32 pad; + int64 duration_ns; + int64 query_id; +} WaitEventTraceRecord; /* 32 bytes */ + +typedef struct WaitEventTraceState +{ + pg_atomic_uint64 write_pos; /* monotonically increasing write position */ + WaitEventTraceRecord records[WAIT_EVENT_TRACE_RING_SIZE]; +} WaitEventTraceState; /* ~128 KB */ + + +/* GUC variables */ extern PGDLLIMPORT bool wait_event_timing; +extern PGDLLIMPORT bool wait_event_trace; /* Pointer to this backend's timing state in shared memory */ extern PGDLLIMPORT WaitEventTimingState *my_wait_event_timing; @@ -129,6 +155,9 @@ extern PGDLLIMPORT WaitEventTimingState *my_wait_event_timing; /* Pointer to this backend's query attribution hash in shared memory */ extern PGDLLIMPORT WaitEventQueryState *my_wait_event_query; +/* Pointer to this backend's trace ring buffer in shared memory */ +extern PGDLLIMPORT WaitEventTraceState *my_wait_event_trace; + /* Pointer to current backend's query_id in PgBackendStatus (set late) */ extern PGDLLIMPORT volatile int64 *my_wait_event_query_id_ptr; @@ -137,6 +166,8 @@ extern Size WaitEventTimingShmemSize(void); extern void WaitEventTimingShmemInit(void); extern Size WaitEventQueryShmemSize(void); extern void WaitEventQueryShmemInit(void); +extern Size WaitEventTraceShmemSize(void); +extern void WaitEventTraceShmemInit(void); /* Called from InitProcess() to point my_wait_event_timing at our slot */ extern void pgstat_set_wait_event_timing_storage(int procNumber); From 7cadcee4a33208409514f1025a979a35463c6866 Mon Sep 17 00:00:00 2001 From: Dmitry Fomin Date: Tue, 24 Mar 2026 09:40:14 +0000 Subject: [PATCH 7/8] Make wait_event_timing a compile-time option (--enable-wait-event-timing) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiled without the flag (default), zero instructions are added to pgstat_report_wait_start/end — identical binary to stock PostgreSQL. When compiled with --enable-wait-event-timing: ./configure --enable-wait-event-timing meson setup -Dwait_event_timing=true The wait_event_timing and wait_event_trace GUCs still exist in both builds (setting them is harmless without the compile flag). This eliminates the ~2% overhead from the compiled-in branch check that was present even with GUC off. Co-Authored-By: Claude Opus 4.6 (1M context) --- configure.ac | 8 +++ meson.build | 1 + meson_options.txt | 3 + src/backend/storage/ipc/ipci.c | 4 ++ src/backend/storage/lmgr/proc.c | 8 +++ src/backend/utils/activity/backend_status.c | 4 ++ .../utils/activity/wait_event_timing.c | 58 +++++++++++++++++-- src/backend/utils/misc/guc_tables.c | 4 +- src/include/pg_config.h.in | 3 + src/include/utils/wait_event.h | 25 ++++---- 10 files changed, 100 insertions(+), 18 deletions(-) diff --git a/configure.ac b/configure.ac index f8327a7020a03..547354d2955a2 100644 --- a/configure.ac +++ b/configure.ac @@ -225,6 +225,14 @@ fi AC_SUBST(DTRACEFLAGS)]) AC_SUBST(enable_dtrace) +# +# --enable-wait-event-timing adds wait event timing instrumentation +# +PGAC_ARG_BOOL(enable, wait-event-timing, no, + [build with wait event timing instrumentation], + [AC_DEFINE([USE_WAIT_EVENT_TIMING], 1, + [Define to 1 to build with wait event timing. (--enable-wait-event-timing)])]) + # # TAP tests # diff --git a/meson.build b/meson.build index 0a181909fab15..ab4c486289d46 100644 --- a/meson.build +++ b/meson.build @@ -505,6 +505,7 @@ meson_bin = find_program(meson_binpath, native: true) cdata.set('USE_ASSERT_CHECKING', get_option('cassert') ? 1 : false) cdata.set('USE_INJECTION_POINTS', get_option('injection_points') ? 1 : false) +cdata.set('USE_WAIT_EVENT_TIMING', get_option('wait_event_timing') ? 1 : false) blocksize = get_option('blocksize').to_int() * 1024 diff --git a/meson_options.txt b/meson_options.txt index 6a793f3e47943..1f191d3a9d621 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -40,6 +40,9 @@ option('pgport', type: 'integer', value: 5432, option('cassert', type: 'boolean', value: false, description: 'Enable assertion checks (for debugging)') +option('wait_event_timing', type: 'boolean', value: false, + description: 'Enable wait event timing instrumentation') + option('tap_tests', type: 'feature', value: 'auto', description: 'Enable TAP tests') diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index f7512905ae10c..6868b723d9ffd 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -137,9 +137,11 @@ CalculateShmemSize(void) size = add_size(size, AsyncShmemSize()); size = add_size(size, StatsShmemSize()); size = add_size(size, WaitEventCustomShmemSize()); +#ifdef USE_WAIT_EVENT_TIMING size = add_size(size, WaitEventTimingShmemSize()); size = add_size(size, WaitEventQueryShmemSize()); size = add_size(size, WaitEventTraceShmemSize()); +#endif size = add_size(size, InjectionPointShmemSize()); size = add_size(size, SlotSyncShmemSize()); size = add_size(size, AioShmemSize()); @@ -328,9 +330,11 @@ CreateOrAttachShmemStructs(void) AsyncShmemInit(); StatsShmemInit(); WaitEventCustomShmemInit(); +#ifdef USE_WAIT_EVENT_TIMING WaitEventTimingShmemInit(); WaitEventQueryShmemInit(); WaitEventTraceShmemInit(); +#endif InjectionPointShmemInit(); AioShmemInit(); WaitLSNShmemInit(); diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index e91aa33d3ed53..1ccd9a848a045 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -529,7 +529,9 @@ InitProcess(void) /* now that we have a proc, report wait events to shared memory */ pgstat_set_wait_event_storage(&MyProc->wait_event_info); +#ifdef USE_WAIT_EVENT_TIMING pgstat_set_wait_event_timing_storage(GetNumberFromPGProc(MyProc)); +#endif /* * We might be reusing a semaphore that belonged to a failed process. So @@ -702,7 +704,9 @@ InitAuxiliaryProcess(void) /* now that we have a proc, report wait events to shared memory */ pgstat_set_wait_event_storage(&MyProc->wait_event_info); +#ifdef USE_WAIT_EVENT_TIMING pgstat_set_wait_event_timing_storage(GetNumberFromPGProc(MyProc)); +#endif /* Check that group locking fields are in a proper initial state. */ Assert(MyProc->lockGroupLeader == NULL); @@ -993,7 +997,9 @@ ProcKill(int code, Datum arg) */ SwitchBackToLocalLatch(); pgstat_reset_wait_event_storage(); +#ifdef USE_WAIT_EVENT_TIMING pgstat_reset_wait_event_timing_storage(); +#endif proc = MyProc; MyProc = NULL; @@ -1059,7 +1065,9 @@ AuxiliaryProcKill(int code, Datum arg) /* look at the equivalent ProcKill() code for comments */ SwitchBackToLocalLatch(); pgstat_reset_wait_event_storage(); +#ifdef USE_WAIT_EVENT_TIMING pgstat_reset_wait_event_timing_storage(); +#endif proc = MyProc; MyProc = NULL; diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c index fb920c1481b6c..9dae9d946bf42 100644 --- a/src/backend/utils/activity/backend_status.c +++ b/src/backend/utils/activity/backend_status.c @@ -20,7 +20,9 @@ #include "storage/proc.h" /* for MyProc */ #include "storage/procarray.h" #include "utils/ascii.h" +#ifdef USE_WAIT_EVENT_TIMING #include "utils/wait_event_timing.h" +#endif #include "utils/guc.h" /* for application_name */ #include "utils/memutils.h" @@ -250,12 +252,14 @@ pgstat_beinit(void) Assert(MyProcNumber >= 0 && MyProcNumber < NumBackendStatSlots); MyBEEntry = &BackendStatusArray[MyProcNumber]; +#ifdef USE_WAIT_EVENT_TIMING /* * Point the wait event timing query_id pointer at our st_query_id. * This must happen here (not in InitProcess) because MyBEEntry is * not yet set when pgstat_set_wait_event_timing_storage() runs. */ my_wait_event_query_id_ptr = &MyBEEntry->st_query_id; +#endif /* Set up a process-exit hook to clean up */ on_shmem_exit(pgstat_beshutdown_hook, 0); diff --git a/src/backend/utils/activity/wait_event_timing.c b/src/backend/utils/activity/wait_event_timing.c index feb322330cac5..32f417ecb198d 100644 --- a/src/backend/utils/activity/wait_event_timing.c +++ b/src/backend/utils/activity/wait_event_timing.c @@ -23,6 +23,58 @@ */ #include "postgres.h" +/* + * GUC variables — always defined so the GUC system works even when + * compiled without --enable-wait-event-timing. Setting them to 'on' + * without the compile flag is harmless (no-op). + */ +bool wait_event_timing = false; +bool wait_event_trace = false; + +#ifndef USE_WAIT_EVENT_TIMING + +/* + * Stub SQL functions when compiled without --enable-wait-event-timing. + * These are referenced by pg_proc.dat and must exist as symbols. + */ +#include "fmgr.h" + +Datum pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS); +Datum pg_stat_get_wait_event_timing_by_query(PG_FUNCTION_ARGS); +Datum pg_stat_get_wait_event_trace(PG_FUNCTION_ARGS); + +Datum +pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("wait_event_timing is not supported by this build"), + errhint("Compile PostgreSQL with --enable-wait-event-timing."))); + PG_RETURN_VOID(); +} + +Datum +pg_stat_get_wait_event_timing_by_query(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("wait_event_timing is not supported by this build"), + errhint("Compile PostgreSQL with --enable-wait-event-timing."))); + PG_RETURN_VOID(); +} + +Datum +pg_stat_get_wait_event_trace(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("wait_event_timing is not supported by this build"), + errhint("Compile PostgreSQL with --enable-wait-event-timing."))); + PG_RETURN_VOID(); +} + +#else /* USE_WAIT_EVENT_TIMING */ + #include "funcapi.h" #include "miscadmin.h" #include "storage/shmem.h" @@ -30,10 +82,6 @@ #include "utils/wait_event.h" #include "utils/wait_event_timing.h" -/* GUC variables */ -bool wait_event_timing = false; -bool wait_event_trace = false; - /* Pointer to this backend's timing state */ WaitEventTimingState *my_wait_event_timing = NULL; @@ -410,3 +458,5 @@ pg_stat_get_wait_event_trace(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +#endif /* USE_WAIT_EVENT_TIMING */ diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 5875817fd67c6..3c44a006c0e49 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -78,7 +78,9 @@ #include "replication/syncrep.h" #include "storage/aio.h" #include "storage/bufmgr.h" -#include "utils/wait_event_timing.h" +/* wait_event_timing GUC variables — always available, no-op without compile flag */ +extern bool wait_event_timing; +extern bool wait_event_trace; #include "storage/bufpage.h" #include "storage/copydir.h" #include "storage/fd.h" diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index d8d61918affec..850e1e562b9d8 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -674,6 +674,9 @@ /* Define to 1 to build with assertion checks. (--enable-cassert) */ #undef USE_ASSERT_CHECKING +/* Define to 1 to build with wait event timing. (--enable-wait-event-timing) */ +#undef USE_WAIT_EVENT_TIMING + /* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */ #undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index 03e4a71e79525..88adf6560a2b5 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -12,7 +12,10 @@ /* enums for wait events */ #include "utils/wait_event_types.h" + +#ifdef USE_WAIT_EVENT_TIMING #include "utils/wait_event_timing.h" +#endif extern const char *pgstat_get_wait_event(uint32 wait_event_info); extern const char *pgstat_get_wait_event_type(uint32 wait_event_info); @@ -65,8 +68,8 @@ extern char **GetWaitEventCustomNames(uint32 classId, int *nwaitevents); * my_wait_event_info initially points to local memory, making it safe to * call this before MyProc has been initialized. * - * When wait_event_timing is enabled, we also record the start timestamp - * for later duration computation in pgstat_report_wait_end(). + * When compiled with --enable-wait-event-timing, also records the start + * timestamp for later duration computation in pgstat_report_wait_end(). * ---------- */ static inline void @@ -78,16 +81,13 @@ pgstat_report_wait_start(uint32 wait_event_info) */ *(volatile uint32 *) my_wait_event_info = wait_event_info; - /* - * Record start timestamp when wait_event_timing is enabled. - * The branch is predictable (always taken or never taken for the - * lifetime of the session) and costs ~1 ns when not taken. - */ +#ifdef USE_WAIT_EVENT_TIMING if (wait_event_timing && likely(my_wait_event_timing != NULL)) { INSTR_TIME_SET_CURRENT(my_wait_event_timing->wait_start); my_wait_event_timing->current_event = wait_event_info; } +#endif } /* ---------- @@ -95,17 +95,15 @@ pgstat_report_wait_start(uint32 wait_event_info) * * Called to report end of a wait. * - * When wait_event_timing is enabled, computes the wait duration and - * accumulates it into per-event statistics. + * When compiled with --enable-wait-event-timing and the GUC is enabled, + * computes the wait duration and accumulates it into per-event statistics, + * query attribution hash, and optional per-session trace ring buffer. * ---------- */ static inline void pgstat_report_wait_end(void) { - /* - * When timing is enabled, compute duration and accumulate stats - * before clearing the wait event. - */ +#ifdef USE_WAIT_EVENT_TIMING if (wait_event_timing && likely(my_wait_event_timing != NULL)) { uint32 event = my_wait_event_timing->current_event; @@ -161,6 +159,7 @@ pgstat_report_wait_end(void) INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start); } } +#endif /* see pgstat_report_wait_start() */ *(volatile uint32 *) my_wait_event_info = 0; From 4ccf36acda3a033aa85ded1e55d46a2ff1d3a0fc Mon Sep 17 00:00:00 2001 From: Dmitry Fomin Date: Tue, 24 Mar 2026 13:12:00 +0000 Subject: [PATCH 8/8] Increase trace ring buffer from 4K to 128K records per backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At 32 bytes per record, this is 4 MB per backend. At 220K events/sec (worst case), holds ~0.6 seconds of history — enough for a background worker polling at 100ms to drain without losing events. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/include/utils/wait_event_timing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/utils/wait_event_timing.h b/src/include/utils/wait_event_timing.h index ce951ef1ec3c5..e25c11f273de4 100644 --- a/src/include/utils/wait_event_timing.h +++ b/src/include/utils/wait_event_timing.h @@ -127,7 +127,7 @@ typedef struct WaitEventQueryState * a record to a per-backend ring buffer. External tools read the buffer * via pg_stat_get_wait_event_trace(). */ -#define WAIT_EVENT_TRACE_RING_SIZE 4096 /* must be power of 2 */ +#define WAIT_EVENT_TRACE_RING_SIZE 131072 /* must be power of 2, 128K records = 4 MB per backend */ typedef struct WaitEventTraceRecord {