diff --git a/charts/bootchain-operator/README.md b/charts/bootchain-operator/README.md index ee2e60c..be67a1d 100644 --- a/charts/bootchain-operator/README.md +++ b/charts/bootchain-operator/README.md @@ -40,6 +40,8 @@ Kubernetes: `>=1.25.0` | crds.install | bool | `true` | | | crds.keep | bool | `true` | | | fullnameOverride | string | `""` | | +| grafana.dashboard.enabled | bool | `false` | | +| grafana.dashboard.labels.grafana_dashboard | string | `"1"` | | | image.pullPolicy | string | `"IfNotPresent"` | | | image.repository | string | `"ghcr.io/user-cube/bootchain-operator"` | | | image.tag | string | `""` | | diff --git a/charts/bootchain-operator/templates/grafana-dashboard.yaml b/charts/bootchain-operator/templates/grafana-dashboard.yaml new file mode 100644 index 0000000..e77f9dd --- /dev/null +++ b/charts/bootchain-operator/templates/grafana-dashboard.yaml @@ -0,0 +1,758 @@ +{{- if and .Values.metrics.enabled .Values.grafana.dashboard.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "bootchain-operator.fullname" . }}-dashboard + namespace: {{ include "bootchain-operator.namespace" . }} + labels: + {{- include "bootchain-operator.labels" . | nindent 4 }} + {{- with .Values.grafana.dashboard.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +data: + bootchain-operator.json: |- + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "10.0.0" }, + { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, + { "type": "panel", "id": "gauge", "name": "Gauge", "version": "" }, + { "type": "panel", "id": "stat", "name": "Stat", "version": "" }, + { "type": "panel", "id": "timeseries", "name": "Time series", "version": "" }, + { "type": "panel", "id": "table", "name": "Table", "version": "" } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "grafana", "uid": "-- Grafana --" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "bootchain-operator — dependency health, reconciliation throughput, and latency", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "title": "Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "count(bootchain_dependencies_total{namespace=~\"$namespace\"})", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "BootDependency Resources", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum(bootchain_dependencies_ready{namespace=~\"$namespace\"})", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Dependencies Ready (total)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum(bootchain_dependencies_total{namespace=~\"$namespace\"}) - sum(bootchain_dependencies_ready{namespace=~\"$namespace\"})", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Dependencies Not Ready", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "red", "value": 0.05 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rate(bootchain_reconcile_total{result=\"error\"}[$__rate_interval]) / rate(bootchain_reconcile_total[$__rate_interval])", + "instant": false, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Reconcile Error Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.99, sum(rate(bootchain_reconcile_duration_seconds_bucket[$__rate_interval])) by (le))", + "instant": false, + "legendFormat": "p99", + "refId": "A" + } + ], + "title": "Reconcile Latency p99", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 101, + "title": "Dependency Health", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "green", "value": 1 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 10, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "bootchain_dependencies_ready{namespace=~\"$namespace\", name=~\"$bootdependency\"} / bootchain_dependencies_total{namespace=~\"$namespace\", name=~\"$bootdependency\"}", + "instant": true, + "legendFormat": "{{ "{{" }}namespace{{ "}}" }}/{{ "{{" }}name{{ "}}" }}", + "refId": "A" + } + ], + "title": "Dependency Readiness Ratio", + "type": "gauge" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 11, + "options": { + "legend": { "calcs": ["lastNotNull", "min"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "bootchain_dependencies_ready{namespace=~\"$namespace\", name=~\"$bootdependency\"}", + "legendFormat": "{{ "{{" }}namespace{{ "}}" }}/{{ "{{" }}name{{ "}}" }} ready", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "bootchain_dependencies_total{namespace=~\"$namespace\", name=~\"$bootdependency\"}", + "legendFormat": "{{ "{{" }}namespace{{ "}}" }}/{{ "{{" }}name{{ "}}" }} total", + "refId": "B" + } + ], + "title": "Ready vs Total Dependencies Over Time", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "color-background" }, + "inspect": false + }, + "mappings": [ + { "options": { "0": { "color": "red", "index": 0, "text": "Not Ready" }, "1": { "color": "green", "index": 1, "text": "Ready" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "namespace" }, "properties": [{ "id": "custom.width", "value": 120 }] }, + { "matcher": { "id": "byName", "options": "name" }, "properties": [{ "id": "custom.width", "value": 200 }] }, + { "matcher": { "id": "byName", "options": "ready" }, "properties": [{ "id": "custom.width", "value": 80 }] }, + { "matcher": { "id": "byName", "options": "total" }, "properties": [{ "id": "custom.width", "value": 70 }] } + ] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 14 }, + "id": 12, + "options": { + "cellHeight": "sm", + "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "ready" }] + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "label_replace(\n bootchain_dependencies_ready{namespace=~\"$namespace\", name=~\"$bootdependency\"}\n == bootchain_dependencies_total{namespace=~\"$namespace\", name=~\"$bootdependency\"},\n \"status\", \"1\", \"\", \"\"\n) or label_replace(\n bootchain_dependencies_ready{namespace=~\"$namespace\", name=~\"$bootdependency\"}\n != bootchain_dependencies_total{namespace=~\"$namespace\", name=~\"$bootdependency\"},\n \"status\", \"0\", \"\", \"\"\n)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "bootchain_dependencies_ready{namespace=~\"$namespace\", name=~\"$bootdependency\"}", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "bootchain_dependencies_total{namespace=~\"$namespace\", name=~\"$bootdependency\"}", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "C" + } + ], + "title": "BootDependency Status Table", + "transformations": [ + { "id": "joinByField", "options": { "byField": "name", "mode": "outer" } }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, "Time 1": true, "Time 2": true, "Time 3": true, + "namespace 2": true, "namespace 3": true, + "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, + "job 1": true, "job 2": true, "job 3": true, + "instance 1": true, "instance 2": true, "instance 3": true, + "status": true + }, + "renameByName": { + "namespace 1": "namespace", + "name": "name", + "Value #B": "ready", + "Value #C": "total", + "Value #A": "all_ready" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "id": 102, + "title": "Reconciliation", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "reconciles / s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 23 }, + "id": 20, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rate(bootchain_reconcile_total{result=\"success\"}[$__rate_interval])", + "legendFormat": "success", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rate(bootchain_reconcile_total{result=\"error\"}[$__rate_interval])", + "legendFormat": "error", + "refId": "B" + } + ], + "title": "Reconcile Throughput", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 23 }, + "id": 21, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.50, sum(rate(bootchain_reconcile_duration_seconds_bucket[$__rate_interval])) by (le, result))", + "legendFormat": "p50 {{ "{{" }}result{{ "}}" }}", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.95, sum(rate(bootchain_reconcile_duration_seconds_bucket[$__rate_interval])) by (le, result))", + "legendFormat": "p95 {{ "{{" }}result{{ "}}" }}", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.99, sum(rate(bootchain_reconcile_duration_seconds_bucket[$__rate_interval])) by (le, result))", + "legendFormat": "p99 {{ "{{" }}result{{ "}}" }}", + "refId": "C" + } + ], + "title": "Reconcile Duration (p50 / p95 / p99)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 }, + "id": 103, + "title": "Webhook", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "requests / s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 }, + "id": 30, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rate(controller_runtime_webhook_requests_total{webhook=~\"/mutate-apps-v1-deployment|/validate-core-bootchain.*\"}[$__rate_interval])", + "legendFormat": "{{ "{{" }}webhook{{ "}}" }} ({{ "{{" }}code{{ "}}" }})", + "refId": "A" + } + ], + "title": "Webhook Request Rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 }, + "id": 31, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.99, sum(rate(controller_runtime_webhook_latency_seconds_bucket{webhook=~\"/mutate-apps-v1-deployment|/validate-core-bootchain.*\"}[$__rate_interval])) by (le, webhook))", + "legendFormat": "p99 {{ "{{" }}webhook{{ "}}" }}", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.95, sum(rate(controller_runtime_webhook_latency_seconds_bucket{webhook=~\"/mutate-apps-v1-deployment|/validate-core-bootchain.*\"}[$__rate_interval])) by (le, webhook))", + "legendFormat": "p95 {{ "{{" }}webhook{{ "}}" }}", + "refId": "B" + } + ], + "title": "Webhook Latency (p95 / p99)", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["bootchain-operator", "kubernetes", "operator"], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "definition": "label_values(bootchain_dependencies_total, namespace)", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(bootchain_dependencies_total, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "definition": "label_values(bootchain_dependencies_total{namespace=~\"$namespace\"}, name)", + "hide": 0, + "includeAll": true, + "label": "BootDependency", + "multi": true, + "name": "bootdependency", + "options": [], + "query": { + "query": "label_values(bootchain_dependencies_total{namespace=~\"$namespace\"}, name)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "bootchain-operator", + "uid": "bootchain-operator-v1", + "version": 1 + } +{{- end }} diff --git a/charts/bootchain-operator/values.yaml b/charts/bootchain-operator/values.yaml index 1881f35..e48aa88 100644 --- a/charts/bootchain-operator/values.yaml +++ b/charts/bootchain-operator/values.yaml @@ -83,6 +83,18 @@ metrics: # Extra labels to add to the ServiceMonitor (e.g. to match a Prometheus instance selector). additionalLabels: {} +## @section Grafana +grafana: + dashboard: + # Deploy the bootchain-operator Grafana dashboard as a ConfigMap. + # Works with the Grafana sidecar (grafana.sidecar.dashboards.enabled=true) + # and with the Grafana Operator (GrafanaDashboard CRD). + enabled: false + # Labels applied to the ConfigMap so the Grafana sidecar or operator can discover it. + # Typical kube-prometheus-stack value: { grafana_dashboard: "1" } + labels: + grafana_dashboard: "1" + ## @section Webhooks webhook: enabled: true diff --git a/docs/reference/metrics.md b/docs/reference/metrics.md index 2e1cf38..c5b87c8 100644 --- a/docs/reference/metrics.md +++ b/docs/reference/metrics.md @@ -72,6 +72,141 @@ helm upgrade bootchain-operator charts/bootchain-operator \ The `additionalLabels` must match your Prometheus instance's `serviceMonitorSelector`. +## Grafana dashboard + +The Helm chart ships a pre-built Grafana dashboard that can be deployed as a ConfigMap and auto-discovered by the Grafana sidecar or Grafana Operator. + +### Panels + +| Section | Panel | Description | +|---|---|---| +| Overview | BootDependency Resources | Count of BootDependency objects being tracked | +| Overview | Dependencies Ready (total) | Sum of all reachable dependencies across all resources | +| Overview | Dependencies Not Ready | Sum of unresolved dependencies (red when > 0) | +| Overview | Reconcile Error Rate | Fraction of reconciliations that ended in error | +| Overview | Reconcile Latency p99 | 99th-percentile reconcile duration | +| Dependency Health | Dependency Readiness Ratio | Gauge showing ready/total per resource | +| Dependency Health | Ready vs Total Over Time | Time-series of ready and total counts per resource | +| Dependency Health | BootDependency Status Table | Per-resource table with ready / total counts | +| Reconciliation | Reconcile Throughput | Reconcile rate (success vs error) over time | +| Reconciliation | Reconcile Duration (p50/p95/p99) | Latency percentiles by result | +| Webhook | Webhook Request Rate | Mutating and validating webhook request rates | +| Webhook | Webhook Latency (p95/p99) | Webhook handler latency percentiles | + +The dashboard includes two template variables — **Namespace** and **BootDependency** — that filter all panels to the selected resources. + +### Setup checklist + +Both items below are required. The dashboard will show **"No data"** if either is missing. + +| # | Requirement | How to verify | +|---|---|---| +| 1 | **ServiceMonitor enabled** — Prometheus must be scraping the operator's `/metrics` endpoint | Check _Status → Targets_ in the Prometheus UI for a target named `bootchain-operator` | +| 2 | **Dashboard label matches the Grafana sidecar/operator selector** — the ConfigMap must carry the label the sidecar watches | Check the sidecar's `GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH` or `sidecar.dashboards.label` in your Grafana Helm values | + +### Enable with kube-prometheus-stack (Grafana sidecar) + +kube-prometheus-stack ships a Grafana sidecar that auto-discovers ConfigMaps carrying a specific label (default: `grafana_dashboard: "1"`). Both the ServiceMonitor and the dashboard ConfigMap must be enabled together: + +```bash +helm upgrade bootchain-operator charts/bootchain-operator \ + --set metrics.serviceMonitor.enabled=true \ + --set metrics.serviceMonitor.additionalLabels.release=prometheus \ + --set grafana.dashboard.enabled=true \ + --set grafana.dashboard.labels.grafana_dashboard="1" +``` + +> **`additionalLabels.release`** must match the `serviceMonitorSelector` label of your Prometheus instance. A common value is `prometheus` or `kube-prometheus-stack`. Check with: +> ```bash +> kubectl get prometheus -A -o jsonpath='{.items[*].spec.serviceMonitorSelector}' +> ``` +> If `serviceMonitorSelector` is empty (`{}`), all ServiceMonitors are picked up and the label can be omitted. + +The sidecar will pick up the ConfigMap and import the dashboard automatically — no manual import required. + +**Verifying the sidecar picked up the dashboard:** + +```bash +# Check sidecar logs for "Found ConfigMap" or "Updating dashboard" +kubectl logs -n \ + -l app.kubernetes.io/name=grafana \ + -c grafana-sc-dashboard +``` + +### Enable with Grafana Operator + +If you use the Grafana Operator, set `grafana.dashboard.labels` to match your `GrafanaDashboard` label selector: + +```bash +helm upgrade bootchain-operator charts/bootchain-operator \ + --set metrics.serviceMonitor.enabled=true \ + --set grafana.dashboard.enabled=true \ + --set grafana.dashboard.labels.app=grafana +``` + +### Manual import + +If you prefer to import the dashboard manually, extract the JSON from the ConfigMap and paste it into **Grafana → Dashboards → Import**: + +```bash +kubectl get configmap bootchain-operator-dashboard \ + -n bootchain-operator-system \ + -o jsonpath='{.data.bootchain-operator\.json}' > bootchain-operator.json +``` + +Then open Grafana, go to **Dashboards → Import**, upload `bootchain-operator.json`, and select your Prometheus datasource. + +> The ServiceMonitor must still be enabled for the imported dashboard to show data. + +### Troubleshooting + +**Dashboard does not appear in Grafana** + +1. Confirm the ConfigMap was created: + ```bash + kubectl get configmap bootchain-operator-dashboard -n bootchain-operator-system + ``` +2. Confirm the label on the ConfigMap matches the sidecar's `sidecar.dashboards.label` value (default `grafana_dashboard: "1"`): + ```bash + kubectl get configmap bootchain-operator-dashboard \ + -n bootchain-operator-system \ + --show-labels + ``` +3. If the label is missing or wrong, either re-deploy with the correct `grafana.dashboard.labels` value, or patch it directly: + ```bash + kubectl label configmap bootchain-operator-dashboard \ + grafana_dashboard="1" \ + -n bootchain-operator-system + ``` +4. Check the sidecar container logs (see _Enable with kube-prometheus-stack_ above). + +--- + +**All panels show "No data"** + +1. Confirm Prometheus is scraping the operator: + ```bash + kubectl port-forward svc/bootchain-operator-metrics 8080:8080 \ + -n bootchain-operator-system + curl -s http://localhost:8080/metrics | grep bootchain + ``` + If this returns metrics, the operator is healthy. If Prometheus is still not scraping it, the ServiceMonitor is likely missing or has the wrong labels. + +2. Check whether the ServiceMonitor exists: + ```bash + kubectl get servicemonitor -n bootchain-operator-system + ``` + If it does not exist, enable it: + ```bash + helm upgrade bootchain-operator charts/bootchain-operator \ + --set metrics.serviceMonitor.enabled=true \ + --set metrics.serviceMonitor.additionalLabels.release= + ``` + +3. Verify the ServiceMonitor is being picked up by Prometheus (_Status → Targets_ in the Prometheus UI). If the target is missing, the `additionalLabels` on the ServiceMonitor do not match your Prometheus instance's `serviceMonitorSelector`. + +4. In the Grafana dashboard, confirm the **datasource** variable at the top is pointing to the correct Prometheus instance. + ## Suggested alerts ```yaml