From 221315e4a345cc970acb7f22446b0927f6785af7 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 20 Mar 2026 17:03:55 +0100 Subject: [PATCH 1/3] feat(sandbox): switch device plugin to CDI injection mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Configure the NVIDIA device plugin to use deviceListStrategy=cdi-cri so that GPU devices are injected via direct CDI device requests in the CRI. Sandbox pods now only require the nvidia.com/gpu resource request — runtimeClassName is no longer set on GPU pods. Signed-off-by: Evan Lezar --- architecture/gateway-single-node.md | 8 +++++--- crates/openshell-server/src/sandbox/mod.rs | 9 ++------- .../gpu-manifests/nvidia-device-plugin-helmchart.yaml | 8 ++++++++ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index 57aebd3a..1f4e04cd 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -300,7 +300,7 @@ GPU support is part of the single-node gateway bootstrap path rather than a sepa - When enabled, the cluster container is created with Docker `DeviceRequests`, which is the API equivalent of `docker run --gpus all`. - `deploy/docker/Dockerfile.images` installs NVIDIA Container Toolkit packages in a dedicated Ubuntu stage and copies the runtime binaries, config, and `libnvidia-container` shared libraries into the final Ubuntu-based cluster image. - `deploy/docker/cluster-entrypoint.sh` checks `GPU_ENABLED=true` and copies GPU-only manifests from `/opt/openshell/gpu-manifests/` into k3s's manifests directory. -- `deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml` installs the NVIDIA device plugin chart, currently pinned to `0.18.2`. NFD and GFD are disabled; the device plugin's default `nodeAffinity` (which requires `feature.node.kubernetes.io/pci-10de.present=true` or `nvidia.com/gpu.present=true` from NFD/GFD) is overridden to empty so the DaemonSet schedules on the single-node cluster without requiring those labels. +- `deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml` installs the NVIDIA device plugin chart, currently pinned to `0.18.2`. NFD and GFD are disabled; the device plugin's default `nodeAffinity` (which requires `feature.node.kubernetes.io/pci-10de.present=true` or `nvidia.com/gpu.present=true` from NFD/GFD) is overridden to empty so the DaemonSet schedules on the single-node cluster without requiring those labels. The chart is configured with `deviceListStrategy: cdi-cri` so the device plugin injects devices via direct CDI device requests in the CRI. - k3s auto-detects `nvidia-container-runtime` on `PATH`, registers the `nvidia` containerd runtime, and creates the `nvidia` `RuntimeClass` automatically. - The OpenShell Helm chart grants the gateway service account cluster-scoped read access to `node.k8s.io/runtimeclasses` and core `nodes` so GPU sandbox admission can verify both the `nvidia` `RuntimeClass` and allocatable GPU capacity before creating a sandbox. @@ -311,10 +311,12 @@ Host GPU drivers & NVIDIA Container Toolkit └─ Docker: --gpus all (DeviceRequests in bollard API) └─ k3s/containerd: nvidia-container-runtime on PATH -> auto-detected └─ k8s: nvidia-device-plugin DaemonSet advertises nvidia.com/gpu - └─ Pods: request nvidia.com/gpu in resource limits + └─ Pods: request nvidia.com/gpu in resource limits (CDI injection — no runtimeClassName needed) ``` -The expected smoke test is a plain pod requesting `nvidia.com/gpu: 1` with `runtimeClassName: nvidia` and running `nvidia-smi`. +Device injection uses CDI (`deviceListStrategy: cdi-cri`): the device plugin injects devices via direct CDI device requests in the CRI. Sandbox pods only need `nvidia.com/gpu: 1` in their resource limits — no `runtimeClassName` field is set on GPU pods. + +The expected smoke test is a plain pod requesting `nvidia.com/gpu: 1` (without `runtimeClassName`) and running `nvidia-smi`. ## Remote Image Transfer diff --git a/crates/openshell-server/src/sandbox/mod.rs b/crates/openshell-server/src/sandbox/mod.rs index e10b33d0..cd8c4cd6 100644 --- a/crates/openshell-server/src/sandbox/mod.rs +++ b/crates/openshell-server/src/sandbox/mod.rs @@ -869,12 +869,7 @@ fn sandbox_template_to_k8s( } let mut spec = serde_json::Map::new(); - if gpu { - spec.insert( - "runtimeClassName".to_string(), - serde_json::json!(GPU_RUNTIME_CLASS_NAME), - ); - } else if !template.runtime_class_name.is_empty() { + if !template.runtime_class_name.is_empty() { spec.insert( "runtimeClassName".to_string(), serde_json::json!(template.runtime_class_name), @@ -1660,7 +1655,7 @@ mod tests { assert_eq!( pod_template["spec"]["runtimeClassName"], - serde_json::json!(GPU_RUNTIME_CLASS_NAME) + serde_json::Value::Null ); assert_eq!( pod_template["spec"]["containers"][0]["resources"]["limits"][GPU_RESOURCE_NAME], diff --git a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml index 088562ac..4ad6512a 100644 --- a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml +++ b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml @@ -12,6 +12,10 @@ # (which requires nvidia.com/gpu.present=true) is overridden to empty # so it schedules on any node without requiring NFD/GFD labels. # +# CDI injection mode: the device plugin uses deviceListStrategy=cdi-cri so that +# devices are injected via CDI hooks before container start. Sandbox pods only +# need the nvidia.com/gpu resource request — no runtimeClassName is required. +# # k3s auto-detects nvidia-container-runtime on PATH and registers the "nvidia" # RuntimeClass automatically, so no manual RuntimeClass manifest is needed. @@ -28,6 +32,10 @@ spec: createNamespace: true valuesContent: |- runtimeClassName: nvidia + deviceListStrategy: cdi-cri + cdi: + nvidiaHookPath: /usr/bin/nvidia-cdi-hook + nvidiaDriverRoot: "/" gfd: enabled: false nfd: From fa3fa94e1111645833a3d7be9dbd66684a16620d Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 20 Mar 2026 17:54:52 +0100 Subject: [PATCH 2/3] docs(debug-skill): add CDI device plugin diagnostics for GPU gateways Signed-off-by: Evan Lezar --- .../skills/debug-openshell-cluster/SKILL.md | 44 ++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 4d0e4659..5b3b6375 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -256,7 +256,43 @@ Look for: - `OOMKilled` — memory limits too low - `FailedMount` — volume issues -### Step 8: Check DNS Resolution +### Step 8: Check GPU Device Plugin and CDI (GPU gateways only) + +Skip this step for non-GPU gateways. + +The NVIDIA device plugin DaemonSet must be running and healthy before GPU sandboxes can be created. It uses CDI injection (`deviceListStrategy: cdi-cri`) to inject GPU devices into sandbox pods — no `runtimeClassName` is set on sandbox pods. + +```bash +# DaemonSet status — numberReady must be >= 1 +openshell doctor exec -- kubectl get daemonset -n nvidia-device-plugin + +# Device plugin pod logs — look for "CDI" lines confirming CDI mode is active +openshell doctor exec -- kubectl logs -n nvidia-device-plugin -l app.kubernetes.io/name=nvidia-device-plugin --tail=50 + +# List CDI devices registered by the device plugin (requires nvidia-ctk in the cluster image). +# Device plugin CDI entries use the vendor string "k8s.device-plugin.nvidia.com" so entries +# will be prefixed "k8s.device-plugin.nvidia.com/gpu=". If the list is empty, CDI spec +# generation has not completed yet. +openshell doctor exec -- nvidia-ctk cdi list + +# Verify CDI spec files were generated on the node +openshell doctor exec -- ls /var/run/cdi/ + +# Helm install job logs for the device plugin chart +openshell doctor exec -- kubectl -n kube-system logs -l job-name=helm-install-nvidia-device-plugin --tail=100 + +# Confirm a GPU sandbox pod has no runtimeClassName (CDI injection, not runtime class) +openshell doctor exec -- kubectl get pod -n openshell -o jsonpath='{range .items[*]}{.metadata.name}{" runtimeClassName="}{.spec.runtimeClassName}{"\n"}{end}' +``` + +Common issues: + +- **DaemonSet 0/N ready**: The device plugin chart may still be deploying (k3s Helm controller can take 1–2 min) or the pod is crashing. Check pod logs. +- **`nvidia-ctk cdi list` returns no `k8s.device-plugin.nvidia.com/gpu=` entries**: CDI spec generation has not completed. The device plugin may still be starting or the `cdi-cri` strategy isn't active. Verify `deviceListStrategy: cdi-cri` is in the rendered Helm values. +- **No CDI spec files at `/var/run/cdi/`**: Same as above — device plugin hasn't written CDI specs yet. +- **`HEALTHCHECK_GPU_DEVICE_PLUGIN_NOT_READY` in health check logs**: Device plugin has no ready pods. Check DaemonSet events and pod logs. + +### Step 9: Check DNS Resolution DNS misconfiguration is a common root cause, especially on remote/Linux hosts: @@ -315,6 +351,7 @@ If DNS is broken, all image pulls from the distribution registry will fail, as w | gRPC `UNIMPLEMENTED` for newer RPCs in push mode | Helm values still point at older pulled images instead of the pushed refs | Verify rendered `openshell-helmchart.yaml` uses the expected push refs (`server`, `sandbox`, `pki-job`) and not `:latest` | | Sandbox pods crash with `/opt/openshell/bin/openshell-sandbox: no such file or directory` | Supervisor binary missing from cluster image | The cluster image was built/published without the `supervisor-builder` target in `deploy/docker/Dockerfile.images`. Rebuild with `mise run docker:build:cluster` and recreate gateway. Bootstrap auto-detects via `HEALTHCHECK_MISSING_SUPERVISOR` marker | | `HEALTHCHECK_MISSING_SUPERVISOR` in health check logs | `/opt/openshell/bin/openshell-sandbox` not found in gateway container | Rebuild cluster image: `mise run docker:build:cluster`, then `openshell gateway destroy && openshell gateway start` | +| `nvidia-ctk cdi list` returns no `k8s.device-plugin.nvidia.com/gpu=` entries | CDI specs not yet generated by device plugin | Device plugin may still be starting; wait and retry, or check pod logs (Step 8) | ## Full Diagnostic Dump @@ -368,4 +405,9 @@ openshell doctor exec -- ls -la /opt/openshell/bin/openshell-sandbox echo "=== DNS Configuration ===" openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf + +# GPU gateways only +echo "=== GPU Device Plugin ===" +openshell doctor exec -- kubectl get daemonset -n nvidia-device-plugin +openshell doctor exec -- nvidia-ctk cdi list ``` From da63c432fa7f5aa5559ce67cef9acb0068708c62 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 20 Mar 2026 23:36:07 +0100 Subject: [PATCH 3/3] feat(gpu): set deviceIDStrategy=index in device plugin Helm values Using index-based device IDs improves compatibility across platforms including Jetson/Tegra-based systems, and aligns with the numeric device naming expected in CDI specs. --- deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml index 4ad6512a..1cb0ca70 100644 --- a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml +++ b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml @@ -33,6 +33,7 @@ spec: valuesContent: |- runtimeClassName: nvidia deviceListStrategy: cdi-cri + deviceIDStrategy: index cdi: nvidiaHookPath: /usr/bin/nvidia-cdi-hook nvidiaDriverRoot: "/"