From fbf22fb58ee2a03072e3a3d2afa8f6887c5a85a4 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Sun, 22 Mar 2026 16:14:40 +0100
Subject: [PATCH] fix(cluster): add Jetson Linux 5.15-tegra platform
 compatibility

Three issues prevent k3s from starting on kernels where the nf_tables
xt extension bridge (nft_compat) is unavailable:

1. kube-router's network policy controller uses the xt_comment iptables
   extension and panics on startup with "Extension comment revision 0
   not supported, missing kernel module?" Pass --disable-network-policy
   to k3s so the controller never runs. The NSSH1 HMAC handshake remains
   the primary sandbox SSH isolation boundary, so this does not weaken
   the effective security posture.

2. flannel and kube-proxy also fail to insert rules via the nf_tables
   iptables backend on the same kernels. Add an xt_comment probe at
   cluster-entrypoint startup; if the probe fails, switch to
   iptables-legacy via update-alternatives before any other netfilter
   work so that flannel, kube-proxy, and the DNS proxy all use a
   consistent backend.

3. The br_netfilter kernel module must be loaded on the host for
   iptables rules to apply to pod bridge traffic. Without it, ClusterIP
   DNAT (including kube-dns at 10.43.0.10) is never applied to pod
   packets, causing silent DNS timeouts deep in the health-check loop.
   Add an early check that fails fast with an actionable error message
   if the module is not present, instructing the user to run
   `sudo modprobe br_netfilter` on the host.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 .../skills/debug-openshell-cluster/SKILL.md   |  2 +
 .../src/sandbox/linux/netns.rs                | 99 +++++++++++++++++--
 deploy/docker/cluster-entrypoint.sh           | 70 ++++++++++++-
 3 files changed, 159 insertions(+), 12 deletions(-)

diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md
index 4d0e4659..5af8895c 100644
--- a/.agents/skills/debug-openshell-cluster/SKILL.md
+++ b/.agents/skills/debug-openshell-cluster/SKILL.md
@@ -104,6 +104,7 @@ Look for:
 - k3s startup errors (certificate issues, port binding failures)
 - Manifest copy errors from `/opt/openshell/manifests/`
 - `iptables` or `cgroup` errors (privilege/capability issues)
+- `Warning: br_netfilter does not appear to be loaded` — this is advisory only; many kernels work without the explicit module. Only act on it if you also see DNS failures or pod-to-service connectivity problems (see Common Failure Patterns).
 
 ### Step 2: Check k3s Cluster Health
 
@@ -308,6 +309,7 @@ If DNS is broken, all image pulls from the distribution registry will fail, as w
 | Port conflict | Another service on the configured gateway host port (default 8080) | Stop conflicting service or use `--port` on `openshell gateway start` to pick a different host port |
 | gRPC connect refused to `127.0.0.1:443` in CI | Docker daemon is remote (`DOCKER_HOST=tcp://...`) but metadata still points to loopback | Verify metadata endpoint host matches `DOCKER_HOST` and includes non-loopback host |
 | DNS failures inside container | Entrypoint DNS detection failed | `openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf` and `openshell doctor logs --lines 20` |
+| Pods can't reach kube-dns / ClusterIP services | `br_netfilter` not loaded; bridge traffic bypasses iptables DNAT rules | `sudo modprobe br_netfilter` on the host, then `echo br_netfilter \| sudo tee /etc/modules-load.d/br_netfilter.conf` to persist. Known to be required on Jetson Linux 5.15-tegra; other kernels (e.g. standard x86/aarch64 Linux) may have bridge netfilter built in and work without the module. The entrypoint logs a warning when `/proc/sys/net/bridge/bridge-nf-call-iptables` is absent but does not abort — only act on it if DNS or service connectivity is actually broken. |
 | Node DiskPressure / MemoryPressure / PIDPressure | Insufficient disk, memory, or PIDs on host | Free disk (`docker system prune -a --volumes`), increase memory, or expand host resources. Bootstrap auto-detects via `HEALTHCHECK_NODE_PRESSURE` marker |
 | Pods evicted with "The node had condition: [DiskPressure]" | Host disk full, kubelet evicting pods | Free disk space on host, then `openshell gateway destroy <name> && openshell gateway start` |
 | `metrics-server` errors in logs | Normal k3s noise, not the root cause | These errors are benign — look for the actual failing health check component |
diff --git a/crates/openshell-sandbox/src/sandbox/linux/netns.rs b/crates/openshell-sandbox/src/sandbox/linux/netns.rs
index 5e6907c5..095ed86c 100644
--- a/crates/openshell-sandbox/src/sandbox/linux/netns.rs
+++ b/crates/openshell-sandbox/src/sandbox/linux/netns.rs
@@ -262,15 +262,18 @@ impl NetworkNamespace {
 
         info!(
             namespace = %self.name,
-            iptables = iptables_path,
+            iptables = %iptables_path,
             proxy_addr = %format!("{}:{}", host_ip_str, proxy_port),
             "Installing bypass detection rules"
         );
 
         // Install IPv4 rules
-        if let Err(e) =
-            self.install_bypass_rules_for(iptables_path, &host_ip_str, &proxy_port_str, &log_prefix)
-        {
+        if let Err(e) = self.install_bypass_rules_for(
+            &iptables_path,
+            &host_ip_str,
+            &proxy_port_str,
+            &log_prefix,
+        ) {
             warn!(
                 namespace = %self.name,
                 error = %e,
@@ -281,7 +284,7 @@ impl NetworkNamespace {
 
         // Install IPv6 rules — best-effort.
         // Skip the proxy ACCEPT rule for IPv6 since the proxy address is IPv4.
-        if let Some(ip6_path) = find_ip6tables(iptables_path) {
+        if let Some(ip6_path) = find_ip6tables(&iptables_path) {
             if let Err(e) = self.install_bypass_rules_for_v6(&ip6_path, &log_prefix) {
                 warn!(
                     namespace = %self.name,
@@ -666,12 +669,92 @@ fn run_iptables_netns(netns: &str, iptables_cmd: &str, args: &[&str]) -> Result<
 const IPTABLES_SEARCH_PATHS: &[&str] =
     &["/usr/sbin/iptables", "/sbin/iptables", "/usr/bin/iptables"];
 
+/// Returns true if xt extension modules (e.g. xt_comment) cannot be used
+/// via the given iptables binary.
+///
+/// Some kernels have nf_tables but lack the nft_compat bridge that allows
+/// xt extension modules to be used through the nf_tables path (e.g. Jetson
+/// Linux 5.15-tegra). This probe detects that condition by attempting to
+/// insert a rule using the xt_comment extension. If it fails, xt extensions
+/// are unavailable and the caller should fall back to iptables-legacy.
+fn xt_extensions_unavailable(iptables_path: &str) -> bool {
+    // Create a temporary probe chain. If this fails (e.g. no CAP_NET_ADMIN),
+    // we can't determine availability — assume extensions are available.
+    let created = Command::new(iptables_path)
+        .args(["-t", "filter", "-N", "_xt_probe"])
+        .output()
+        .map(|o| o.status.success())
+        .unwrap_or(false);
+
+    if !created {
+        return false;
+    }
+
+    // Attempt to insert a rule using xt_comment. Failure means nft_compat
+    // cannot bridge xt extension modules on this kernel.
+    let probe_ok = Command::new(iptables_path)
+        .args([
+            "-t",
+            "filter",
+            "-A",
+            "_xt_probe",
+            "-m",
+            "comment",
+            "--comment",
+            "probe",
+            "-j",
+            "ACCEPT",
+        ])
+        .output()
+        .map(|o| o.status.success())
+        .unwrap_or(false);
+
+    // Clean up — best-effort, ignore failures.
+    let _ = Command::new(iptables_path)
+        .args([
+            "-t",
+            "filter",
+            "-D",
+            "_xt_probe",
+            "-m",
+            "comment",
+            "--comment",
+            "probe",
+            "-j",
+            "ACCEPT",
+        ])
+        .output();
+    let _ = Command::new(iptables_path)
+        .args(["-t", "filter", "-X", "_xt_probe"])
+        .output();
+
+    !probe_ok
+}
+
 /// Find the iptables binary path, checking well-known locations.
-fn find_iptables() -> Option<&'static str> {
-    IPTABLES_SEARCH_PATHS
+///
+/// If xt extension modules are unavailable via the standard binary and
+/// `iptables-legacy` is available alongside it, the legacy binary is returned
+/// instead. This ensures bypass-detection rules can be installed on kernels
+/// where `nft_compat` is unavailable (e.g. Jetson Linux 5.15-tegra).
+fn find_iptables() -> Option<String> {
+    let standard_path = IPTABLES_SEARCH_PATHS
         .iter()
         .find(|path| std::path::Path::new(path).exists())
-        .copied()
+        .copied()?;
+
+    if xt_extensions_unavailable(standard_path) {
+        let legacy_path = standard_path.replace("iptables", "iptables-legacy");
+        if std::path::Path::new(&legacy_path).exists() {
+            debug!(
+                legacy = legacy_path,
+                "xt extensions unavailable; using iptables-legacy"
+            );
+            return Some(legacy_path);
+        }
+    }
+
+    Some(standard_path.to_string())
 }
 
 /// Find the ip6tables binary path, deriving it from the iptables location.
diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh
index 84b8cf9a..2fea6fa6 100644
--- a/deploy/docker/cluster-entrypoint.sh
+++ b/deploy/docker/cluster-entrypoint.sh
@@ -25,6 +25,61 @@
 
 set -e
 
+# ---------------------------------------------------------------------------
+# Select iptables backend
+# ---------------------------------------------------------------------------
+# Some kernels (e.g. Jetson Linux 5.15-tegra) have the nf_tables subsystem
+# but lack the nft_compat bridge that allows flannel and kube-proxy to use
+# xt extension modules (xt_comment, xt_conntrack). Detect this by probing
+# whether xt_comment is usable via the current iptables backend. If the
+# probe fails, switch to iptables-legacy. Set USE_IPTABLES_LEGACY=1
+# externally to skip the probe and force the legacy backend.
+# ---------------------------------------------------------------------------
+# Check br_netfilter kernel module
+# ---------------------------------------------------------------------------
+# br_netfilter makes the kernel pass bridge (pod-to-pod) traffic through
+# iptables. Without it, kube-proxy's DNAT rules for ClusterIP services are
+# never applied to pod traffic, so pods cannot reach services such as
+# kube-dns (10.43.0.10), breaking all in-cluster DNS resolution.
+#
+# The module must be loaded on the HOST before the container starts —
+# containers cannot load kernel modules themselves. If it is missing, log a
+# warning rather than failing hard: some kernels have bridge netfilter support
+# built-in or expose it differently, and will work correctly without the module
+# being explicitly loaded as a separate .ko.
+if [ ! -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then
+    echo "Warning: br_netfilter does not appear to be loaded on the host." >&2
+    echo "         Pod-to-service networking (including kube-dns) may not work without it." >&2
+    echo "         If the cluster fails to start or DNS is broken, try loading it on the host:" >&2
+    echo "           sudo modprobe br_netfilter" >&2
+    echo "         To persist across reboots:" >&2
+    echo "           echo br_netfilter | sudo tee /etc/modules-load.d/br_netfilter.conf" >&2
+fi
+
+if [ -z "${USE_IPTABLES_LEGACY:-}" ]; then
+    if iptables -t filter -N _xt_probe 2>/dev/null; then
+        _probe_rc=0
+        iptables -t filter -A _xt_probe -m comment --comment "probe" -j ACCEPT \
+            2>/dev/null || _probe_rc=$?
+        iptables -t filter -D _xt_probe -m comment --comment "probe" -j ACCEPT \
+            2>/dev/null || true
+        iptables -t filter -X _xt_probe 2>/dev/null || true
+        [ "$_probe_rc" -ne 0 ] && USE_IPTABLES_LEGACY=1
+    fi
+fi
+
+if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then
+    echo "iptables nf_tables xt extension bridge unavailable — switching to iptables-legacy"
+    if update-alternatives --set iptables /usr/sbin/iptables-legacy 2>/dev/null && \
+       update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy 2>/dev/null; then
+        echo "Now using iptables-legacy mode"
+    else
+        echo "Warning: could not switch to iptables-legacy — cluster networking may fail"
+    fi
+fi
+
+IPTABLES=$([ "${USE_IPTABLES_LEGACY:-0}" = "1" ] && echo iptables-legacy || echo iptables)
+
 RESOLV_CONF="/etc/rancher/k3s/resolv.conf"
 
 has_default_route() {
@@ -74,11 +129,11 @@ setup_dns_proxy() {
     # Docker sets up rules like:
     #   -A DOCKER_OUTPUT -d 127.0.0.11/32 -p udp --dport 53 -j DNAT --to-destination 127.0.0.11:<port>
     #   -A DOCKER_OUTPUT -d 127.0.0.11/32 -p tcp --dport 53 -j DNAT --to-destination 127.0.0.11:<port>
-    UDP_PORT=$(iptables -t nat -S DOCKER_OUTPUT 2>/dev/null \
+    UDP_PORT=$($IPTABLES -t nat -S DOCKER_OUTPUT 2>/dev/null \
         | grep -- '-p udp.*--dport 53' \
         | sed -n 's/.*--to-destination 127.0.0.11:\([0-9]*\).*/\1/p' \
         | head -1)
-    TCP_PORT=$(iptables -t nat -S DOCKER_OUTPUT 2>/dev/null \
+    TCP_PORT=$($IPTABLES -t nat -S DOCKER_OUTPUT 2>/dev/null \
         | grep -- '-p tcp.*--dport 53' \
         | sed -n 's/.*--to-destination 127.0.0.11:\([0-9]*\).*/\1/p' \
         | head -1)
@@ -101,9 +156,9 @@ setup_dns_proxy() {
     echo "Setting up DNS proxy: ${CONTAINER_IP}:53 -> 127.0.0.11 (udp:${UDP_PORT}, tcp:${TCP_PORT})"
 
     # Forward DNS from pods (PREROUTING) and local processes (OUTPUT) to Docker's DNS
-    iptables -t nat -I PREROUTING -p udp --dport 53 -d "$CONTAINER_IP" -j DNAT \
+    $IPTABLES -t nat -I PREROUTING -p udp --dport 53 -d "$CONTAINER_IP" -j DNAT \
         --to-destination "127.0.0.11:${UDP_PORT}"
-    iptables -t nat -I PREROUTING -p tcp --dport 53 -d "$CONTAINER_IP" -j DNAT \
+    $IPTABLES -t nat -I PREROUTING -p tcp --dport 53 -d "$CONTAINER_IP" -j DNAT \
         --to-destination "127.0.0.11:${TCP_PORT}"
 
     echo "nameserver $CONTAINER_IP" > "$RESOLV_CONF"
@@ -495,6 +550,13 @@ if [ ! -f /sys/fs/cgroup/cgroup.controllers ]; then
     EXTRA_KUBELET_ARGS="--kubelet-arg=fail-cgroupv1=false"
 fi
 
+# On kernels where xt_comment is unavailable, kube-router's network policy
+# controller panics at startup. Disable it when the iptables-legacy probe
+# triggered; sandbox isolation is enforced by the NSSH1 HMAC handshake instead.
+if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then
+    EXTRA_KUBELET_ARGS="$EXTRA_KUBELET_ARGS --disable-network-policy"
+fi
+
 # Docker Desktop can briefly start the container before its bridge default route
 # is fully installed. k3s exits immediately in that state, so wait briefly for
 # routing to settle first.