From 5ae8a608df1f275f6733de3971df64299c37ac0d Mon Sep 17 00:00:00 2001 From: llink5 Date: Wed, 1 Apr 2026 11:53:51 +0000 Subject: [PATCH] runtime: use symptom-based rescan instead of runtime detection Modern container runtimes (Docker 29+) no longer advertise their identity through OCI hooks or annotations. Rather than attempting fragile runtime detection, check for the symptom: no network endpoints after sandbox creation. - Remove IsDockerContainer guard from RescanNetwork goroutine - Remove container kill on timeout (too aggressive without reliable runtime detection, breaks CNI on slow architectures) - Restore original startVM endpoint scan condition (fixes CNI regression on s390x) - RescanNetwork returns nil on timeout with warning instead of error Signed-off-by: llink5 --- src/runtime/pkg/containerd-shim-v2/start.go | 33 ++++++++------------- src/runtime/virtcontainers/sandbox.go | 8 ++--- 2 files changed, 15 insertions(+), 26 deletions(-) diff --git a/src/runtime/pkg/containerd-shim-v2/start.go b/src/runtime/pkg/containerd-shim-v2/start.go index 06ace930a7..f0d04e1cbe 100644 --- a/src/runtime/pkg/containerd-shim-v2/start.go +++ b/src/runtime/pkg/containerd-shim-v2/start.go @@ -8,13 +8,11 @@ package containerdshim import ( "context" "fmt" - "syscall" "github.com/containerd/containerd/api/types/task" "github.com/sirupsen/logrus" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" - vcutils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" ) func startContainer(ctx context.Context, s *service, c *container) (retErr error) { @@ -48,25 +46,18 @@ func startContainer(ctx context.Context, s *service, c *container) (retErr error } go watchSandbox(ctx, s) - // Docker 26+ configures networking after the Start response. - // Run the network rescan asynchronously so we don't block - // the Start RPC — Docker won't call allocateNetwork until - // it receives the StartResponse. - if c.spec != nil && vcutils.IsDockerContainer(c.spec) { - go func() { - if err := s.sandbox.RescanNetwork(s.ctx); err != nil { - shimLog.WithError(err).WithFields(logrus.Fields{ - "sandbox": s.sandbox.ID(), - "container": c.id, - }).Error("Docker 26+ network setup failed: no interfaces discovered after timeout. " + - "Container killed to prevent silent networking failure. " + - "Check Docker daemon logs and network configuration.") - if sigErr := s.sandbox.SignalProcess(s.ctx, c.id, c.id, syscall.SIGKILL, true); sigErr != nil { - shimLog.WithError(sigErr).Error("failed to kill container after network setup failure") - } - } - }() - } + // If no network endpoints were discovered during sandbox creation, + // schedule an async rescan. This handles runtimes that configure + // networking after task creation (e.g. Docker 26+ configures + // networking after the Start response, and prestart hooks may + // not have run yet on slower architectures). + // RescanNetwork is idempotent — it returns immediately if + // endpoints already exist. + go func() { + if err := s.sandbox.RescanNetwork(s.ctx); err != nil { + shimLog.WithError(err).Error("async network rescan failed — container may lack networking") + } + }() // We use s.ctx(`ctx` derived from `s.ctx`) to check for cancellation of the // shim context and the context passed to startContainer for tracing. diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index bdf2bce4d7..2a878538d2 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -371,7 +371,8 @@ func (s *Sandbox) RescanNetwork(ctx context.Context) error { case <-ctx.Done(): return ctx.Err() case <-deadline.C: - return fmt.Errorf("no network interfaces discovered after %s timeout", maxWait) + s.Logger().Warn("no network interfaces found after timeout — networking may be configured by prestart hooks") + return nil case <-ticker.C: } } @@ -1571,12 +1572,9 @@ func (s *Sandbox) startVM(ctx context.Context, prestartHookFunc func(context.Con // 3. In case of vm factory, scan the netns to hotplug interfaces after vm is started. // 4. In case of prestartHookFunc, network config might have been changed. We need to // rescan and handle the change. - // 5. If no endpoints were found pre-VM-start (e.g. Docker 26+ placed the - // hypervisor in its own pre-configured netns), rescan now that the - // hypervisor is running so addAllEndpoints can discover its namespace. if !s.config.NetworkConfig.DisableNewNetwork && caps.IsNetworkDeviceHotplugSupported() && - (s.factory != nil || prestartHookFunc != nil || len(s.network.Endpoints()) == 0) { + (s.factory != nil || prestartHookFunc != nil) { if _, err := s.network.AddEndpoints(ctx, s, nil, true); err != nil { return err }