From d13bd3f7eb47bf05debb4fece1dfaf50a55e84d2 Mon Sep 17 00:00:00 2001 From: llink5 Date: Tue, 31 Mar 2026 18:52:05 +0000 Subject: [PATCH] runtime: kill container on network timeout, address review nitpicks - Kill container via SIGKILL when RescanNetwork times out instead of silently continuing without networking - Remove unused networkReady channel - Fix import ordering, structured logging, log levels - Remove double-logging on timeout path - Add rollback comment and interface doc comment - Use logrus.Fields and plain const consistently Signed-off-by: llink5 --- src/runtime/pkg/containerd-shim-v2/start.go | 26 ++++++++++++++----- src/runtime/virtcontainers/interfaces.go | 1 + src/runtime/virtcontainers/network_linux.go | 8 +++--- src/runtime/virtcontainers/sandbox.go | 5 ++-- src/runtime/virtcontainers/utils/utils.go | 8 +++--- .../virtcontainers/utils/utils_test.go | 6 +++++ 6 files changed, 36 insertions(+), 18 deletions(-) diff --git a/src/runtime/pkg/containerd-shim-v2/start.go b/src/runtime/pkg/containerd-shim-v2/start.go index adc36b6bda..06ace930a7 100644 --- a/src/runtime/pkg/containerd-shim-v2/start.go +++ b/src/runtime/pkg/containerd-shim-v2/start.go @@ -8,11 +8,13 @@ package containerdshim import ( "context" "fmt" - - "github.com/sirupsen/logrus" + "syscall" "github.com/containerd/containerd/api/types/task" + "github.com/sirupsen/logrus" + "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" + vcutils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" ) func startContainer(ctx context.Context, s *service, c *container) (retErr error) { @@ -50,11 +52,21 @@ func startContainer(ctx context.Context, s *service, c *container) (retErr error // Run the network rescan asynchronously so we don't block // the Start RPC — Docker won't call allocateNetwork until // it receives the StartResponse. - go func() { - if err := s.sandbox.RescanNetwork(s.ctx); err != nil { - shimLog.WithError(err).Warn("async network rescan failed") - } - }() + if c.spec != nil && vcutils.IsDockerContainer(c.spec) { + go func() { + if err := s.sandbox.RescanNetwork(s.ctx); err != nil { + shimLog.WithError(err).WithFields(logrus.Fields{ + "sandbox": s.sandbox.ID(), + "container": c.id, + }).Error("Docker 26+ network setup failed: no interfaces discovered after timeout. " + + "Container killed to prevent silent networking failure. " + + "Check Docker daemon logs and network configuration.") + if sigErr := s.sandbox.SignalProcess(s.ctx, c.id, c.id, syscall.SIGKILL, true); sigErr != nil { + shimLog.WithError(sigErr).Error("failed to kill container after network setup failure") + } + } + }() + } // We use s.ctx(`ctx` derived from `s.ctx`) to check for cancellation of the // shim context and the context passed to startContainer for tracing. diff --git a/src/runtime/virtcontainers/interfaces.go b/src/runtime/virtcontainers/interfaces.go index c162dd2b50..21dd40d25e 100644 --- a/src/runtime/virtcontainers/interfaces.go +++ b/src/runtime/virtcontainers/interfaces.go @@ -72,6 +72,7 @@ type VCSandbox interface { GetOOMEvent(ctx context.Context) (string, error) GetHypervisorPid() (int, error) + // RescanNetwork re-scans the network namespace for late-discovered endpoints. RescanNetwork(ctx context.Context) error UpdateRuntimeMetrics() error diff --git a/src/runtime/virtcontainers/network_linux.go b/src/runtime/virtcontainers/network_linux.go index 68e4178ddc..a9034b20eb 100644 --- a/src/runtime/virtcontainers/network_linux.go +++ b/src/runtime/virtcontainers/network_linux.go @@ -21,6 +21,7 @@ import ( "time" "github.com/containernetworking/plugins/pkg/ns" + "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" otelTrace "go.opentelemetry.io/otel/trace" @@ -347,10 +348,10 @@ func (n *LinuxNetwork) addAllEndpoints(ctx context.Context, s *Sandbox, hotplug // hypervisor is running in a different namespace and retry there. if len(endpoints) == 0 && s != nil { if hypervisorNs, ok := n.detectHypervisorNetns(s); ok { - networkLogger().WithFields(map[string]interface{}{ + networkLogger().WithFields(logrus.Fields{ "original_netns": n.netNSPath, "hypervisor_netns": hypervisorNs, - }).Info("no endpoints in original netns, switching to hypervisor netns") + }).Debug("no endpoints in original netns, switching to hypervisor netns") origPath := n.netNSPath origCreated := n.netNSCreated @@ -414,6 +415,7 @@ func (n *LinuxNetwork) scanEndpointsInNs(ctx context.Context, s *Sandbox, nsPath for _, link := range linkList { netInfo, err := networkInfoFromLink(netlinkHandle, link) if err != nil { + // No rollback needed — no endpoints were added in this iteration yet. return nil, err } @@ -685,7 +687,7 @@ func (n *LinuxNetwork) RemoveEndpoints(ctx context.Context, s *Sandbox, endpoint if delErr := deleteNetNS(n.placeholderNetNS); delErr != nil { networkLogger().WithField("netns", n.placeholderNetNS).WithError(delErr).Warn("failed to delete placeholder netns during teardown") } else { - networkLogger().Infof("Placeholder network namespace %q deleted", n.placeholderNetNS) + networkLogger().WithField("netns", n.placeholderNetNS).Info("placeholder network namespace deleted") n.placeholderNetNS = "" } } diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 21fff0c720..bdf2bce4d7 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -357,7 +357,7 @@ func (s *Sandbox) RescanNetwork(ctx context.Context) error { ticker := time.NewTicker(pollInterval) defer ticker.Stop() - s.Logger().Info("waiting for network interfaces in namespace") + s.Logger().Debug("waiting for network interfaces in namespace") for { if _, err := s.network.AddEndpoints(ctx, s, nil, true); err != nil { @@ -371,8 +371,7 @@ func (s *Sandbox) RescanNetwork(ctx context.Context) error { case <-ctx.Done(): return ctx.Err() case <-deadline.C: - s.Logger().Warn("no network interfaces found after timeout") - return nil + return fmt.Errorf("no network interfaces discovered after %s timeout", maxWait) case <-ticker.C: } } diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index e2cb4b3baf..39bcfde8f4 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -494,11 +494,9 @@ func RevertBytes(num uint64) uint64 { return 1024*RevertBytes(a) + b } -const ( - // dockerLibnetworkSetkey is the hook argument that identifies Docker's - // network configuration hook. The argument following it is the sandbox ID. - dockerLibnetworkSetkey = "libnetwork-setkey" -) +// dockerLibnetworkSetkey is the hook argument that identifies Docker's +// network configuration hook. The argument following it is the sandbox ID. +const dockerLibnetworkSetkey = "libnetwork-setkey" // dockerNetnsPrefixes are the well-known filesystem paths where the Docker // daemon bind-mounts container network namespaces. diff --git a/src/runtime/virtcontainers/utils/utils_test.go b/src/runtime/virtcontainers/utils/utils_test.go index 1d1c85f113..564f47d414 100644 --- a/src/runtime/virtcontainers/utils/utils_test.go +++ b/src/runtime/virtcontainers/utils/utils_test.go @@ -579,6 +579,9 @@ func TestRevertBytes(t *testing.T) { assert.Equal(expectedNum, num) } +// TestIsDockerContainer validates hook-detection logic in isolation. +// End-to-end Docker→containerd→kata integration is covered by +// external tests (see tests/integration/kubernetes/). func TestIsDockerContainer(t *testing.T) { assert := assert.New(t) @@ -625,6 +628,9 @@ func TestIsDockerContainer(t *testing.T) { assert.False(IsDockerContainer(ociSpec3)) } +// TestDockerNetnsPath validates netns path discovery from OCI hook args. +// This does not test the actual namespace opening or endpoint scanning; +// see integration tests for full-path coverage. func TestDockerNetnsPath(t *testing.T) { assert := assert.New(t)