mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-04-04 11:03:52 +00:00
runtime: use symptom-based rescan instead of runtime detection
Modern container runtimes (Docker 29+) no longer advertise their identity through OCI hooks or annotations. Rather than attempting fragile runtime detection, check for the symptom: no network endpoints after sandbox creation. - Remove IsDockerContainer guard from RescanNetwork goroutine - Remove container kill on timeout (too aggressive without reliable runtime detection, breaks CNI on slow architectures) - Restore original startVM endpoint scan condition (fixes CNI regression on s390x) - RescanNetwork returns nil on timeout with warning instead of error Signed-off-by: llink5 <llink5@users.noreply.github.com>
This commit is contained in:
@@ -8,13 +8,11 @@ package containerdshim
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"syscall"
|
||||
|
||||
"github.com/containerd/containerd/api/types/task"
|
||||
"github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
|
||||
vcutils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
|
||||
)
|
||||
|
||||
func startContainer(ctx context.Context, s *service, c *container) (retErr error) {
|
||||
@@ -48,25 +46,18 @@ func startContainer(ctx context.Context, s *service, c *container) (retErr error
|
||||
}
|
||||
go watchSandbox(ctx, s)
|
||||
|
||||
// Docker 26+ configures networking after the Start response.
|
||||
// Run the network rescan asynchronously so we don't block
|
||||
// the Start RPC — Docker won't call allocateNetwork until
|
||||
// it receives the StartResponse.
|
||||
if c.spec != nil && vcutils.IsDockerContainer(c.spec) {
|
||||
go func() {
|
||||
if err := s.sandbox.RescanNetwork(s.ctx); err != nil {
|
||||
shimLog.WithError(err).WithFields(logrus.Fields{
|
||||
"sandbox": s.sandbox.ID(),
|
||||
"container": c.id,
|
||||
}).Error("Docker 26+ network setup failed: no interfaces discovered after timeout. " +
|
||||
"Container killed to prevent silent networking failure. " +
|
||||
"Check Docker daemon logs and network configuration.")
|
||||
if sigErr := s.sandbox.SignalProcess(s.ctx, c.id, c.id, syscall.SIGKILL, true); sigErr != nil {
|
||||
shimLog.WithError(sigErr).Error("failed to kill container after network setup failure")
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
// If no network endpoints were discovered during sandbox creation,
|
||||
// schedule an async rescan. This handles runtimes that configure
|
||||
// networking after task creation (e.g. Docker 26+ configures
|
||||
// networking after the Start response, and prestart hooks may
|
||||
// not have run yet on slower architectures).
|
||||
// RescanNetwork is idempotent — it returns immediately if
|
||||
// endpoints already exist.
|
||||
go func() {
|
||||
if err := s.sandbox.RescanNetwork(s.ctx); err != nil {
|
||||
shimLog.WithError(err).Error("async network rescan failed — container may lack networking")
|
||||
}
|
||||
}()
|
||||
|
||||
// We use s.ctx(`ctx` derived from `s.ctx`) to check for cancellation of the
|
||||
// shim context and the context passed to startContainer for tracing.
|
||||
|
||||
@@ -371,7 +371,8 @@ func (s *Sandbox) RescanNetwork(ctx context.Context) error {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-deadline.C:
|
||||
return fmt.Errorf("no network interfaces discovered after %s timeout", maxWait)
|
||||
s.Logger().Warn("no network interfaces found after timeout — networking may be configured by prestart hooks")
|
||||
return nil
|
||||
case <-ticker.C:
|
||||
}
|
||||
}
|
||||
@@ -1571,12 +1572,9 @@ func (s *Sandbox) startVM(ctx context.Context, prestartHookFunc func(context.Con
|
||||
// 3. In case of vm factory, scan the netns to hotplug interfaces after vm is started.
|
||||
// 4. In case of prestartHookFunc, network config might have been changed. We need to
|
||||
// rescan and handle the change.
|
||||
// 5. If no endpoints were found pre-VM-start (e.g. Docker 26+ placed the
|
||||
// hypervisor in its own pre-configured netns), rescan now that the
|
||||
// hypervisor is running so addAllEndpoints can discover its namespace.
|
||||
if !s.config.NetworkConfig.DisableNewNetwork &&
|
||||
caps.IsNetworkDeviceHotplugSupported() &&
|
||||
(s.factory != nil || prestartHookFunc != nil || len(s.network.Endpoints()) == 0) {
|
||||
(s.factory != nil || prestartHookFunc != nil) {
|
||||
if _, err := s.network.AddEndpoints(ctx, s, nil, true); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user