From 363fbed804c721058188f7a94a77b3cf2d1afe00 Mon Sep 17 00:00:00 2001 From: bin Date: Tue, 29 Mar 2022 16:39:01 +0800 Subject: [PATCH 1/2] runtime: stop getting OOM events when ttrpc: closed error getOOMEvents is a long-waiting call, it will retry when failed. For cases of agent shutdown, the retry should stop. When the agent hasn't detected agent has died, we can also check whether the error is "ttrpc: closed". Fixes: #3815 Signed-off-by: bin --- src/runtime/pkg/containerd-shim-v2/wait.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/runtime/pkg/containerd-shim-v2/wait.go b/src/runtime/pkg/containerd-shim-v2/wait.go index 0eeac976e0..8b2ac70fbf 100644 --- a/src/runtime/pkg/containerd-shim-v2/wait.go +++ b/src/runtime/pkg/containerd-shim-v2/wait.go @@ -15,7 +15,6 @@ import ( "github.com/containerd/containerd/api/types/task" "github.com/containerd/containerd/mount" "github.com/sirupsen/logrus" - "google.golang.org/grpc/codes" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" ) @@ -156,13 +155,11 @@ func watchOOMEvents(ctx context.Context, s *service) { default: containerID, err := s.sandbox.GetOOMEvent(ctx) if err != nil { - shimLog.WithError(err).Warn("failed to get OOM event from sandbox") - // If the GetOOMEvent call is not implemented, then the agent is most likely an older version, - // stop attempting to get OOM events. - // for rust agent, the response code is not found - if isGRPCErrorCode(codes.NotFound, err) || err.Error() == "Dead agent" { + if err.Error() == "ttrpc: closed" || err.Error() == "Dead agent" { + shimLog.WithError(err).Warn("agent has shutdown, return from watching of OOM events") return } + shimLog.WithError(err).Warn("failed to get OOM event from sandbox") time.Sleep(defaultCheckInterval) continue } From 9f75e226f165badb4f73e594062092f6106df509 Mon Sep 17 00:00:00 2001 From: bin Date: Tue, 29 Mar 2022 16:59:12 +0800 Subject: [PATCH 2/2] runtime: add logs around sandbox monitor For debugging purposes, add some logs. Fixes: #3815 Signed-off-by: bin --- src/runtime/pkg/containerd-shim-v2/wait.go | 2 ++ src/runtime/virtcontainers/monitor.go | 14 +++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/runtime/pkg/containerd-shim-v2/wait.go b/src/runtime/pkg/containerd-shim-v2/wait.go index 8b2ac70fbf..0f6c4fe066 100644 --- a/src/runtime/pkg/containerd-shim-v2/wait.go +++ b/src/runtime/pkg/containerd-shim-v2/wait.go @@ -67,6 +67,7 @@ func wait(ctx context.Context, s *service, c *container, execID string) (int32, if c.cType.IsSandbox() { // cancel watcher if s.monitor != nil { + shimLog.WithField("sandbox", s.sandbox.ID()).Info("cancel watcher") s.monitor <- nil } if err = s.sandbox.Stop(ctx, true); err != nil { @@ -110,6 +111,7 @@ func watchSandbox(ctx context.Context, s *service) { return } err := <-s.monitor + shimLog.WithError(err).WithField("sandbox", s.sandbox.ID()).Info("watchSandbox gets an error or stop signal") if err == nil { return } diff --git a/src/runtime/virtcontainers/monitor.go b/src/runtime/virtcontainers/monitor.go index 125636a909..ae7843beae 100644 --- a/src/runtime/virtcontainers/monitor.go +++ b/src/runtime/virtcontainers/monitor.go @@ -18,6 +18,8 @@ const ( watcherChannelSize = 128 ) +var monitorLog = virtLog.WithField("subsystem", "virtcontainers/monitor") + // nolint: govet type monitor struct { watchers []chan error @@ -33,6 +35,9 @@ type monitor struct { } func newMonitor(s *Sandbox) *monitor { + // there should only be one monitor for one sandbox, + // so it's safe to let monitorLog as a global variable. + monitorLog = monitorLog.WithField("sandbox", s.ID()) return &monitor{ sandbox: s, checkInterval: defaultCheckInterval, @@ -72,6 +77,7 @@ func (m *monitor) newWatcher(ctx context.Context) (chan error, error) { } func (m *monitor) notify(ctx context.Context, err error) { + monitorLog.WithError(err).Warn("notify on errors") m.sandbox.agent.markDead(ctx) m.Lock() @@ -85,18 +91,19 @@ func (m *monitor) notify(ctx context.Context, err error) { // but just in case... defer func() { if x := recover(); x != nil { - virtLog.Warnf("watcher closed channel: %v", x) + monitorLog.Warnf("watcher closed channel: %v", x) } }() for _, c := range m.watchers { + monitorLog.WithError(err).Warn("write error to watcher") // throw away message can not write to channel // make it not stuck, the first error is useful. select { case c <- err: default: - virtLog.WithField("channel-size", watcherChannelSize).Warnf("watcher channel is full, throw notify message") + monitorLog.WithField("channel-size", watcherChannelSize).Warnf("watcher channel is full, throw notify message") } } } @@ -104,6 +111,7 @@ func (m *monitor) notify(ctx context.Context, err error) { func (m *monitor) stop() { // wait outside of monitor lock for the watcher channel to exit. defer m.wg.Wait() + monitorLog.Info("stopping monitor") m.Lock() defer m.Unlock() @@ -122,7 +130,7 @@ func (m *monitor) stop() { // but just in case... defer func() { if x := recover(); x != nil { - virtLog.Warnf("watcher closed channel: %v", x) + monitorLog.Warnf("watcher closed channel: %v", x) } }()