From cb0e6094ff93a2720ced71bf569c6b170b81e16c Mon Sep 17 00:00:00 2001 From: bin liu Date: Tue, 10 Nov 2020 11:58:12 +0800 Subject: [PATCH] runtime: sleep 1 second after GetOOMEvent failed In some cases, for example agent crashed and not marked dead yet, the GetOOMEvent will return errors like `connection reset by peer` or `ttrpc: closed`. Do a sleep with 1 second (agent check interval) and let agent health check to do the check. Fixes: #991 Signed-off-by: bin liu --- src/runtime/containerd-shim-v2/wait.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/runtime/containerd-shim-v2/wait.go b/src/runtime/containerd-shim-v2/wait.go index a6f524e53b..07a46d0246 100644 --- a/src/runtime/containerd-shim-v2/wait.go +++ b/src/runtime/containerd-shim-v2/wait.go @@ -20,6 +20,8 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/oci" ) +const defaultCheckInterval = 1 * time.Second + func wait(s *service, c *container, execID string) (int32, error) { var execs *exec var err error @@ -152,6 +154,7 @@ func watchOOMEvents(ctx context.Context, s *service) { if isGRPCErrorCode(codes.NotFound, err) || err.Error() == "Dead agent" { return } + time.Sleep(defaultCheckInterval) continue }