runtime: sleep 1 second after GetOOMEvent failed

In some cases, for example agent crashed and not marked dead yet, the GetOOMEvent
will return errors like `connection reset by peer` or `ttrpc: closed`. Do a sleep
with 1 second (agent check interval) and let agent health check to do the check.

Fixes: #991

Signed-off-by: bin liu <bin@hyper.sh>
This commit is contained in:
bin liu 2020-11-10 11:58:12 +08:00
parent d22c7cf00b
commit cb0e6094ff

View File

@ -20,6 +20,8 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/oci"
)
const defaultCheckInterval = 1 * time.Second
func wait(s *service, c *container, execID string) (int32, error) {
var execs *exec
var err error
@ -152,6 +154,7 @@ func watchOOMEvents(ctx context.Context, s *service) {
if isGRPCErrorCode(codes.NotFound, err) || err.Error() == "Dead agent" {
return
}
time.Sleep(defaultCheckInterval)
continue
}