From 035afab901a3eddb1ce35eef9cf0c87e4056978d Mon Sep 17 00:00:00 2001 From: Yu-Ju Hong Date: Mon, 13 Mar 2017 15:33:59 -0700 Subject: [PATCH] dockershim: remove corrupted sandbox checkpoints This is a workaround to ensure that kubelet doesn't block forever when the checkpoint is corrupted. --- pkg/kubelet/dockershim/docker_sandbox.go | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pkg/kubelet/dockershim/docker_sandbox.go b/pkg/kubelet/dockershim/docker_sandbox.go index 79dca5e67ec..e34b7ce3081 100644 --- a/pkg/kubelet/dockershim/docker_sandbox.go +++ b/pkg/kubelet/dockershim/docker_sandbox.go @@ -144,6 +144,16 @@ func (ds *dockerService) StopPodSandbox(podSandboxID string) error { glog.Warningf("Both sandbox container and checkpoint for id %q could not be found. "+ "Proceed without further sandbox information.", podSandboxID) } else { + if checkpointErr == errors.CorruptCheckpointError { + // Remove the corrupted checkpoint so that the next + // StopPodSandbox call can proceed. This may indicate that + // some resources won't be reclaimed. + // TODO (#43021): Fix this properly. + glog.Warningf("Removing corrupted checkpoint %q: %+v", podSandboxID, *checkpoint) + if err := ds.checkpointHandler.RemoveCheckpoint(podSandboxID); err != nil { + glog.Warningf("Unable to remove corrupted checkpoint %q: %v", podSandboxID, err) + } + } return utilerrors.NewAggregate([]error{ fmt.Errorf("failed to get checkpoint for sandbox %q: %v", podSandboxID, checkpointErr), fmt.Errorf("failed to get sandbox status: %v", statusErr)}) @@ -393,8 +403,10 @@ func (ds *dockerService) ListPodSandbox(filter *runtimeapi.PodSandboxFilter) ([] glog.Errorf("Failed to retrieve checkpoint for sandbox %q: %v", id, err) if err == errors.CorruptCheckpointError { - glog.V(2).Info("Removing corrupted checkpoint %q: %+v", id, *checkpoint) - ds.checkpointHandler.RemoveCheckpoint(id) + glog.Warningf("Removing corrupted checkpoint %q: %+v", id, *checkpoint) + if err := ds.checkpointHandler.RemoveCheckpoint(id); err != nil { + glog.Warningf("Unable to remove corrupted checkpoint %q: %v", id, err) + } } continue }