Merge pull request #42768 from yujuhong/fix_sandbox_listing

Automatic merge from submit-queue

dockershim: Fix the race condition in ListPodSandbox

In ListPodSandbox(), we
 1. List all sandbox docker containers
 2. List all sandbox checkpoints. If the checkpoint does not have a
    corresponding container in (1), we return partial result based on
    the checkpoint.

The problem is that new PodSandboxes can be created between step (1) and
(2). In those cases, we will see the checkpoints, but not the sandbox
containers. This leads to strange behavior because the partial result
from the checkpoint does not include some critical information. For
example, the creation timestamp'd be zero, and that would cause kubelet's
garbage collector to immediately remove the sandbox.

This change fixes that by getting the list of checkpoints before listing
all the containers (since in RunPodSandbox we create them in the reverse
order).
This commit is contained in:
Kubernetes Submit Queue 2017-03-08 21:33:31 -08:00 committed by GitHub
commit 6fac75c80a

View File

@ -341,6 +341,18 @@ func (ds *dockerService) ListPodSandbox(filter *runtimeapi.PodSandboxFilter) ([]
}
}
}
// Make sure we get the list of checkpoints first so that we don't include
// new PodSandboxes that are being created right now.
var err error
checkpoints := []string{}
if filter == nil {
checkpoints, err = ds.checkpointHandler.ListCheckpoints()
if err != nil {
glog.Errorf("Failed to list checkpoints: %v", err)
}
}
containers, err := ds.client.ListContainers(opts)
if err != nil {
return nil, err
@ -367,27 +379,21 @@ func (ds *dockerService) ListPodSandbox(filter *runtimeapi.PodSandboxFilter) ([]
// Include sandbox that could only be found with its checkpoint if no filter is applied
// These PodSandbox will only include PodSandboxID, Name, Namespace.
// These PodSandbox will be in PodSandboxState_SANDBOX_NOTREADY state.
if filter == nil {
checkpoints, err := ds.checkpointHandler.ListCheckpoints()
for _, id := range checkpoints {
if _, ok := sandboxIDs[id]; ok {
continue
}
checkpoint, err := ds.checkpointHandler.GetCheckpoint(id)
if err != nil {
glog.Errorf("Failed to list checkpoints: %v", err)
}
for _, id := range checkpoints {
if _, ok := sandboxIDs[id]; ok {
continue
}
checkpoint, err := ds.checkpointHandler.GetCheckpoint(id)
if err != nil {
glog.Errorf("Failed to retrieve checkpoint for sandbox %q: %v", id, err)
glog.Errorf("Failed to retrieve checkpoint for sandbox %q: %v", id, err)
if err == errors.CorruptCheckpointError {
glog.V(2).Info("Removing corrupted checkpoint %q: %+v", id, *checkpoint)
ds.checkpointHandler.RemoveCheckpoint(id)
}
continue
if err == errors.CorruptCheckpointError {
glog.V(2).Info("Removing corrupted checkpoint %q: %+v", id, *checkpoint)
ds.checkpointHandler.RemoveCheckpoint(id)
}
result = append(result, checkpointToRuntimeAPISandbox(id, checkpoint))
continue
}
result = append(result, checkpointToRuntimeAPISandbox(id, checkpoint))
}
// Include legacy sandboxes if there are still legacy sandboxes not cleaned up yet.