retry StopPodSandbox on Network teardown failure

This commit is contained in:
Minhan Xia 2017-02-14 11:44:05 -08:00
parent 012acad32e
commit 3cc837878f

View File

@ -115,8 +115,10 @@ func (ds *dockerService) RunPodSandbox(config *runtimeapi.PodSandboxConfig) (str
// after us? // after us?
func (ds *dockerService) StopPodSandbox(podSandboxID string) error { func (ds *dockerService) StopPodSandbox(podSandboxID string) error {
var namespace, name string var namespace, name string
var checkpointErr, statusErr error
needNetworkTearDown := false needNetworkTearDown := false
// Try to retrieve sandbox information from docker daemon or sandbox checkpoint
status, statusErr := ds.PodSandboxStatus(podSandboxID) status, statusErr := ds.PodSandboxStatus(podSandboxID)
if statusErr == nil { if statusErr == nil {
nsOpts := status.GetLinux().GetNamespaces().GetOptions() nsOpts := status.GetLinux().GetNamespaces().GetOptions()
@ -125,36 +127,53 @@ func (ds *dockerService) StopPodSandbox(podSandboxID string) error {
namespace = m.Namespace namespace = m.Namespace
name = m.Name name = m.Name
} else { } else {
checkpoint, err := ds.checkpointHandler.GetCheckpoint(podSandboxID) var checkpoint *PodSandboxCheckpoint
if err != nil { checkpoint, checkpointErr = ds.checkpointHandler.GetCheckpoint(podSandboxID)
glog.Errorf("Failed to get checkpoint for sandbox %q: %v", podSandboxID, err)
return fmt.Errorf("failed to get sandbox status: %v", statusErr) // Proceed if both sandbox container and checkpoint could not be found. This means that following
// actions will only have sandbox ID and not have pod namespace and name information.
// Return error if encounter any unexpected error.
if checkpointErr != nil {
if dockertools.IsContainerNotFoundError(statusErr) && checkpointErr == errors.CheckpointNotFoundError {
glog.Warningf("Both sandbox container and checkpoint for id %q could not be found. "+
"Proceed without further sandbox information.", podSandboxID)
} else {
return utilerrors.NewAggregate([]error{
fmt.Errorf("failed to get checkpoint for sandbox %q: %v", podSandboxID, checkpointErr),
fmt.Errorf("failed to get sandbox status: %v", statusErr)})
} }
} else {
namespace = checkpoint.Namespace namespace = checkpoint.Namespace
name = checkpoint.Name name = checkpoint.Name
}
// Always trigger network plugin to tear down // Always trigger network plugin to tear down
needNetworkTearDown = true needNetworkTearDown = true
} }
// WARNING: The following operations made the following assumption:
// 1. kubelet will retry on any error returned by StopPodSandbox.
// 2. tearing down network and stopping sandbox container can succeed in any sequence.
// This depends on the implementation detail of network plugin and proper error handling.
// For kubenet, if tearing down network failed and sandbox container is stopped, kubelet
// will retry. On retry, kubenet will not be able to retrieve network namespace of the sandbox
// since it is stopped. With empty network namespcae, CNI bridge plugin will conduct best
// effort clean up and will not return error.
errList := []error{}
if needNetworkTearDown { if needNetworkTearDown {
cID := kubecontainer.BuildContainerID(runtimeName, podSandboxID) cID := kubecontainer.BuildContainerID(runtimeName, podSandboxID)
if err := ds.networkPlugin.TearDownPod(namespace, name, cID); err != nil { if err := ds.networkPlugin.TearDownPod(namespace, name, cID); err != nil {
// TODO: Figure out a way to retry this error. We can't errList = append(errList, fmt.Errorf("failed to teardown sandbox %q for pod %s/%s: %v", podSandboxID, namespace, name, err))
// right now because the plugin throws errors when it doesn't find
// eth0, which might not exist for various reasons (setup failed,
// conf changed etc). In theory, it should teardown everything else
// so there's no need to retry.
glog.Errorf("Failed to teardown sandbox %q for pod %s/%s: %v", podSandboxID, namespace, name, err)
} }
} }
if err := ds.client.StopContainer(podSandboxID, defaultSandboxGracePeriod); err != nil { if err := ds.client.StopContainer(podSandboxID, defaultSandboxGracePeriod); err != nil {
glog.Errorf("Failed to stop sandbox %q: %v", podSandboxID, err) glog.Errorf("Failed to stop sandbox %q: %v", podSandboxID, err)
// Do not return error if the container does not exist // Do not return error if the container does not exist
if !dockertools.IsContainerNotFoundError(err) { if !dockertools.IsContainerNotFoundError(err) {
return err errList = append(errList, err)
} }
} }
return nil return utilerrors.NewAggregate(errList)
// TODO: Stop all running containers in the sandbox. // TODO: Stop all running containers in the sandbox.
} }