retry StopPodSandbox on Network teardown failure

2025-09-16 06:32:32 +00:00 · 2017-02-14 11:44:05 -08:00
parent 012acad32e
commit 3cc837878f
1 changed files with 33 additions and 14 deletions
--- a/pkg/kubelet/dockershim/docker_sandbox.go
+++ b/pkg/kubelet/dockershim/docker_sandbox.go
@@ -115,8 +115,10 @@ func (ds *dockerService) RunPodSandbox(config *runtimeapi.PodSandboxConfig) (str
 // after us?
 func (ds *dockerService) StopPodSandbox(podSandboxID string) error {
 	var namespace, name string
+	var checkpointErr, statusErr error
 	needNetworkTearDown := false

+	// Try to retrieve sandbox information from docker daemon or sandbox checkpoint
 	status, statusErr := ds.PodSandboxStatus(podSandboxID)
 	if statusErr == nil {
 		nsOpts := status.GetLinux().GetNamespaces().GetOptions()
@@ -125,36 +127,53 @@ func (ds *dockerService) StopPodSandbox(podSandboxID string) error {
 		namespace = m.Namespace
 		name = m.Name
 	} else {
-		checkpoint, err := ds.checkpointHandler.GetCheckpoint(podSandboxID)
-		if err != nil {
-			glog.Errorf("Failed to get checkpoint for sandbox %q: %v", podSandboxID, err)
-			return fmt.Errorf("failed to get sandbox status: %v", statusErr)
+		var checkpoint *PodSandboxCheckpoint
+		checkpoint, checkpointErr = ds.checkpointHandler.GetCheckpoint(podSandboxID)
+
+		// Proceed if both sandbox container and checkpoint could not be found. This means that following
+		// actions will only have sandbox ID and not have pod namespace and name information.
+		// Return error if encounter any unexpected error.
+		if checkpointErr != nil {
+			if dockertools.IsContainerNotFoundError(statusErr) && checkpointErr == errors.CheckpointNotFoundError {
+				glog.Warningf("Both sandbox container and checkpoint for id %q could not be found. "+
+					"Proceed without further sandbox information.", podSandboxID)
+			} else {
+				return utilerrors.NewAggregate([]error{
+					fmt.Errorf("failed to get checkpoint for sandbox %q: %v", podSandboxID, checkpointErr),
+					fmt.Errorf("failed to get sandbox status: %v", statusErr)})
+			}
+		} else {
+			namespace = checkpoint.Namespace
+			name = checkpoint.Name
 		}
-		namespace = checkpoint.Namespace
-		name = checkpoint.Name
+
 		// Always trigger network plugin to tear down
 		needNetworkTearDown = true
 	}

+	// WARNING: The following operations made the following assumption:
+	// 1. kubelet will retry on any error returned by StopPodSandbox.
+	// 2. tearing down network and stopping sandbox container can succeed in any sequence.
+	// This depends on the implementation detail of network plugin and proper error handling.
+	// For kubenet, if tearing down network failed and sandbox container is stopped, kubelet
+	// will retry. On retry, kubenet will not be able to retrieve network namespace of the sandbox
+	// since it is stopped. With empty network namespcae, CNI bridge plugin will conduct best
+	// effort clean up and will not return error.
+	errList := []error{}
 	if needNetworkTearDown {
 		cID := kubecontainer.BuildContainerID(runtimeName, podSandboxID)
 		if err := ds.networkPlugin.TearDownPod(namespace, name, cID); err != nil {
-			// TODO: Figure out a way to retry this error. We can't
-			// right now because the plugin throws errors when it doesn't find
-			// eth0, which might not exist for various reasons (setup failed,
-			// conf changed etc). In theory, it should teardown everything else
-			// so there's no need to retry.
-			glog.Errorf("Failed to teardown sandbox %q for pod %s/%s: %v", podSandboxID, namespace, name, err)
+			errList = append(errList, fmt.Errorf("failed to teardown sandbox %q for pod %s/%s: %v", podSandboxID, namespace, name, err))
 		}
 	}
 	if err := ds.client.StopContainer(podSandboxID, defaultSandboxGracePeriod); err != nil {
 		glog.Errorf("Failed to stop sandbox %q: %v", podSandboxID, err)
 		// Do not return error if the container does not exist
 		if !dockertools.IsContainerNotFoundError(err) {
-			return err
+			errList = append(errList, err)
 		}
 	}
-	return nil
+	return utilerrors.NewAggregate(errList)
 	// TODO: Stop all running containers in the sandbox.
 }