Merge pull request #62874 from dcbw/dockershim-SetUpPod-cleanup-on-failure

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. dockershim/sandbox: clean up pod network even if SetUpPod() failed If the CNI network plugin completes successfully, but something fails between that success and dockerhsim's sandbox setup code, plugin resources may not be cleaned up. A non-trivial amount of code runs after the plugin itself exits and the CNI driver's SetUpPod() returns, and any error condition recognized by that code would cause this leakage. The Kubernetes CRI RunPodSandbox() request does not attempt to clean up on errors, since it cannot know how much (if any) networking was actually set up. It depends on the CRI implementation to do that cleanup for it. In the dockershim case, a SetUpPod() failure means networkReady is FALSE for the sandbox, and TearDownPod() will not be called later by garbage collection even though networking was configured, because dockershim can't know how far SetUpPod() got. Concrete examples include if the sandbox's container is somehow removed during during that time, or another OS error is encountered, or the plugin returns a malformed result to the CNI driver. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1532965 ```release-note NONE ```
2025-07-22 11:21:47 +00:00 · 2018-04-24 21:48:01 -07:00 · 2018-04-24 21:48:01 -07:00 · 61892abc94
commit 61892abc94
parent 50dd920837 91321ef85b
2 changed files with 18 additions and 4 deletions
--- a/pkg/kubelet/dockershim/docker_sandbox.go
+++ b/pkg/kubelet/dockershim/docker_sandbox.go
@ -163,12 +163,24 @@ func (ds *dockerService) RunPodSandbox(ctx context.Context, r *runtimeapi.RunPod
 	cID := kubecontainer.BuildContainerID(runtimeName, createResp.ID)
 	err = ds.network.SetUpPod(config.GetMetadata().Namespace, config.GetMetadata().Name, cID, config.Annotations)
 	if err != nil {
-		// TODO(random-liu): Do we need to teardown network here?
-		if err := ds.client.StopContainer(createResp.ID, defaultSandboxGracePeriod); err != nil {
-			glog.Warningf("Failed to stop sandbox container %q for pod %q: %v", createResp.ID, config.Metadata.Name, err)
+		errList := []error{fmt.Errorf("failed to set up sandbox container %q network for pod %q: %v", createResp.ID, config.Metadata.Name, err)}
+
+		// Ensure network resources are cleaned up even if the plugin
+		// succeeded but an error happened between that success and here.
+		err = ds.network.TearDownPod(config.GetMetadata().Namespace, config.GetMetadata().Name, cID)
+		if err != nil {
+			errList = append(errList, fmt.Errorf("failed to clean up sandbox container %q network for pod %q: %v", createResp.ID, config.Metadata.Name, err))
 		}
+
+		err = ds.client.StopContainer(createResp.ID, defaultSandboxGracePeriod)
+		if err != nil {
+			errList = append(errList, fmt.Errorf("failed to stop sandbox container %q for pod %q: %v", createResp.ID, config.Metadata.Name, err))
+		}
+
+		return resp, utilerrors.NewAggregate(errList)
 	}
-	return resp, err
+
+	return resp, nil
 }

 // StopPodSandbox stops the sandbox. If there are any running containers in the
--- a/pkg/kubelet/dockershim/docker_sandbox_test.go
+++ b/pkg/kubelet/dockershim/docker_sandbox_test.go
@ -277,6 +277,8 @@ func TestSetUpPodFailure(t *testing.T) {
 	cID := kubecontainer.ContainerID{Type: runtimeName, ID: libdocker.GetFakeContainerID(fmt.Sprintf("/%v", makeSandboxName(c)))}
 	mockPlugin.EXPECT().Name().Return("mockNetworkPlugin").AnyTimes()
 	mockPlugin.EXPECT().SetUpPod(ns, name, cID).Return(errors.New("setup pod error")).AnyTimes()
+	// If SetUpPod() fails, we expect TearDownPod() to immediately follow
+	mockPlugin.EXPECT().TearDownPod(ns, name, cID)
 	// Assume network plugin doesn't return error, dockershim should still be able to return not ready correctly.
 	mockPlugin.EXPECT().GetPodNetworkStatus(ns, name, cID).Return(&network.PodNetworkStatus{IP: net.IP("127.0.0.01")}, nil).AnyTimes()