Merge pull request #42767 from Random-Liu/cleanup-infra-container-on-error

Automatic merge from submit-queue (batch tested with PRs 42768, 42760, 42771, 42767) Stop sandbox container when hit network error. Fixes https://github.com/kubernetes/kubernetes/issues/42698. This PR stops the sandbox container when hitting a network error. This PR also adds a unit test for it. I'm not sure whether we should try teardown pod network after `SetUpPod` failure. We don't do that in dockertools https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/dockertools/docker_manager.go#L2276. @yujuhong @freehan
2025-07-23 11:50:44 +00:00 · 2017-03-09 00:08:01 -08:00 · 2017-03-09 00:08:01 -08:00 · 4cf553f78e
commit 4cf553f78e
parent cc28936bd1 2690461cbb
2 changed files with 52 additions and 1 deletions
--- a/pkg/kubelet/dockershim/docker_sandbox.go
+++ b/pkg/kubelet/dockershim/docker_sandbox.go
@ -105,7 +105,12 @@ func (ds *dockerService) RunPodSandbox(config *runtimeapi.PodSandboxConfig) (str
 	// recognized by the CNI standard yet.
 	cID := kubecontainer.BuildContainerID(runtimeName, createResp.ID)
 	err = ds.network.SetUpPod(config.GetMetadata().Namespace, config.GetMetadata().Name, cID, config.Annotations)
-	// TODO: Do we need to teardown on failure or can we rely on a StopPodSandbox call with the given ID?
+	if err != nil {
 		// TODO(random-liu): Do we need to teardown network here?
 		if err := ds.client.StopContainer(createResp.ID, defaultSandboxGracePeriod); err != nil {
 			glog.Warningf("Failed to stop sandbox container %q for pod %q: %v", createResp.ID, config.Metadata.Name, err)
 		}
 	}
 	return createResp.ID, err
 }
--- a/pkg/kubelet/dockershim/docker_sandbox_test.go
+++ b/pkg/kubelet/dockershim/docker_sandbox_test.go
@ -17,7 +17,9 @@ limitations under the License.
 package dockershim
 import (
 	"errors"
 	"fmt"
 	"net"
 	"os"
 	"testing"
 	"time"
@ -203,3 +205,47 @@ func TestHostNetworkPluginInvocation(t *testing.T) {
 	assert.NoError(t, err)
 	assert.NoError(t, ds.StopPodSandbox(cID.ID))
 }
 // TestSetUpPodFailure checks that the sandbox should be not ready when it
 // hits a SetUpPod failure.
 func TestSetUpPodFailure(t *testing.T) {
 	ds, _, _ := newTestDockerService()
 	mockPlugin := newTestNetworkPlugin(t)
 	ds.network = network.NewPluginManager(mockPlugin)
 	defer mockPlugin.Finish()
 	name := "foo0"
 	ns := "bar0"
 	c := makeSandboxConfigWithLabelsAndAnnotations(
 		name, ns, "0", 0,
 		map[string]string{"label": name},
 		map[string]string{"annotation": ns},
 	)
 	cID := kubecontainer.ContainerID{Type: runtimeName, ID: dockertools.GetFakeContainerID(fmt.Sprintf("/%v", makeSandboxName(c)))}
 	mockPlugin.EXPECT().Name().Return("mockNetworkPlugin").AnyTimes()
 	mockPlugin.EXPECT().SetUpPod(ns, name, cID).Return(errors.New("setup pod error")).AnyTimes()
 	// Assume network plugin doesn't return error, dockershim should still be able to return not ready correctly.
 	mockPlugin.EXPECT().GetPodNetworkStatus(ns, name, cID).Return(&network.PodNetworkStatus{IP: net.IP("127.0.0.01")}, nil).AnyTimes()
 	t.Logf("RunPodSandbox should return error")
 	_, err := ds.RunPodSandbox(c)
 	assert.Error(t, err)
 	t.Logf("PodSandboxStatus should be not ready")
 	status, err := ds.PodSandboxStatus(cID.ID)
 	assert.NoError(t, err)
 	assert.Equal(t, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, status.State)
 	t.Logf("ListPodSandbox should also show not ready")
 	sandboxes, err := ds.ListPodSandbox(nil)
 	assert.NoError(t, err)
 	var sandbox *runtimeapi.PodSandbox
 	for _, s := range sandboxes {
 		if s.Id == cID.ID {
 			sandbox = s
 			break
 		}
 	}
 	assert.NotNil(t, sandbox)
 	assert.Equal(t, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, sandbox.State)
 }