Merge pull request #42767 from Random-Liu/cleanup-infra-container-on-error

Automatic merge from submit-queue (batch tested with PRs 42768, 42760, 42771, 42767) Stop sandbox container when hit network error. Fixes https://github.com/kubernetes/kubernetes/issues/42698. This PR stops the sandbox container when hitting a network error. This PR also adds a unit test for it. I'm not sure whether we should try teardown pod network after `SetUpPod` failure. We don't do that in dockertools https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/dockertools/docker_manager.go#L2276. @yujuhong @freehan
2025-09-07 04:03:20 +00:00 · 2017-03-09 00:08:01 -08:00
parent cc28936bd1 2690461cbb
commit 4cf553f78e
2 changed files with 52 additions and 1 deletions
--- a/pkg/kubelet/dockershim/docker_sandbox.go
+++ b/pkg/kubelet/dockershim/docker_sandbox.go
@@ -105,7 +105,12 @@ func (ds *dockerService) RunPodSandbox(config *runtimeapi.PodSandboxConfig) (str
 	// recognized by the CNI standard yet.
 	cID := kubecontainer.BuildContainerID(runtimeName, createResp.ID)
 	err = ds.network.SetUpPod(config.GetMetadata().Namespace, config.GetMetadata().Name, cID, config.Annotations)
-	// TODO: Do we need to teardown on failure or can we rely on a StopPodSandbox call with the given ID?
+	if err != nil {
+		// TODO(random-liu): Do we need to teardown network here?
+		if err := ds.client.StopContainer(createResp.ID, defaultSandboxGracePeriod); err != nil {
+			glog.Warningf("Failed to stop sandbox container %q for pod %q: %v", createResp.ID, config.Metadata.Name, err)
+		}
+	}
 	return createResp.ID, err
 }

--- a/pkg/kubelet/dockershim/docker_sandbox_test.go
+++ b/pkg/kubelet/dockershim/docker_sandbox_test.go
@@ -17,7 +17,9 @@ limitations under the License.
 package dockershim

 import (
+	"errors"
 	"fmt"
+	"net"
 	"os"
 	"testing"
 	"time"
@@ -203,3 +205,47 @@ func TestHostNetworkPluginInvocation(t *testing.T) {
 	assert.NoError(t, err)
 	assert.NoError(t, ds.StopPodSandbox(cID.ID))
 }
+
+// TestSetUpPodFailure checks that the sandbox should be not ready when it
+// hits a SetUpPod failure.
+func TestSetUpPodFailure(t *testing.T) {
+	ds, _, _ := newTestDockerService()
+	mockPlugin := newTestNetworkPlugin(t)
+	ds.network = network.NewPluginManager(mockPlugin)
+	defer mockPlugin.Finish()
+
+	name := "foo0"
+	ns := "bar0"
+	c := makeSandboxConfigWithLabelsAndAnnotations(
+		name, ns, "0", 0,
+		map[string]string{"label": name},
+		map[string]string{"annotation": ns},
+	)
+	cID := kubecontainer.ContainerID{Type: runtimeName, ID: dockertools.GetFakeContainerID(fmt.Sprintf("/%v", makeSandboxName(c)))}
+	mockPlugin.EXPECT().Name().Return("mockNetworkPlugin").AnyTimes()
+	mockPlugin.EXPECT().SetUpPod(ns, name, cID).Return(errors.New("setup pod error")).AnyTimes()
+	// Assume network plugin doesn't return error, dockershim should still be able to return not ready correctly.
+	mockPlugin.EXPECT().GetPodNetworkStatus(ns, name, cID).Return(&network.PodNetworkStatus{IP: net.IP("127.0.0.01")}, nil).AnyTimes()
+
+	t.Logf("RunPodSandbox should return error")
+	_, err := ds.RunPodSandbox(c)
+	assert.Error(t, err)
+
+	t.Logf("PodSandboxStatus should be not ready")
+	status, err := ds.PodSandboxStatus(cID.ID)
+	assert.NoError(t, err)
+	assert.Equal(t, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, status.State)
+
+	t.Logf("ListPodSandbox should also show not ready")
+	sandboxes, err := ds.ListPodSandbox(nil)
+	assert.NoError(t, err)
+	var sandbox *runtimeapi.PodSandbox
+	for _, s := range sandboxes {
+		if s.Id == cID.ID {
+			sandbox = s
+			break
+		}
+	}
+	assert.NotNil(t, sandbox)
+	assert.Equal(t, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, sandbox.State)
+}