dockertools: tear down dead infra containers in SyncPod()

Dead infra containers may still have network resources allocated to them and may not be GC-ed for a long time. But allowing SyncPod() to restart an infra container before the old one is destroyed prevents network plugins from carrying the old network details (eg IPAM) over to the new infra container.
2025-07-23 11:50:44 +00:00 · 2017-01-16 13:26:38 -06:00 · 2017-01-16 13:26:38 -06:00 · 20e1cdb97c
commit 20e1cdb97c
parent 4d7d7faa81
2 changed files with 111 additions and 32 deletions
--- a/pkg/kubelet/dockertools/docker_manager.go
+++ b/pkg/kubelet/dockertools/docker_manager.go
@ -1440,21 +1440,22 @@ func (dm *DockerManager) KillPod(pod *v1.Pod, runningPod kubecontainer.Pod, grac
 }

 // NOTE(random-liu): The pod passed in could be *nil* when kubelet restarted.
-func (dm *DockerManager) killPodWithSyncResult(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (result kubecontainer.PodSyncResult) {
+// runtimePod may contain either running or exited containers
+func (dm *DockerManager) killPodWithSyncResult(pod *v1.Pod, runtimePod kubecontainer.Pod, gracePeriodOverride *int64) (result kubecontainer.PodSyncResult) {
 	// Short circuit if there's nothing to kill.
-	if len(runningPod.Containers) == 0 {
+	if len(runtimePod.Containers) == 0 {
 		return
 	}
 	// Send the kills in parallel since they may take a long time.
-	// There may be len(runningPod.Containers) or len(runningPod.Containers)-1 of result in the channel
-	containerResults := make(chan *kubecontainer.SyncResult, len(runningPod.Containers))
+	// There may be len(runtimePod.Containers) or len(runtimePod.Containers)-1 of result in the channel
+	containerResults := make(chan *kubecontainer.SyncResult, len(runtimePod.Containers))
 	wg := sync.WaitGroup{}
 	var (
-		networkContainer *kubecontainer.Container
-		networkSpec      *v1.Container
+		networkContainers []*kubecontainer.Container
+		networkSpecs      []*v1.Container
 	)
-	wg.Add(len(runningPod.Containers))
-	for _, container := range runningPod.Containers {
+	wg.Add(len(runtimePod.Containers))
+	for _, container := range runtimePod.Containers {
 		go func(container *kubecontainer.Container) {
 			defer utilruntime.HandleCrash()
 			defer wg.Done()
@ -1479,21 +1480,20 @@ func (dm *DockerManager) killPodWithSyncResult(pod *v1.Pod, runningPod kubeconta

 			// TODO: Handle this without signaling the pod infra container to
 			// adapt to the generic container runtime.
-			if container.Name == PodInfraContainerName {
+			if containerIsNetworked(container.Name) {
 				// Store the container runtime for later deletion.
 				// We do this so that PreStop handlers can run in the network namespace.
-				networkContainer = container
-				networkSpec = containerSpec
-				return
+				networkContainers = append(networkContainers, container)
+				networkSpecs = append(networkSpecs, containerSpec)
+			} else {
+				killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, container.Name)
+				err := dm.KillContainerInPod(container.ID, containerSpec, pod, "Need to kill pod.", gracePeriodOverride)
+				if err != nil {
+					killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
+					glog.Errorf("Failed to delete container %v: %v; Skipping pod %q", container.ID.ID, err, runtimePod.ID)
+				}
+				containerResults <- killContainerResult
 			}
-
-			killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, container.Name)
-			err := dm.KillContainerInPod(container.ID, containerSpec, pod, "Need to kill pod.", gracePeriodOverride)
-			if err != nil {
-				killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
-				glog.Errorf("Failed to delete container %v: %v; Skipping pod %q", container.ID.ID, err, runningPod.ID)
-			}
-			containerResults <- killContainerResult
 		}(container)
 	}
 	wg.Wait()
@ -1501,27 +1501,37 @@ func (dm *DockerManager) killPodWithSyncResult(pod *v1.Pod, runningPod kubeconta
 	for containerResult := range containerResults {
 		result.AddSyncResult(containerResult)
 	}
-	if networkContainer != nil {
+
+	// Tear down any dead or running network/infra containers, but only kill
+	// those that are still running.
+	for i := range networkContainers {
+		networkContainer := networkContainers[i]
+		networkSpec := networkSpecs[i]
+
+		teardownNetworkResult := kubecontainer.NewSyncResult(kubecontainer.TeardownNetwork, kubecontainer.BuildPodFullName(runtimePod.Name, runtimePod.Namespace))
+		result.AddSyncResult(teardownNetworkResult)
+
 		ins, err := dm.client.InspectContainer(networkContainer.ID.ID)
 		if err != nil {
 			err = fmt.Errorf("Error inspecting container %v: %v", networkContainer.ID.ID, err)
 			glog.Error(err)
-			result.Fail(err)
-			return
+			teardownNetworkResult.Fail(kubecontainer.ErrTeardownNetwork, err.Error())
+			continue
 		}
+
 		if getDockerNetworkMode(ins) != namespaceModeHost {
-			teardownNetworkResult := kubecontainer.NewSyncResult(kubecontainer.TeardownNetwork, kubecontainer.BuildPodFullName(runningPod.Name, runningPod.Namespace))
-			result.AddSyncResult(teardownNetworkResult)
-			if err := dm.network.TearDownPod(runningPod.Namespace, runningPod.Name, networkContainer.ID); err != nil {
+			if err := dm.network.TearDownPod(runtimePod.Namespace, runtimePod.Name, networkContainer.ID); err != nil {
 				teardownNetworkResult.Fail(kubecontainer.ErrTeardownNetwork, err.Error())
 				glog.Error(err)
 			}
 		}
-		killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, networkContainer.Name)
-		result.AddSyncResult(killContainerResult)
-		if err := dm.KillContainerInPod(networkContainer.ID, networkSpec, pod, "Need to kill pod.", gracePeriodOverride); err != nil {
-			killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
-			glog.Errorf("Failed to delete container %v: %v; Skipping pod %q", networkContainer.ID.ID, err, runningPod.ID)
+		if networkContainer.State == kubecontainer.ContainerStateRunning {
+			killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, networkContainer.Name)
+			result.AddSyncResult(killContainerResult)
+			if err := dm.KillContainerInPod(networkContainer.ID, networkSpec, pod, "Need to kill pod.", gracePeriodOverride); err != nil {
+				killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
+				glog.Errorf("Failed to delete container %v: %v; Skipping pod %q", networkContainer.ID.ID, err, runtimePod.ID)
+			}
 		}
 	}
 	return
@ -2150,9 +2160,29 @@ func (dm *DockerManager) SyncPod(pod *v1.Pod, _ v1.PodStatus, podStatus *kubecon
 			glog.V(4).Infof("Killing Infra Container for %q, will start new one", format.Pod(pod))
 		}

+		// Get list of running container(s) to kill
+		podToKill := kubecontainer.ConvertPodStatusToRunningPod(dm.Type(), podStatus)
+
+		// If there are dead network containers, also kill them to ensure
+		// their network resources get released and are available to be
+		// re-used by new net containers
+		for _, containerStatus := range podStatus.ContainerStatuses {
+			if containerIsNetworked(containerStatus.Name) && containerStatus.State == kubecontainer.ContainerStateExited {
+				container := &kubecontainer.Container{
+					ID:      containerStatus.ID,
+					Name:    containerStatus.Name,
+					Image:   containerStatus.Image,
+					ImageID: containerStatus.ImageID,
+					Hash:    containerStatus.Hash,
+					State:   containerStatus.State,
+				}
+				podToKill.Containers = append(podToKill.Containers, container)
+			}
+		}
+
 		// Killing phase: if we want to start new infra container, or nothing is running kill everything (including infra container)
 		// TODO(random-liu): We'll use pod status directly in the future
-		killResult := dm.killPodWithSyncResult(pod, kubecontainer.ConvertPodStatusToRunningPod(dm.Type(), podStatus), nil)
+		killResult := dm.killPodWithSyncResult(pod, podToKill, nil)
 		result.AddPodSyncResult(killResult)
 		if killResult.Error() != nil {
 			return
--- a/pkg/kubelet/dockertools/docker_manager_test.go
+++ b/pkg/kubelet/dockertools/docker_manager_test.go
@ -1828,6 +1828,55 @@ func TestGetPodStatusNoSuchContainer(t *testing.T) {
 	// Verify that we will try to start new contrainers even if the inspections
 	// failed.
 	verifyCalls(t, fakeDocker, []string{
+		// Inspect dead infra container for possible network teardown
+		"inspect_container",
+		// Start a new infra container.
+		"create", "start", "inspect_container", "inspect_container",
+		// Start a new container.
+		"create", "start", "inspect_container",
+	})
+}
+
+func TestSyncPodDeadInfraContainerTeardown(t *testing.T) {
+	const (
+		noSuchContainerID = "nosuchcontainer"
+		infraContainerID  = "9876"
+	)
+	dm, fakeDocker := newTestDockerManager()
+	dm.podInfraContainerImage = "pod_infra_image"
+	ctrl := gomock.NewController(t)
+	defer ctrl.Finish()
+	fnp := nettest.NewMockNetworkPlugin(ctrl)
+	dm.network = network.NewPluginManager(fnp)
+
+	pod := makePod("foo", &v1.PodSpec{
+		Containers: []v1.Container{{Name: noSuchContainerID}},
+	})
+
+	fakeDocker.SetFakeContainers([]*FakeContainer{
+		{
+			ID:         infraContainerID,
+			Name:       "/k8s_POD." + strconv.FormatUint(generatePodInfraContainerHash(pod), 16) + "_foo_new_12345678_42",
+			ExitCode:   0,
+			StartedAt:  time.Now(),
+			FinishedAt: time.Now(),
+			Running:    false,
+		},
+	})
+
+	// Can be called multiple times due to GetPodStatus
+	fnp.EXPECT().Name().Return("someNetworkPlugin").AnyTimes()
+	fnp.EXPECT().TearDownPod("new", "foo", gomock.Any()).Return(nil)
+	fnp.EXPECT().GetPodNetworkStatus("new", "foo", gomock.Any()).Return(&network.PodNetworkStatus{IP: net.ParseIP("1.1.1.1")}, nil).AnyTimes()
+	fnp.EXPECT().SetUpPod("new", "foo", gomock.Any()).Return(nil)
+
+	runSyncPod(t, dm, fakeDocker, pod, nil, false)
+
+	// Verify that we will try to start new contrainers even if the inspections
+	// failed.
+	verifyCalls(t, fakeDocker, []string{
+		// Inspect dead infra container for possible network teardown
+		"inspect_container",
 		// Start a new infra container.
 		"create", "start", "inspect_container", "inspect_container",
 		// Start a new container.