dockertools: tear down dead infra containers in SyncPod()

Dead infra containers may still have network resources allocated to
them and may not be GC-ed for a long time.  But allowing SyncPod()
to restart an infra container before the old one is destroyed
prevents network plugins from carrying the old network details
(eg IPAM) over to the new infra container.
This commit is contained in:
Dan Williams 2017-01-16 13:26:38 -06:00
parent 4d7d7faa81
commit 20e1cdb97c
2 changed files with 111 additions and 32 deletions

View File

@ -1440,21 +1440,22 @@ func (dm *DockerManager) KillPod(pod *v1.Pod, runningPod kubecontainer.Pod, grac
}
// NOTE(random-liu): The pod passed in could be *nil* when kubelet restarted.
func (dm *DockerManager) killPodWithSyncResult(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (result kubecontainer.PodSyncResult) {
// runtimePod may contain either running or exited containers
func (dm *DockerManager) killPodWithSyncResult(pod *v1.Pod, runtimePod kubecontainer.Pod, gracePeriodOverride *int64) (result kubecontainer.PodSyncResult) {
// Short circuit if there's nothing to kill.
if len(runningPod.Containers) == 0 {
if len(runtimePod.Containers) == 0 {
return
}
// Send the kills in parallel since they may take a long time.
// There may be len(runningPod.Containers) or len(runningPod.Containers)-1 of result in the channel
containerResults := make(chan *kubecontainer.SyncResult, len(runningPod.Containers))
// There may be len(runtimePod.Containers) or len(runtimePod.Containers)-1 of result in the channel
containerResults := make(chan *kubecontainer.SyncResult, len(runtimePod.Containers))
wg := sync.WaitGroup{}
var (
networkContainer *kubecontainer.Container
networkSpec *v1.Container
networkContainers []*kubecontainer.Container
networkSpecs []*v1.Container
)
wg.Add(len(runningPod.Containers))
for _, container := range runningPod.Containers {
wg.Add(len(runtimePod.Containers))
for _, container := range runtimePod.Containers {
go func(container *kubecontainer.Container) {
defer utilruntime.HandleCrash()
defer wg.Done()
@ -1479,21 +1480,20 @@ func (dm *DockerManager) killPodWithSyncResult(pod *v1.Pod, runningPod kubeconta
// TODO: Handle this without signaling the pod infra container to
// adapt to the generic container runtime.
if container.Name == PodInfraContainerName {
if containerIsNetworked(container.Name) {
// Store the container runtime for later deletion.
// We do this so that PreStop handlers can run in the network namespace.
networkContainer = container
networkSpec = containerSpec
return
networkContainers = append(networkContainers, container)
networkSpecs = append(networkSpecs, containerSpec)
} else {
killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, container.Name)
err := dm.KillContainerInPod(container.ID, containerSpec, pod, "Need to kill pod.", gracePeriodOverride)
if err != nil {
killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
glog.Errorf("Failed to delete container %v: %v; Skipping pod %q", container.ID.ID, err, runtimePod.ID)
}
containerResults <- killContainerResult
}
killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, container.Name)
err := dm.KillContainerInPod(container.ID, containerSpec, pod, "Need to kill pod.", gracePeriodOverride)
if err != nil {
killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
glog.Errorf("Failed to delete container %v: %v; Skipping pod %q", container.ID.ID, err, runningPod.ID)
}
containerResults <- killContainerResult
}(container)
}
wg.Wait()
@ -1501,27 +1501,37 @@ func (dm *DockerManager) killPodWithSyncResult(pod *v1.Pod, runningPod kubeconta
for containerResult := range containerResults {
result.AddSyncResult(containerResult)
}
if networkContainer != nil {
// Tear down any dead or running network/infra containers, but only kill
// those that are still running.
for i := range networkContainers {
networkContainer := networkContainers[i]
networkSpec := networkSpecs[i]
teardownNetworkResult := kubecontainer.NewSyncResult(kubecontainer.TeardownNetwork, kubecontainer.BuildPodFullName(runtimePod.Name, runtimePod.Namespace))
result.AddSyncResult(teardownNetworkResult)
ins, err := dm.client.InspectContainer(networkContainer.ID.ID)
if err != nil {
err = fmt.Errorf("Error inspecting container %v: %v", networkContainer.ID.ID, err)
glog.Error(err)
result.Fail(err)
return
teardownNetworkResult.Fail(kubecontainer.ErrTeardownNetwork, err.Error())
continue
}
if getDockerNetworkMode(ins) != namespaceModeHost {
teardownNetworkResult := kubecontainer.NewSyncResult(kubecontainer.TeardownNetwork, kubecontainer.BuildPodFullName(runningPod.Name, runningPod.Namespace))
result.AddSyncResult(teardownNetworkResult)
if err := dm.network.TearDownPod(runningPod.Namespace, runningPod.Name, networkContainer.ID); err != nil {
if err := dm.network.TearDownPod(runtimePod.Namespace, runtimePod.Name, networkContainer.ID); err != nil {
teardownNetworkResult.Fail(kubecontainer.ErrTeardownNetwork, err.Error())
glog.Error(err)
}
}
killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, networkContainer.Name)
result.AddSyncResult(killContainerResult)
if err := dm.KillContainerInPod(networkContainer.ID, networkSpec, pod, "Need to kill pod.", gracePeriodOverride); err != nil {
killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
glog.Errorf("Failed to delete container %v: %v; Skipping pod %q", networkContainer.ID.ID, err, runningPod.ID)
if networkContainer.State == kubecontainer.ContainerStateRunning {
killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, networkContainer.Name)
result.AddSyncResult(killContainerResult)
if err := dm.KillContainerInPod(networkContainer.ID, networkSpec, pod, "Need to kill pod.", gracePeriodOverride); err != nil {
killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
glog.Errorf("Failed to delete container %v: %v; Skipping pod %q", networkContainer.ID.ID, err, runtimePod.ID)
}
}
}
return
@ -2150,9 +2160,29 @@ func (dm *DockerManager) SyncPod(pod *v1.Pod, _ v1.PodStatus, podStatus *kubecon
glog.V(4).Infof("Killing Infra Container for %q, will start new one", format.Pod(pod))
}
// Get list of running container(s) to kill
podToKill := kubecontainer.ConvertPodStatusToRunningPod(dm.Type(), podStatus)
// If there are dead network containers, also kill them to ensure
// their network resources get released and are available to be
// re-used by new net containers
for _, containerStatus := range podStatus.ContainerStatuses {
if containerIsNetworked(containerStatus.Name) && containerStatus.State == kubecontainer.ContainerStateExited {
container := &kubecontainer.Container{
ID: containerStatus.ID,
Name: containerStatus.Name,
Image: containerStatus.Image,
ImageID: containerStatus.ImageID,
Hash: containerStatus.Hash,
State: containerStatus.State,
}
podToKill.Containers = append(podToKill.Containers, container)
}
}
// Killing phase: if we want to start new infra container, or nothing is running kill everything (including infra container)
// TODO(random-liu): We'll use pod status directly in the future
killResult := dm.killPodWithSyncResult(pod, kubecontainer.ConvertPodStatusToRunningPod(dm.Type(), podStatus), nil)
killResult := dm.killPodWithSyncResult(pod, podToKill, nil)
result.AddPodSyncResult(killResult)
if killResult.Error() != nil {
return

View File

@ -1828,6 +1828,55 @@ func TestGetPodStatusNoSuchContainer(t *testing.T) {
// Verify that we will try to start new contrainers even if the inspections
// failed.
verifyCalls(t, fakeDocker, []string{
// Inspect dead infra container for possible network teardown
"inspect_container",
// Start a new infra container.
"create", "start", "inspect_container", "inspect_container",
// Start a new container.
"create", "start", "inspect_container",
})
}
func TestSyncPodDeadInfraContainerTeardown(t *testing.T) {
const (
noSuchContainerID = "nosuchcontainer"
infraContainerID = "9876"
)
dm, fakeDocker := newTestDockerManager()
dm.podInfraContainerImage = "pod_infra_image"
ctrl := gomock.NewController(t)
defer ctrl.Finish()
fnp := nettest.NewMockNetworkPlugin(ctrl)
dm.network = network.NewPluginManager(fnp)
pod := makePod("foo", &v1.PodSpec{
Containers: []v1.Container{{Name: noSuchContainerID}},
})
fakeDocker.SetFakeContainers([]*FakeContainer{
{
ID: infraContainerID,
Name: "/k8s_POD." + strconv.FormatUint(generatePodInfraContainerHash(pod), 16) + "_foo_new_12345678_42",
ExitCode: 0,
StartedAt: time.Now(),
FinishedAt: time.Now(),
Running: false,
},
})
// Can be called multiple times due to GetPodStatus
fnp.EXPECT().Name().Return("someNetworkPlugin").AnyTimes()
fnp.EXPECT().TearDownPod("new", "foo", gomock.Any()).Return(nil)
fnp.EXPECT().GetPodNetworkStatus("new", "foo", gomock.Any()).Return(&network.PodNetworkStatus{IP: net.ParseIP("1.1.1.1")}, nil).AnyTimes()
fnp.EXPECT().SetUpPod("new", "foo", gomock.Any()).Return(nil)
runSyncPod(t, dm, fakeDocker, pod, nil, false)
// Verify that we will try to start new contrainers even if the inspections
// failed.
verifyCalls(t, fakeDocker, []string{
// Inspect dead infra container for possible network teardown
"inspect_container",
// Start a new infra container.
"create", "start", "inspect_container", "inspect_container",
// Start a new container.