diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go index 9d52a09b9e1..f2ffe19b4f9 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_manager.go +++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go @@ -212,6 +212,9 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe } func (m *manager) Allocate(p *v1.Pod, c *v1.Container) error { + // Garbage collect any stranded resources before allocating CPUs. + m.removeStaleState() + m.Lock() defer m.Unlock() @@ -384,18 +387,14 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec } if cstatus.State.Terminated != nil { - // Since the container is terminated, we know it is safe to - // remove it without any reconciliation. Removing the container - // will also remove it from the `containerMap` so that this - // container will be skipped next time around the loop. + // The container is terminated but we can't call m.RemoveContainer() + // here because it could remove the allocated cpuset for the container + // which may be in the process of being restarted. That would result + // in the container losing any exclusively-allocated CPUs that it + // was allocated. _, _, err := m.containerMap.GetContainerRef(containerID) if err == nil { - klog.Warningf("[cpumanager] reconcileState: skipping container; already terminated (pod: %s, container id: %s)", pod.Name, containerID) - err := m.RemoveContainer(containerID) - if err != nil { - klog.Errorf("[cpumanager] reconcileState: failed to remove container (pod: %s, container id: %s, error: %v)", pod.Name, containerID, err) - failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID}) - } + klog.Warningf("[cpumanager] reconcileState: ignoring terminated container (pod: %s, container id: %s)", pod.Name, containerID) } continue } diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go index 746a5845a63..bd18b2c95eb 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go @@ -269,12 +269,14 @@ func TestCPUManagerAdd(t *testing.T) { err: testCase.updateErr, }, containerMap: containermap.NewContainerMap(), - activePods: func() []*v1.Pod { return nil }, podStatusProvider: mockPodStatusProvider{}, + sourcesReady: &sourcesReadyStub{}, } pod := makePod("fakePod", "fakeContainer", "2", "2") container := &pod.Spec.Containers[0] + mgr.activePods = func() []*v1.Pod { return []*v1.Pod{pod} } + err := mgr.Allocate(pod, container) if !reflect.DeepEqual(err, testCase.expAllocateErr) { t.Errorf("CPU Manager Allocate() error (%v). expected error: %v but got: %v", @@ -487,8 +489,11 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) { state: state, containerRuntime: mockRuntimeService{}, containerMap: containermap.NewContainerMap(), - activePods: func() []*v1.Pod { return nil }, podStatusProvider: mockPodStatusProvider{}, + sourcesReady: &sourcesReadyStub{}, + activePods: func() []*v1.Pod { + return []*v1.Pod{testCase.pod} + }, } containers := append( @@ -1021,12 +1026,14 @@ func TestCPUManagerAddWithResvList(t *testing.T) { err: testCase.updateErr, }, containerMap: containermap.NewContainerMap(), - activePods: func() []*v1.Pod { return nil }, podStatusProvider: mockPodStatusProvider{}, + sourcesReady: &sourcesReadyStub{}, } pod := makePod("fakePod", "fakeContainer", "2", "2") container := &pod.Spec.Containers[0] + mgr.activePods = func() []*v1.Pod { return []*v1.Pod{pod} } + err := mgr.Allocate(pod, container) if !reflect.DeepEqual(err, testCase.expAllocateErr) { t.Errorf("CPU Manager Allocate() error (%v). expected error: %v but got: %v", diff --git a/pkg/kubelet/cm/internal_container_lifecycle.go b/pkg/kubelet/cm/internal_container_lifecycle.go index 9e243430269..690718e4e68 100644 --- a/pkg/kubelet/cm/internal_container_lifecycle.go +++ b/pkg/kubelet/cm/internal_container_lifecycle.go @@ -54,19 +54,10 @@ func (i *internalContainerLifecycleImpl) PreStartContainer(pod *v1.Pod, containe } func (i *internalContainerLifecycleImpl) PreStopContainer(containerID string) error { - if i.cpuManager != nil { - return i.cpuManager.RemoveContainer(containerID) - } return nil } func (i *internalContainerLifecycleImpl) PostStopContainer(containerID string) error { - if i.cpuManager != nil { - err := i.cpuManager.RemoveContainer(containerID) - if err != nil { - return err - } - } if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManager) { err := i.topologyManager.RemoveContainer(containerID) if err != nil {