Merge pull request #90377 from cbf123/container_cpuset_fixup_2

Fix exclusive CPU allocations being deleted at container restart
This commit is contained in:
Kubernetes Prow Robot 2020-04-27 13:40:04 -07:00 committed by GitHub
commit 7fdc1275d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 19 additions and 22 deletions

View File

@ -212,6 +212,9 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe
}
func (m *manager) Allocate(p *v1.Pod, c *v1.Container) error {
// Garbage collect any stranded resources before allocating CPUs.
m.removeStaleState()
m.Lock()
defer m.Unlock()
@ -384,18 +387,14 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec
}
if cstatus.State.Terminated != nil {
// Since the container is terminated, we know it is safe to
// remove it without any reconciliation. Removing the container
// will also remove it from the `containerMap` so that this
// container will be skipped next time around the loop.
// The container is terminated but we can't call m.RemoveContainer()
// here because it could remove the allocated cpuset for the container
// which may be in the process of being restarted. That would result
// in the container losing any exclusively-allocated CPUs that it
// was allocated.
_, _, err := m.containerMap.GetContainerRef(containerID)
if err == nil {
klog.Warningf("[cpumanager] reconcileState: skipping container; already terminated (pod: %s, container id: %s)", pod.Name, containerID)
err := m.RemoveContainer(containerID)
if err != nil {
klog.Errorf("[cpumanager] reconcileState: failed to remove container (pod: %s, container id: %s, error: %v)", pod.Name, containerID, err)
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
}
klog.Warningf("[cpumanager] reconcileState: ignoring terminated container (pod: %s, container id: %s)", pod.Name, containerID)
}
continue
}

View File

@ -269,12 +269,14 @@ func TestCPUManagerAdd(t *testing.T) {
err: testCase.updateErr,
},
containerMap: containermap.NewContainerMap(),
activePods: func() []*v1.Pod { return nil },
podStatusProvider: mockPodStatusProvider{},
sourcesReady: &sourcesReadyStub{},
}
pod := makePod("fakePod", "fakeContainer", "2", "2")
container := &pod.Spec.Containers[0]
mgr.activePods = func() []*v1.Pod { return []*v1.Pod{pod} }
err := mgr.Allocate(pod, container)
if !reflect.DeepEqual(err, testCase.expAllocateErr) {
t.Errorf("CPU Manager Allocate() error (%v). expected error: %v but got: %v",
@ -487,8 +489,11 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) {
state: state,
containerRuntime: mockRuntimeService{},
containerMap: containermap.NewContainerMap(),
activePods: func() []*v1.Pod { return nil },
podStatusProvider: mockPodStatusProvider{},
sourcesReady: &sourcesReadyStub{},
activePods: func() []*v1.Pod {
return []*v1.Pod{testCase.pod}
},
}
containers := append(
@ -1030,12 +1035,14 @@ func TestCPUManagerAddWithResvList(t *testing.T) {
err: testCase.updateErr,
},
containerMap: containermap.NewContainerMap(),
activePods: func() []*v1.Pod { return nil },
podStatusProvider: mockPodStatusProvider{},
sourcesReady: &sourcesReadyStub{},
}
pod := makePod("fakePod", "fakeContainer", "2", "2")
container := &pod.Spec.Containers[0]
mgr.activePods = func() []*v1.Pod { return []*v1.Pod{pod} }
err := mgr.Allocate(pod, container)
if !reflect.DeepEqual(err, testCase.expAllocateErr) {
t.Errorf("CPU Manager Allocate() error (%v). expected error: %v but got: %v",

View File

@ -54,19 +54,10 @@ func (i *internalContainerLifecycleImpl) PreStartContainer(pod *v1.Pod, containe
}
func (i *internalContainerLifecycleImpl) PreStopContainer(containerID string) error {
if i.cpuManager != nil {
return i.cpuManager.RemoveContainer(containerID)
}
return nil
}
func (i *internalContainerLifecycleImpl) PostStopContainer(containerID string) error {
if i.cpuManager != nil {
err := i.cpuManager.RemoveContainer(containerID)
if err != nil {
return err
}
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManager) {
err := i.topologyManager.RemoveContainer(containerID)
if err != nil {