mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-28 14:07:14 +00:00
Fix exclusive CPU allocations being deleted at container restart
The expectation is that exclusive CPU allocations happen at pod creation time. When a container restarts, it should not have its exclusive CPU allocations removed, and it should not need to re-allocate CPUs. There are a few places in the current code that look for containers that have exited and call CpuManager.RemoveContainer() to clean up the container. This will end up deleting any exclusive CPU allocations for that container, and if the container restarts within the same pod it will end up using the default cpuset rather than what should be exclusive CPUs. Removing those calls and adding resource cleanup at allocation time should get rid of the problem. Signed-off-by: Chris Friesen <chris.friesen@windriver.com>
This commit is contained in:
parent
0acf2f0983
commit
ab5870d808
@ -212,6 +212,9 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *manager) Allocate(p *v1.Pod, c *v1.Container) error {
|
func (m *manager) Allocate(p *v1.Pod, c *v1.Container) error {
|
||||||
|
// Garbage collect any stranded resources before allocating CPUs.
|
||||||
|
m.removeStaleState()
|
||||||
|
|
||||||
m.Lock()
|
m.Lock()
|
||||||
defer m.Unlock()
|
defer m.Unlock()
|
||||||
|
|
||||||
@ -384,18 +387,14 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec
|
|||||||
}
|
}
|
||||||
|
|
||||||
if cstatus.State.Terminated != nil {
|
if cstatus.State.Terminated != nil {
|
||||||
// Since the container is terminated, we know it is safe to
|
// The container is terminated but we can't call m.RemoveContainer()
|
||||||
// remove it without any reconciliation. Removing the container
|
// here because it could remove the allocated cpuset for the container
|
||||||
// will also remove it from the `containerMap` so that this
|
// which may be in the process of being restarted. That would result
|
||||||
// container will be skipped next time around the loop.
|
// in the container losing any exclusively-allocated CPUs that it
|
||||||
|
// was allocated.
|
||||||
_, _, err := m.containerMap.GetContainerRef(containerID)
|
_, _, err := m.containerMap.GetContainerRef(containerID)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
klog.Warningf("[cpumanager] reconcileState: skipping container; already terminated (pod: %s, container id: %s)", pod.Name, containerID)
|
klog.Warningf("[cpumanager] reconcileState: ignoring terminated container (pod: %s, container id: %s)", pod.Name, containerID)
|
||||||
err := m.RemoveContainer(containerID)
|
|
||||||
if err != nil {
|
|
||||||
klog.Errorf("[cpumanager] reconcileState: failed to remove container (pod: %s, container id: %s, error: %v)", pod.Name, containerID, err)
|
|
||||||
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -269,12 +269,14 @@ func TestCPUManagerAdd(t *testing.T) {
|
|||||||
err: testCase.updateErr,
|
err: testCase.updateErr,
|
||||||
},
|
},
|
||||||
containerMap: containermap.NewContainerMap(),
|
containerMap: containermap.NewContainerMap(),
|
||||||
activePods: func() []*v1.Pod { return nil },
|
|
||||||
podStatusProvider: mockPodStatusProvider{},
|
podStatusProvider: mockPodStatusProvider{},
|
||||||
|
sourcesReady: &sourcesReadyStub{},
|
||||||
}
|
}
|
||||||
|
|
||||||
pod := makePod("fakePod", "fakeContainer", "2", "2")
|
pod := makePod("fakePod", "fakeContainer", "2", "2")
|
||||||
container := &pod.Spec.Containers[0]
|
container := &pod.Spec.Containers[0]
|
||||||
|
mgr.activePods = func() []*v1.Pod { return []*v1.Pod{pod} }
|
||||||
|
|
||||||
err := mgr.Allocate(pod, container)
|
err := mgr.Allocate(pod, container)
|
||||||
if !reflect.DeepEqual(err, testCase.expAllocateErr) {
|
if !reflect.DeepEqual(err, testCase.expAllocateErr) {
|
||||||
t.Errorf("CPU Manager Allocate() error (%v). expected error: %v but got: %v",
|
t.Errorf("CPU Manager Allocate() error (%v). expected error: %v but got: %v",
|
||||||
@ -487,8 +489,11 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) {
|
|||||||
state: state,
|
state: state,
|
||||||
containerRuntime: mockRuntimeService{},
|
containerRuntime: mockRuntimeService{},
|
||||||
containerMap: containermap.NewContainerMap(),
|
containerMap: containermap.NewContainerMap(),
|
||||||
activePods: func() []*v1.Pod { return nil },
|
|
||||||
podStatusProvider: mockPodStatusProvider{},
|
podStatusProvider: mockPodStatusProvider{},
|
||||||
|
sourcesReady: &sourcesReadyStub{},
|
||||||
|
activePods: func() []*v1.Pod {
|
||||||
|
return []*v1.Pod{testCase.pod}
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
containers := append(
|
containers := append(
|
||||||
@ -1021,12 +1026,14 @@ func TestCPUManagerAddWithResvList(t *testing.T) {
|
|||||||
err: testCase.updateErr,
|
err: testCase.updateErr,
|
||||||
},
|
},
|
||||||
containerMap: containermap.NewContainerMap(),
|
containerMap: containermap.NewContainerMap(),
|
||||||
activePods: func() []*v1.Pod { return nil },
|
|
||||||
podStatusProvider: mockPodStatusProvider{},
|
podStatusProvider: mockPodStatusProvider{},
|
||||||
|
sourcesReady: &sourcesReadyStub{},
|
||||||
}
|
}
|
||||||
|
|
||||||
pod := makePod("fakePod", "fakeContainer", "2", "2")
|
pod := makePod("fakePod", "fakeContainer", "2", "2")
|
||||||
container := &pod.Spec.Containers[0]
|
container := &pod.Spec.Containers[0]
|
||||||
|
mgr.activePods = func() []*v1.Pod { return []*v1.Pod{pod} }
|
||||||
|
|
||||||
err := mgr.Allocate(pod, container)
|
err := mgr.Allocate(pod, container)
|
||||||
if !reflect.DeepEqual(err, testCase.expAllocateErr) {
|
if !reflect.DeepEqual(err, testCase.expAllocateErr) {
|
||||||
t.Errorf("CPU Manager Allocate() error (%v). expected error: %v but got: %v",
|
t.Errorf("CPU Manager Allocate() error (%v). expected error: %v but got: %v",
|
||||||
|
@ -54,19 +54,10 @@ func (i *internalContainerLifecycleImpl) PreStartContainer(pod *v1.Pod, containe
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (i *internalContainerLifecycleImpl) PreStopContainer(containerID string) error {
|
func (i *internalContainerLifecycleImpl) PreStopContainer(containerID string) error {
|
||||||
if i.cpuManager != nil {
|
|
||||||
return i.cpuManager.RemoveContainer(containerID)
|
|
||||||
}
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *internalContainerLifecycleImpl) PostStopContainer(containerID string) error {
|
func (i *internalContainerLifecycleImpl) PostStopContainer(containerID string) error {
|
||||||
if i.cpuManager != nil {
|
|
||||||
err := i.cpuManager.RemoveContainer(containerID)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManager) {
|
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManager) {
|
||||||
err := i.topologyManager.RemoveContainer(containerID)
|
err := i.topologyManager.RemoveContainer(containerID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
Loading…
Reference in New Issue
Block a user