diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go index 58996d18b13..651e5fe79c9 100644 --- a/pkg/kubelet/cm/cpumanager/policy_static.go +++ b/pkg/kubelet/cm/cpumanager/policy_static.go @@ -129,7 +129,7 @@ func (p *staticPolicy) validateState(s state.State) error { } // State has already been initialized from file (is not empty) - // 1 Check if the reserved cpuset is not part of default cpuset because: + // 1. Check if the reserved cpuset is not part of default cpuset because: // - kube/system reserved have changed (increased) - may lead to some containers not being able to start // - user tampered with file if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) { @@ -145,6 +145,23 @@ func (p *staticPolicy) validateState(s state.State) error { cID, cset.String(), tmpDefaultCPUset.String()) } } + + // 3. It's possible that the set of available CPUs has changed since + // the state was written. This can be due to for example + // offlining a CPU when kubelet is not running. If this happens, + // CPU manager will run into trouble when later it tries to + // assign non-existent CPUs to containers. Validate that the + // topology that was received during CPU manager startup matches with + // the set of CPUs stored in the state. + totalKnownCPUs := tmpDefaultCPUset.Clone() + for _, cset := range tmpAssignments { + totalKnownCPUs = totalKnownCPUs.Union(cset) + } + if !totalKnownCPUs.Equals(p.topology.CPUDetails.CPUs()) { + return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"", + p.topology.CPUDetails.CPUs().String(), totalKnownCPUs.String()) + } + return nil } diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go index a1bc3517baf..d4d84d0da19 100644 --- a/pkg/kubelet/cm/cpumanager/policy_static_test.go +++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go @@ -87,6 +87,26 @@ func TestStaticPolicyStart(t *testing.T) { stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7, 8, 9, 10, 11), expPanic: true, }, + { + description: "core 12 is not present in topology but is in state cpuset", + topo: topoDualSocketHT, + stAssignments: state.ContainerCPUAssignments{ + "0": cpuset.NewCPUSet(0, 1, 2), + "1": cpuset.NewCPUSet(3, 4), + }, + stDefaultCPUSet: cpuset.NewCPUSet(5, 6, 7, 8, 9, 10, 11, 12), + expPanic: true, + }, + { + description: "core 11 is present in topology but is not in state cpuset", + topo: topoDualSocketHT, + stAssignments: state.ContainerCPUAssignments{ + "0": cpuset.NewCPUSet(0, 1, 2), + "1": cpuset.NewCPUSet(3, 4), + }, + stDefaultCPUSet: cpuset.NewCPUSet(5, 6, 7, 8, 9, 10), + expPanic: true, + }, } for _, testCase := range testCases { t.Run(testCase.description, func(t *testing.T) {