Proactively remove init Containers in CPUManager static policy

This patch fixes a bug in the CPUManager, whereby it doesn't honor the
"effective requests/limits" of a Pod as defined by:

    https://kubernetes.io/docs/concepts/workloads/pods/init-containers/#resources

The rule states that a Pod’s "effective request/limit" for a resource
should be the larger of:
    * The highest of any particular resource request or limit
      defined on all init Containers
    * The sum of all app Containers request/limit for a
      resource

Moreover, the rule states that:
    * The effective QoS tier is the same for init Containers
      and app containers alike

This means that the resource requests of init Containers and app
Containers should be able to overlap, such that the larger of the two
becomes the "effective resource request/limit" for the Pod. Likewise,
if a QoS tier of "Guaranteed" is determined for the Pod, then both init
Containers and app Containers should run in this tier.

In its current implementation, the CPU manager honors the effective QoS
tier for both init and app containers, but doesn't honor the "effective
request/limit" correctly.

Instead, it treats the "effective request/limit" as:
    * The sum of all init Containers plus the sum of all app
      Containers request/limit for a resource

It does this by not proactively removing the CPUs given to previous init
containers when new containers are being created. In the worst case,
this causes the CPUManager to give non-overlapping CPUs to all
containers (whether init or app) in the "Guaranteed" QoS tier before any
of the containers in the Pod actually start.

This effectively blocks these Pods from running if the total number of
CPUs being requested across init and app Containers goes beyond the
limits of the system.

This patch fixes this problem by updating the CPUManager static policy
so that it proactively removes any guaranteed CPUs it has granted to
init Containers before allocating CPUs to app containers. Since all init
container are run sequentially, it also makes sure this proactive
removal happens for previous init containers when allocating CPUs to
later ones.
This commit is contained in:
Kevin Klues 2019-06-05 14:04:34 -07:00
parent 78915a1094
commit c6d9bbcb74
3 changed files with 110 additions and 4 deletions

View File

@ -3,6 +3,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = [
"container_map.go",
"cpu_assignment.go",
"cpu_manager.go",
"fake_cpu_manager.go",

View File

@ -0,0 +1,68 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"k8s.io/api/core/v1"
)
// containerMap maps (podUID, containerName) -> containerID
type containerMap map[string]map[string]string
func newContainerMap() containerMap {
return make(containerMap)
}
func (cm containerMap) Add(p *v1.Pod, c *v1.Container, containerID string) {
podUID := string(p.UID)
if _, exists := cm[podUID]; !exists {
cm[podUID] = make(map[string]string)
}
cm[podUID][c.Name] = containerID
}
func (cm containerMap) Remove(containerID string) {
found := false
for podUID := range cm {
for containerName := range cm[podUID] {
if containerID == cm[podUID][containerName] {
delete(cm[podUID], containerName)
found = true
break
}
}
if len(cm[podUID]) == 0 {
delete(cm, podUID)
}
if found {
break
}
}
}
func (cm containerMap) Get(p *v1.Pod, c *v1.Container) (string, error) {
podUID := string(p.UID)
if _, exists := cm[podUID]; !exists {
return "", fmt.Errorf("pod %s not in containerMap", podUID)
}
if _, exists := cm[podUID][c.Name]; !exists {
return "", fmt.Errorf("container %s not in containerMap for pod %s", c.Name, podUID)
}
return cm[podUID][c.Name], nil
}

View File

@ -73,6 +73,10 @@ type staticPolicy struct {
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reserved cpuset.CPUSet
// containerMap provides a mapping from
// (pod, container) -> containerID
// for all containers a pod
containerMap containerMap
}
// Ensure staticPolicy implements Policy interface
@ -97,8 +101,9 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int) Policy
klog.Infof("[cpumanager] reserved %d CPUs (\"%s\") not available for exclusive assignment", reserved.Size(), reserved)
return &staticPolicy{
topology: topology,
reserved: reserved,
topology: topology,
reserved: reserved,
containerMap: newContainerMap(),
}
}
@ -172,7 +177,15 @@ func (p *staticPolicy) assignableCPUs(s state.State) cpuset.CPUSet {
return s.GetDefaultCPUSet().Difference(p.reserved)
}
func (p *staticPolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error {
func (p *staticPolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) (rerr error) {
// So long as this function does not return an error,
// add (pod, container, containerID) to the containerMap.
defer func() {
if rerr == nil {
p.containerMap.Add(pod, container, containerID)
}
}()
if numCPUs := guaranteedCPUs(pod, container); numCPUs != 0 {
klog.Infof("[cpumanager] static policy: AddContainer (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID)
// container belongs in an exclusively allocated pool
@ -182,6 +195,22 @@ func (p *staticPolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Co
return nil
}
// Proactively remove CPUs from init containers that have already run.
// They are guaranteed to have run to completion before any other
// container is run.
for _, initContainer := range pod.Spec.InitContainers {
if container.Name != initContainer.Name {
initContainerID, err := p.containerMap.Get(pod, &initContainer)
if err != nil {
continue
}
err = p.RemoveContainer(s, initContainerID)
if err != nil {
klog.Warningf("[cpumanager] unable to remove init container (container id: %s, error: %v)", initContainerID, err)
}
}
}
cpuset, err := p.allocateCPUs(s, numCPUs)
if err != nil {
klog.Errorf("[cpumanager] unable to allocate %d CPUs (container id: %s, error: %v)", numCPUs, containerID, err)
@ -193,7 +222,15 @@ func (p *staticPolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Co
return nil
}
func (p *staticPolicy) RemoveContainer(s state.State, containerID string) error {
func (p *staticPolicy) RemoveContainer(s state.State, containerID string) (rerr error) {
// So long as this function does not return an error,
// remove containerID from the containerMap.
defer func() {
if rerr == nil {
p.containerMap.Remove(containerID)
}
}()
klog.Infof("[cpumanager] static policy: RemoveContainer (container id: %s)", containerID)
if toRelease, ok := s.GetCPUSet(containerID); ok {
s.Delete(containerID)