Update kubelet to use per-pod QoS policy.

Signed-off-by: Vishnu kannan <vishnuk@google.com>
2025-08-29 13:24:42 +00:00 · 2016-05-03 17:30:09 -07:00 · 2016-05-03 17:30:09 -07:00 · f884180deb
commit f884180deb
parent f48c83600c
5 changed files with 228 additions and 165 deletions
--- a/pkg/kubelet/dockertools/manager.go
+++ b/pkg/kubelet/dockertools/manager.go
@ -1412,7 +1412,7 @@ func containerAndPodFromLabels(inspect *dockertypes.ContainerJSON) (pod *api.Pod
 	return
 }

-func (dm *DockerManager) applyOOMScoreAdj(container *api.Container, containerInfo *dockertypes.ContainerJSON) error {
+func (dm *DockerManager) applyOOMScoreAdj(pod *api.Pod, container *api.Container, containerInfo *dockertypes.ContainerJSON) error {
 	if containerInfo.State.Pid == 0 {
 		// Container exited. We cannot do anything about it. Ignore this error.
 		glog.V(2).Infof("Failed to apply OOM score adj on container %q with ID %q. Init process does not exist.", containerInfo.Name, containerInfo.ID)
@ -1428,7 +1428,7 @@ func (dm *DockerManager) applyOOMScoreAdj(container *api.Container, containerInf
 		}
 		return err
 	}
-	oomScoreAdj := dm.calculateOomScoreAdj(container)
+	oomScoreAdj := dm.calculateOomScoreAdj(pod, container)
 	if err = dm.oomAdjuster.ApplyOOMScoreAdjContainer(cgroupName, oomScoreAdj, 5); err != nil {
 		if err == os.ErrNotExist {
 			// Container exited. We cannot do anything about it. Ignore this error.
@ -1464,7 +1464,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe
 		utsMode = namespaceModeHost
 	}

-	oomScoreAdj := dm.calculateOomScoreAdj(container)
+	oomScoreAdj := dm.calculateOomScoreAdj(pod, container)

 	id, err := dm.runContainer(pod, container, opts, ref, netMode, ipcMode, utsMode, pidMode, restartCount, oomScoreAdj)
 	if err != nil {
@ -1503,7 +1503,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe

 	// Check if current docker version is higher than 1.10. Otherwise, we have to apply OOMScoreAdj instead of using docker API.
 	// TODO: Remove this logic after we stop supporting docker version < 1.10.
-	if err := dm.applyOOMScoreAdjIfNeeded(container, containerInfo); err != nil {
+	if err = dm.applyOOMScoreAdjIfNeeded(pod, container, containerInfo); err != nil {
 		return kubecontainer.ContainerID{}, err
 	}

@ -1521,7 +1521,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe
 	return id, err
 }

-func (dm *DockerManager) applyOOMScoreAdjIfNeeded(container *api.Container, containerInfo *dockertypes.ContainerJSON) error {
+func (dm *DockerManager) applyOOMScoreAdjIfNeeded(pod *api.Pod, container *api.Container, containerInfo *dockertypes.ContainerJSON) error {
 	// Compare current API version with expected api version.
 	result, err := dm.checkDockerAPIVersion(dockerv110APIVersion)
 	if err != nil {
@ -1529,7 +1529,7 @@ func (dm *DockerManager) applyOOMScoreAdjIfNeeded(container *api.Container, cont
 	}
 	// If current api version is older than OOMScoreAdj requested, use the old way.
 	if result < 0 {
-		if err := dm.applyOOMScoreAdj(container, containerInfo); err != nil {
+		if err := dm.applyOOMScoreAdj(pod, container, containerInfo); err != nil {
 			return fmt.Errorf("Failed to apply oom-score-adj to container %q- %v", err, containerInfo.Name)
 		}
 	}
@ -1537,7 +1537,7 @@ func (dm *DockerManager) applyOOMScoreAdjIfNeeded(container *api.Container, cont
 	return nil
 }

-func (dm *DockerManager) calculateOomScoreAdj(container *api.Container) int {
+func (dm *DockerManager) calculateOomScoreAdj(pod *api.Pod, container *api.Container) int {
 	// Set OOM score of the container based on the priority of the container.
 	// Processes in lower-priority pods should be killed first if the system runs out of memory.
 	// The main pod infrastructure container is considered high priority, since if it is killed the
@ -1546,7 +1546,7 @@ func (dm *DockerManager) calculateOomScoreAdj(container *api.Container) int {
 	if container.Name == PodInfraContainerName {
 		oomScoreAdj = qos.PodInfraOOMAdj
 	} else {
-		oomScoreAdj = qos.GetContainerOOMScoreAdjust(container, int64(dm.machineInfo.MemoryCapacity))
+		oomScoreAdj = qos.GetContainerOOMScoreAdjust(pod, container, int64(dm.machineInfo.MemoryCapacity))

 	}

--- a/pkg/kubelet/qos/policy.go
+++ b/pkg/kubelet/qos/policy.go
@ -18,53 +18,32 @@ package qos

 import (
 	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/kubelet/qos/util"
 )

 const (
 	PodInfraOOMAdj        int = -999
 	KubeletOOMScoreAdj    int = -999
 	KubeProxyOOMScoreAdj  int = -999
+	guaranteedOOMScoreAdj int = -998
+	besteffortOOMScoreAdj int = 1000
 )

-// isBestEffort returns true if the container's resource requirements are best-effort.
-func isBestEffort(container *api.Container) bool {
-	// A container is best-effort if any of its resource requests is unspecified or 0.
-	if container.Resources.Requests.Memory().Value() == 0 ||
-		container.Resources.Requests.Cpu().Value() == 0 {
-		return true
-	}
-	return false
-}
-
-// isGuaranteed returns true if the container's resource requirements are Guaranteed.
-func isGuaranteed(container *api.Container) bool {
-	// A container is guaranteed if all its request == limit.
-	memoryRequest := container.Resources.Requests.Memory().Value()
-	memoryLimit := container.Resources.Limits.Memory().Value()
-	cpuRequest := container.Resources.Requests.Cpu().Value()
-	cpuLimit := container.Resources.Limits.Cpu().Value()
-	if memoryRequest != 0 &&
-		cpuRequest != 0 &&
-		cpuRequest == cpuLimit &&
-		memoryRequest == memoryLimit {
-		return true
-	}
-	return false
-}
-
 // GetContainerOOMAdjust returns the amount by which the OOM score of all processes in the
-// container should be adjusted. The OOM score of a process is the percentage of memory it consumes
+// container should be adjusted.
+// The OOM score of a process is the percentage of memory it consumes
 // multiplied by 10 (barring exceptional cases) + a configurable quantity which is between -1000
 // and 1000. Containers with higher OOM scores are killed if the system runs out of memory.
 // See https://lwn.net/Articles/391222/ for more information.
-func GetContainerOOMScoreAdjust(container *api.Container, memoryCapacity int64) int {
-	if isGuaranteed(container) {
+func GetContainerOOMScoreAdjust(pod *api.Pod, container *api.Container, memoryCapacity int64) int {
+	switch util.GetPodQos(pod) {
+	case util.Guaranteed:
 		// Guaranteed containers should be the last to get killed.
-		return -999
-	} else if isBestEffort(container) {
-		// Best-effort containers should be the first to be killed.
-		return 1000
-	} else {
+		return guaranteedOOMScoreAdj
+	case util.BestEffort:
+		return besteffortOOMScoreAdj
+	}
+
 	// Burstable containers are a middle tier, between Guaranteed and Best-Effort. Ideally,
 	// we want to protect Burstable containers that consume less memory than requested.
 	// The formula below is a heuristic. A container requesting for 10% of a system's
@ -75,11 +54,14 @@ func GetContainerOOMScoreAdjust(container *api.Container, memoryCapacity int64)
 	// Note that this is a heuristic, it won't work if a container has many small processes.
 	memoryRequest := container.Resources.Requests.Memory().Value()
 	oomScoreAdjust := 1000 - (1000*memoryRequest)/memoryCapacity
-		// A guaranteed container using 100% of memory can have an OOM score of 1. Ensure
-		// that burstable containers have a higher OOM score.
+	// A guaranteed pod using 100% of memory can have an OOM score of 1. Ensure
+	// that burstable pods have a higher OOM score adjustment.
 	if oomScoreAdjust < 2 {
 		return 2
 	}
+	// Give burstable pods a higher chance of survival over besteffort pods.
+	if int(oomScoreAdjust) == besteffortOOMScoreAdj {
+		return int(oomScoreAdjust - 1)
+	}
 	return int(oomScoreAdjust)
 }
-}
--- a/pkg/kubelet/qos/policy_test.go
+++ b/pkg/kubelet/qos/policy_test.go
@ -29,15 +29,24 @@ const (
 )

 var (
-	zeroRequestBestEffort = api.Container{
+	cpuLimit = api.Pod{
+		Spec: api.PodSpec{
+			Containers: []api.Container{
+				{
 					Resources: api.ResourceRequirements{
 						Limits: api.ResourceList{
 							api.ResourceName(api.ResourceCPU): resource.MustParse("10"),
 						},
 					},
+				},
+			},
+		},
 	}

-	edgeBestEffort = api.Container{
+	memoryLimitCPURequest = api.Pod{
+		Spec: api.PodSpec{
+			Containers: []api.Container{
+				{
 					Resources: api.ResourceRequirements{
 						Requests: api.ResourceList{
 							api.ResourceName(api.ResourceCPU): resource.MustParse("0"),
@ -46,19 +55,39 @@ var (
 							api.ResourceName(api.ResourceMemory): resource.MustParse("10G"),
 						},
 					},
+				},
+			},
+		},
 	}

-	noRequestBestEffort = api.Container{
+	zeroMemoryLimit = api.Pod{
+		Spec: api.PodSpec{
+			Containers: []api.Container{
+				{
 					Resources: api.ResourceRequirements{
 						Limits: api.ResourceList{
 							api.ResourceName(api.ResourceMemory): resource.MustParse("0"),
 						},
 					},
+				},
+			},
+		},
 	}

-	noLimitBestEffort = api.Container{}
+	noRequestLimit = api.Pod{
+		Spec: api.PodSpec{
+			Containers: []api.Container{
+				{
+					Resources: api.ResourceRequirements{},
+				},
+			},
+		},
+	}

-	guaranteed = api.Container{
+	equalRequestLimitCPUMemory = api.Pod{
+		Spec: api.PodSpec{
+			Containers: []api.Container{
+				{
 					Resources: api.ResourceRequirements{
 						Requests: api.ResourceList{
 							api.ResourceName(api.ResourceMemory): resource.MustParse("10G"),
@ -69,9 +98,15 @@ var (
 							api.ResourceName(api.ResourceMemory): resource.MustParse("10G"),
 						},
 					},
+				},
+			},
+		},
 	}

-	burstable = api.Container{
+	cpuUnlimitedMemoryLimitedWithRequests = api.Pod{
+		Spec: api.PodSpec{
+			Containers: []api.Container{
+				{
 					Resources: api.ResourceRequirements{
 						Requests: api.ResourceList{
 							api.ResourceName(api.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount / 2)),
@ -81,50 +116,29 @@ var (
 							api.ResourceName(api.ResourceMemory): resource.MustParse("10G"),
 						},
 					},
+				},
+			},
+		},
 	}

-	burstableNoLimit = api.Container{
+	requestNoLimit = api.Pod{
+		Spec: api.PodSpec{
+			Containers: []api.Container{
+				{
 					Resources: api.ResourceRequirements{
 						Requests: api.ResourceList{
 							api.ResourceName(api.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount - 1)),
 							api.ResourceName(api.ResourceCPU):    resource.MustParse("5m"),
 						},
 					},
+				},
+			},
+		},
 	}
 )

-func TestIsBestEffort(t *testing.T) {
-	validCases := []api.Container{zeroRequestBestEffort, noRequestBestEffort, noLimitBestEffort, edgeBestEffort}
-	for _, container := range validCases {
-		if !isBestEffort(&container) {
-			t.Errorf("container %+v is best-effort", container)
-		}
-	}
-	invalidCases := []api.Container{guaranteed, burstable}
-	for _, container := range invalidCases {
-		if isBestEffort(&container) {
-			t.Errorf("container %+v is not best-effort", container)
-		}
-	}
-}
-
-func TestIsGuaranteed(t *testing.T) {
-	validCases := []api.Container{guaranteed}
-	for _, container := range validCases {
-		if !isGuaranteed(&container) {
-			t.Errorf("container %+v is guaranteed", container)
-		}
-	}
-	invalidCases := []api.Container{zeroRequestBestEffort, noRequestBestEffort, noLimitBestEffort, edgeBestEffort, burstable}
-	for _, container := range invalidCases {
-		if isGuaranteed(&container) {
-			t.Errorf("container %+v is not guaranteed", container)
-		}
-	}
-}
-
 type oomTest struct {
-	container       *api.Container
+	pod             *api.Pod
 	memoryCapacity  int64
 	lowOOMScoreAdj  int // The max oom_score_adj score the container should be assigned.
 	highOOMScoreAdj int // The min oom_score_adj score the container should be assigned.
@ -133,50 +147,50 @@ type oomTest struct {
 func TestGetContainerOOMScoreAdjust(t *testing.T) {
 	oomTests := []oomTest{
 		{
-			container:       &zeroRequestBestEffort,
+			pod:             &cpuLimit,
 			memoryCapacity:  4000000000,
-			lowOOMScoreAdj:  1000,
-			highOOMScoreAdj: 1000,
+			lowOOMScoreAdj:  999,
+			highOOMScoreAdj: 999,
 		},
 		{
-			container:       &edgeBestEffort,
+			pod:             &memoryLimitCPURequest,
 			memoryCapacity:  8000000000,
-			lowOOMScoreAdj:  1000,
-			highOOMScoreAdj: 1000,
+			lowOOMScoreAdj:  999,
+			highOOMScoreAdj: 999,
 		},
 		{
-			container:       &noRequestBestEffort,
+			pod:             &zeroMemoryLimit,
 			memoryCapacity:  7230457451,
 			lowOOMScoreAdj:  1000,
 			highOOMScoreAdj: 1000,
 		},
 		{
-			container:       &noLimitBestEffort,
+			pod:             &noRequestLimit,
 			memoryCapacity:  4000000000,
 			lowOOMScoreAdj:  1000,
 			highOOMScoreAdj: 1000,
 		},
 		{
-			container:       &guaranteed,
+			pod:             &equalRequestLimitCPUMemory,
 			memoryCapacity:  123456789,
-			lowOOMScoreAdj:  -999,
-			highOOMScoreAdj: -999,
+			lowOOMScoreAdj:  -998,
+			highOOMScoreAdj: -998,
 		},
 		{
-			container:       &burstable,
+			pod:             &cpuUnlimitedMemoryLimitedWithRequests,
 			memoryCapacity:  standardMemoryAmount,
 			lowOOMScoreAdj:  495,
 			highOOMScoreAdj: 505,
 		},
 		{
-			container:       &burstableNoLimit,
+			pod:             &requestNoLimit,
 			memoryCapacity:  standardMemoryAmount,
 			lowOOMScoreAdj:  2,
 			highOOMScoreAdj: 2,
 		},
 	}
 	for _, test := range oomTests {
-		oomScoreAdj := GetContainerOOMScoreAdjust(test.container, test.memoryCapacity)
+		oomScoreAdj := GetContainerOOMScoreAdjust(test.pod, &test.pod.Spec.Containers[0], test.memoryCapacity)
 		if oomScoreAdj < test.lowOOMScoreAdj || oomScoreAdj > test.highOOMScoreAdj {
 			t.Errorf("oom_score_adj should be between %d and %d, but was %d", test.lowOOMScoreAdj, test.highOOMScoreAdj, oomScoreAdj)
 		}
--- a/pkg/kubelet/qos/util/qos.go
+++ b/pkg/kubelet/qos/util/qos.go
@ -18,7 +18,7 @@ package util

 import (
 	"k8s.io/kubernetes/pkg/api"
-	"k8s.io/kubernetes/pkg/util/sets"
+	"k8s.io/kubernetes/pkg/api/resource"
 )

 const (
@ -48,23 +48,62 @@ func isResourceBestEffort(container *api.Container, resource api.ResourceName) b
 }

 // GetPodQos returns the QoS class of a pod.
-// The QoS class of a pod is the lowest QoS class for each resource in each container.
+// A pod is besteffort if none of its containers have specified any requests or limits.
+// A pod is guaranteed only when requests and limits are specified for all the containers and they are equal.
+// A pod is burstable if limits and requests do not match across all containers.
 func GetPodQos(pod *api.Pod) string {
-	qosValues := sets.NewString()
+	requests := api.ResourceList{}
+	limits := api.ResourceList{}
+	zeroQuantity := resource.MustParse("0")
+	isGuaranteed := true
 	for _, container := range pod.Spec.Containers {
-		qosPerResource := GetQoS(&container)
-		for _, qosValue := range qosPerResource {
-			qosValues.Insert(qosValue)
+		// process requests
+		for name, quantity := range container.Resources.Requests {
+			if quantity.Cmp(zeroQuantity) == 1 {
+				delta := quantity.Copy()
+				if _, exists := requests[name]; !exists {
+					requests[name] = *delta
+				} else {
+					delta.Add(requests[name])
+					requests[name] = *delta
 				}
 			}
-	if qosValues.Has(BestEffort) {
+		}
+		// process limits
+		for name, quantity := range container.Resources.Limits {
+			if quantity.Cmp(zeroQuantity) == 1 {
+				delta := quantity.Copy()
+				if _, exists := limits[name]; !exists {
+					limits[name] = *delta
+				} else {
+					delta.Add(limits[name])
+					limits[name] = *delta
+				}
+			}
+		}
+		if len(container.Resources.Limits) != len(supportedComputeResources) {
+			isGuaranteed = false
+		}
+	}
+	if len(requests) == 0 && len(limits) == 0 {
 		return BestEffort
 	}
-	if qosValues.Has(Burstable) {
-		return Burstable
+	// Check is requests match limits for all resources.
+	if isGuaranteed {
+		for name, req := range requests {
+			if lim, exists := limits[name]; !exists || lim.Cmp(req) != 0 {
+				isGuaranteed = false
+				break
 			}
+		}
+	}
+	if isGuaranteed &&
+		len(requests) == len(limits) &&
+		len(limits) == len(supportedComputeResources) {
 		return Guaranteed
 	}
+	return Burstable
+}

 // GetQos returns a mapping of resource name to QoS class of a container
 func GetQoS(container *api.Container) map[api.ResourceName]string {
--- a/pkg/kubelet/qos/util/qos_test.go
+++ b/pkg/kubelet/qos/util/qos_test.go
@ -64,24 +64,58 @@ func TestGetPodQos(t *testing.T) {
 		pod      *api.Pod
 		expected string
 	}{
+		{
+			pod: newPod("guaranteed", []api.Container{
+				newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
+			}),
+			expected: Guaranteed,
+		},
+		{
+			pod: newPod("guaranteed-guaranteed", []api.Container{
+				newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
+				newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
+			}),
+			expected: Guaranteed,
+		},
+		{
+			pod: newPod("best-effort-best-effort", []api.Container{
+				newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
+				newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
+			}),
+			expected: BestEffort,
+		},
 		{
 			pod: newPod("best-effort", []api.Container{
 				newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
 			}),
 			expected: BestEffort,
 		},
+		{
+			pod: newPod("best-effort-burstable", []api.Container{
+				newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
+				newContainer("burstable", getResourceList("1", ""), getResourceList("2", "")),
+			}),
+			expected: Burstable,
+		},
 		{
 			pod: newPod("best-effort-guaranteed", []api.Container{
 				newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
 				newContainer("guaranteed", getResourceList("10m", "100Mi"), getResourceList("10m", "100Mi")),
 			}),
-			expected: BestEffort,
+			expected: Burstable,
 		},
 		{
-			pod: newPod("best-effort-cpu-guaranteed-memory", []api.Container{
-				newContainer("best-effort", getResourceList("", "100Mi"), getResourceList("", "100Mi")),
+			pod: newPod("burstable-cpu-guaranteed-memory", []api.Container{
+				newContainer("burstable", getResourceList("", "100Mi"), getResourceList("", "100Mi")),
 			}),
-			expected: BestEffort,
+			expected: Burstable,
+		},
+		{
+			pod: newPod("burstable-guaranteed", []api.Container{
+				newContainer("burstable", getResourceList("1", "100Mi"), getResourceList("2", "100Mi")),
+				newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
+			}),
+			expected: Burstable,
 		},
 		{
 			pod: newPod("burstable", []api.Container{
@ -89,12 +123,6 @@ func TestGetPodQos(t *testing.T) {
 			}),
 			expected: Burstable,
 		},
-		{
-			pod: newPod("guaranteed", []api.Container{
-				newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
-			}),
-			expected: Guaranteed,
-		},
 	}
 	for _, testCase := range testCases {
 		if actual := GetPodQos(testCase.pod); testCase.expected != actual {