diff --git a/pkg/kubelet/dockertools/manager.go b/pkg/kubelet/dockertools/manager.go index 371f9a50e0c..f2082144a06 100644 --- a/pkg/kubelet/dockertools/manager.go +++ b/pkg/kubelet/dockertools/manager.go @@ -1412,7 +1412,7 @@ func containerAndPodFromLabels(inspect *dockertypes.ContainerJSON) (pod *api.Pod return } -func (dm *DockerManager) applyOOMScoreAdj(container *api.Container, containerInfo *dockertypes.ContainerJSON) error { +func (dm *DockerManager) applyOOMScoreAdj(pod *api.Pod, container *api.Container, containerInfo *dockertypes.ContainerJSON) error { if containerInfo.State.Pid == 0 { // Container exited. We cannot do anything about it. Ignore this error. glog.V(2).Infof("Failed to apply OOM score adj on container %q with ID %q. Init process does not exist.", containerInfo.Name, containerInfo.ID) @@ -1428,7 +1428,7 @@ func (dm *DockerManager) applyOOMScoreAdj(container *api.Container, containerInf } return err } - oomScoreAdj := dm.calculateOomScoreAdj(container) + oomScoreAdj := dm.calculateOomScoreAdj(pod, container) if err = dm.oomAdjuster.ApplyOOMScoreAdjContainer(cgroupName, oomScoreAdj, 5); err != nil { if err == os.ErrNotExist { // Container exited. We cannot do anything about it. Ignore this error. @@ -1464,7 +1464,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe utsMode = namespaceModeHost } - oomScoreAdj := dm.calculateOomScoreAdj(container) + oomScoreAdj := dm.calculateOomScoreAdj(pod, container) id, err := dm.runContainer(pod, container, opts, ref, netMode, ipcMode, utsMode, pidMode, restartCount, oomScoreAdj) if err != nil { @@ -1503,7 +1503,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe // Check if current docker version is higher than 1.10. Otherwise, we have to apply OOMScoreAdj instead of using docker API. // TODO: Remove this logic after we stop supporting docker version < 1.10. - if err := dm.applyOOMScoreAdjIfNeeded(container, containerInfo); err != nil { + if err = dm.applyOOMScoreAdjIfNeeded(pod, container, containerInfo); err != nil { return kubecontainer.ContainerID{}, err } @@ -1521,7 +1521,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe return id, err } -func (dm *DockerManager) applyOOMScoreAdjIfNeeded(container *api.Container, containerInfo *dockertypes.ContainerJSON) error { +func (dm *DockerManager) applyOOMScoreAdjIfNeeded(pod *api.Pod, container *api.Container, containerInfo *dockertypes.ContainerJSON) error { // Compare current API version with expected api version. result, err := dm.checkDockerAPIVersion(dockerv110APIVersion) if err != nil { @@ -1529,7 +1529,7 @@ func (dm *DockerManager) applyOOMScoreAdjIfNeeded(container *api.Container, cont } // If current api version is older than OOMScoreAdj requested, use the old way. if result < 0 { - if err := dm.applyOOMScoreAdj(container, containerInfo); err != nil { + if err := dm.applyOOMScoreAdj(pod, container, containerInfo); err != nil { return fmt.Errorf("Failed to apply oom-score-adj to container %q- %v", err, containerInfo.Name) } } @@ -1537,7 +1537,7 @@ func (dm *DockerManager) applyOOMScoreAdjIfNeeded(container *api.Container, cont return nil } -func (dm *DockerManager) calculateOomScoreAdj(container *api.Container) int { +func (dm *DockerManager) calculateOomScoreAdj(pod *api.Pod, container *api.Container) int { // Set OOM score of the container based on the priority of the container. // Processes in lower-priority pods should be killed first if the system runs out of memory. // The main pod infrastructure container is considered high priority, since if it is killed the @@ -1546,7 +1546,7 @@ func (dm *DockerManager) calculateOomScoreAdj(container *api.Container) int { if container.Name == PodInfraContainerName { oomScoreAdj = qos.PodInfraOOMAdj } else { - oomScoreAdj = qos.GetContainerOOMScoreAdjust(container, int64(dm.machineInfo.MemoryCapacity)) + oomScoreAdj = qos.GetContainerOOMScoreAdjust(pod, container, int64(dm.machineInfo.MemoryCapacity)) } diff --git a/pkg/kubelet/qos/policy.go b/pkg/kubelet/qos/policy.go index 252b72fca9e..511e629fadf 100644 --- a/pkg/kubelet/qos/policy.go +++ b/pkg/kubelet/qos/policy.go @@ -18,68 +18,50 @@ package qos import ( "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/kubelet/qos/util" ) const ( - PodInfraOOMAdj int = -999 - KubeletOOMScoreAdj int = -999 - KubeProxyOOMScoreAdj int = -999 + PodInfraOOMAdj int = -999 + KubeletOOMScoreAdj int = -999 + KubeProxyOOMScoreAdj int = -999 + guaranteedOOMScoreAdj int = -998 + besteffortOOMScoreAdj int = 1000 ) -// isBestEffort returns true if the container's resource requirements are best-effort. -func isBestEffort(container *api.Container) bool { - // A container is best-effort if any of its resource requests is unspecified or 0. - if container.Resources.Requests.Memory().Value() == 0 || - container.Resources.Requests.Cpu().Value() == 0 { - return true - } - return false -} - -// isGuaranteed returns true if the container's resource requirements are Guaranteed. -func isGuaranteed(container *api.Container) bool { - // A container is guaranteed if all its request == limit. - memoryRequest := container.Resources.Requests.Memory().Value() - memoryLimit := container.Resources.Limits.Memory().Value() - cpuRequest := container.Resources.Requests.Cpu().Value() - cpuLimit := container.Resources.Limits.Cpu().Value() - if memoryRequest != 0 && - cpuRequest != 0 && - cpuRequest == cpuLimit && - memoryRequest == memoryLimit { - return true - } - return false -} - // GetContainerOOMAdjust returns the amount by which the OOM score of all processes in the -// container should be adjusted. The OOM score of a process is the percentage of memory it consumes +// container should be adjusted. +// The OOM score of a process is the percentage of memory it consumes // multiplied by 10 (barring exceptional cases) + a configurable quantity which is between -1000 // and 1000. Containers with higher OOM scores are killed if the system runs out of memory. // See https://lwn.net/Articles/391222/ for more information. -func GetContainerOOMScoreAdjust(container *api.Container, memoryCapacity int64) int { - if isGuaranteed(container) { +func GetContainerOOMScoreAdjust(pod *api.Pod, container *api.Container, memoryCapacity int64) int { + switch util.GetPodQos(pod) { + case util.Guaranteed: // Guaranteed containers should be the last to get killed. - return -999 - } else if isBestEffort(container) { - // Best-effort containers should be the first to be killed. - return 1000 - } else { - // Burstable containers are a middle tier, between Guaranteed and Best-Effort. Ideally, - // we want to protect Burstable containers that consume less memory than requested. - // The formula below is a heuristic. A container requesting for 10% of a system's - // memory will have an OOM score adjust of 900. If a process in container Y - // uses over 10% of memory, its OOM score will be 1000. The idea is that containers - // which use more than their request will have an OOM score of 1000 and will be prime - // targets for OOM kills. - // Note that this is a heuristic, it won't work if a container has many small processes. - memoryRequest := container.Resources.Requests.Memory().Value() - oomScoreAdjust := 1000 - (1000*memoryRequest)/memoryCapacity - // A guaranteed container using 100% of memory can have an OOM score of 1. Ensure - // that burstable containers have a higher OOM score. - if oomScoreAdjust < 2 { - return 2 - } - return int(oomScoreAdjust) + return guaranteedOOMScoreAdj + case util.BestEffort: + return besteffortOOMScoreAdj } + + // Burstable containers are a middle tier, between Guaranteed and Best-Effort. Ideally, + // we want to protect Burstable containers that consume less memory than requested. + // The formula below is a heuristic. A container requesting for 10% of a system's + // memory will have an OOM score adjust of 900. If a process in container Y + // uses over 10% of memory, its OOM score will be 1000. The idea is that containers + // which use more than their request will have an OOM score of 1000 and will be prime + // targets for OOM kills. + // Note that this is a heuristic, it won't work if a container has many small processes. + memoryRequest := container.Resources.Requests.Memory().Value() + oomScoreAdjust := 1000 - (1000*memoryRequest)/memoryCapacity + // A guaranteed pod using 100% of memory can have an OOM score of 1. Ensure + // that burstable pods have a higher OOM score adjustment. + if oomScoreAdjust < 2 { + return 2 + } + // Give burstable pods a higher chance of survival over besteffort pods. + if int(oomScoreAdjust) == besteffortOOMScoreAdj { + return int(oomScoreAdjust - 1) + } + return int(oomScoreAdjust) } diff --git a/pkg/kubelet/qos/policy_test.go b/pkg/kubelet/qos/policy_test.go index 27dd96501ab..e66b7b158ad 100644 --- a/pkg/kubelet/qos/policy_test.go +++ b/pkg/kubelet/qos/policy_test.go @@ -29,102 +29,116 @@ const ( ) var ( - zeroRequestBestEffort = api.Container{ - Resources: api.ResourceRequirements{ - Limits: api.ResourceList{ - api.ResourceName(api.ResourceCPU): resource.MustParse("10"), + cpuLimit = api.Pod{ + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Resources: api.ResourceRequirements{ + Limits: api.ResourceList{ + api.ResourceName(api.ResourceCPU): resource.MustParse("10"), + }, + }, + }, }, }, } - edgeBestEffort = api.Container{ - Resources: api.ResourceRequirements{ - Requests: api.ResourceList{ - api.ResourceName(api.ResourceCPU): resource.MustParse("0"), - }, - Limits: api.ResourceList{ - api.ResourceName(api.ResourceMemory): resource.MustParse("10G"), + memoryLimitCPURequest = api.Pod{ + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + api.ResourceName(api.ResourceCPU): resource.MustParse("0"), + }, + Limits: api.ResourceList{ + api.ResourceName(api.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, }, }, } - noRequestBestEffort = api.Container{ - Resources: api.ResourceRequirements{ - Limits: api.ResourceList{ - api.ResourceName(api.ResourceMemory): resource.MustParse("0"), + zeroMemoryLimit = api.Pod{ + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Resources: api.ResourceRequirements{ + Limits: api.ResourceList{ + api.ResourceName(api.ResourceMemory): resource.MustParse("0"), + }, + }, + }, }, }, } - noLimitBestEffort = api.Container{} - - guaranteed = api.Container{ - Resources: api.ResourceRequirements{ - Requests: api.ResourceList{ - api.ResourceName(api.ResourceMemory): resource.MustParse("10G"), - api.ResourceName(api.ResourceCPU): resource.MustParse("5m"), - }, - Limits: api.ResourceList{ - api.ResourceName(api.ResourceCPU): resource.MustParse("5m"), - api.ResourceName(api.ResourceMemory): resource.MustParse("10G"), + noRequestLimit = api.Pod{ + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Resources: api.ResourceRequirements{}, + }, }, }, } - burstable = api.Container{ - Resources: api.ResourceRequirements{ - Requests: api.ResourceList{ - api.ResourceName(api.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount / 2)), - api.ResourceName(api.ResourceCPU): resource.MustParse("5m"), - }, - Limits: api.ResourceList{ - api.ResourceName(api.ResourceMemory): resource.MustParse("10G"), + equalRequestLimitCPUMemory = api.Pod{ + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + api.ResourceName(api.ResourceMemory): resource.MustParse("10G"), + api.ResourceName(api.ResourceCPU): resource.MustParse("5m"), + }, + Limits: api.ResourceList{ + api.ResourceName(api.ResourceCPU): resource.MustParse("5m"), + api.ResourceName(api.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, }, }, } - burstableNoLimit = api.Container{ - Resources: api.ResourceRequirements{ - Requests: api.ResourceList{ - api.ResourceName(api.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount - 1)), - api.ResourceName(api.ResourceCPU): resource.MustParse("5m"), + cpuUnlimitedMemoryLimitedWithRequests = api.Pod{ + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + api.ResourceName(api.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount / 2)), + api.ResourceName(api.ResourceCPU): resource.MustParse("5m"), + }, + Limits: api.ResourceList{ + api.ResourceName(api.ResourceMemory): resource.MustParse("10G"), + }, + }, + }, + }, + }, + } + + requestNoLimit = api.Pod{ + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + api.ResourceName(api.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount - 1)), + api.ResourceName(api.ResourceCPU): resource.MustParse("5m"), + }, + }, + }, }, }, } ) -func TestIsBestEffort(t *testing.T) { - validCases := []api.Container{zeroRequestBestEffort, noRequestBestEffort, noLimitBestEffort, edgeBestEffort} - for _, container := range validCases { - if !isBestEffort(&container) { - t.Errorf("container %+v is best-effort", container) - } - } - invalidCases := []api.Container{guaranteed, burstable} - for _, container := range invalidCases { - if isBestEffort(&container) { - t.Errorf("container %+v is not best-effort", container) - } - } -} - -func TestIsGuaranteed(t *testing.T) { - validCases := []api.Container{guaranteed} - for _, container := range validCases { - if !isGuaranteed(&container) { - t.Errorf("container %+v is guaranteed", container) - } - } - invalidCases := []api.Container{zeroRequestBestEffort, noRequestBestEffort, noLimitBestEffort, edgeBestEffort, burstable} - for _, container := range invalidCases { - if isGuaranteed(&container) { - t.Errorf("container %+v is not guaranteed", container) - } - } -} - type oomTest struct { - container *api.Container + pod *api.Pod memoryCapacity int64 lowOOMScoreAdj int // The max oom_score_adj score the container should be assigned. highOOMScoreAdj int // The min oom_score_adj score the container should be assigned. @@ -133,50 +147,50 @@ type oomTest struct { func TestGetContainerOOMScoreAdjust(t *testing.T) { oomTests := []oomTest{ { - container: &zeroRequestBestEffort, + pod: &cpuLimit, memoryCapacity: 4000000000, - lowOOMScoreAdj: 1000, - highOOMScoreAdj: 1000, + lowOOMScoreAdj: 999, + highOOMScoreAdj: 999, }, { - container: &edgeBestEffort, + pod: &memoryLimitCPURequest, memoryCapacity: 8000000000, - lowOOMScoreAdj: 1000, - highOOMScoreAdj: 1000, + lowOOMScoreAdj: 999, + highOOMScoreAdj: 999, }, { - container: &noRequestBestEffort, + pod: &zeroMemoryLimit, memoryCapacity: 7230457451, lowOOMScoreAdj: 1000, highOOMScoreAdj: 1000, }, { - container: &noLimitBestEffort, + pod: &noRequestLimit, memoryCapacity: 4000000000, lowOOMScoreAdj: 1000, highOOMScoreAdj: 1000, }, { - container: &guaranteed, + pod: &equalRequestLimitCPUMemory, memoryCapacity: 123456789, - lowOOMScoreAdj: -999, - highOOMScoreAdj: -999, + lowOOMScoreAdj: -998, + highOOMScoreAdj: -998, }, { - container: &burstable, + pod: &cpuUnlimitedMemoryLimitedWithRequests, memoryCapacity: standardMemoryAmount, lowOOMScoreAdj: 495, highOOMScoreAdj: 505, }, { - container: &burstableNoLimit, + pod: &requestNoLimit, memoryCapacity: standardMemoryAmount, lowOOMScoreAdj: 2, highOOMScoreAdj: 2, }, } for _, test := range oomTests { - oomScoreAdj := GetContainerOOMScoreAdjust(test.container, test.memoryCapacity) + oomScoreAdj := GetContainerOOMScoreAdjust(test.pod, &test.pod.Spec.Containers[0], test.memoryCapacity) if oomScoreAdj < test.lowOOMScoreAdj || oomScoreAdj > test.highOOMScoreAdj { t.Errorf("oom_score_adj should be between %d and %d, but was %d", test.lowOOMScoreAdj, test.highOOMScoreAdj, oomScoreAdj) } diff --git a/pkg/kubelet/qos/util/qos.go b/pkg/kubelet/qos/util/qos.go index 8e1038eda27..7b8264e16c3 100644 --- a/pkg/kubelet/qos/util/qos.go +++ b/pkg/kubelet/qos/util/qos.go @@ -18,7 +18,7 @@ package util import ( "k8s.io/kubernetes/pkg/api" - "k8s.io/kubernetes/pkg/util/sets" + "k8s.io/kubernetes/pkg/api/resource" ) const ( @@ -48,22 +48,61 @@ func isResourceBestEffort(container *api.Container, resource api.ResourceName) b } // GetPodQos returns the QoS class of a pod. -// The QoS class of a pod is the lowest QoS class for each resource in each container. +// A pod is besteffort if none of its containers have specified any requests or limits. +// A pod is guaranteed only when requests and limits are specified for all the containers and they are equal. +// A pod is burstable if limits and requests do not match across all containers. func GetPodQos(pod *api.Pod) string { - qosValues := sets.NewString() + requests := api.ResourceList{} + limits := api.ResourceList{} + zeroQuantity := resource.MustParse("0") + isGuaranteed := true for _, container := range pod.Spec.Containers { - qosPerResource := GetQoS(&container) - for _, qosValue := range qosPerResource { - qosValues.Insert(qosValue) + // process requests + for name, quantity := range container.Resources.Requests { + if quantity.Cmp(zeroQuantity) == 1 { + delta := quantity.Copy() + if _, exists := requests[name]; !exists { + requests[name] = *delta + } else { + delta.Add(requests[name]) + requests[name] = *delta + } + } + } + // process limits + for name, quantity := range container.Resources.Limits { + if quantity.Cmp(zeroQuantity) == 1 { + delta := quantity.Copy() + if _, exists := limits[name]; !exists { + limits[name] = *delta + } else { + delta.Add(limits[name]) + limits[name] = *delta + } + } + } + if len(container.Resources.Limits) != len(supportedComputeResources) { + isGuaranteed = false } } - if qosValues.Has(BestEffort) { + if len(requests) == 0 && len(limits) == 0 { return BestEffort } - if qosValues.Has(Burstable) { - return Burstable + // Check is requests match limits for all resources. + if isGuaranteed { + for name, req := range requests { + if lim, exists := limits[name]; !exists || lim.Cmp(req) != 0 { + isGuaranteed = false + break + } + } } - return Guaranteed + if isGuaranteed && + len(requests) == len(limits) && + len(limits) == len(supportedComputeResources) { + return Guaranteed + } + return Burstable } // GetQos returns a mapping of resource name to QoS class of a container diff --git a/pkg/kubelet/qos/util/qos_test.go b/pkg/kubelet/qos/util/qos_test.go index c931e204a21..e7a060fec5b 100644 --- a/pkg/kubelet/qos/util/qos_test.go +++ b/pkg/kubelet/qos/util/qos_test.go @@ -64,24 +64,58 @@ func TestGetPodQos(t *testing.T) { pod *api.Pod expected string }{ + { + pod: newPod("guaranteed", []api.Container{ + newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), + }), + expected: Guaranteed, + }, + { + pod: newPod("guaranteed-guaranteed", []api.Container{ + newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), + newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), + }), + expected: Guaranteed, + }, + { + pod: newPod("best-effort-best-effort", []api.Container{ + newContainer("best-effort", getResourceList("", ""), getResourceList("", "")), + newContainer("best-effort", getResourceList("", ""), getResourceList("", "")), + }), + expected: BestEffort, + }, { pod: newPod("best-effort", []api.Container{ newContainer("best-effort", getResourceList("", ""), getResourceList("", "")), }), expected: BestEffort, }, + { + pod: newPod("best-effort-burstable", []api.Container{ + newContainer("best-effort", getResourceList("", ""), getResourceList("", "")), + newContainer("burstable", getResourceList("1", ""), getResourceList("2", "")), + }), + expected: Burstable, + }, { pod: newPod("best-effort-guaranteed", []api.Container{ newContainer("best-effort", getResourceList("", ""), getResourceList("", "")), newContainer("guaranteed", getResourceList("10m", "100Mi"), getResourceList("10m", "100Mi")), }), - expected: BestEffort, + expected: Burstable, }, { - pod: newPod("best-effort-cpu-guaranteed-memory", []api.Container{ - newContainer("best-effort", getResourceList("", "100Mi"), getResourceList("", "100Mi")), + pod: newPod("burstable-cpu-guaranteed-memory", []api.Container{ + newContainer("burstable", getResourceList("", "100Mi"), getResourceList("", "100Mi")), }), - expected: BestEffort, + expected: Burstable, + }, + { + pod: newPod("burstable-guaranteed", []api.Container{ + newContainer("burstable", getResourceList("1", "100Mi"), getResourceList("2", "100Mi")), + newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), + }), + expected: Burstable, }, { pod: newPod("burstable", []api.Container{ @@ -89,12 +123,6 @@ func TestGetPodQos(t *testing.T) { }), expected: Burstable, }, - { - pod: newPod("guaranteed", []api.Container{ - newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")), - }), - expected: Guaranteed, - }, } for _, testCase := range testCases { if actual := GetPodQos(testCase.pod); testCase.expected != actual {