diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index 7d188f5a421..159d5ea4e9d 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -28,9 +28,9 @@ go_library( "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/cm:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", - "//pkg/kubelet/pod:go_default_library", "//pkg/kubelet/qos:go_default_library", "//pkg/kubelet/server/stats:go_default_library", + "//pkg/kubelet/types:go_default_library", "//pkg/kubelet/util/format:go_default_library", "//pkg/quota/evaluator/core:go_default_library", "//pkg/util/clock:go_default_library", diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 1ad80b41488..19cf3ce90b7 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -28,9 +28,9 @@ import ( "k8s.io/kubernetes/pkg/client/record" "k8s.io/kubernetes/pkg/kubelet/cm" "k8s.io/kubernetes/pkg/kubelet/lifecycle" - kubepod "k8s.io/kubernetes/pkg/kubelet/pod" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/stats" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/util/format" "k8s.io/kubernetes/pkg/util/clock" "k8s.io/kubernetes/pkg/util/wait" @@ -109,7 +109,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd // the node has memory pressure, admit if not best-effort if hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) { notBestEffort := qos.BestEffort != qos.GetPodQOS(attrs.Pod) - if notBestEffort || kubepod.IsCriticalPod(attrs.Pod) { + if notBestEffort || kubetypes.IsCriticalPod(attrs.Pod) { return lifecycle.PodAdmitResult{Admit: true} } } diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index c0f4a57f506..6994e3c4dbb 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1915,7 +1915,7 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { var criticalPods []*v1.Pod var nonCriticalPods []*v1.Pod for _, p := range pods { - if kubepod.IsCriticalPod(p) { + if kubetypes.IsCriticalPod(p) { criticalPods = append(criticalPods, p) } else { nonCriticalPods = append(nonCriticalPods, p) diff --git a/pkg/kubelet/pod/pod_manager.go b/pkg/kubelet/pod/pod_manager.go index d923af9f86b..800887e23e2 100644 --- a/pkg/kubelet/pod/pod_manager.go +++ b/pkg/kubelet/pod/pod_manager.go @@ -21,7 +21,6 @@ import ( "k8s.io/kubernetes/pkg/api/v1" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" - kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/types" ) @@ -307,11 +306,3 @@ func (pm *basicManager) GetPodByMirrorPod(mirrorPod *v1.Pod) (*v1.Pod, bool) { pod, ok := pm.podByFullName[kubecontainer.GetPodFullName(mirrorPod)] return pod, ok } - -// IsCriticalPod returns true if the pod bears the critical pod annotation -// key. Both the rescheduler and the kubelet use this key to make admission -// and scheduling decisions. -func IsCriticalPod(pod *v1.Pod) bool { - _, ok := pod.Annotations[kubetypes.CriticalPodAnnotationKey] - return ok -} diff --git a/pkg/kubelet/qos/BUILD b/pkg/kubelet/qos/BUILD index f0f5aee2f2f..3f2e2c3e832 100644 --- a/pkg/kubelet/qos/BUILD +++ b/pkg/kubelet/qos/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/api:go_default_library", "//pkg/api/resource:go_default_library", "//pkg/api/v1:go_default_library", + "//pkg/kubelet/types:go_default_library", "//pkg/util/sets:go_default_library", ], ) @@ -36,5 +37,6 @@ go_test( deps = [ "//pkg/api/resource:go_default_library", "//pkg/api/v1:go_default_library", + "//pkg/kubelet/types:go_default_library", ], ) diff --git a/pkg/kubelet/qos/policy.go b/pkg/kubelet/qos/policy.go index 075bcc85439..e06a660b491 100644 --- a/pkg/kubelet/qos/policy.go +++ b/pkg/kubelet/qos/policy.go @@ -16,14 +16,20 @@ limitations under the License. package qos -import "k8s.io/kubernetes/pkg/api/v1" +import ( + "k8s.io/kubernetes/pkg/api/v1" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" +) const ( // PodInfraOOMAdj is very docker specific. For arbitrary runtime, it may not make // sense to set sandbox level oom score, e.g. a sandbox could only be a namespace // without a process. // TODO: Handle infra container oom score adj in a runtime agnostic way. + // TODO: Should handle critical pod oom score adj with a proper preemption priority. + // This is the workaround for https://github.com/kubernetes/kubernetes/issues/38322. PodInfraOOMAdj int = -998 + CriticalPodOOMAdj int = -998 KubeletOOMScoreAdj int = -999 DockerOOMScoreAdj int = -999 KubeProxyOOMScoreAdj int = -999 @@ -38,6 +44,10 @@ const ( // and 1000. Containers with higher OOM scores are killed if the system runs out of memory. // See https://lwn.net/Articles/391222/ for more information. func GetContainerOOMScoreAdjust(pod *v1.Pod, container *v1.Container, memoryCapacity int64) int { + if kubetypes.IsCriticalPod(pod) { + return CriticalPodOOMAdj + } + switch GetPodQOS(pod) { case Guaranteed: // Guaranteed containers should be the last to get killed. diff --git a/pkg/kubelet/qos/policy_test.go b/pkg/kubelet/qos/policy_test.go index ea35de56527..e878d778551 100644 --- a/pkg/kubelet/qos/policy_test.go +++ b/pkg/kubelet/qos/policy_test.go @@ -22,6 +22,7 @@ import ( "k8s.io/kubernetes/pkg/api/resource" "k8s.io/kubernetes/pkg/api/v1" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" ) const ( @@ -135,6 +136,25 @@ var ( }, }, } + criticalPodWithNoLimit = v1.Pod{ + ObjectMeta: v1.ObjectMeta{ + Annotations: map[string]string{ + kubetypes.CriticalPodAnnotationKey: "", + }, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceName(v1.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount - 1)), + v1.ResourceName(v1.ResourceCPU): resource.MustParse("5m"), + }, + }, + }, + }, + }, + } ) type oomTest struct { @@ -188,6 +208,12 @@ func TestGetContainerOOMScoreAdjust(t *testing.T) { lowOOMScoreAdj: 2, highOOMScoreAdj: 2, }, + { + pod: &criticalPodWithNoLimit, + memoryCapacity: standardMemoryAmount, + lowOOMScoreAdj: -998, + highOOMScoreAdj: -998, + }, } for _, test := range oomTests { oomScoreAdj := GetContainerOOMScoreAdjust(test.pod, &test.pod.Spec.Containers[0], test.memoryCapacity) diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index ef1ea7ff9c0..2d8f20a5179 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -140,3 +140,11 @@ func (sp SyncPodType) String() string { return "unknown" } } + +// IsCriticalPod returns true if the pod bears the critical pod annotation +// key. Both the rescheduler and the kubelet use this key to make admission +// and scheduling decisions. +func IsCriticalPod(pod *v1.Pod) bool { + _, ok := pod.Annotations[CriticalPodAnnotationKey] + return ok +}