diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index 59898d3a81e..7d188f5a421 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -28,6 +28,7 @@ go_library( "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/cm:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", + "//pkg/kubelet/pod:go_default_library", "//pkg/kubelet/qos:go_default_library", "//pkg/kubelet/server/stats:go_default_library", "//pkg/kubelet/util/format:go_default_library", @@ -55,6 +56,7 @@ go_test( "//pkg/client/record:go_default_library", "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", + "//pkg/kubelet/types:go_default_library", "//pkg/quota:go_default_library", "//pkg/types:go_default_library", "//pkg/util/clock:go_default_library", diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index d1a28cfa86d..1ad80b41488 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -28,6 +28,7 @@ import ( "k8s.io/kubernetes/pkg/client/record" "k8s.io/kubernetes/pkg/kubelet/cm" "k8s.io/kubernetes/pkg/kubelet/lifecycle" + kubepod "k8s.io/kubernetes/pkg/kubelet/pod" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/stats" "k8s.io/kubernetes/pkg/kubelet/util/format" @@ -108,7 +109,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd // the node has memory pressure, admit if not best-effort if hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) { notBestEffort := qos.BestEffort != qos.GetPodQOS(attrs.Pod) - if notBestEffort { + if notBestEffort || kubepod.IsCriticalPod(attrs.Pod) { return lifecycle.PodAdmitResult{Admit: true} } } diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index a54baaa318f..1d170c3f49c 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -25,6 +25,7 @@ import ( "k8s.io/kubernetes/pkg/client/record" statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" "k8s.io/kubernetes/pkg/kubelet/lifecycle" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/types" "k8s.io/kubernetes/pkg/util/clock" ) @@ -210,6 +211,8 @@ func TestMemoryPressure(t *testing.T) { // create a best effort pod to test admission bestEffortPodToAdmit, _ := podMaker("best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi") burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi") + criticalBestEffortPodToAdmit, _ := podMaker("critical-best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi") + criticalBestEffortPodToAdmit.ObjectMeta.Annotations = map[string]string{kubetypes.CriticalPodAnnotationKey: ""} // synchronize manager.synchronize(diskInfoProvider, activePodsFunc) @@ -220,8 +223,8 @@ func TestMemoryPressure(t *testing.T) { } // try to admit our pods (they should succeed) - expected := []bool{true, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + expected := []bool{true, true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } @@ -296,9 +299,10 @@ func TestMemoryPressure(t *testing.T) { t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) } - // the best-effort pod should not admit, burstable should - expected = []bool{false, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + // the best-effort pod without critical annotation should not admit, + // burstable and critical pods should + expected = []bool{false, true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } @@ -320,9 +324,9 @@ func TestMemoryPressure(t *testing.T) { t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) } - // the best-effort pod should not admit, burstable should - expected = []bool{false, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + // the best-effort pod should not admit, burstable and critical pods should + expected = []bool{false, true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } @@ -345,8 +349,8 @@ func TestMemoryPressure(t *testing.T) { } // all pods should admit now - expected = []bool{true, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + expected = []bool{true, true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } diff --git a/pkg/kubelet/pod/pod_manager.go b/pkg/kubelet/pod/pod_manager.go index 800887e23e2..d923af9f86b 100644 --- a/pkg/kubelet/pod/pod_manager.go +++ b/pkg/kubelet/pod/pod_manager.go @@ -21,6 +21,7 @@ import ( "k8s.io/kubernetes/pkg/api/v1" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/types" ) @@ -306,3 +307,11 @@ func (pm *basicManager) GetPodByMirrorPod(mirrorPod *v1.Pod) (*v1.Pod, bool) { pod, ok := pm.podByFullName[kubecontainer.GetPodFullName(mirrorPod)] return pod, ok } + +// IsCriticalPod returns true if the pod bears the critical pod annotation +// key. Both the rescheduler and the kubelet use this key to make admission +// and scheduling decisions. +func IsCriticalPod(pod *v1.Pod) bool { + _, ok := pod.Annotations[kubetypes.CriticalPodAnnotationKey] + return ok +} diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index e560a9b91f6..ef1ea7ff9c0 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -27,6 +27,15 @@ const ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror" const ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen" const ConfigHashAnnotationKey = "kubernetes.io/config.hash" +// This key needs to sync with the key used by the rescheduler, which currently +// lives in contrib. Its presence indicates 2 things, as far as the kubelet is +// concerned: +// 1. Resource related admission checks will prioritize the admission of +// pods bearing the key, over pods without the key, regardless of QoS. +// 2. The OOM score of pods bearing the key will be <= pods without +// the key (where the <= part is determied by QoS). +const CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod" + // PodOperation defines what changes will be made on a pod configuration. type PodOperation int