From afd676d94c614fc7336737bad1880784e0228c08 Mon Sep 17 00:00:00 2001 From: bprashanth Date: Thu, 15 Dec 2016 12:25:07 -0800 Subject: [PATCH 1/3] Kubelet admits critical pods even under memory pressure --- pkg/kubelet/eviction/BUILD | 2 ++ pkg/kubelet/eviction/eviction_manager.go | 3 ++- pkg/kubelet/eviction/eviction_manager_test.go | 24 +++++++++++-------- pkg/kubelet/pod/pod_manager.go | 9 +++++++ pkg/kubelet/types/pod_update.go | 9 +++++++ 5 files changed, 36 insertions(+), 11 deletions(-) diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index 59898d3a81e..7d188f5a421 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -28,6 +28,7 @@ go_library( "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/cm:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", + "//pkg/kubelet/pod:go_default_library", "//pkg/kubelet/qos:go_default_library", "//pkg/kubelet/server/stats:go_default_library", "//pkg/kubelet/util/format:go_default_library", @@ -55,6 +56,7 @@ go_test( "//pkg/client/record:go_default_library", "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", + "//pkg/kubelet/types:go_default_library", "//pkg/quota:go_default_library", "//pkg/types:go_default_library", "//pkg/util/clock:go_default_library", diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index d1a28cfa86d..1ad80b41488 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -28,6 +28,7 @@ import ( "k8s.io/kubernetes/pkg/client/record" "k8s.io/kubernetes/pkg/kubelet/cm" "k8s.io/kubernetes/pkg/kubelet/lifecycle" + kubepod "k8s.io/kubernetes/pkg/kubelet/pod" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/stats" "k8s.io/kubernetes/pkg/kubelet/util/format" @@ -108,7 +109,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd // the node has memory pressure, admit if not best-effort if hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) { notBestEffort := qos.BestEffort != qos.GetPodQOS(attrs.Pod) - if notBestEffort { + if notBestEffort || kubepod.IsCriticalPod(attrs.Pod) { return lifecycle.PodAdmitResult{Admit: true} } } diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index a54baaa318f..1d170c3f49c 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -25,6 +25,7 @@ import ( "k8s.io/kubernetes/pkg/client/record" statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" "k8s.io/kubernetes/pkg/kubelet/lifecycle" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/types" "k8s.io/kubernetes/pkg/util/clock" ) @@ -210,6 +211,8 @@ func TestMemoryPressure(t *testing.T) { // create a best effort pod to test admission bestEffortPodToAdmit, _ := podMaker("best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi") burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi") + criticalBestEffortPodToAdmit, _ := podMaker("critical-best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi") + criticalBestEffortPodToAdmit.ObjectMeta.Annotations = map[string]string{kubetypes.CriticalPodAnnotationKey: ""} // synchronize manager.synchronize(diskInfoProvider, activePodsFunc) @@ -220,8 +223,8 @@ func TestMemoryPressure(t *testing.T) { } // try to admit our pods (they should succeed) - expected := []bool{true, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + expected := []bool{true, true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } @@ -296,9 +299,10 @@ func TestMemoryPressure(t *testing.T) { t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) } - // the best-effort pod should not admit, burstable should - expected = []bool{false, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + // the best-effort pod without critical annotation should not admit, + // burstable and critical pods should + expected = []bool{false, true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } @@ -320,9 +324,9 @@ func TestMemoryPressure(t *testing.T) { t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) } - // the best-effort pod should not admit, burstable should - expected = []bool{false, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + // the best-effort pod should not admit, burstable and critical pods should + expected = []bool{false, true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } @@ -345,8 +349,8 @@ func TestMemoryPressure(t *testing.T) { } // all pods should admit now - expected = []bool{true, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + expected = []bool{true, true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } diff --git a/pkg/kubelet/pod/pod_manager.go b/pkg/kubelet/pod/pod_manager.go index 800887e23e2..d923af9f86b 100644 --- a/pkg/kubelet/pod/pod_manager.go +++ b/pkg/kubelet/pod/pod_manager.go @@ -21,6 +21,7 @@ import ( "k8s.io/kubernetes/pkg/api/v1" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/types" ) @@ -306,3 +307,11 @@ func (pm *basicManager) GetPodByMirrorPod(mirrorPod *v1.Pod) (*v1.Pod, bool) { pod, ok := pm.podByFullName[kubecontainer.GetPodFullName(mirrorPod)] return pod, ok } + +// IsCriticalPod returns true if the pod bears the critical pod annotation +// key. Both the rescheduler and the kubelet use this key to make admission +// and scheduling decisions. +func IsCriticalPod(pod *v1.Pod) bool { + _, ok := pod.Annotations[kubetypes.CriticalPodAnnotationKey] + return ok +} diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index e560a9b91f6..ef1ea7ff9c0 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -27,6 +27,15 @@ const ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror" const ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen" const ConfigHashAnnotationKey = "kubernetes.io/config.hash" +// This key needs to sync with the key used by the rescheduler, which currently +// lives in contrib. Its presence indicates 2 things, as far as the kubelet is +// concerned: +// 1. Resource related admission checks will prioritize the admission of +// pods bearing the key, over pods without the key, regardless of QoS. +// 2. The OOM score of pods bearing the key will be <= pods without +// the key (where the <= part is determied by QoS). +const CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod" + // PodOperation defines what changes will be made on a pod configuration. type PodOperation int From 4fff49bb937df63a410c32a8fea2c2057506f365 Mon Sep 17 00:00:00 2001 From: bprashanth Date: Thu, 15 Dec 2016 12:28:31 -0800 Subject: [PATCH 2/3] Make kube-proxy a critical pod --- cluster/saltbase/salt/kube-proxy/kube-proxy.manifest | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest b/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest index 573b27f2816..9c19c999e06 100644 --- a/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest +++ b/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest @@ -38,6 +38,14 @@ kind: Pod metadata: name: kube-proxy namespace: kube-system + # This annotation lowers the possibility that kube-proxy gets evicted when the + # node is under memory pressure, and prioritizes it for admission, even if + # the node is under memory pressure. + # Note that kube-proxy runs as a static pod so this annotation does NOT have + # any effect on rescheduler (default scheduler and rescheduler are not + # involved in scheduling kube-proxy). + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' labels: tier: node component: kube-proxy From b7409e00380b349ec971d089363ad7ff22b37c76 Mon Sep 17 00:00:00 2001 From: bprashanth Date: Thu, 15 Dec 2016 13:27:49 -0800 Subject: [PATCH 3/3] Sort critical pods before admission --- pkg/kubelet/kubelet.go | 17 ++++++++-- pkg/kubelet/kubelet_test.go | 62 +++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index ff607a52b46..06e48bfd12d 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1909,8 +1909,21 @@ func (kl *Kubelet) handleMirrorPod(mirrorPod *v1.Pod, start time.Time) { // a config source. func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { start := kl.clock.Now() - sort.Sort(sliceutils.PodsByCreationTime(pods)) - for _, pod := range pods { + + // Pass critical pods through admission check first. + var criticalPods []*v1.Pod + var nonCriticalPods []*v1.Pod + for _, p := range pods { + if kubepod.IsCriticalPod(p) { + criticalPods = append(criticalPods, p) + } else { + nonCriticalPods = append(nonCriticalPods, p) + } + } + sort.Sort(sliceutils.PodsByCreationTime(criticalPods)) + sort.Sort(sliceutils.PodsByCreationTime(nonCriticalPods)) + + for _, pod := range append(criticalPods, nonCriticalPods...) { existingPods := kl.podManager.GetPods() // Always add the pod to the pod manager. Kubelet relies on the pod // manager as the source of truth for the desired state. If a pod does diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 85b9e95cfa1..351dd69b681 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -461,6 +461,68 @@ func TestHandlePortConflicts(t *testing.T) { require.Equal(t, v1.PodPending, status.Phase) } +// Tests that we sort pods based on criticality. +func TestCriticalPrioritySorting(t *testing.T) { + testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) + kl := testKubelet.kubelet + nodes := []v1.Node{ + {ObjectMeta: v1.ObjectMeta{Name: testKubeletHostname}, + Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(10, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(40, resource.DecimalSI), + }}}, + } + kl.nodeLister = testNodeLister{nodes: nodes} + kl.nodeInfo = testNodeInfo{nodes: nodes} + testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorapi.MachineInfo{}, nil) + testKubelet.fakeCadvisor.On("ImagesFsInfo").Return(cadvisorapiv2.FsInfo{}, nil) + testKubelet.fakeCadvisor.On("RootFsInfo").Return(cadvisorapiv2.FsInfo{}, nil) + + spec := v1.PodSpec{NodeName: string(kl.nodeName), + Containers: []v1.Container{{Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "memory": resource.MustParse("90"), + }, + }}}} + pods := []*v1.Pod{ + podWithUidNameNsSpec("000000000", "newpod", "foo", spec), + podWithUidNameNsSpec("987654321", "oldpod", "foo", spec), + podWithUidNameNsSpec("123456789", "middlepod", "foo", spec), + } + + // Pods are not sorted by creation time. + startTime := time.Now() + pods[0].CreationTimestamp = metav1.NewTime(startTime.Add(10 * time.Second)) + pods[1].CreationTimestamp = metav1.NewTime(startTime) + pods[2].CreationTimestamp = metav1.NewTime(startTime.Add(1 * time.Second)) + + // Make the middle and new pod critical, the middle pod should win + // even though it comes later in the list + critical := map[string]string{kubetypes.CriticalPodAnnotationKey: ""} + pods[0].Annotations = critical + pods[1].Annotations = map[string]string{} + pods[2].Annotations = critical + + // The non-critical pod should be rejected + notfittingPods := []*v1.Pod{pods[0], pods[1]} + fittingPod := pods[2] + + kl.HandlePodAdditions(pods) + // Check pod status stored in the status map. + // notfittingPod should be Failed + for _, p := range notfittingPods { + status, found := kl.statusManager.GetPodStatus(p.UID) + require.True(t, found, "Status of pod %q is not found in the status map", p.UID) + require.Equal(t, v1.PodFailed, status.Phase) + } + + // fittingPod should be Pending + status, found := kl.statusManager.GetPodStatus(fittingPod.UID) + require.True(t, found, "Status of pod %q is not found in the status map", fittingPod.UID) + require.Equal(t, v1.PodPending, status.Phase) +} + // Tests that we handle host name conflicts correctly by setting the failed status in status map. func TestHandleHostNameConflicts(t *testing.T) { testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)