diff --git a/pkg/kubelet/types/BUILD b/pkg/kubelet/types/BUILD index 0c94c9df6b6..762979d5baf 100644 --- a/pkg/kubelet/types/BUILD +++ b/pkg/kubelet/types/BUILD @@ -18,6 +18,7 @@ go_library( importpath = "k8s.io/kubernetes/pkg/kubelet/types", deps = [ "//pkg/apis/core:go_default_library", + "//pkg/scheduler/api:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/types:go_default_library", diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index 6bfc3dac63f..62116985fd3 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -22,6 +22,7 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubeapi "k8s.io/kubernetes/pkg/apis/core" + schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" ) const ( @@ -139,15 +140,16 @@ func (sp SyncPodType) String() string { } } -// IsCriticalPod returns true if the pod bears the critical pod annotation -// key. Both the rescheduler and the kubelet use this key to make admission -// and scheduling decisions. +// IsCriticalPod returns true if the pod bears the critical pod annotation key or if pod's priority is greater than +// or equal to SystemCriticalPriority. Both the rescheduler(deprecated in 1.10) and the kubelet use this function +// to make admission and scheduling decisions. func IsCriticalPod(pod *v1.Pod) bool { - return IsCritical(pod.Namespace, pod.Annotations) + return IsCritical(pod.Namespace, pod.Annotations) || (pod.Spec.Priority != nil && IsCriticalPodBasedOnPriority(pod.Namespace, *pod.Spec.Priority)) } // IsCritical returns true if parameters bear the critical pod annotation // key. The DaemonSetController use this key directly to make scheduling decisions. +// TODO: @ravig - Deprecated. Remove this when we move to resolving critical pods based on priorityClassName. func IsCritical(ns string, annotations map[string]string) bool { // Critical pods are restricted to "kube-system" namespace as of now. if ns != kubeapi.NamespaceSystem { @@ -159,3 +161,15 @@ func IsCritical(ns string, annotations map[string]string) bool { } return false } + +// IsCriticalPodBasedOnPriority checks if the given pod is a critical pod based on priority resolved from pod Spec. +func IsCriticalPodBasedOnPriority(ns string, priority int32) bool { + // Critical pods are restricted to "kube-system" namespace as of now. + if ns != kubeapi.NamespaceSystem { + return false + } + if priority >= schedulerapi.SystemCriticalPriority { + return true + } + return false +} diff --git a/pkg/kubelet/types/pod_update_test.go b/pkg/kubelet/types/pod_update_test.go index 46d27829f9b..849995452f1 100644 --- a/pkg/kubelet/types/pod_update_test.go +++ b/pkg/kubelet/types/pod_update_test.go @@ -158,7 +158,7 @@ func TestIsCriticalPod(t *testing.T) { { pod: v1.Pod{ ObjectMeta: metav1.ObjectMeta{ - Name: "pod3", + Name: "pod4", Namespace: "kube-system", Annotations: map[string]string{ "scheduler.alpha.kubernetes.io/critical-pod": "", diff --git a/pkg/scheduler/api/types.go b/pkg/scheduler/api/types.go index af54891b8d6..090e8780a16 100644 --- a/pkg/scheduler/api/types.go +++ b/pkg/scheduler/api/types.go @@ -36,6 +36,14 @@ const ( MaxPriority = 10 // MaxWeight defines the max weight value. MaxWeight = MaxInt / MaxPriority + // HighestUserDefinablePriority is the highest priority for user defined priority classes. Priority values larger than 1 billion are reserved for Kubernetes system use. + HighestUserDefinablePriority = int32(1000000000) + // SystemCriticalPriority is the beginning of the range of priority values for critical system components. + SystemCriticalPriority = 2 * HighestUserDefinablePriority + // NOTE: In order to avoid conflict of names with user-defined priority classes, all the names must + // start with scheduling.SystemPriorityClassPrefix which is by default "system-". + SystemClusterCritical = "system-cluster-critical" + SystemNodeCritical = "system-node-critical" ) // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -231,6 +239,12 @@ type HostPriority struct { // HostPriorityList declares a []HostPriority type. type HostPriorityList []HostPriority +// SystemPriorityClasses defines special priority classes which are used by system critical pods that should not be preempted by workload pods. +var SystemPriorityClasses = map[string]int32{ + SystemClusterCritical: SystemCriticalPriority, + SystemNodeCritical: SystemCriticalPriority + 1000, +} + func (h HostPriorityList) Len() int { return len(h) } diff --git a/plugin/pkg/admission/priority/BUILD b/plugin/pkg/admission/priority/BUILD index 35b572b09b5..3f9959806f4 100644 --- a/plugin/pkg/admission/priority/BUILD +++ b/plugin/pkg/admission/priority/BUILD @@ -16,6 +16,7 @@ go_test( "//pkg/client/informers/informers_generated/internalversion:go_default_library", "//pkg/controller:go_default_library", "//pkg/features:go_default_library", + "//pkg/scheduler/api:go_default_library", "//vendor/github.com/golang/glog:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apiserver/pkg/admission:go_default_library", @@ -35,6 +36,8 @@ go_library( "//pkg/client/listers/scheduling/internalversion:go_default_library", "//pkg/features:go_default_library", "//pkg/kubeapiserver/admission:go_default_library", + "//pkg/kubelet/types:go_default_library", + "//pkg/scheduler/api:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library", "//vendor/k8s.io/apimachinery/pkg/labels:go_default_library", "//vendor/k8s.io/apiserver/pkg/admission:go_default_library", diff --git a/plugin/pkg/admission/priority/admission.go b/plugin/pkg/admission/priority/admission.go index 3750ec86aff..6b88732166d 100644 --- a/plugin/pkg/admission/priority/admission.go +++ b/plugin/pkg/admission/priority/admission.go @@ -31,26 +31,15 @@ import ( schedulinglisters "k8s.io/kubernetes/pkg/client/listers/scheduling/internalversion" "k8s.io/kubernetes/pkg/features" kubeapiserveradmission "k8s.io/kubernetes/pkg/kubeapiserver/admission" + kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" + schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" ) const ( // PluginName indicates name of admission plugin. PluginName = "Priority" - - // HighestUserDefinablePriority is the highest priority for user defined priority classes. Priority values larger than 1 billion are reserved for Kubernetes system use. - HighestUserDefinablePriority = 1000000000 - // SystemCriticalPriority is the beginning of the range of priority values for critical system components. - SystemCriticalPriority = 2 * HighestUserDefinablePriority ) -// SystemPriorityClasses defines special priority classes which are used by system critical pods that should not be preempted by workload pods. -// NOTE: In order to avoid conflict of names with user-defined priority classes, all the names must -// start with scheduling.SystemPriorityClassPrefix which is by default "system-". -var SystemPriorityClasses = map[string]int32{ - "system-cluster-critical": SystemCriticalPriority, - "system-node-critical": SystemCriticalPriority + 1000, -} - // Register registers a plugin func Register(plugins *admission.Plugins) { plugins.Register(PluginName, func(config io.Reader) (admission.Interface, error) { @@ -166,6 +155,13 @@ func (p *PriorityPlugin) admitPod(a admission.Attributes) error { } if utilfeature.DefaultFeatureGate.Enabled(features.PodPriority) { var priority int32 + // TODO: @ravig - This is for backwards compatibility to ensure that critical pods with annotations just work fine. + // Remove when no longer needed. + if len(pod.Spec.PriorityClassName) == 0 && + utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) && + kubelettypes.IsCritical(a.GetNamespace(), pod.Annotations) { + pod.Spec.PriorityClassName = schedulerapi.SystemClusterCritical + } if len(pod.Spec.PriorityClassName) == 0 { var err error priority, err = p.getDefaultPriority() @@ -174,7 +170,7 @@ func (p *PriorityPlugin) admitPod(a admission.Attributes) error { } } else { // First try to resolve by system priority classes. - priority, ok = SystemPriorityClasses[pod.Spec.PriorityClassName] + priority, ok = schedulerapi.SystemPriorityClasses[pod.Spec.PriorityClassName] if !ok { // Now that we didn't find any system priority, try resolving by user defined priority classes. pc, err := p.lister.Get(pod.Spec.PriorityClassName) @@ -202,10 +198,10 @@ func (p *PriorityPlugin) validatePriorityClass(a admission.Attributes) error { if !ok { return errors.NewBadRequest("resource was marked with kind PriorityClass but was unable to be converted") } - if pc.Value > HighestUserDefinablePriority { - return admission.NewForbidden(a, fmt.Errorf("maximum allowed value of a user defined priority is %v", HighestUserDefinablePriority)) + if pc.Value > schedulerapi.HighestUserDefinablePriority { + return admission.NewForbidden(a, fmt.Errorf("maximum allowed value of a user defined priority is %v", schedulerapi.HighestUserDefinablePriority)) } - if _, ok := SystemPriorityClasses[pc.Name]; ok { + if _, ok := schedulerapi.SystemPriorityClasses[pc.Name]; ok { return admission.NewForbidden(a, fmt.Errorf("the name of the priority class is a reserved name for system use only: %v", pc.Name)) } // If the new PriorityClass tries to be the default priority, make sure that no other priority class is marked as default. diff --git a/plugin/pkg/admission/priority/admission_test.go b/plugin/pkg/admission/priority/admission_test.go index 06963588c61..ec00cdc9aa4 100644 --- a/plugin/pkg/admission/priority/admission_test.go +++ b/plugin/pkg/admission/priority/admission_test.go @@ -30,6 +30,7 @@ import ( informers "k8s.io/kubernetes/pkg/client/informers/informers_generated/internalversion" "k8s.io/kubernetes/pkg/controller" "k8s.io/kubernetes/pkg/features" + schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" ) func addPriorityClasses(ctrl *PriorityPlugin, priorityClasses []*scheduling.PriorityClass) { @@ -82,7 +83,7 @@ func TestPriorityClassAdmission(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "toohighclass", }, - Value: HighestUserDefinablePriority + 1, + Value: schedulerapi.HighestUserDefinablePriority + 1, Description: "Just a test priority class", } @@ -91,9 +92,9 @@ func TestPriorityClassAdmission(t *testing.T) { Kind: "PriorityClass", }, ObjectMeta: metav1.ObjectMeta{ - Name: "system-cluster-critical", + Name: schedulerapi.SystemClusterCritical, }, - Value: HighestUserDefinablePriority + 1, + Value: schedulerapi.HighestUserDefinablePriority + 1, Description: "Name conflicts with system priority class names", } @@ -321,7 +322,7 @@ func TestPodAdmission(t *testing.T) { Name: containerName, }, }, - PriorityClassName: "system-cluster-critical", + PriorityClassName: schedulerapi.SystemClusterCritical, }, }, // pod[5]: mirror Pod with a system priority class name @@ -357,9 +358,27 @@ func TestPodAdmission(t *testing.T) { Priority: &intPriority, }, }, + // pod[7]: Pod with a critical priority annotation. This needs to be automatically assigned + // system-cluster-critical + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-w-system-priority", + Namespace: "kube-system", + Annotations: map[string]string{"scheduler.alpha.kubernetes.io/critical-pod": ""}, + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: containerName, + }, + }, + }, + }, } // Enable PodPriority feature gate. utilfeature.DefaultFeatureGate.Set(fmt.Sprintf("%s=true", features.PodPriority)) + // Enable ExperimentalCriticalPodAnnotation feature gate. + utilfeature.DefaultFeatureGate.Set(fmt.Sprintf("%s=true", features.ExperimentalCriticalPodAnnotation)) tests := []struct { name string existingClasses []*scheduling.PriorityClass @@ -402,7 +421,7 @@ func TestPodAdmission(t *testing.T) { "pod with a system priority class", []*scheduling.PriorityClass{}, *pods[4], - SystemCriticalPriority, + schedulerapi.SystemCriticalPriority, false, }, { @@ -423,7 +442,7 @@ func TestPodAdmission(t *testing.T) { "mirror pod with system priority class", []*scheduling.PriorityClass{}, *pods[5], - SystemCriticalPriority, + schedulerapi.SystemCriticalPriority, false, }, { @@ -433,6 +452,13 @@ func TestPodAdmission(t *testing.T) { 0, true, }, + { + "pod with critical pod annotation", + []*scheduling.PriorityClass{}, + *pods[7], + schedulerapi.SystemCriticalPriority, + false, + }, } for _, test := range tests { diff --git a/test/e2e/scheduling/BUILD b/test/e2e/scheduling/BUILD index a7596251368..9ee18722387 100644 --- a/test/e2e/scheduling/BUILD +++ b/test/e2e/scheduling/BUILD @@ -25,6 +25,7 @@ go_library( "//pkg/kubelet/apis:go_default_library", "//pkg/quota/evaluator/core:go_default_library", "//pkg/scheduler/algorithm/priorities/util:go_default_library", + "//pkg/scheduler/api:go_default_library", "//pkg/util/version:go_default_library", "//test/e2e/common:go_default_library", "//test/e2e/framework:go_default_library", diff --git a/test/e2e/scheduling/preemption.go b/test/e2e/scheduling/preemption.go index 92513402d23..1867ec4cc46 100644 --- a/test/e2e/scheduling/preemption.go +++ b/test/e2e/scheduling/preemption.go @@ -26,6 +26,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clientset "k8s.io/client-go/kubernetes" + schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" "k8s.io/kubernetes/test/e2e/framework" . "github.com/onsi/ginkgo" @@ -43,7 +44,6 @@ var _ = SIGDescribe("SchedulerPreemption [Serial] [Feature:PodPreemption]", func lowPriorityClassName := f.BaseName + "-low-priority" mediumPriorityClassName := f.BaseName + "-medium-priority" highPriorityClassName := f.BaseName + "-high-priority" - AfterEach(func() { }) @@ -126,6 +126,66 @@ var _ = SIGDescribe("SchedulerPreemption [Serial] [Feature:PodPreemption]", func } }) + // This test verifies that when a critical pod is created and no node with + // enough resources is found, scheduler preempts a lower priority pod to schedule + // this critical pod. + It("validates lower priority pod preemption by critical pod", func() { + var podRes v1.ResourceList + // Create one pod per node that uses a lot of the node's resources. + By("Create pods that use 60% of node resources.") + pods := make([]*v1.Pod, len(nodeList.Items)) + for i, node := range nodeList.Items { + cpuAllocatable, found := node.Status.Allocatable["cpu"] + Expect(found).To(Equal(true)) + milliCPU := cpuAllocatable.MilliValue() * 40 / 100 + memAllocatable, found := node.Status.Allocatable["memory"] + Expect(found).To(Equal(true)) + memory := memAllocatable.Value() * 60 / 100 + podRes = v1.ResourceList{} + podRes[v1.ResourceCPU] = *resource.NewMilliQuantity(int64(milliCPU), resource.DecimalSI) + podRes[v1.ResourceMemory] = *resource.NewQuantity(int64(memory), resource.BinarySI) + + // make the first pod low priority and the rest medium priority. + priorityName := mediumPriorityClassName + if i == 0 { + priorityName = lowPriorityClassName + } + pods[i] = createPausePod(f, pausePodConfig{ + Name: fmt.Sprintf("pod%d-%v", i, priorityName), + PriorityClassName: priorityName, + Resources: &v1.ResourceRequirements{ + Requests: podRes, + }, + }) + framework.Logf("Created pod: %v", pods[i].Name) + } + By("Wait for pods to be scheduled.") + for _, pod := range pods { + framework.ExpectNoError(framework.WaitForPodRunningInNamespace(cs, pod)) + } + + By("Run a critical pod that use 60% of a node resources.") + // Create a critical pod and make sure it is scheduled. + runPausePod(f, pausePodConfig{ + Name: "critical-pod", + PriorityClassName: schedulerapi.SystemClusterCritical, + Resources: &v1.ResourceRequirements{ + Requests: podRes, + }, + }) + // Make sure that the lowest priority pod is deleted. + preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(pods[0].Name, metav1.GetOptions{}) + podDeleted := (err != nil && errors.IsNotFound(err)) || + (err == nil && preemptedPod.DeletionTimestamp != nil) + Expect(podDeleted).To(BeTrue()) + // Other pods (mid priority ones) should be present. + for i := 1; i < len(pods); i++ { + livePod, err := cs.CoreV1().Pods(pods[i].Namespace).Get(pods[i].Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + Expect(livePod.DeletionTimestamp).To(BeNil()) + } + }) + // This test verifies that when a high priority pod is pending and its // scheduling violates a medium priority pod anti-affinity, the medium priority // pod is preempted to allow the higher priority pod schedule.