diff --git a/pkg/apis/scheduling/types.go b/pkg/apis/scheduling/types.go index d5db894a8d0..29c950157c6 100644 --- a/pkg/apis/scheduling/types.go +++ b/pkg/apis/scheduling/types.go @@ -23,9 +23,17 @@ const ( // that do not specify any priority class and there is no priority class // marked as default. DefaultPriorityWhenNoDefaultClassExists = 0 + // HighestUserDefinablePriority is the highest priority for user defined priority classes. Priority values larger than 1 billion are reserved for Kubernetes system use. + HighestUserDefinablePriority = int32(1000000000) + // SystemCriticalPriority is the beginning of the range of priority values for critical system components. + SystemCriticalPriority = 2 * HighestUserDefinablePriority // SystemPriorityClassPrefix is the prefix reserved for system priority class names. Other priority // classes are not allowed to start with this prefix. SystemPriorityClassPrefix = "system-" + // NOTE: In order to avoid conflict of names with user-defined priority classes, all the names must + // start with SystemPriorityClassPrefix. + SystemClusterCritical = SystemPriorityClassPrefix + "cluster-critical" + SystemNodeCritical = SystemPriorityClassPrefix + "node-critical" ) // +genclient diff --git a/pkg/apis/scheduling/validation/validation.go b/pkg/apis/scheduling/validation/validation.go index c8b85ef85d4..bd446b25585 100644 --- a/pkg/apis/scheduling/validation/validation.go +++ b/pkg/apis/scheduling/validation/validation.go @@ -17,8 +17,6 @@ limitations under the License. package validation import ( - "strings" - "k8s.io/apimachinery/pkg/util/validation/field" apivalidation "k8s.io/kubernetes/pkg/apis/core/validation" "k8s.io/kubernetes/pkg/apis/scheduling" @@ -26,12 +24,7 @@ import ( // ValidatePriorityClassName checks whether the given priority class name is valid. func ValidatePriorityClassName(name string, prefix bool) []string { - var allErrs []string - if strings.HasPrefix(name, scheduling.SystemPriorityClassPrefix) { - allErrs = append(allErrs, "priority class names with '"+scheduling.SystemPriorityClassPrefix+"' prefix are reserved for system use only") - } - allErrs = append(allErrs, apivalidation.NameIsDNSSubdomain(name, prefix)...) - return allErrs + return apivalidation.NameIsDNSSubdomain(name, prefix) } // ValidatePriorityClass tests whether required fields in the PriorityClass are diff --git a/pkg/apis/scheduling/validation/validation_test.go b/pkg/apis/scheduling/validation/validation_test.go index e35ee426936..8b0cec40b6c 100644 --- a/pkg/apis/scheduling/validation/validation_test.go +++ b/pkg/apis/scheduling/validation/validation_test.go @@ -53,10 +53,6 @@ func TestValidatePriorityClass(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: "tier&1", Namespace: ""}, Value: 100, }, - "invalid system name": { - ObjectMeta: metav1.ObjectMeta{Name: scheduling.SystemPriorityClassPrefix + "test"}, - Value: 100, - }, } for k, v := range errorCases { diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index 62116985fd3..c176a08bf0b 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -22,7 +22,7 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubeapi "k8s.io/kubernetes/pkg/apis/core" - schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" + "k8s.io/kubernetes/pkg/apis/scheduling" ) const ( @@ -168,7 +168,7 @@ func IsCriticalPodBasedOnPriority(ns string, priority int32) bool { if ns != kubeapi.NamespaceSystem { return false } - if priority >= schedulerapi.SystemCriticalPriority { + if priority >= scheduling.SystemCriticalPriority { return true } return false diff --git a/pkg/registry/scheduling/rest/storage_scheduling.go b/pkg/registry/scheduling/rest/storage_scheduling.go index 3074fb7d4af..807a4ae8a6c 100644 --- a/pkg/registry/scheduling/rest/storage_scheduling.go +++ b/pkg/registry/scheduling/rest/storage_scheduling.go @@ -17,6 +17,15 @@ limitations under the License. package rest import ( + "fmt" + "time" + + "github.com/golang/glog" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apiserver/pkg/registry/generic" "k8s.io/apiserver/pkg/registry/rest" genericapiserver "k8s.io/apiserver/pkg/server" @@ -24,11 +33,16 @@ import ( "k8s.io/kubernetes/pkg/api/legacyscheme" "k8s.io/kubernetes/pkg/apis/scheduling" schedulingapiv1alpha1 "k8s.io/kubernetes/pkg/apis/scheduling/v1alpha1" + schedulingclient "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset/typed/scheduling/internalversion" priorityclassstore "k8s.io/kubernetes/pkg/registry/scheduling/priorityclass/storage" ) +const PostStartHookName = "scheduling/bootstrap-system-priority-classes" + type RESTStorageProvider struct{} +var _ genericapiserver.PostStartHookProvider = RESTStorageProvider{} + func (p RESTStorageProvider) NewRESTStorage(apiResourceConfigSource serverstorage.APIResourceConfigSource, restOptionsGetter generic.RESTOptionsGetter) (genericapiserver.APIGroupInfo, bool) { apiGroupInfo := genericapiserver.NewDefaultAPIGroupInfo(scheduling.GroupName, legacyscheme.Registry, legacyscheme.Scheme, legacyscheme.ParameterCodec, legacyscheme.Codecs) @@ -49,6 +63,64 @@ func (p RESTStorageProvider) v1alpha1Storage(apiResourceConfigSource serverstora return storage } +func (p RESTStorageProvider) PostStartHook() (string, genericapiserver.PostStartHookFunc, error) { + return PostStartHookName, AddSystemPriorityClasses(), nil +} + +func AddSystemPriorityClasses() genericapiserver.PostStartHookFunc { + priorityClasses := []*scheduling.PriorityClass{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: scheduling.SystemNodeCritical, + }, + Value: scheduling.SystemCriticalPriority + 1000, + Description: "Used for system critical pods that must not be moved from their current node.", + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: scheduling.SystemClusterCritical, + }, + Value: scheduling.SystemCriticalPriority, + Description: "Used for system critical pods that must run in the cluster, but can be moved to another node if necessary.", + }, + } + return func(hookContext genericapiserver.PostStartHookContext) error { + // Adding system priority classes is important. If they fail to add, many critical system + // components may fail and cluster may break. + err := wait.Poll(1*time.Second, 30*time.Second, func() (done bool, err error) { + schedClientSet, err := schedulingclient.NewForConfig(hookContext.LoopbackClientConfig) + if err != nil { + utilruntime.HandleError(fmt.Errorf("unable to initialize client: %v", err)) + return false, nil + } + + for _, pc := range priorityClasses { + _, err := schedClientSet.PriorityClasses().Get(pc.Name, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + _, err := schedClientSet.PriorityClasses().Create(pc) + if err != nil { + return false, err + } else { + glog.Infof("created PriorityClass %s with value %v", pc.Name, pc.Value) + } + } else { + // Unable to get the priority class for reasons other than "not found". + return false, err + } + } + } + glog.Infof("all system priority classes are created successfully.") + return true, nil + }) + // if we're never able to make it through initialization, kill the API server. + if err != nil { + return fmt.Errorf("unable to add default system priority classes: %v", err) + } + return nil + } +} + func (p RESTStorageProvider) GroupName() string { return scheduling.GroupName } diff --git a/pkg/scheduler/api/types.go b/pkg/scheduler/api/types.go index cfc8d219ec1..4d87b3853dc 100644 --- a/pkg/scheduler/api/types.go +++ b/pkg/scheduler/api/types.go @@ -36,14 +36,6 @@ const ( MaxPriority = 10 // MaxWeight defines the max weight value. MaxWeight = MaxInt / MaxPriority - // HighestUserDefinablePriority is the highest priority for user defined priority classes. Priority values larger than 1 billion are reserved for Kubernetes system use. - HighestUserDefinablePriority = int32(1000000000) - // SystemCriticalPriority is the beginning of the range of priority values for critical system components. - SystemCriticalPriority = 2 * HighestUserDefinablePriority - // NOTE: In order to avoid conflict of names with user-defined priority classes, all the names must - // start with scheduling.SystemPriorityClassPrefix which is by default "system-". - SystemClusterCritical = "system-cluster-critical" - SystemNodeCritical = "system-node-critical" ) // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -259,12 +251,6 @@ type HostPriority struct { // HostPriorityList declares a []HostPriority type. type HostPriorityList []HostPriority -// SystemPriorityClasses defines special priority classes which are used by system critical pods that should not be preempted by workload pods. -var SystemPriorityClasses = map[string]int32{ - SystemClusterCritical: SystemCriticalPriority, - SystemNodeCritical: SystemCriticalPriority + 1000, -} - func (h HostPriorityList) Len() int { return len(h) } diff --git a/plugin/pkg/admission/priority/admission.go b/plugin/pkg/admission/priority/admission.go index 2c7e22242cb..97add55a8d2 100644 --- a/plugin/pkg/admission/priority/admission.go +++ b/plugin/pkg/admission/priority/admission.go @@ -19,10 +19,12 @@ package priority import ( "fmt" "io" + "strings" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/labels" "k8s.io/apiserver/pkg/admission" + "k8s.io/apiserver/pkg/authentication/user" utilfeature "k8s.io/apiserver/pkg/util/feature" api "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/apis/scheduling" @@ -32,7 +34,6 @@ import ( "k8s.io/kubernetes/pkg/features" kubeapiserveradmission "k8s.io/kubernetes/pkg/kubeapiserver/admission" kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" - schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" ) const ( @@ -154,7 +155,7 @@ func (p *priorityPlugin) admitPod(a admission.Attributes) error { if len(pod.Spec.PriorityClassName) == 0 && utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) && kubelettypes.IsCritical(a.GetNamespace(), pod.Annotations) { - pod.Spec.PriorityClassName = schedulerapi.SystemClusterCritical + pod.Spec.PriorityClassName = scheduling.SystemClusterCritical } if len(pod.Spec.PriorityClassName) == 0 { var err error @@ -163,22 +164,17 @@ func (p *priorityPlugin) admitPod(a admission.Attributes) error { return fmt.Errorf("failed to get default priority class: %v", err) } } else { - // First try to resolve by system priority classes. - priority, ok = schedulerapi.SystemPriorityClasses[pod.Spec.PriorityClassName] - if !ok { - // Now that we didn't find any system priority, try resolving by user defined priority classes. - pc, err := p.lister.Get(pod.Spec.PriorityClassName) - - if err != nil { - if errors.IsNotFound(err) { - return admission.NewForbidden(a, fmt.Errorf("no PriorityClass with name %v was found", pod.Spec.PriorityClassName)) - } - - return fmt.Errorf("failed to get PriorityClass with name %s: %v", pod.Spec.PriorityClassName, err) + // Try resolving the priority class name. + pc, err := p.lister.Get(pod.Spec.PriorityClassName) + if err != nil { + if errors.IsNotFound(err) { + return admission.NewForbidden(a, fmt.Errorf("no PriorityClass with name %v was found", pod.Spec.PriorityClassName)) } - priority = pc.Value + return fmt.Errorf("failed to get PriorityClass with name %s: %v", pod.Spec.PriorityClassName, err) } + + priority = pc.Value } pod.Spec.Priority = &priority } @@ -192,11 +188,15 @@ func (p *priorityPlugin) validatePriorityClass(a admission.Attributes) error { if !ok { return errors.NewBadRequest("resource was marked with kind PriorityClass but was unable to be converted") } - if pc.Value > schedulerapi.HighestUserDefinablePriority { - return admission.NewForbidden(a, fmt.Errorf("maximum allowed value of a user defined priority is %v", schedulerapi.HighestUserDefinablePriority)) - } - if _, ok := schedulerapi.SystemPriorityClasses[pc.Name]; ok { - return admission.NewForbidden(a, fmt.Errorf("the name of the priority class is a reserved name for system use only: %v", pc.Name)) + // API server adds system critical priority classes at bootstrapping. We should + // not enforce restrictions on adding system level priority classes for API server. + if userInfo := a.GetUserInfo(); userInfo == nil || userInfo.GetName() != user.APIServerUser { + if pc.Value > scheduling.HighestUserDefinablePriority { + return admission.NewForbidden(a, fmt.Errorf("maximum allowed value of a user defined priority is %v", scheduling.HighestUserDefinablePriority)) + } + if strings.HasPrefix(pc.Name, scheduling.SystemPriorityClassPrefix) { + return admission.NewForbidden(a, fmt.Errorf("priority class names with '"+scheduling.SystemPriorityClassPrefix+"' prefix are reserved for system use only")) + } } // If the new PriorityClass tries to be the default priority, make sure that no other priority class is marked as default. if pc.GlobalDefault { diff --git a/plugin/pkg/admission/priority/admission_test.go b/plugin/pkg/admission/priority/admission_test.go index 2fb2a113420..bda6a8a4eea 100644 --- a/plugin/pkg/admission/priority/admission_test.go +++ b/plugin/pkg/admission/priority/admission_test.go @@ -24,13 +24,13 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apiserver/pkg/admission" + "k8s.io/apiserver/pkg/authentication/user" utilfeature "k8s.io/apiserver/pkg/util/feature" api "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/apis/scheduling" informers "k8s.io/kubernetes/pkg/client/informers/informers_generated/internalversion" "k8s.io/kubernetes/pkg/controller" "k8s.io/kubernetes/pkg/features" - schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" ) func addPriorityClasses(ctrl *priorityPlugin, priorityClasses []*scheduling.PriorityClass) { @@ -75,6 +75,17 @@ var nondefaultClass1 = &scheduling.PriorityClass{ Description: "Just a test priority class", } +var systemClusterCritical = &scheduling.PriorityClass{ + TypeMeta: metav1.TypeMeta{ + Kind: "PriorityClass", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: scheduling.SystemClusterCritical, + }, + Value: scheduling.SystemCriticalPriority, + GlobalDefault: true, +} + func TestPriorityClassAdmission(t *testing.T) { var tooHighPriorityClass = &scheduling.PriorityClass{ TypeMeta: metav1.TypeMeta{ @@ -83,7 +94,7 @@ func TestPriorityClassAdmission(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "toohighclass", }, - Value: schedulerapi.HighestUserDefinablePriority + 1, + Value: scheduling.HighestUserDefinablePriority + 1, Description: "Just a test priority class", } @@ -92,42 +103,56 @@ func TestPriorityClassAdmission(t *testing.T) { Kind: "PriorityClass", }, ObjectMeta: metav1.ObjectMeta{ - Name: schedulerapi.SystemClusterCritical, + Name: scheduling.SystemPriorityClassPrefix + "test", }, - Value: schedulerapi.HighestUserDefinablePriority + 1, - Description: "Name conflicts with system priority class names", + Value: scheduling.HighestUserDefinablePriority + 1, + Description: "Name has system critical prefix", } tests := []struct { name string existingClasses []*scheduling.PriorityClass newClass *scheduling.PriorityClass + userInfo user.Info expectError bool }{ { "one default class", []*scheduling.PriorityClass{}, defaultClass1, + nil, false, }, { "more than one default classes", []*scheduling.PriorityClass{defaultClass1}, defaultClass2, + nil, true, }, { "too high PriorityClass value", []*scheduling.PriorityClass{}, tooHighPriorityClass, + nil, true, }, { "system name conflict", []*scheduling.PriorityClass{}, systemClass, + nil, true, }, + { + "system name allowed for API server", + []*scheduling.PriorityClass{}, + systemClass, + &user.DefaultInfo{ + Name: user.APIServerUser, + }, + false, + }, } for _, test := range tests { @@ -146,7 +171,7 @@ func TestPriorityClassAdmission(t *testing.T) { scheduling.Resource("priorityclasses").WithVersion("version"), "", admission.Create, - nil, + test.userInfo, ) err := ctrl.Validate(attrs) glog.Infof("Got %v", err) @@ -322,7 +347,7 @@ func TestPodAdmission(t *testing.T) { Name: containerName, }, }, - PriorityClassName: schedulerapi.SystemClusterCritical, + PriorityClassName: scheduling.SystemClusterCritical, }, }, // pod[5]: mirror Pod with a system priority class name @@ -419,9 +444,9 @@ func TestPodAdmission(t *testing.T) { }, { "pod with a system priority class", - []*scheduling.PriorityClass{}, + []*scheduling.PriorityClass{systemClusterCritical}, *pods[4], - schedulerapi.SystemCriticalPriority, + scheduling.SystemCriticalPriority, false, }, { @@ -440,9 +465,9 @@ func TestPodAdmission(t *testing.T) { }, { "mirror pod with system priority class", - []*scheduling.PriorityClass{}, + []*scheduling.PriorityClass{systemClusterCritical}, *pods[5], - schedulerapi.SystemCriticalPriority, + scheduling.SystemCriticalPriority, false, }, { @@ -454,9 +479,9 @@ func TestPodAdmission(t *testing.T) { }, { "pod with critical pod annotation", - []*scheduling.PriorityClass{}, + []*scheduling.PriorityClass{systemClusterCritical}, *pods[7], - schedulerapi.SystemCriticalPriority, + scheduling.SystemCriticalPriority, false, }, } diff --git a/test/e2e/scheduling/preemption.go b/test/e2e/scheduling/preemption.go index 1867ec4cc46..64a29dc61f8 100644 --- a/test/e2e/scheduling/preemption.go +++ b/test/e2e/scheduling/preemption.go @@ -26,7 +26,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clientset "k8s.io/client-go/kubernetes" - schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" + "k8s.io/kubernetes/pkg/apis/scheduling" "k8s.io/kubernetes/test/e2e/framework" . "github.com/onsi/ginkgo" @@ -168,7 +168,7 @@ var _ = SIGDescribe("SchedulerPreemption [Serial] [Feature:PodPreemption]", func // Create a critical pod and make sure it is scheduled. runPausePod(f, pausePodConfig{ Name: "critical-pod", - PriorityClassName: schedulerapi.SystemClusterCritical, + PriorityClassName: scheduling.SystemClusterCritical, Resources: &v1.ResourceRequirements{ Requests: podRes, }, @@ -311,3 +311,40 @@ var _ = SIGDescribe("SchedulerPreemption [Serial] [Feature:PodPreemption]", func } }) }) + +var _ = SIGDescribe("PodPriorityResolution [Serial] [Feature:PodPreemption]", func() { + var cs clientset.Interface + var ns string + f := framework.NewDefaultFramework("sched-pod-priority") + + BeforeEach(func() { + cs = f.ClientSet + ns = f.Namespace.Name + + err := framework.CheckTestingNSDeletedExcept(cs, ns) + framework.ExpectNoError(err) + }) + + // This test verifies that when a higher priority pod is created and no node with + // enough resources is found, scheduler preempts a lower priority pod to schedule + // the high priority pod. + It("validates critical system priorities are created and resolved", func() { + var podRes v1.ResourceList + // Create pods that use system critical priorities and + By("Create pods that use critical system priorities.") + systemPriorityClasses := []string{ + scheduling.SystemNodeCritical, scheduling.SystemClusterCritical, + } + for i, spc := range systemPriorityClasses { + pod := createPausePod(f, pausePodConfig{ + Name: fmt.Sprintf("pod%d-%v", i, spc), + PriorityClassName: spc, + Resources: &v1.ResourceRequirements{ + Requests: podRes, + }, + }) + Expect(pod.Spec.Priority).NotTo(BeNil()) + framework.Logf("Created pod: %v", pod.Name) + } + }) +})