diff --git a/cluster/centos/config-default.sh b/cluster/centos/config-default.sh index c66c8392699..dff6b5f387c 100755 --- a/cluster/centos/config-default.sh +++ b/cluster/centos/config-default.sh @@ -120,7 +120,7 @@ export FLANNEL_NET=${FLANNEL_NET:-"172.16.0.0/16"} # Admission Controllers to invoke prior to persisting objects in cluster # If we included ResourceQuota, we should keep it at the end of the list to prevent incrementing quota usage prematurely. -export ADMISSION_CONTROL=${ADMISSION_CONTROL:-"Initializers,NamespaceLifecycle,LimitRanger,ServiceAccount,DefaultTolerationSeconds,ResourceQuota"} +export ADMISSION_CONTROL=${ADMISSION_CONTROL:-"Initializers,NamespaceLifecycle,LimitRanger,ServiceAccount,DefaultTolerationSeconds,Priority,ResourceQuota"} # Extra options to set on the Docker command line. # This is useful for setting --insecure-registry for local registries. diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index 8e052d3cd6e..1da4449fd8e 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -234,7 +234,7 @@ fi # Admission Controllers to invoke prior to persisting objects in cluster # If we included ResourceQuota, we should keep it at the end of the list to prevent incrementing quota usage prematurely. -ADMISSION_CONTROL=Initializers,NamespaceLifecycle,LimitRanger,ServiceAccount,PersistentVolumeLabel,DefaultStorageClass,DefaultTolerationSeconds,NodeRestriction,ResourceQuota +ADMISSION_CONTROL=Initializers,NamespaceLifecycle,LimitRanger,ServiceAccount,PersistentVolumeLabel,DefaultStorageClass,DefaultTolerationSeconds,NodeRestriction,Priority,ResourceQuota # Optional: if set to true kube-up will automatically check for existing resources and clean them up. KUBE_UP_AUTOMATIC_CLEANUP=${KUBE_UP_AUTOMATIC_CLEANUP:-false} diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index 09ffd819b63..933a749c7c9 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -267,7 +267,7 @@ if [ ${ENABLE_IP_ALIASES} = true ]; then fi # If we included ResourceQuota, we should keep it at the end of the list to prevent incrementing quota usage prematurely. -ADMISSION_CONTROL="${KUBE_ADMISSION_CONTROL:-Initializers,NamespaceLifecycle,LimitRanger,ServiceAccount,PersistentVolumeLabel,PodPreset,DefaultStorageClass,DefaultTolerationSeconds,NodeRestriction,ResourceQuota}" +ADMISSION_CONTROL="${KUBE_ADMISSION_CONTROL:-Initializers,NamespaceLifecycle,LimitRanger,ServiceAccount,PersistentVolumeLabel,PodPreset,DefaultStorageClass,DefaultTolerationSeconds,NodeRestriction,Priority,ResourceQuota}" # Optional: if set to true kube-up will automatically check for existing resources and clean them up. KUBE_UP_AUTOMATIC_CLEANUP=${KUBE_UP_AUTOMATIC_CLEANUP:-false} diff --git a/cmd/kube-apiserver/app/plugins.go b/cmd/kube-apiserver/app/plugins.go index 8677d35f592..06e21d5a5f0 100644 --- a/cmd/kube-apiserver/app/plugins.go +++ b/cmd/kube-apiserver/app/plugins.go @@ -43,6 +43,7 @@ import ( "k8s.io/kubernetes/plugin/pkg/admission/podnodeselector" "k8s.io/kubernetes/plugin/pkg/admission/podpreset" "k8s.io/kubernetes/plugin/pkg/admission/podtolerationrestriction" + podpriority "k8s.io/kubernetes/plugin/pkg/admission/priority" "k8s.io/kubernetes/plugin/pkg/admission/resourcequota" "k8s.io/kubernetes/plugin/pkg/admission/security/podsecuritypolicy" "k8s.io/kubernetes/plugin/pkg/admission/securitycontext/scdeny" @@ -73,6 +74,7 @@ func RegisterAllAdmissionPlugins(plugins *admission.Plugins) { podtolerationrestriction.Register(plugins) resourcequota.Register(plugins) podsecuritypolicy.Register(plugins) + podpriority.Register(plugins) scdeny.Register(plugins) serviceaccount.Register(plugins) setdefault.Register(plugins) diff --git a/pkg/apis/scheduling/types.go b/pkg/apis/scheduling/types.go index 8dab7df6656..06cceadb74d 100644 --- a/pkg/apis/scheduling/types.go +++ b/pkg/apis/scheduling/types.go @@ -18,6 +18,13 @@ package scheduling import metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +const ( + // DefaultPriorityWhenNoDefaultClassExists is used to set priority of pods + // that do not specify any priority class and there is no priority class + // marked as default. + DefaultPriorityWhenNoDefaultClassExists = 0 +) + // +genclient // +genclient:nonNamespaced // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object diff --git a/plugin/pkg/admission/priority/admission.go b/plugin/pkg/admission/priority/admission.go new file mode 100644 index 00000000000..264206e3f62 --- /dev/null +++ b/plugin/pkg/admission/priority/admission.go @@ -0,0 +1,201 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package admission + +import ( + "fmt" + "io" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apiserver/pkg/admission" + "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/apis/scheduling" + "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset" + informers "k8s.io/kubernetes/pkg/client/informers/informers_generated/internalversion" + schedulinglisters "k8s.io/kubernetes/pkg/client/listers/scheduling/internalversion" + kubeapiserveradmission "k8s.io/kubernetes/pkg/kubeapiserver/admission" +) + +const ( + pluginName = "Priority" + + // HighestUserDefinablePriority is the highest priority for user defined priority classes. Priority values larger than 1 billion are reserved for Kubernetes system use. + HighestUserDefinablePriority = 1000000000 + // SystemCriticalPriority is the beginning of the range of priority values for critical system components. + SystemCriticalPriority = 2 * HighestUserDefinablePriority +) + +// SystemPriorityClasses defines special priority classes which are used by system critical pods that should not be preempted by workload pods. +var SystemPriorityClasses = map[string]int32{ + "system-cluster-critical": SystemCriticalPriority, + "system-node-critical": SystemCriticalPriority + 1000, +} + +// Register registers a plugin +func Register(plugins *admission.Plugins) { + plugins.Register(pluginName, func(config io.Reader) (admission.Interface, error) { + return NewPlugin(), nil + }) +} + +// priorityPlugin is an implementation of admission.Interface. +type priorityPlugin struct { + *admission.Handler + client internalclientset.Interface + lister schedulinglisters.PriorityClassLister +} + +var _ = kubeapiserveradmission.WantsInternalKubeInformerFactory(&priorityPlugin{}) +var _ = kubeapiserveradmission.WantsInternalKubeClientSet(&priorityPlugin{}) + +// NewPlugin creates a new priority admission plugin. +func NewPlugin() admission.Interface { + return &priorityPlugin{ + Handler: admission.NewHandler(admission.Create, admission.Update), + } +} + +func (p *priorityPlugin) Validate() error { + if p.client == nil { + return fmt.Errorf("%s requires a client", pluginName) + } + if p.lister == nil { + return fmt.Errorf("%s requires a lister", pluginName) + } + return nil +} + +func (p *priorityPlugin) SetInternalKubeClientSet(client internalclientset.Interface) { + p.client = client +} + +func (p *priorityPlugin) SetInternalKubeInformerFactory(f informers.SharedInformerFactory) { + priorityInformer := f.Scheduling().InternalVersion().PriorityClasses() + p.lister = priorityInformer.Lister() + p.SetReadyFunc(priorityInformer.Informer().HasSynced) +} + +var ( + podResource = api.Resource("pods") + priorityClassResource = api.Resource("priorityclasses") +) + +// Admit checks Pods and PriorityClasses and admits or rejects them. It also resolved the priority of pods based on their PriorityClass. +func (p *priorityPlugin) Admit(a admission.Attributes) error { + operation := a.GetOperation() + // Ignore all calls to subresources or resources other than pods. + // Ignore all operations other than Create and Update. + if len(a.GetSubresource()) != 0 || (operation != admission.Create && operation != admission.Update) { + return nil + } + + switch a.GetResource().GroupResource() { + case podResource: + return p.admitPod(a) + + case priorityClassResource: + return p.admitPriorityClass(a) + + default: + return nil + } +} + +func (p *priorityPlugin) admitPod(a admission.Attributes) error { + operation := a.GetOperation() + pod, ok := a.GetObject().(*api.Pod) + if !ok { + return errors.NewBadRequest("Resource was marked with kind Pod but was unable to be converted") + } + if _, isMirrorPod := pod.Annotations[api.MirrorPodAnnotationKey]; isMirrorPod { + return nil + } + // Make sure that the user has not set `priority` at the time of pod creation. + if operation == admission.Create && pod.Spec.Priority != nil { + return admission.NewForbidden(a, fmt.Errorf("The integer value of priority must not be provided in pod spec. The system populates the value from the given PriorityClass name")) + } + var priority int32 + if len(pod.Spec.PriorityClassName) == 0 { + dpc, err := p.findDefaultPriorityClass() + if err != nil { + return fmt.Errorf("Failed to get default priority class: %v", err) + } + if dpc != nil { + priority = dpc.Value + } else { + priority = scheduling.DefaultPriorityWhenNoDefaultClassExists + } + } else { + // First try to resolve by system priority classes. + priority, ok = SystemPriorityClasses[pod.Spec.PriorityClassName] + if !ok { + // Now that we didn't find any system priority, try resolving by user defined priority classes. + pc, err := p.lister.Get(pod.Spec.PriorityClassName) + if err != nil { + return fmt.Errorf("Failed to get default priority class %s: %v", pod.Spec.PriorityClassName, err) + } + if pc == nil { + return admission.NewForbidden(a, fmt.Errorf("No PriorityClass with name %v was found", pod.Spec.PriorityClassName)) + } + priority = pc.Value + } + } + pod.Spec.Priority = &priority + return nil +} + +func (p *priorityPlugin) admitPriorityClass(a admission.Attributes) error { + operation := a.GetOperation() + pc, ok := a.GetObject().(*scheduling.PriorityClass) + if !ok { + return errors.NewBadRequest("Resource was marked with kind Pod but was unable to be converted") + } + if pc.Value > HighestUserDefinablePriority { + return admission.NewForbidden(a, fmt.Errorf("Maximum allowed value of a user defined priority is %v", HighestUserDefinablePriority)) + } + if _, ok := SystemPriorityClasses[pc.Name]; ok { + return admission.NewForbidden(a, fmt.Errorf("The name of the priority class is a reserved name for system use only: %v", pc.Name)) + } + // If the new PriorityClass tries to be the default priority, make sure that no other priority class is marked as default. + if pc.GlobalDefault { + dpc, err := p.findDefaultPriorityClass() + if err != nil { + return fmt.Errorf("Failed to get default priority class: %v", err) + } + if dpc != nil { + // Throw an error if a second default priority class is being created, or an existing priority class is being marked as default, while another default already exists. + if operation == admission.Create || (operation == admission.Update && dpc.GetName() != pc.GetName()) { + return admission.NewForbidden(a, fmt.Errorf("PriorityClass %v is already marked as default. Only one default can exist", dpc.GetName())) + } + } + } + return nil +} + +func (p *priorityPlugin) findDefaultPriorityClass() (*scheduling.PriorityClass, error) { + list, err := p.lister.List(labels.Everything()) + if err != nil { + return nil, err + } + for _, pci := range list { + if pci.GlobalDefault { + return pci, nil + } + } + return nil, nil +} diff --git a/plugin/pkg/admission/priority/admission_test.go b/plugin/pkg/admission/priority/admission_test.go new file mode 100644 index 00000000000..63cdea3160f --- /dev/null +++ b/plugin/pkg/admission/priority/admission_test.go @@ -0,0 +1,335 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package admission + +import ( + "testing" + + "github.com/golang/glog" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apiserver/pkg/admission" + "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/apis/scheduling" + informers "k8s.io/kubernetes/pkg/client/informers/informers_generated/internalversion" + "k8s.io/kubernetes/pkg/controller" +) + +func addPriorityClasses(ctrl *priorityPlugin, priorityClasses []*scheduling.PriorityClass) { + informerFactory := informers.NewSharedInformerFactory(nil, controller.NoResyncPeriodFunc()) + ctrl.SetInternalKubeInformerFactory(informerFactory) + // First add the existing classes to the cache. + for _, c := range priorityClasses { + informerFactory.Scheduling().InternalVersion().PriorityClasses().Informer().GetStore().Add(c) + } +} + +var defaultClass1 = &scheduling.PriorityClass{ + TypeMeta: metav1.TypeMeta{ + Kind: "PriorityClass", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "default1", + }, + Value: 1000, + GlobalDefault: true, +} + +var defaultClass2 = &scheduling.PriorityClass{ + TypeMeta: metav1.TypeMeta{ + Kind: "PriorityClass", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "default2", + }, + Value: 2000, + GlobalDefault: true, +} + +var nondefaultClass1 = &scheduling.PriorityClass{ + TypeMeta: metav1.TypeMeta{ + Kind: "PriorityClass", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "nondefault1", + }, + Value: 2000, + Description: "Just a test priority class", +} + +func TestPriorityClassAdmission(t *testing.T) { + var tooHighPriorityClass = &scheduling.PriorityClass{ + TypeMeta: metav1.TypeMeta{ + Kind: "PriorityClass", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "toohighclass", + }, + Value: HighestUserDefinablePriority + 1, + Description: "Just a test priority class", + } + + var systemClass = &scheduling.PriorityClass{ + TypeMeta: metav1.TypeMeta{ + Kind: "PriorityClass", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "system-cluster-critical", + }, + Value: HighestUserDefinablePriority + 1, + Description: "Name conflicts with system priority class names", + } + + tests := []struct { + name string + existingClasses []*scheduling.PriorityClass + newClass *scheduling.PriorityClass + expectError bool + }{ + { + "one default class", + []*scheduling.PriorityClass{}, + defaultClass1, + false, + }, + { + "more than one default classes", + []*scheduling.PriorityClass{defaultClass1}, + defaultClass2, + true, + }, + { + "too high PriorityClass value", + []*scheduling.PriorityClass{}, + tooHighPriorityClass, + true, + }, + { + "system name conflict", + []*scheduling.PriorityClass{}, + systemClass, + true, + }, + } + + for _, test := range tests { + glog.V(4).Infof("starting test %q", test.name) + + ctrl := NewPlugin().(*priorityPlugin) + // Add existing priority classes. + addPriorityClasses(ctrl, test.existingClasses) + // Now add the new class. + attrs := admission.NewAttributesRecord( + test.newClass, + nil, + api.Kind("PriorityClass").WithVersion("version"), + "", + "", + api.Resource("priorityclasses").WithVersion("version"), + "", + admission.Create, + nil, + ) + err := ctrl.Admit(attrs) + glog.Infof("Got %v", err) + if err != nil && !test.expectError { + t.Errorf("Test %q: unexpected error received: %v", test.name, err) + } + if err == nil && test.expectError { + t.Errorf("Test %q: expected error and no error recevied", test.name) + } + } +} + +var intPriority = int32(1000) + +func TestPodAdmission(t *testing.T) { + containerName := "container" + + pods := []*api.Pod{ + // pod[0]: Pod with a proper priority class. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-w-priorityclass", + Namespace: "namespace", + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: containerName, + }, + }, + PriorityClassName: "default1", + }, + }, + // pod[1]: Pod with no priority class + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-wo-priorityclass", + Namespace: "namespace", + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: containerName, + }, + }, + }, + }, + // pod[2]: Pod with non-existing priority class + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-w-non-existing-priorityclass", + Namespace: "namespace", + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: containerName, + }, + }, + PriorityClassName: "non-existing", + }, + }, + // pod[3]: Pod with integer value of priority + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-w-integer-priority", + Namespace: "namespace", + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: containerName, + }, + }, + PriorityClassName: "default1", + Priority: &intPriority, + }, + }, + // pod[4]: Pod with a system priority class name + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-w-system-priority", + Namespace: "namespace", + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: containerName, + }, + }, + PriorityClassName: "system-cluster-critical", + }, + }, + } + + tests := []struct { + name string + existingClasses []*scheduling.PriorityClass + // Admission controller changes pod spec. So, we take an api.Pod instead of + // *api.Pod to avoid interfering with other tests. + pod api.Pod + expectedPriority int32 + expectError bool + }{ + { + "Pod with priority class", + []*scheduling.PriorityClass{defaultClass1, nondefaultClass1}, + *pods[0], + 1000, + false, + }, + + { + "Pod without priority class", + []*scheduling.PriorityClass{defaultClass1}, + *pods[1], + 1000, + false, + }, + { + "pod without priority class and no existing priority class", + []*scheduling.PriorityClass{}, + *pods[1], + scheduling.DefaultPriorityWhenNoDefaultClassExists, + false, + }, + { + "pod without priority class and no default class", + []*scheduling.PriorityClass{nondefaultClass1}, + *pods[1], + scheduling.DefaultPriorityWhenNoDefaultClassExists, + false, + }, + { + "pod with a system priority class", + []*scheduling.PriorityClass{}, + *pods[4], + SystemCriticalPriority, + false, + }, + { + "Pod with non-existing priority class", + []*scheduling.PriorityClass{defaultClass1, nondefaultClass1}, + *pods[2], + 0, + true, + }, + { + "pod with integer priority", + []*scheduling.PriorityClass{}, + *pods[3], + 0, + true, + }, + } + + for _, test := range tests { + glog.V(4).Infof("starting test %q", test.name) + + ctrl := NewPlugin().(*priorityPlugin) + // Add existing priority classes. + addPriorityClasses(ctrl, test.existingClasses) + + // Create pod. + attrs := admission.NewAttributesRecord( + &test.pod, + nil, + api.Kind("Pod").WithVersion("version"), + test.pod.ObjectMeta.Namespace, + "", + api.Resource("pods").WithVersion("version"), + "", + admission.Create, + nil, + ) + err := ctrl.Admit(attrs) + glog.Infof("Got %v", err) + if !test.expectError { + if err != nil { + t.Errorf("Test %q: unexpected error received: %v", test.name, err) + } + if *test.pod.Spec.Priority != test.expectedPriority { + t.Errorf("Test %q: expected priority is %d, but got %d.", test.name, test.expectedPriority, *test.pod.Spec.Priority) + } + } + if err == nil && test.expectError { + t.Errorf("Test %q: expected error and no error recevied", test.name) + } + } +}