mirror of
				https://github.com/k3s-io/kubernetes.git
				synced 2025-11-03 23:40:03 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			273 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			273 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
/*
 | 
						|
Copyright 2017 The Kubernetes Authors.
 | 
						|
 | 
						|
Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
you may not use this file except in compliance with the License.
 | 
						|
You may obtain a copy of the License at
 | 
						|
 | 
						|
    http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 | 
						|
Unless required by applicable law or agreed to in writing, software
 | 
						|
distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
See the License for the specific language governing permissions and
 | 
						|
limitations under the License.
 | 
						|
*/
 | 
						|
 | 
						|
package preemption
 | 
						|
 | 
						|
import (
 | 
						|
	"fmt"
 | 
						|
	"math"
 | 
						|
 | 
						|
	"k8s.io/api/core/v1"
 | 
						|
	"k8s.io/client-go/tools/record"
 | 
						|
	"k8s.io/klog"
 | 
						|
	"k8s.io/kubernetes/pkg/api/v1/resource"
 | 
						|
	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
 | 
						|
	"k8s.io/kubernetes/pkg/kubelet/events"
 | 
						|
	"k8s.io/kubernetes/pkg/kubelet/eviction"
 | 
						|
	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
 | 
						|
	"k8s.io/kubernetes/pkg/kubelet/metrics"
 | 
						|
	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
 | 
						|
	"k8s.io/kubernetes/pkg/kubelet/util/format"
 | 
						|
)
 | 
						|
 | 
						|
const message = "Preempted in order to admit critical pod"
 | 
						|
 | 
						|
// CriticalPodAdmissionHandler is an AdmissionFailureHandler that handles admission failure for Critical Pods.
 | 
						|
// If the ONLY admission failures are due to insufficient resources, then CriticalPodAdmissionHandler evicts pods
 | 
						|
// so that the critical pod can be admitted.  For evictions, the CriticalPodAdmissionHandler evicts a set of pods that
 | 
						|
// frees up the required resource requests.  The set of pods is designed to minimize impact, and is prioritized according to the ordering:
 | 
						|
// minimal impact for guaranteed pods > minimal impact for burstable pods > minimal impact for besteffort pods.
 | 
						|
// minimal impact is defined as follows: fewest pods evicted > fewest total requests of pods.
 | 
						|
// finding the fewest total requests of pods is considered besteffort.
 | 
						|
type CriticalPodAdmissionHandler struct {
 | 
						|
	getPodsFunc eviction.ActivePodsFunc
 | 
						|
	killPodFunc eviction.KillPodFunc
 | 
						|
	recorder    record.EventRecorder
 | 
						|
}
 | 
						|
 | 
						|
var _ lifecycle.AdmissionFailureHandler = &CriticalPodAdmissionHandler{}
 | 
						|
 | 
						|
func NewCriticalPodAdmissionHandler(getPodsFunc eviction.ActivePodsFunc, killPodFunc eviction.KillPodFunc, recorder record.EventRecorder) *CriticalPodAdmissionHandler {
 | 
						|
	return &CriticalPodAdmissionHandler{
 | 
						|
		getPodsFunc: getPodsFunc,
 | 
						|
		killPodFunc: killPodFunc,
 | 
						|
		recorder:    recorder,
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// HandleAdmissionFailure gracefully handles admission rejection, and, in some cases,
 | 
						|
// to allow admission of the pod despite its previous failure.
 | 
						|
func (c *CriticalPodAdmissionHandler) HandleAdmissionFailure(admitPod *v1.Pod, failureReasons []lifecycle.PredicateFailureReason) ([]lifecycle.PredicateFailureReason, error) {
 | 
						|
	if !kubetypes.IsCriticalPod(admitPod) {
 | 
						|
		return failureReasons, nil
 | 
						|
	}
 | 
						|
	// InsufficientResourceError is not a reason to reject a critical pod.
 | 
						|
	// Instead of rejecting, we free up resources to admit it, if no other reasons for rejection exist.
 | 
						|
	nonResourceReasons := []lifecycle.PredicateFailureReason{}
 | 
						|
	resourceReasons := []*admissionRequirement{}
 | 
						|
	for _, reason := range failureReasons {
 | 
						|
		if r, ok := reason.(*lifecycle.InsufficientResourceError); ok {
 | 
						|
			resourceReasons = append(resourceReasons, &admissionRequirement{
 | 
						|
				resourceName: r.ResourceName,
 | 
						|
				quantity:     r.GetInsufficientAmount(),
 | 
						|
			})
 | 
						|
		} else {
 | 
						|
			nonResourceReasons = append(nonResourceReasons, reason)
 | 
						|
		}
 | 
						|
	}
 | 
						|
	if len(nonResourceReasons) > 0 {
 | 
						|
		// Return only reasons that are not resource related, since critical pods cannot fail admission for resource reasons.
 | 
						|
		return nonResourceReasons, nil
 | 
						|
	}
 | 
						|
	err := c.evictPodsToFreeRequests(admitPod, admissionRequirementList(resourceReasons))
 | 
						|
	// if no error is returned, preemption succeeded and the pod is safe to admit.
 | 
						|
	return nil, err
 | 
						|
}
 | 
						|
 | 
						|
// evictPodsToFreeRequests takes a list of insufficient resources, and attempts to free them by evicting pods
 | 
						|
// based on requests.  For example, if the only insufficient resource is 200Mb of memory, this function could
 | 
						|
// evict a pod with request=250Mb.
 | 
						|
func (c *CriticalPodAdmissionHandler) evictPodsToFreeRequests(admitPod *v1.Pod, insufficientResources admissionRequirementList) error {
 | 
						|
	podsToPreempt, err := getPodsToPreempt(admitPod, c.getPodsFunc(), insufficientResources)
 | 
						|
	if err != nil {
 | 
						|
		return fmt.Errorf("preemption: error finding a set of pods to preempt: %v", err)
 | 
						|
	}
 | 
						|
	klog.Infof("preemption: attempting to evict pods %v, in order to free up resources: %s", podsToPreempt, insufficientResources.toString())
 | 
						|
	for _, pod := range podsToPreempt {
 | 
						|
		status := v1.PodStatus{
 | 
						|
			Phase:   v1.PodFailed,
 | 
						|
			Message: message,
 | 
						|
			Reason:  events.PreemptContainer,
 | 
						|
		}
 | 
						|
		// record that we are evicting the pod
 | 
						|
		c.recorder.Eventf(pod, v1.EventTypeWarning, events.PreemptContainer, message)
 | 
						|
		// this is a blocking call and should only return when the pod and its containers are killed.
 | 
						|
		err := c.killPodFunc(pod, status, nil)
 | 
						|
		if err != nil {
 | 
						|
			klog.Warningf("preemption: pod %s failed to evict %v", format.Pod(pod), err)
 | 
						|
			// In future syncPod loops, the kubelet will retry the pod deletion steps that it was stuck on.
 | 
						|
			continue
 | 
						|
		}
 | 
						|
		if len(insufficientResources) > 0 {
 | 
						|
			metrics.Preemptions.WithLabelValues(insufficientResources[0].resourceName.String()).Inc()
 | 
						|
		} else {
 | 
						|
			metrics.Preemptions.WithLabelValues("").Inc()
 | 
						|
		}
 | 
						|
		klog.Infof("preemption: pod %s evicted successfully", format.Pod(pod))
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
// getPodsToPreempt returns a list of pods that could be preempted to free requests >= requirements
 | 
						|
func getPodsToPreempt(pod *v1.Pod, pods []*v1.Pod, requirements admissionRequirementList) ([]*v1.Pod, error) {
 | 
						|
	bestEffortPods, burstablePods, guaranteedPods := sortPodsByQOS(pod, pods)
 | 
						|
 | 
						|
	// make sure that pods exist to reclaim the requirements
 | 
						|
	unableToMeetRequirements := requirements.subtract(append(append(bestEffortPods, burstablePods...), guaranteedPods...)...)
 | 
						|
	if len(unableToMeetRequirements) > 0 {
 | 
						|
		return nil, fmt.Errorf("no set of running pods found to reclaim resources: %v", unableToMeetRequirements.toString())
 | 
						|
	}
 | 
						|
	// find the guaranteed pods we would need to evict if we already evicted ALL burstable and besteffort pods.
 | 
						|
	guaranteedToEvict, err := getPodsToPreemptByDistance(guaranteedPods, requirements.subtract(append(bestEffortPods, burstablePods...)...))
 | 
						|
	if err != nil {
 | 
						|
		return nil, err
 | 
						|
	}
 | 
						|
	// Find the burstable pods we would need to evict if we already evicted ALL besteffort pods, and the required guaranteed pods.
 | 
						|
	burstableToEvict, err := getPodsToPreemptByDistance(burstablePods, requirements.subtract(append(bestEffortPods, guaranteedToEvict...)...))
 | 
						|
	if err != nil {
 | 
						|
		return nil, err
 | 
						|
	}
 | 
						|
	// Find the besteffort pods we would need to evict if we already evicted the required guaranteed and burstable pods.
 | 
						|
	bestEffortToEvict, err := getPodsToPreemptByDistance(bestEffortPods, requirements.subtract(append(burstableToEvict, guaranteedToEvict...)...))
 | 
						|
	if err != nil {
 | 
						|
		return nil, err
 | 
						|
	}
 | 
						|
	return append(append(bestEffortToEvict, burstableToEvict...), guaranteedToEvict...), nil
 | 
						|
}
 | 
						|
 | 
						|
// getPodsToPreemptByDistance finds the pods that have pod requests >= admission requirements.
 | 
						|
// Chooses pods that minimize "distance" to the requirements.
 | 
						|
// If more than one pod exists that fulfills the remaining requirements,
 | 
						|
// it chooses the pod that has the "smaller resource request"
 | 
						|
// This method, by repeatedly choosing the pod that fulfills as much of the requirements as possible,
 | 
						|
// attempts to minimize the number of pods returned.
 | 
						|
func getPodsToPreemptByDistance(pods []*v1.Pod, requirements admissionRequirementList) ([]*v1.Pod, error) {
 | 
						|
	podsToEvict := []*v1.Pod{}
 | 
						|
	// evict pods by shortest distance from remaining requirements, updating requirements every round.
 | 
						|
	for len(requirements) > 0 {
 | 
						|
		if len(pods) == 0 {
 | 
						|
			return nil, fmt.Errorf("no set of running pods found to reclaim resources: %v", requirements.toString())
 | 
						|
		}
 | 
						|
		// all distances must be less than len(requirements), because the max distance for a single requirement is 1
 | 
						|
		bestDistance := float64(len(requirements) + 1)
 | 
						|
		bestPodIndex := 0
 | 
						|
		// Find the pod with the smallest distance from requirements
 | 
						|
		// Or, in the case of two equidistant pods, find the pod with "smaller" resource requests.
 | 
						|
		for i, pod := range pods {
 | 
						|
			dist := requirements.distance(pod)
 | 
						|
			if dist < bestDistance || (bestDistance == dist && smallerResourceRequest(pod, pods[bestPodIndex])) {
 | 
						|
				bestDistance = dist
 | 
						|
				bestPodIndex = i
 | 
						|
			}
 | 
						|
		}
 | 
						|
		// subtract the pod from requirements, and transfer the pod from input-pods to pods-to-evicted
 | 
						|
		requirements = requirements.subtract(pods[bestPodIndex])
 | 
						|
		podsToEvict = append(podsToEvict, pods[bestPodIndex])
 | 
						|
		pods[bestPodIndex] = pods[len(pods)-1]
 | 
						|
		pods = pods[:len(pods)-1]
 | 
						|
	}
 | 
						|
	return podsToEvict, nil
 | 
						|
}
 | 
						|
 | 
						|
type admissionRequirement struct {
 | 
						|
	resourceName v1.ResourceName
 | 
						|
	quantity     int64
 | 
						|
}
 | 
						|
 | 
						|
type admissionRequirementList []*admissionRequirement
 | 
						|
 | 
						|
// distance returns distance of the pods requests from the admissionRequirements.
 | 
						|
// The distance is measured by the fraction of the requirement satisfied by the pod,
 | 
						|
// so that each requirement is weighted equally, regardless of absolute magnitude.
 | 
						|
func (a admissionRequirementList) distance(pod *v1.Pod) float64 {
 | 
						|
	dist := float64(0)
 | 
						|
	for _, req := range a {
 | 
						|
		remainingRequest := float64(req.quantity - resource.GetResourceRequest(pod, req.resourceName))
 | 
						|
		if remainingRequest > 0 {
 | 
						|
			dist += math.Pow(remainingRequest/float64(req.quantity), 2)
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return dist
 | 
						|
}
 | 
						|
 | 
						|
// subtract returns a new admissionRequirementList containing remaining requirements if the provided pod
 | 
						|
// were to be preempted
 | 
						|
func (a admissionRequirementList) subtract(pods ...*v1.Pod) admissionRequirementList {
 | 
						|
	newList := []*admissionRequirement{}
 | 
						|
	for _, req := range a {
 | 
						|
		newQuantity := req.quantity
 | 
						|
		for _, pod := range pods {
 | 
						|
			newQuantity -= resource.GetResourceRequest(pod, req.resourceName)
 | 
						|
			if newQuantity <= 0 {
 | 
						|
				break
 | 
						|
			}
 | 
						|
		}
 | 
						|
		if newQuantity > 0 {
 | 
						|
			newList = append(newList, &admissionRequirement{
 | 
						|
				resourceName: req.resourceName,
 | 
						|
				quantity:     newQuantity,
 | 
						|
			})
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return newList
 | 
						|
}
 | 
						|
 | 
						|
func (a admissionRequirementList) toString() string {
 | 
						|
	s := "["
 | 
						|
	for _, req := range a {
 | 
						|
		s += fmt.Sprintf("(res: %v, q: %d), ", req.resourceName, req.quantity)
 | 
						|
	}
 | 
						|
	return s + "]"
 | 
						|
}
 | 
						|
 | 
						|
// sortPodsByQOS returns lists containing besteffort, burstable, and guaranteed pods that
 | 
						|
// can be preempted by preemptor pod.
 | 
						|
func sortPodsByQOS(preemptor *v1.Pod, pods []*v1.Pod) (bestEffort, burstable, guaranteed []*v1.Pod) {
 | 
						|
	for _, pod := range pods {
 | 
						|
		if kubetypes.Preemptable(preemptor, pod) {
 | 
						|
			switch v1qos.GetPodQOS(pod) {
 | 
						|
			case v1.PodQOSBestEffort:
 | 
						|
				bestEffort = append(bestEffort, pod)
 | 
						|
			case v1.PodQOSBurstable:
 | 
						|
				burstable = append(burstable, pod)
 | 
						|
			case v1.PodQOSGuaranteed:
 | 
						|
				guaranteed = append(guaranteed, pod)
 | 
						|
			default:
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return
 | 
						|
}
 | 
						|
 | 
						|
// smallerResourceRequest returns true if pod1 has a smaller request than pod2
 | 
						|
func smallerResourceRequest(pod1 *v1.Pod, pod2 *v1.Pod) bool {
 | 
						|
	priorityList := []v1.ResourceName{
 | 
						|
		v1.ResourceMemory,
 | 
						|
		v1.ResourceCPU,
 | 
						|
	}
 | 
						|
	for _, res := range priorityList {
 | 
						|
		req1 := resource.GetResourceRequest(pod1, res)
 | 
						|
		req2 := resource.GetResourceRequest(pod2, res)
 | 
						|
		if req1 < req2 {
 | 
						|
			return true
 | 
						|
		} else if req1 > req2 {
 | 
						|
			return false
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return true
 | 
						|
}
 |