mirror of
https://github.com/k3s-io/kubernetes.git
synced 2026-01-05 07:27:21 +00:00
Merge pull request #33763 from jayunit100/sched-checkservice-predicateCache
Automatic merge from submit-queue Predicate cacheing and cleanup Fix to #31795 First pass @ cleanup and caching of the CheckServiceAffinity function. The cleanup IMO is necessary because the logic around the pod listing and the use of the "implicit selector" (which is reverse engineered to enable the homogenous pod groups). Should still pass the E2Es. @timothysc @wojtek-t
This commit is contained in:
@@ -36,6 +36,19 @@ import (
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||
)
|
||||
|
||||
// predicatePrecomputations: Helper types/variables...
|
||||
type PredicateMetadataModifier func(pm *predicateMetadata)
|
||||
|
||||
var predicatePrecomputeRegisterLock sync.Mutex
|
||||
var predicatePrecomputations map[string]PredicateMetadataModifier = make(map[string]PredicateMetadataModifier)
|
||||
|
||||
func RegisterPredicatePrecomputation(predicateName string, precomp PredicateMetadataModifier) {
|
||||
predicatePrecomputeRegisterLock.Lock()
|
||||
defer predicatePrecomputeRegisterLock.Unlock()
|
||||
predicatePrecomputations[predicateName] = precomp
|
||||
}
|
||||
|
||||
// Other types for predicate functions...
|
||||
type NodeInfo interface {
|
||||
GetNodeInfo(nodeID string) (*api.Node, error)
|
||||
}
|
||||
@@ -77,34 +90,21 @@ func (c *CachedNodeInfo) GetNodeInfo(id string) (*api.Node, error) {
|
||||
return node.(*api.Node), nil
|
||||
}
|
||||
|
||||
// predicateMetadata is a type that is passed as metadata for predicate functions
|
||||
type predicateMetadata struct {
|
||||
podBestEffort bool
|
||||
podRequest *schedulercache.Resource
|
||||
podPorts map[int]bool
|
||||
matchingAntiAffinityTerms []matchingPodAntiAffinityTerm
|
||||
}
|
||||
|
||||
// Note that predicateMetdata and matchingPodAntiAffinityTerm need to be declared in the same file
|
||||
// due to the way declarations are processed in predicate declaration unit tests.
|
||||
type matchingPodAntiAffinityTerm struct {
|
||||
term *api.PodAffinityTerm
|
||||
node *api.Node
|
||||
}
|
||||
|
||||
func PredicateMetadata(pod *api.Pod, nodeInfoMap map[string]*schedulercache.NodeInfo) interface{} {
|
||||
// If we cannot compute metadata, just return nil
|
||||
if pod == nil {
|
||||
return nil
|
||||
}
|
||||
matchingTerms, err := getMatchingAntiAffinityTerms(pod, nodeInfoMap)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &predicateMetadata{
|
||||
podBestEffort: isPodBestEffort(pod),
|
||||
podRequest: GetResourceRequest(pod),
|
||||
podPorts: GetUsedPorts(pod),
|
||||
matchingAntiAffinityTerms: matchingTerms,
|
||||
}
|
||||
type predicateMetadata struct {
|
||||
pod *api.Pod
|
||||
podBestEffort bool
|
||||
podRequest *schedulercache.Resource
|
||||
podPorts map[int]bool
|
||||
matchingAntiAffinityTerms []matchingPodAntiAffinityTerm
|
||||
serviceAffinityMatchingPodList []*api.Pod
|
||||
serviceAffinityMatchingPodServices []*api.Service
|
||||
}
|
||||
|
||||
func isVolumeConflict(volume api.Volume, pod *api.Pod) bool {
|
||||
@@ -637,20 +637,42 @@ type ServiceAffinity struct {
|
||||
labels []string
|
||||
}
|
||||
|
||||
func NewServiceAffinityPredicate(podLister algorithm.PodLister, serviceLister algorithm.ServiceLister, nodeInfo NodeInfo, labels []string) algorithm.FitPredicate {
|
||||
// serviceAffinityPrecomputation should be run once by the scheduler before looping through the Predicate. It is a helper function that
|
||||
// only should be referenced by NewServiceAffinityPredicate.
|
||||
func (s *ServiceAffinity) serviceAffinityPrecomputation(pm *predicateMetadata) {
|
||||
if pm.pod == nil {
|
||||
glog.Errorf("Cannot precompute service affinity, a pod is required to caluculate service affinity.")
|
||||
return
|
||||
}
|
||||
|
||||
var errSvc, errList error
|
||||
// Store services which match the pod.
|
||||
pm.serviceAffinityMatchingPodServices, errSvc = s.serviceLister.GetPodServices(pm.pod)
|
||||
selector := CreateSelectorFromLabels(pm.pod.Labels)
|
||||
// consider only the pods that belong to the same namespace
|
||||
allMatches, errList := s.podLister.List(selector)
|
||||
|
||||
// In the future maybe we will return them as part of the function.
|
||||
if errSvc != nil || errList != nil {
|
||||
glog.Errorf("Some Error were found while precomputing svc affinity: \nservices:%v , \npods:%v", errSvc, errList)
|
||||
}
|
||||
pm.serviceAffinityMatchingPodList = FilterPodsByNamespace(allMatches, pm.pod.Namespace)
|
||||
}
|
||||
|
||||
func NewServiceAffinityPredicate(podLister algorithm.PodLister, serviceLister algorithm.ServiceLister, nodeInfo NodeInfo, labels []string) (algorithm.FitPredicate, PredicateMetadataModifier) {
|
||||
affinity := &ServiceAffinity{
|
||||
podLister: podLister,
|
||||
serviceLister: serviceLister,
|
||||
nodeInfo: nodeInfo,
|
||||
labels: labels,
|
||||
}
|
||||
return affinity.CheckServiceAffinity
|
||||
return affinity.checkServiceAffinity, affinity.serviceAffinityPrecomputation
|
||||
}
|
||||
|
||||
// The checkServiceAffinity predicate matches nodes in such a way to force that
|
||||
// ServiceAffinity.labels are homogenous for pods added to a node.
|
||||
// (i.e. it returns true IFF this pod can be added to this node, such
|
||||
// that all other pods in the same service are running on nodes w/
|
||||
// checkServiceAffinity is a predicate which matches nodes in such a way to force that
|
||||
// ServiceAffinity.labels are homogenous for pods that are scheduled to a node.
|
||||
// (i.e. it returns true IFF this pod can be added to this node such that all other pods in
|
||||
// the same service are running on nodes with
|
||||
// the exact same ServiceAffinity.label values).
|
||||
//
|
||||
// Details:
|
||||
@@ -660,46 +682,47 @@ func NewServiceAffinityPredicate(podLister algorithm.PodLister, serviceLister al
|
||||
// the match.
|
||||
// Otherwise:
|
||||
// Create an "implicit selector" which gaurantees pods will land on nodes with similar values
|
||||
// for the affinity labels.
|
||||
// for the affinity labels.
|
||||
//
|
||||
// To do this, we "reverse engineer" a selector by introspecting existing pods running under the same service+namespace.
|
||||
// These backfilled labels in the selector "L" are defined like so:
|
||||
// - L is a label that the ServiceAffinity object needs as a matching constraints.
|
||||
// - L is not defined in the pod itself already.
|
||||
// - and SOME pod, from a service, in the same namespace, ALREADY scheduled onto a node, has a matching value.
|
||||
func (s *ServiceAffinity) CheckServiceAffinity(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
//
|
||||
// WARNING: This Predicate is NOT gauranteed to work if some of the predicateMetadata data isn't precomputed...
|
||||
// For that reason it is not exported, i.e. it is highlhy coupled to the implementation of the FitPredicate construction.
|
||||
func (s *ServiceAffinity) checkServiceAffinity(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
var services []*api.Service
|
||||
var pods []*api.Pod
|
||||
if pm, ok := meta.(*predicateMetadata); ok && (pm.serviceAffinityMatchingPodList != nil || pm.serviceAffinityMatchingPodServices != nil) {
|
||||
services = pm.serviceAffinityMatchingPodServices
|
||||
pods = pm.serviceAffinityMatchingPodList
|
||||
} else {
|
||||
// Make the predicate resilient in case metadata is missing.
|
||||
pm = &predicateMetadata{pod: pod}
|
||||
s.serviceAffinityPrecomputation(pm)
|
||||
pods, services = pm.serviceAffinityMatchingPodList, pm.serviceAffinityMatchingPodServices
|
||||
}
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return false, nil, fmt.Errorf("node not found")
|
||||
}
|
||||
|
||||
// check if the pod being scheduled has the affinity labels specified in its NodeSelector
|
||||
affinityLabels := FindLabelsInSet(s.labels, labels.Set(pod.Spec.NodeSelector))
|
||||
|
||||
// Introspect services IFF we didn't predefine all the affinity labels in the pod itself.
|
||||
// Step 1: If we don't have all constraints, introspect nodes to find the missing constraints.
|
||||
if len(s.labels) > len(affinityLabels) {
|
||||
services, err := s.serviceLister.GetPodServices(pod)
|
||||
if err == nil && len(services) > 0 {
|
||||
// just use the first service and get the other pods within the service
|
||||
// TODO: a separate predicate can be created that tries to handle all services for the pod
|
||||
selector := labels.SelectorFromSet(services[0].Spec.Selector)
|
||||
servicePods, err := s.podLister.List(selector)
|
||||
if err != nil {
|
||||
return false, nil, err
|
||||
}
|
||||
// consider only the pods that belong to the same namespace
|
||||
nsServicePods := FilterPodsByNamespace(servicePods, pod.Namespace)
|
||||
if len(nsServicePods) > 0 {
|
||||
// consider any service pod and fetch the node its hosted on
|
||||
otherNode, err := s.nodeInfo.GetNodeInfo(nsServicePods[0].Spec.NodeName)
|
||||
if len(services) > 0 {
|
||||
if len(pods) > 0 {
|
||||
nodeWithAffinityLabels, err := s.nodeInfo.GetNodeInfo(pods[0].Spec.NodeName)
|
||||
if err != nil {
|
||||
return false, nil, err
|
||||
}
|
||||
AddUnsetLabelsToMap(affinityLabels, s.labels, labels.Set(otherNode.Labels))
|
||||
AddUnsetLabelsToMap(affinityLabels, s.labels, labels.Set(nodeWithAffinityLabels.Labels))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check if the node matches the selector
|
||||
// Step 2: Finally complete the affinity predicate based on whatever set of predicates we were able to find.
|
||||
if CreateSelectorFromLabels(affinityLabels).Matches(labels.Set(node.Labels)) {
|
||||
return true, nil, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user