mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-12-01 17:44:30 +00:00
Move service affinity predicate logic to its plugin.
This commit is contained in:
@@ -20,17 +20,27 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
"k8s.io/klog"
|
||||
"k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
|
||||
"k8s.io/kubernetes/pkg/scheduler/algorithm/priorities"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/migration"
|
||||
framework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
|
||||
schedulerlisters "k8s.io/kubernetes/pkg/scheduler/listers"
|
||||
"k8s.io/kubernetes/pkg/scheduler/nodeinfo"
|
||||
)
|
||||
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
const Name = "ServiceAffinity"
|
||||
const (
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
Name = "ServiceAffinity"
|
||||
|
||||
// preFilterStateKey is the key in CycleState to InterPodAffinity pre-computed data.
|
||||
// Using the name of the plugin will likely help us avoid collisions with other plugins.
|
||||
preFilterStateKey = "PreFilter" + Name
|
||||
)
|
||||
|
||||
// Args holds the args that are used to configure the plugin.
|
||||
type Args struct {
|
||||
@@ -42,39 +52,58 @@ type Args struct {
|
||||
AntiAffinityLabelsPreference []string `json:"antiAffinityLabelsPreference,omitempty"`
|
||||
}
|
||||
|
||||
// preFilterState computed at PreFilter and used at Filter.
|
||||
type preFilterState struct {
|
||||
matchingPodList []*v1.Pod
|
||||
matchingPodServices []*v1.Service
|
||||
}
|
||||
|
||||
// Clone the prefilter state.
|
||||
func (s *preFilterState) Clone() framework.StateData {
|
||||
if s == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
copy := preFilterState{}
|
||||
copy.matchingPodServices = append([]*v1.Service(nil),
|
||||
s.matchingPodServices...)
|
||||
copy.matchingPodList = append([]*v1.Pod(nil),
|
||||
s.matchingPodList...)
|
||||
|
||||
return ©
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(plArgs *runtime.Unknown, handle framework.FrameworkHandle) (framework.Plugin, error) {
|
||||
args := &Args{}
|
||||
if err := framework.DecodeInto(plArgs, args); err != nil {
|
||||
args := Args{}
|
||||
if err := framework.DecodeInto(plArgs, &args); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
informerFactory := handle.SharedInformerFactory()
|
||||
nodeInfoLister := handle.SnapshotSharedLister().NodeInfos()
|
||||
podLister := handle.SnapshotSharedLister().Pods()
|
||||
serviceLister := informerFactory.Core().V1().Services().Lister()
|
||||
|
||||
fitPredicate, predicateMetadataProducer := predicates.NewServiceAffinityPredicate(nodeInfoLister, podLister, serviceLister, args.AffinityLabels)
|
||||
// Once we generate the predicate we should also Register the Precomputation
|
||||
predicates.RegisterPredicateMetadataProducer(predicates.CheckServiceAffinityPred, predicateMetadataProducer)
|
||||
|
||||
priorityMapFunction, priorityReduceFunction := priorities.NewServiceAntiAffinityPriority(podLister, serviceLister, args.AntiAffinityLabelsPreference)
|
||||
|
||||
return &ServiceAffinity{
|
||||
handle: handle,
|
||||
predicate: fitPredicate,
|
||||
sharedLister: handle.SnapshotSharedLister(),
|
||||
serviceLister: serviceLister,
|
||||
priorityMapFunction: priorityMapFunction,
|
||||
priorityReduceFunction: priorityReduceFunction,
|
||||
args: args,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ServiceAffinity is a plugin that checks service affinity.
|
||||
type ServiceAffinity struct {
|
||||
handle framework.FrameworkHandle
|
||||
predicate predicates.FitPredicate
|
||||
args Args
|
||||
sharedLister schedulerlisters.SharedLister
|
||||
serviceLister corelisters.ServiceLister
|
||||
priorityMapFunction priorities.PriorityMapFunction
|
||||
priorityReduceFunction priorities.PriorityReduceFunction
|
||||
}
|
||||
|
||||
var _ framework.PreFilterPlugin = &ServiceAffinity{}
|
||||
var _ framework.FilterPlugin = &ServiceAffinity{}
|
||||
var _ framework.ScorePlugin = &ServiceAffinity{}
|
||||
|
||||
@@ -83,19 +112,184 @@ func (pl *ServiceAffinity) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
func (pl *ServiceAffinity) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *nodeinfo.NodeInfo) *framework.Status {
|
||||
meta, ok := migration.CovertStateRefToPredMeta(migration.PredicateMetadata(cycleState))
|
||||
if !ok {
|
||||
return framework.NewStatus(framework.Error, "looking up Metadata")
|
||||
func (pl *ServiceAffinity) createPreFilterState(pod *v1.Pod) (*preFilterState, error) {
|
||||
if pod == nil {
|
||||
return nil, fmt.Errorf("a pod is required to calculate service affinity preFilterState")
|
||||
}
|
||||
_, reasons, err := pl.predicate(pod, meta, nodeInfo)
|
||||
return migration.PredicateResultToFrameworkStatus(reasons, err)
|
||||
// Store services which match the pod.
|
||||
matchingPodServices, err := schedulerlisters.GetPodServices(pl.serviceLister, pod)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("listing pod services: %v", err.Error())
|
||||
}
|
||||
selector := predicates.CreateSelectorFromLabels(pod.Labels)
|
||||
allMatches, err := pl.sharedLister.Pods().List(selector)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("listing pods: %v", err.Error())
|
||||
}
|
||||
|
||||
// consider only the pods that belong to the same namespace
|
||||
matchingPodList := predicates.FilterPodsByNamespace(allMatches, pod.Namespace)
|
||||
|
||||
return &preFilterState{
|
||||
matchingPodList: matchingPodList,
|
||||
matchingPodServices: matchingPodServices,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// PreFilter invoked at the prefilter extension point.
|
||||
func (pl *ServiceAffinity) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) *framework.Status {
|
||||
s, err := pl.createPreFilterState(pod)
|
||||
if err != nil {
|
||||
return framework.NewStatus(framework.Error, fmt.Sprintf("could not create preFilterState: %v", err))
|
||||
|
||||
}
|
||||
cycleState.Write(preFilterStateKey, s)
|
||||
return nil
|
||||
}
|
||||
|
||||
// PreFilterExtensions returns prefilter extensions, pod add and remove.
|
||||
func (pl *ServiceAffinity) PreFilterExtensions() framework.PreFilterExtensions {
|
||||
return pl
|
||||
}
|
||||
|
||||
// AddPod from pre-computed data in cycleState.
|
||||
func (pl *ServiceAffinity) AddPod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podToAdd *v1.Pod, nodeInfo *nodeinfo.NodeInfo) *framework.Status {
|
||||
s, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.NewStatus(framework.Error, err.Error())
|
||||
}
|
||||
|
||||
// If addedPod is in the same namespace as the pod, update the list
|
||||
// of matching pods if applicable.
|
||||
if s == nil || podToAdd.Namespace != podToSchedule.Namespace {
|
||||
return nil
|
||||
}
|
||||
|
||||
selector := predicates.CreateSelectorFromLabels(podToSchedule.Labels)
|
||||
if selector.Matches(labels.Set(podToAdd.Labels)) {
|
||||
s.matchingPodList = append(s.matchingPodList, podToAdd)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// RemovePod from pre-computed data in cycleState.
|
||||
func (pl *ServiceAffinity) RemovePod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podToRemove *v1.Pod, nodeInfo *nodeinfo.NodeInfo) *framework.Status {
|
||||
s, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.NewStatus(framework.Error, err.Error())
|
||||
}
|
||||
|
||||
if s == nil ||
|
||||
len(s.matchingPodList) == 0 ||
|
||||
podToRemove.Namespace != s.matchingPodList[0].Namespace {
|
||||
return nil
|
||||
}
|
||||
|
||||
for i, pod := range s.matchingPodList {
|
||||
if pod.Name == podToRemove.Name && pod.Namespace == podToRemove.Namespace {
|
||||
s.matchingPodList = append(s.matchingPodList[:i], s.matchingPodList[i+1:]...)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
|
||||
c, err := cycleState.Read(preFilterStateKey)
|
||||
if err != nil {
|
||||
// The metadata wasn't pre-computed in prefilter. We ignore the error for now since
|
||||
// Filter is able to handle that by computing it again.
|
||||
klog.Error(fmt.Sprintf("reading %q from cycleState: %v", preFilterStateKey, err))
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if c == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
s, ok := c.(*preFilterState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("%+v convert to interpodaffinity.state error", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Filter matches nodes in such a way to force that
|
||||
// ServiceAffinity.labels are homogeneous for pods that are scheduled to a node.
|
||||
// (i.e. it returns true IFF this pod can be added to this node such that all other pods in
|
||||
// the same service are running on nodes with the exact same ServiceAffinity.label values).
|
||||
//
|
||||
// For example:
|
||||
// If the first pod of a service was scheduled to a node with label "region=foo",
|
||||
// all the other subsequent pods belong to the same service will be schedule on
|
||||
// nodes with the same "region=foo" label.
|
||||
//
|
||||
// Details:
|
||||
//
|
||||
// If (the svc affinity labels are not a subset of pod's label selectors )
|
||||
// The pod has all information necessary to check affinity, the pod's label selector is sufficient to calculate
|
||||
// the match.
|
||||
// Otherwise:
|
||||
// Create an "implicit selector" which guarantees pods will land on nodes with similar values
|
||||
// for the affinity labels.
|
||||
//
|
||||
// To do this, we "reverse engineer" a selector by introspecting existing pods running under the same service+namespace.
|
||||
// These backfilled labels in the selector "L" are defined like so:
|
||||
// - L is a label that the ServiceAffinity object needs as a matching constraint.
|
||||
// - L is not defined in the pod itself already.
|
||||
// - and SOME pod, from a service, in the same namespace, ALREADY scheduled onto a node, has a matching value.
|
||||
func (pl *ServiceAffinity) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *nodeinfo.NodeInfo) *framework.Status {
|
||||
if len(pl.args.AffinityLabels) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return framework.NewStatus(framework.Error, "node not found")
|
||||
}
|
||||
|
||||
s, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.NewStatus(framework.Error, err.Error())
|
||||
}
|
||||
if s == nil {
|
||||
// Make the filter resilient in case preFilterState is missing.
|
||||
s, err = pl.createPreFilterState(pod)
|
||||
if err != nil {
|
||||
return framework.NewStatus(framework.Error, fmt.Sprintf("could not create preFilterState: %v", err))
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
pods, services := s.matchingPodList, s.matchingPodServices
|
||||
filteredPods := nodeInfo.FilterOutPods(pods)
|
||||
// check if the pod being scheduled has the affinity labels specified in its NodeSelector
|
||||
affinityLabels := predicates.FindLabelsInSet(pl.args.AffinityLabels, labels.Set(pod.Spec.NodeSelector))
|
||||
// Step 1: If we don't have all constraints, introspect nodes to find the missing constraints.
|
||||
if len(pl.args.AffinityLabels) > len(affinityLabels) {
|
||||
if len(services) > 0 {
|
||||
if len(filteredPods) > 0 {
|
||||
nodeWithAffinityLabels, err := pl.sharedLister.NodeInfos().Get(filteredPods[0].Spec.NodeName)
|
||||
if err != nil {
|
||||
return framework.NewStatus(framework.Error, "node not found")
|
||||
}
|
||||
predicates.AddUnsetLabelsToMap(affinityLabels, pl.args.AffinityLabels, labels.Set(nodeWithAffinityLabels.Node().Labels))
|
||||
}
|
||||
}
|
||||
}
|
||||
// Step 2: Finally complete the affinity predicate based on whatever set of predicates we were able to find.
|
||||
if predicates.CreateSelectorFromLabels(affinityLabels).Matches(labels.Set(node.Labels)) {
|
||||
return nil
|
||||
}
|
||||
|
||||
return migration.PredicateResultToFrameworkStatus([]predicates.PredicateFailureReason{predicates.ErrServiceAffinityViolated}, nil)
|
||||
}
|
||||
|
||||
// Score invoked at the Score extension point.
|
||||
func (pl *ServiceAffinity) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
|
||||
nodeInfo, err := pl.sharedLister.NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.NewStatus(framework.Error, fmt.Sprintf("getting node %q from Snapshot: %v", nodeName, err))
|
||||
}
|
||||
@@ -107,7 +301,7 @@ func (pl *ServiceAffinity) Score(ctx context.Context, state *framework.CycleStat
|
||||
// NormalizeScore invoked after scoring all nodes.
|
||||
func (pl *ServiceAffinity) NormalizeScore(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
|
||||
// Note that priorityReduceFunction doesn't use priority metadata, hence passing nil here.
|
||||
err := pl.priorityReduceFunction(pod, nil, pl.handle.SnapshotSharedLister(), scores)
|
||||
err := pl.priorityReduceFunction(pod, nil, pl.sharedLister, scores)
|
||||
return migration.ErrorToFrameworkStatus(err)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user