diff --git a/pkg/scheduler/algorithm/predicates/metadata.go b/pkg/scheduler/algorithm/predicates/metadata.go index cd5d0367d1a..f1f14969d38 100644 --- a/pkg/scheduler/algorithm/predicates/metadata.go +++ b/pkg/scheduler/algorithm/predicates/metadata.go @@ -74,7 +74,9 @@ type topologyPairsMaps struct { type topologyPairsPodSpreadMap struct { // This map is keyed with a topology key, and valued with minimum number // of pods matched on that topology domain. + // TODO(Huang-Wei): refactor to {tpKey->tpValSet(or tpValSlice)} topologyKeyToMinPodsMap map[string]int32 + // TODO(Huang-Wei): refactor to {tpPair->count, podName->tpPairSet(optional)} *topologyPairsMaps } @@ -231,10 +233,8 @@ func getTPMapMatchingSpreadConstraints(pod *v1.Pod, nodeInfoMap map[string]*sche return } // Ensure current node's labels contains all topologyKeys in 'constraints'. - for _, constraint := range constraints { - if _, ok := node.Labels[constraint.TopologyKey]; !ok { - return - } + if !nodeLabelsMatchSpreadConstraints(node.Labels, constraints) { + return } nodeTopologyMaps := newTopologyPairsMaps() @@ -319,6 +319,16 @@ func podMatchesSpreadConstraint(podLabelSet labels.Set, constraint v1.TopologySp return true, nil } +// check if ALL topology keys in spread constraints are present in node labels +func nodeLabelsMatchSpreadConstraints(nodeLabels map[string]string, constraints []v1.TopologySpreadConstraint) bool { + for _, constraint := range constraints { + if _, ok := nodeLabels[constraint.TopologyKey]; !ok { + return false + } + } + return true +} + // returns a pointer to a new topologyPairsMaps func newTopologyPairsMaps() *topologyPairsMaps { return &topologyPairsMaps{topologyPairToPods: make(map[topologyPair]podSet), @@ -374,6 +384,89 @@ func (m *topologyPairsMaps) clone() *topologyPairsMaps { return copy } +func (m *topologyPairsPodSpreadMap) addPod(addedPod, preemptorPod *v1.Pod, node *v1.Node) error { + if addedPod.Namespace != preemptorPod.Namespace { + return nil + } + constraints := getHardTopologySpreadConstraints(preemptorPod) + if !nodeLabelsMatchSpreadConstraints(node.Labels, constraints) { + return nil + } + + // records which topology key(s) needs to be updated + minMatchNeedingUpdate := make(map[string]struct{}) + podLabelSet := labels.Set(addedPod.Labels) + for _, constraint := range constraints { + if match, err := podMatchesSpreadConstraint(podLabelSet, constraint); err != nil { + return err + } else if !match { + continue + } + pair := topologyPair{ + key: constraint.TopologyKey, + value: node.Labels[constraint.TopologyKey], + } + // it means current node is one of the critical paths of topologyKeyToMinPodsMap[TopologyKey] + if int32(len(m.topologyPairToPods[pair])) == m.topologyKeyToMinPodsMap[pair.key] { + minMatchNeedingUpdate[pair.key] = struct{}{} + } + m.addTopologyPair(pair, addedPod) + } + // no need to addTopologyPairWithoutPods b/c if a pair without pods must be present, + // it should have already been created earlier in removePod() phase + + // In most cases, min match map doesn't need to be updated. + // But it's required to be updated when current node is the ONLY critical path which impacts + // the min match. With that said, in this case min match needs to be updated to min match + 1 + if len(minMatchNeedingUpdate) != 0 { + // TODO(Huang-Wei): performance can be optimized. + // A possible solution is to record number of critical paths which co-impact the min match. + tempMinMatchMap := make(map[string]int32, len(minMatchNeedingUpdate)) + for key := range minMatchNeedingUpdate { + tempMinMatchMap[key] = math.MaxInt32 + } + for pair, podSet := range m.topologyPairToPods { + if _, ok := minMatchNeedingUpdate[pair.key]; !ok { + continue + } + if l := int32(len(podSet)); l < tempMinMatchMap[pair.key] { + tempMinMatchMap[pair.key] = l + } + } + for key, tempMin := range tempMinMatchMap { + if tempMin == m.topologyKeyToMinPodsMap[key]+1 { + m.topologyKeyToMinPodsMap[key] = tempMin + } + } + } + return nil +} + +func (m *topologyPairsPodSpreadMap) removePod(deletedPod *v1.Pod) { + if m == nil || deletedPod == nil { + return + } + + deletedPodFullName := schedutil.GetPodFullName(deletedPod) + pairSet, ok := m.podToTopologyPairs[deletedPodFullName] + if !ok { + return + } + topologyPairToPods := m.topologyPairToPods + for pair := range pairSet { + delete(topologyPairToPods[pair], deletedPod) + // if topologyPairToPods[pair] is empty after deletion + // don't clean it up as that topology counts as a match now + + // removal of the deletedPod would probably genereate a smaller matching number + // so re-calculate minMatch to a smaller value if possible + if l := int32(len(topologyPairToPods[pair])); l < m.topologyKeyToMinPodsMap[pair.key] { + m.topologyKeyToMinPodsMap[pair.key] = l + } + } + delete(m.podToTopologyPairs, deletedPodFullName) +} + func (m *topologyPairsPodSpreadMap) clone() *topologyPairsPodSpreadMap { // m could be nil when EvenPodsSpread feature is disabled if m == nil { @@ -400,6 +493,8 @@ func (meta *predicateMetadata) RemovePod(deletedPod *v1.Pod) error { // Delete pod from the matching affinity or anti-affinity topology pairs maps. meta.topologyPairsPotentialAffinityPods.removePod(deletedPod) meta.topologyPairsPotentialAntiAffinityPods.removePod(deletedPod) + // Delete pod from the pod spread topology maps. + meta.topologyPairsPodSpreadMap.removePod(deletedPod) // All pods in the serviceAffinityMatchingPodList are in the same namespace. // So, if the namespace of the first one is not the same as the namespace of the // deletedPod, we don't need to check the list, as deletedPod isn't in the list. @@ -461,6 +556,12 @@ func (meta *predicateMetadata) AddPod(addedPod *v1.Pod, nodeInfo *schedulernodei } } } + // Update meta.topologyPairsPodSpreadMap if meta.pod has hard spread constraints + // and addedPod matches that + if err := meta.topologyPairsPodSpreadMap.addPod(addedPod, meta.pod, nodeInfo.Node()); err != nil { + return err + } + // If addedPod is in the same namespace as the meta.pod, update the list // of matching pods if applicable. if meta.serviceAffinityInUse && addedPod.Namespace == meta.pod.Namespace { diff --git a/pkg/scheduler/algorithm/predicates/metadata_test.go b/pkg/scheduler/algorithm/predicates/metadata_test.go index 4a284667f60..911c05922bb 100644 --- a/pkg/scheduler/algorithm/predicates/metadata_test.go +++ b/pkg/scheduler/algorithm/predicates/metadata_test.go @@ -1243,6 +1243,547 @@ func TestGetTPMapMatchingSpreadConstraints(t *testing.T) { } } +func TestPodSpreadMap_addPod(t *testing.T) { + tests := []struct { + name string + preemptorPod *v1.Pod + addedPod *v1.Pod + existingPods []*v1.Pod + nodeIdx int // denotes which node 'addedPod' belongs to + nodes []*v1.Node + injectPodPointers map[topologyPair][]int // non-negative index refers to existingPods[i], negative index refers to addedPod + want *topologyPairsPodSpreadMap + }{ + { + name: "node a and b both impact current min match", + preemptorPod: st.MakePod().Name("p").Label("foo", ""). + SpreadConstraint(1, "node", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + addedPod: st.MakePod().Name("p-a1").Node("node-a").Label("foo", "").Obj(), + existingPods: nil, // it's an empty cluster + nodeIdx: 0, + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-b").Label("zone", "zone1").Label("node", "node-b").Obj(), + }, + injectPodPointers: map[topologyPair][]int{ + {key: "node", value: "node-a"}: {-1}, + {key: "node", value: "node-b"}: {}, + }, + want: &topologyPairsPodSpreadMap{ + // min match map shouldn't be changed b/c node-b is still on the critical path + // determining min match + topologyKeyToMinPodsMap: map[string]int32{"node": 0}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-a1_": newPairSet("node", "node-a"), + }, + }, + }, + }, + { + name: "only node a impacts current min match", + preemptorPod: st.MakePod().Name("p").Label("foo", ""). + SpreadConstraint(1, "node", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + addedPod: st.MakePod().Name("p-a1").Node("node-a").Label("foo", "").Obj(), + existingPods: []*v1.Pod{ + st.MakePod().Name("p-b1").Node("node-b").Label("foo", "").Obj(), + }, + nodeIdx: 0, + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-b").Label("zone", "zone1").Label("node", "node-b").Obj(), + }, + injectPodPointers: map[topologyPair][]int{ + {key: "node", value: "node-a"}: {-1}, + {key: "node", value: "node-b"}: {0}, + }, + want: &topologyPairsPodSpreadMap{ + // min match should be changed from 0 to 1 + topologyKeyToMinPodsMap: map[string]int32{"node": 1}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-a1_": newPairSet("node", "node-a"), + "p-b1_": newPairSet("node", "node-b"), + }, + }, + }, + }, + { + name: "add a pod with mis-matched namespace doesn't change topologyKeyToMinPodsMap", + preemptorPod: st.MakePod().Name("p").Label("foo", ""). + SpreadConstraint(1, "node", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + addedPod: st.MakePod().Name("p-a1").Namespace("ns1").Node("node-a").Label("foo", "").Obj(), + existingPods: []*v1.Pod{ + st.MakePod().Name("p-b1").Node("node-b").Label("foo", "").Obj(), + }, + nodeIdx: 0, + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-b").Label("zone", "zone1").Label("node", "node-b").Obj(), + }, + injectPodPointers: map[topologyPair][]int{ + {key: "node", value: "node-a"}: {}, + {key: "node", value: "node-b"}: {0}, + }, + want: &topologyPairsPodSpreadMap{ + // min match remains the same + topologyKeyToMinPodsMap: map[string]int32{"node": 0}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + // "p-a1_": newPairSet("node", "node-a") shouldn't exist + "p-b1_": newPairSet("node", "node-b"), + }, + }, + }, + }, + { + name: "add pod on non-critical node won't trigger re-calculation", + preemptorPod: st.MakePod().Name("p").Label("foo", ""). + SpreadConstraint(1, "node", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + addedPod: st.MakePod().Name("p-b2").Node("node-b").Label("foo", "").Obj(), + existingPods: []*v1.Pod{ + st.MakePod().Name("p-b1").Node("node-b").Label("foo", "").Obj(), + }, + nodeIdx: 1, + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-b").Label("zone", "zone1").Label("node", "node-b").Obj(), + }, + injectPodPointers: map[topologyPair][]int{ + {key: "node", value: "node-a"}: {}, + {key: "node", value: "node-b"}: {-1, 0}, + }, + want: &topologyPairsPodSpreadMap{ + topologyKeyToMinPodsMap: map[string]int32{"node": 0}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-b1_": newPairSet("node", "node-b"), + "p-b2_": newPairSet("node", "node-b"), + }, + }, + }, + }, + { + name: "node a and x both impact topologyKeyToMinPodsMap on zone and node", + preemptorPod: st.MakePod().Name("p").Label("foo", ""). + SpreadConstraint(1, "zone", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + SpreadConstraint(1, "node", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + addedPod: st.MakePod().Name("p-a1").Node("node-a").Label("foo", "").Obj(), + existingPods: nil, // it's an empty cluster + nodeIdx: 0, + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-x").Label("zone", "zone2").Label("node", "node-x").Obj(), + }, + injectPodPointers: map[topologyPair][]int{ + {key: "zone", value: "zone1"}: {-1}, + {key: "zone", value: "zone2"}: {}, + {key: "node", value: "node-a"}: {-1}, + {key: "node", value: "node-x"}: {}, + }, + want: &topologyPairsPodSpreadMap{ + topologyKeyToMinPodsMap: map[string]int32{"zone": 0, "node": 0}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-a1_": newPairSet("zone", "zone1", "node", "node-a"), + }, + }, + }, + }, + { + name: "only node a impacts topologyKeyToMinPodsMap on zone and node", + preemptorPod: st.MakePod().Name("p").Label("foo", ""). + SpreadConstraint(1, "zone", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + SpreadConstraint(1, "node", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + addedPod: st.MakePod().Name("p-a1").Node("node-a").Label("foo", "").Obj(), + existingPods: []*v1.Pod{ + st.MakePod().Name("p-x1").Node("node-x").Label("foo", "").Obj(), + }, + nodeIdx: 0, + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-x").Label("zone", "zone2").Label("node", "node-x").Obj(), + }, + injectPodPointers: map[topologyPair][]int{ + {key: "zone", value: "zone1"}: {-1}, + {key: "zone", value: "zone2"}: {0}, + {key: "node", value: "node-a"}: {-1}, + {key: "node", value: "node-x"}: {0}, + }, + want: &topologyPairsPodSpreadMap{ + topologyKeyToMinPodsMap: map[string]int32{"zone": 1, "node": 1}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-a1_": newPairSet("zone", "zone1", "node", "node-a"), + "p-x1_": newPairSet("zone", "zone2", "node", "node-x"), + }, + }, + }, + }, + { + name: "node a impacts topologyKeyToMinPodsMap on node, node x impacts topologyKeyToMinPodsMap on zone", + preemptorPod: st.MakePod().Name("p").Label("foo", ""). + SpreadConstraint(1, "zone", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + SpreadConstraint(1, "node", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + addedPod: st.MakePod().Name("p-a1").Node("node-a").Label("foo", "").Obj(), + existingPods: []*v1.Pod{ + st.MakePod().Name("p-b1").Node("node-b").Label("foo", "").Obj(), + st.MakePod().Name("p-b2").Node("node-b").Label("foo", "").Obj(), + st.MakePod().Name("p-x1").Node("node-x").Label("foo", "").Obj(), + }, + nodeIdx: 0, + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-b").Label("zone", "zone1").Label("node", "node-b").Obj(), + st.MakeNode().Name("node-x").Label("zone", "zone2").Label("node", "node-x").Obj(), + }, + injectPodPointers: map[topologyPair][]int{ + {key: "zone", value: "zone1"}: {-1, 0, 1}, + {key: "zone", value: "zone2"}: {2}, + {key: "node", value: "node-a"}: {-1}, + {key: "node", value: "node-b"}: {0, 1}, + {key: "node", value: "node-x"}: {2}, + }, + want: &topologyPairsPodSpreadMap{ + topologyKeyToMinPodsMap: map[string]int32{"zone": 1, "node": 1}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-a1_": newPairSet("zone", "zone1", "node", "node-a"), + "p-b1_": newPairSet("zone", "zone1", "node", "node-b"), + "p-b2_": newPairSet("zone", "zone1", "node", "node-b"), + "p-x1_": newPairSet("zone", "zone2", "node", "node-x"), + }, + }, + }, + }, + { + name: "constraints hold different labelSelectors, node a impacts topologyKeyToMinPodsMap on node", + preemptorPod: st.MakePod().Name("p").Label("foo", "").Label("bar", ""). + SpreadConstraint(1, "zone", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + SpreadConstraint(1, "node", hardSpread, st.MakeLabelSelector().Exists("bar").Obj()). + Obj(), + addedPod: st.MakePod().Name("p-a1").Node("node-a").Label("foo", "").Obj(), + existingPods: []*v1.Pod{ + st.MakePod().Name("p-b1").Node("node-b").Label("foo", "").Label("bar", "").Obj(), + st.MakePod().Name("p-x1").Node("node-x").Label("foo", "").Obj(), + st.MakePod().Name("p-x2").Node("node-x").Label("bar", "").Obj(), + }, + nodeIdx: 0, + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-b").Label("zone", "zone1").Label("node", "node-b").Obj(), + st.MakeNode().Name("node-x").Label("zone", "zone2").Label("node", "node-x").Obj(), + }, + injectPodPointers: map[topologyPair][]int{ + {key: "zone", value: "zone1"}: {-1, 0}, + {key: "zone", value: "zone2"}: {1}, + {key: "node", value: "node-a"}: {}, + {key: "node", value: "node-b"}: {0}, + {key: "node", value: "node-x"}: {2}, + }, + want: &topologyPairsPodSpreadMap{ + topologyKeyToMinPodsMap: map[string]int32{"zone": 1, "node": 0}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-a1_": newPairSet("zone", "zone1"), + "p-b1_": newPairSet("zone", "zone1", "node", "node-b"), + "p-x1_": newPairSet("zone", "zone2"), + "p-x2_": newPairSet("node", "node-x"), + }, + }, + }, + }, + { + name: "constraints hold different labelSelectors, node a impacts topologyKeyToMinPodsMap on both zone and node", + preemptorPod: st.MakePod().Name("p").Label("foo", "").Label("bar", ""). + SpreadConstraint(1, "zone", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + SpreadConstraint(1, "node", hardSpread, st.MakeLabelSelector().Exists("bar").Obj()). + Obj(), + addedPod: st.MakePod().Name("p-a1").Node("node-a").Label("foo", "").Label("bar", "").Obj(), + existingPods: []*v1.Pod{ + st.MakePod().Name("p-b1").Node("node-b").Label("bar", "").Obj(), + st.MakePod().Name("p-x1").Node("node-x").Label("foo", "").Obj(), + st.MakePod().Name("p-x2").Node("node-x").Label("bar", "").Obj(), + }, + nodeIdx: 0, + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-b").Label("zone", "zone1").Label("node", "node-b").Obj(), + st.MakeNode().Name("node-x").Label("zone", "zone2").Label("node", "node-x").Obj(), + }, + injectPodPointers: map[topologyPair][]int{ + {key: "zone", value: "zone1"}: {-1}, + {key: "zone", value: "zone2"}: {1}, + {key: "node", value: "node-a"}: {-1}, + {key: "node", value: "node-b"}: {0}, + {key: "node", value: "node-x"}: {2}, + }, + want: &topologyPairsPodSpreadMap{ + topologyKeyToMinPodsMap: map[string]int32{"zone": 1, "node": 1}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-a1_": newPairSet("zone", "zone1", "node", "node-a"), + "p-b1_": newPairSet("node", "node-b"), + "p-x1_": newPairSet("zone", "zone2"), + "p-x2_": newPairSet("node", "node-x"), + }, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.want.topologyPairToPods = make(map[topologyPair]podSet) + for pair, indexes := range tt.injectPodPointers { + pSet := make(podSet) + for _, i := range indexes { + if i >= 0 { + pSet[tt.existingPods[i]] = struct{}{} + } else { + pSet[tt.addedPod] = struct{}{} + } + } + tt.want.topologyPairToPods[pair] = pSet + } + + nodeInfoMap := schedulernodeinfo.CreateNodeNameToInfoMap(tt.existingPods, tt.nodes) + podSpreadMap, _ := getTPMapMatchingSpreadConstraints(tt.preemptorPod, nodeInfoMap) + + podSpreadMap.addPod(tt.addedPod, tt.preemptorPod, tt.nodes[tt.nodeIdx]) + if !reflect.DeepEqual(podSpreadMap, tt.want) { + t.Errorf("podSpreadMap#addPod() = %v, want %v", podSpreadMap, tt.want) + } + }) + } +} + +func TestPodSpreadMap_removePod(t *testing.T) { + tests := []struct { + name string + preemptor *v1.Pod // preemptor pod + nodes []*v1.Node + existingPods []*v1.Pod + deletedPodIdx int // need to reuse *Pod of existingPods[i] + deletedPod *v1.Pod // if deletedPodIdx is invalid, this field is bypassed + injectPodPointers map[topologyPair][]int + want *topologyPairsPodSpreadMap + }{ + { + // A high priority pod may not be scheduled due to node taints or resource shortage. + // So preemption is triggered. + name: "one spreadConstraint on zone, topologyKeyToMinPodsMap unchanged", + preemptor: st.MakePod().Name("p").Label("foo", ""). + SpreadConstraint(1, "zone", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-b").Label("zone", "zone1").Label("node", "node-b").Obj(), + st.MakeNode().Name("node-x").Label("zone", "zone2").Label("node", "node-x").Obj(), + }, + existingPods: []*v1.Pod{ + st.MakePod().Name("p-a1").Node("node-a").Label("foo", "").Obj(), + st.MakePod().Name("p-b1").Node("node-b").Label("foo", "").Obj(), + st.MakePod().Name("p-x1").Node("node-x").Label("foo", "").Obj(), + }, + deletedPodIdx: 0, // remove pod "p-a1" + injectPodPointers: map[topologyPair][]int{ + {key: "zone", value: "zone1"}: {1}, + {key: "zone", value: "zone2"}: {2}, + }, + want: &topologyPairsPodSpreadMap{ + // topologyKeyToMinPodsMap actually doesn't change + topologyKeyToMinPodsMap: map[string]int32{"zone": 1}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-b1_": newPairSet("zone", "zone1"), + "p-x1_": newPairSet("zone", "zone2"), + }, + }, + }, + }, + { + name: "one spreadConstraint on node, topologyKeyToMinPodsMap changed", + preemptor: st.MakePod().Name("p").Label("foo", ""). + SpreadConstraint(1, "zone", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-b").Label("zone", "zone1").Label("node", "node-b").Obj(), + st.MakeNode().Name("node-x").Label("zone", "zone2").Label("node", "node-x").Obj(), + st.MakeNode().Name("node-y").Label("zone", "zone2").Label("node", "node-y").Obj(), + }, + existingPods: []*v1.Pod{ + st.MakePod().Name("p-a1").Node("node-a").Label("foo", "").Obj(), + st.MakePod().Name("p-b1").Node("node-b").Label("foo", "").Obj(), + st.MakePod().Name("p-x1").Node("node-x").Label("foo", "").Obj(), + st.MakePod().Name("p-y1").Node("node-y").Label("foo", "").Obj(), + }, + deletedPodIdx: 0, // remove pod "p-a1" + injectPodPointers: map[topologyPair][]int{ + {key: "zone", value: "zone1"}: {1}, + {key: "zone", value: "zone2"}: {2, 3}, + }, + want: &topologyPairsPodSpreadMap{ + // topologyKeyToMinPodsMap is expected to be re-calculated from {"zone": 2} + // to {"zone": 1} + topologyKeyToMinPodsMap: map[string]int32{"zone": 1}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-b1_": newPairSet("zone", "zone1"), + "p-x1_": newPairSet("zone", "zone2"), + "p-y1_": newPairSet("zone", "zone2"), + }, + }, + }, + }, + { + name: "delete an irrelevant pod won't help", + preemptor: st.MakePod().Name("p").Label("foo", ""). + SpreadConstraint(1, "zone", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-b").Label("zone", "zone1").Label("node", "node-b").Obj(), + st.MakeNode().Name("node-x").Label("zone", "zone2").Label("node", "node-x").Obj(), + st.MakeNode().Name("node-y").Label("zone", "zone2").Label("node", "node-y").Obj(), + }, + existingPods: []*v1.Pod{ + st.MakePod().Name("p-a0").Node("node-a").Label("bar", "").Obj(), + st.MakePod().Name("p-a1").Node("node-a").Label("foo", "").Obj(), + st.MakePod().Name("p-b1").Node("node-b").Label("foo", "").Obj(), + st.MakePod().Name("p-x1").Node("node-x").Label("foo", "").Obj(), + st.MakePod().Name("p-y1").Node("node-y").Label("foo", "").Obj(), + }, + deletedPodIdx: 0, // remove pod "p-a0" + injectPodPointers: map[topologyPair][]int{ + {key: "zone", value: "zone1"}: {1, 2}, + {key: "zone", value: "zone2"}: {3, 4}, + }, + want: &topologyPairsPodSpreadMap{ + // topologyKeyToMinPodsMap is unchanged + topologyKeyToMinPodsMap: map[string]int32{"zone": 2}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-a1_": newPairSet("zone", "zone1"), + "p-b1_": newPairSet("zone", "zone1"), + "p-x1_": newPairSet("zone", "zone2"), + "p-y1_": newPairSet("zone", "zone2"), + }, + }, + }, + }, + { + name: "delete a non-existing pod won't help", + preemptor: st.MakePod().Name("p").Label("foo", ""). + SpreadConstraint(1, "zone", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-b").Label("zone", "zone1").Label("node", "node-b").Obj(), + st.MakeNode().Name("node-x").Label("zone", "zone2").Label("node", "node-x").Obj(), + st.MakeNode().Name("node-y").Label("zone", "zone2").Label("node", "node-y").Obj(), + }, + existingPods: []*v1.Pod{ + st.MakePod().Name("p-a1").Node("node-a").Label("foo", "").Obj(), + st.MakePod().Name("p-b1").Node("node-b").Label("foo", "").Obj(), + st.MakePod().Name("p-x1").Node("node-x").Label("foo", "").Obj(), + st.MakePod().Name("p-y1").Node("node-y").Label("foo", "").Obj(), + }, + deletedPodIdx: -1, + deletedPod: st.MakePod().Name("p-a0").Node("node-a").Label("bar", "").Obj(), + injectPodPointers: map[topologyPair][]int{ + {key: "zone", value: "zone1"}: {0, 1}, + {key: "zone", value: "zone2"}: {2, 3}, + }, + want: &topologyPairsPodSpreadMap{ + // topologyKeyToMinPodsMap is unchanged + topologyKeyToMinPodsMap: map[string]int32{"zone": 2}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-a1_": newPairSet("zone", "zone1"), + "p-b1_": newPairSet("zone", "zone1"), + "p-x1_": newPairSet("zone", "zone2"), + "p-y1_": newPairSet("zone", "zone2"), + }, + }, + }, + }, + { + name: "two spreadConstraints", + preemptor: st.MakePod().Name("p").Label("foo", ""). + SpreadConstraint(1, "zone", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + SpreadConstraint(1, "node", hardSpread, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + nodes: []*v1.Node{ + st.MakeNode().Name("node-a").Label("zone", "zone1").Label("node", "node-a").Obj(), + st.MakeNode().Name("node-b").Label("zone", "zone1").Label("node", "node-b").Obj(), + st.MakeNode().Name("node-x").Label("zone", "zone2").Label("node", "node-x").Obj(), + }, + existingPods: []*v1.Pod{ + st.MakePod().Name("p-a1").Node("node-a").Label("foo", "").Obj(), + st.MakePod().Name("p-a2").Node("node-a").Label("foo", "").Obj(), + st.MakePod().Name("p-b1").Node("node-b").Label("foo", "").Obj(), + st.MakePod().Name("p-x1").Node("node-x").Label("foo", "").Obj(), + st.MakePod().Name("p-x2").Node("node-x").Label("foo", "").Obj(), + }, + deletedPodIdx: 3, // remove pod "p-x1" + injectPodPointers: map[topologyPair][]int{ + {key: "zone", value: "zone1"}: {0, 1, 2}, + {key: "zone", value: "zone2"}: {4}, + {key: "node", value: "node-a"}: {0, 1}, + {key: "node", value: "node-b"}: {2}, + {key: "node", value: "node-x"}: {4}, + }, + want: &topologyPairsPodSpreadMap{ + // topologyKeyToMinPodsMap is expected to be re-calculated from {"zone": 2, "node": 1} + // to {"zone": 1, "node": 1} + topologyKeyToMinPodsMap: map[string]int32{"zone": 1, "node": 1}, + topologyPairsMaps: &topologyPairsMaps{ + podToTopologyPairs: map[string]topologyPairSet{ + "p-a1_": newPairSet("zone", "zone1", "node", "node-a"), + "p-a2_": newPairSet("zone", "zone1", "node", "node-a"), + "p-b1_": newPairSet("zone", "zone1", "node", "node-b"), + "p-x2_": newPairSet("zone", "zone2", "node", "node-x"), + }, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.want.topologyPairToPods = make(map[topologyPair]podSet) + for pair, indexes := range tt.injectPodPointers { + pSet := make(podSet) + for _, i := range indexes { + pSet[tt.existingPods[i]] = struct{}{} + } + tt.want.topologyPairToPods[pair] = pSet + } + + nodeInfoMap := schedulernodeinfo.CreateNodeNameToInfoMap(tt.existingPods, tt.nodes) + podSpreadMap, _ := getTPMapMatchingSpreadConstraints(tt.preemptor, nodeInfoMap) + + var deletedPod *v1.Pod + if tt.deletedPodIdx < len(tt.existingPods) && tt.deletedPodIdx >= 0 { + deletedPod = tt.existingPods[tt.deletedPodIdx] + } else { + deletedPod = tt.deletedPod + } + podSpreadMap.removePod(deletedPod) + if !reflect.DeepEqual(podSpreadMap, tt.want) { + t.Errorf("podSpreadMap#removePod() = %v, want %v", podSpreadMap, tt.want) + } + }) + } +} + var ( hardSpread = v1.DoNotSchedule softSpread = v1.ScheduleAnyway diff --git a/pkg/scheduler/algorithm/predicates/predicates.go b/pkg/scheduler/algorithm/predicates/predicates.go index 4a47662c2ca..059c11220a6 100644 --- a/pkg/scheduler/algorithm/predicates/predicates.go +++ b/pkg/scheduler/algorithm/predicates/predicates.go @@ -1722,7 +1722,6 @@ func EvenPodsSpreadPredicate(pod *v1.Pod, meta PredicateMetadata, nodeInfo *sche if node == nil { return false, nil, fmt.Errorf("node not found") } - constraints := getHardTopologySpreadConstraints(pod) if len(constraints) == 0 { return true, nil, nil diff --git a/pkg/scheduler/core/generic_scheduler_test.go b/pkg/scheduler/core/generic_scheduler_test.go index cd5bf50493a..61a955bcecc 100644 --- a/pkg/scheduler/core/generic_scheduler_test.go +++ b/pkg/scheduler/core/generic_scheduler_test.go @@ -553,11 +553,9 @@ func TestGenericScheduler(t *testing.T) { pvcLister := schedulertesting.FakePersistentVolumeClaimLister(pvcs) - var predMetaProducer algorithmpredicates.PredicateMetadataProducer + predMetaProducer := algorithmpredicates.EmptyPredicateMetadataProducer if test.buildPredMeta { predMetaProducer = algorithmpredicates.NewPredicateMetadataFactory(schedulertesting.FakePodLister(test.pods)) - } else { - predMetaProducer = algorithmpredicates.EmptyPredicateMetadataProducer } scheduler := NewGenericScheduler( cache, @@ -1083,7 +1081,83 @@ func TestSelectNodesForPreemption(t *testing.T) { expected: map[string]map[string]bool{"machine1": {"a": true}, "machine2": {}}, addAffinityPredicate: true, }, + { + name: "preemption to resolve even pods spread FitError", + predicates: map[string]algorithmpredicates.FitPredicate{ + "matches": algorithmpredicates.EvenPodsSpreadPredicate, + }, + nodes: []string{"node-a/zone1", "node-b/zone1", "node-x/zone2"}, + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "p", + Labels: map[string]string{"foo": ""}, + }, + Spec: v1.PodSpec{ + Priority: &highPriority, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpExists, + }, + }, + }, + }, + { + MaxSkew: 1, + TopologyKey: "hostname", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpExists, + }, + }, + }, + }, + }, + }, + }, + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-a1", UID: types.UID("pod-a1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-a", Priority: &midPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-a2", UID: types.UID("pod-a2"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-a", Priority: &lowPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-b1", UID: types.UID("pod-b1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-b", Priority: &lowPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-x1", UID: types.UID("pod-x1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-x", Priority: &highPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-x2", UID: types.UID("pod-x2"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-x", Priority: &highPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + }, + expected: map[string]map[string]bool{ + "node-a": {"pod-a2": true}, + "node-b": {"pod-b1": true}, + }, + }, } + labelKeys := []string{"hostname", "zone", "region"} for _, test := range tests { t.Run(test.name, func(t *testing.T) { assignDefaultStartTime(test.pods) @@ -1091,7 +1165,13 @@ func TestSelectNodesForPreemption(t *testing.T) { nodes := []*v1.Node{} for _, n := range test.nodes { node := makeNode(n, 1000*5, priorityutil.DefaultMemoryRequest*5) - node.ObjectMeta.Labels = map[string]string{"hostname": node.Name} + // if possible, split node name by '/' to form labels in a format of + // {"hostname": node.Name[0], "zone": node.Name[1], "region": node.Name[2]} + node.ObjectMeta.Labels = make(map[string]string) + for i, label := range strings.Split(node.Name, "/") { + node.ObjectMeta.Labels[labelKeys[i]] = label + } + node.Name = node.ObjectMeta.Labels["hostname"] nodes = append(nodes, node) } if test.addAffinityPredicate { @@ -1416,6 +1496,15 @@ func TestNodesWherePreemptionMightHelp(t *testing.T) { }, expected: map[string]bool{"machine4": true}, }, + { + name: "ErrTopologySpreadConstraintsNotMatch should be tried as it indicates that the pod is unschedulable due to topology spread constraints", + failedPredMap: FailedPredicateMap{ + "machine1": []algorithmpredicates.PredicateFailureReason{algorithmpredicates.ErrTopologySpreadConstraintsNotMatch}, + "machine2": []algorithmpredicates.PredicateFailureReason{algorithmpredicates.ErrPodNotMatchHostName}, + "machine3": []algorithmpredicates.PredicateFailureReason{algorithmpredicates.ErrTopologySpreadConstraintsNotMatch}, + }, + expected: map[string]bool{"machine1": true, "machine3": true, "machine4": true}, + }, } for _, test := range tests { @@ -1435,27 +1524,31 @@ func TestNodesWherePreemptionMightHelp(t *testing.T) { func TestPreempt(t *testing.T) { defer algorithmpredicates.SetPredicatesOrderingDuringTest(order)() - failedPredMap := FailedPredicateMap{ + defaultFailedPredMap := FailedPredicateMap{ "machine1": []algorithmpredicates.PredicateFailureReason{algorithmpredicates.NewInsufficientResourceError(v1.ResourceMemory, 1000, 500, 300)}, "machine2": []algorithmpredicates.PredicateFailureReason{algorithmpredicates.ErrDiskConflict}, "machine3": []algorithmpredicates.PredicateFailureReason{algorithmpredicates.NewInsufficientResourceError(v1.ResourceMemory, 1000, 600, 400)}, } // Prepare 3 node names. - nodeNames := []string{} + defaultNodeNames := []string{} for i := 1; i < 4; i++ { - nodeNames = append(nodeNames, fmt.Sprintf("machine%d", i)) + defaultNodeNames = append(defaultNodeNames, fmt.Sprintf("machine%d", i)) } var ( preemptLowerPriority = v1.PreemptLowerPriority preemptNever = v1.PreemptNever ) tests := []struct { - name string - pod *v1.Pod - pods []*v1.Pod - extenders []*FakeExtender - expectedNode string - expectedPods []string // list of preempted pods + name string + pod *v1.Pod + pods []*v1.Pod + extenders []*FakeExtender + failedPredMap FailedPredicateMap + nodeNames []string + predicate algorithmpredicates.FitPredicate + buildPredMeta bool + expectedNode string + expectedPods []string // list of preempted pods }{ { name: "basic preemption logic", @@ -1489,6 +1582,83 @@ func TestPreempt(t *testing.T) { expectedNode: "machine3", expectedPods: []string{}, }, + { + name: "preemption for topology spread constraints", + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "p", + Labels: map[string]string{"foo": ""}, + }, + Spec: v1.PodSpec{ + Priority: &highPriority, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpExists, + }, + }, + }, + }, + { + MaxSkew: 1, + TopologyKey: "hostname", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpExists, + }, + }, + }, + }, + }, + }, + }, + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-a1", UID: types.UID("pod-a1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-a", Priority: &highPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-a2", UID: types.UID("pod-a2"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-a", Priority: &highPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-b1", UID: types.UID("pod-b1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-b", Priority: &lowPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-x1", UID: types.UID("pod-x1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-x", Priority: &highPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-x2", UID: types.UID("pod-x2"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{NodeName: "node-x", Priority: &highPriority}, + Status: v1.PodStatus{Phase: v1.PodRunning}, + }, + }, + failedPredMap: FailedPredicateMap{ + "node-a": []algorithmpredicates.PredicateFailureReason{algorithmpredicates.ErrTopologySpreadConstraintsNotMatch}, + "node-b": []algorithmpredicates.PredicateFailureReason{algorithmpredicates.ErrTopologySpreadConstraintsNotMatch}, + "node-x": []algorithmpredicates.PredicateFailureReason{algorithmpredicates.ErrTopologySpreadConstraintsNotMatch}, + }, + predicate: algorithmpredicates.EvenPodsSpreadPredicate, + buildPredMeta: true, + nodeNames: []string{"node-a/zone1", "node-b/zone1", "node-x/zone2"}, + expectedNode: "node-b", + expectedPods: []string{"pod-b1"}, + }, { name: "Scheduler extenders allow only machine1, otherwise machine3 would have been chosen", pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1")}, Spec: v1.PodSpec{ @@ -1618,6 +1788,7 @@ func TestPreempt(t *testing.T) { }, } + labelKeys := []string{"hostname", "zone", "region"} for _, test := range tests { t.Run(test.name, func(t *testing.T) { t.Logf("===== Running test %v", t.Name()) @@ -1627,14 +1798,26 @@ func TestPreempt(t *testing.T) { cache.AddPod(pod) } cachedNodeInfoMap := map[string]*schedulernodeinfo.NodeInfo{} - for _, name := range nodeNames { + nodeNames := defaultNodeNames + if len(test.nodeNames) != 0 { + nodeNames = test.nodeNames + } + for i, name := range nodeNames { node := makeNode(name, 1000*5, priorityutil.DefaultMemoryRequest*5) + // if possible, split node name by '/' to form labels in a format of + // {"hostname": node.Name[0], "zone": node.Name[1], "region": node.Name[2]} + node.ObjectMeta.Labels = make(map[string]string) + for i, label := range strings.Split(node.Name, "/") { + node.ObjectMeta.Labels[labelKeys[i]] = label + } + node.Name = node.ObjectMeta.Labels["hostname"] cache.AddNode(node) + nodeNames[i] = node.Name // Set nodeInfo to extenders to mock extenders' cache for preemption. cachedNodeInfo := schedulernodeinfo.NewNodeInfo() cachedNodeInfo.SetNode(node) - cachedNodeInfoMap[name] = cachedNodeInfo + cachedNodeInfoMap[node.Name] = cachedNodeInfo } extenders := []algorithm.SchedulerExtender{} for _, extender := range test.extenders { @@ -1642,11 +1825,19 @@ func TestPreempt(t *testing.T) { extender.cachedNodeNameToInfo = cachedNodeInfoMap extenders = append(extenders, extender) } + predicate := algorithmpredicates.PodFitsResources + if test.predicate != nil { + predicate = test.predicate + } + predMetaProducer := algorithmpredicates.EmptyPredicateMetadataProducer + if test.buildPredMeta { + predMetaProducer = algorithmpredicates.NewPredicateMetadataFactory(schedulertesting.FakePodLister(test.pods)) + } scheduler := NewGenericScheduler( cache, internalqueue.NewSchedulingQueue(nil, nil), - map[string]algorithmpredicates.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, - algorithmpredicates.EmptyPredicateMetadataProducer, + map[string]algorithmpredicates.FitPredicate{"matches": predicate}, + predMetaProducer, []priorities.PriorityConfig{{Function: numericPriority, Weight: 1}}, priorities.EmptyPriorityMetadataProducer, emptyFramework, @@ -1660,6 +1851,10 @@ func TestPreempt(t *testing.T) { true) scheduler.(*genericScheduler).snapshot() // Call Preempt and check the expected results. + failedPredMap := defaultFailedPredMap + if test.failedPredMap != nil { + failedPredMap = test.failedPredMap + } node, victims, _, err := scheduler.Preempt(test.pod, schedulertesting.FakeNodeLister(makeNodeList(nodeNames)), error(&FitError{Pod: test.pod, FailedPredicates: failedPredMap})) if err != nil { t.Errorf("unexpected error in preemption: %v", err)