diff --git a/plugin/pkg/scheduler/algorithm/priorities/priorities.go b/plugin/pkg/scheduler/algorithm/priorities/priorities.go index c1cce819f88..c6f7bc62ce8 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/priorities.go +++ b/plugin/pkg/scheduler/algorithm/priorities/priorities.go @@ -21,6 +21,7 @@ import ( "github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/labels" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api/resource" "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm" "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm/predicates" "github.com/golang/glog" @@ -28,42 +29,72 @@ import ( // the unused capacity is calculated on a scale of 0-10 // 0 being the lowest priority and 10 being the highest -func calculateScore(requested, capacity int64, node string) int { +func calculateScore(requested int64, capacity int64, node string) int { if capacity == 0 { return 0 } if requested > capacity { - glog.Infof("Combined requested resources from existing pods exceeds capacity on minion: %s", node) + glog.Infof("Combined requested resources %d from existing pods exceeds capacity %d on minion: %s", + requested, capacity, node) return 0 } return int(((capacity - requested) * 10) / capacity) } +// For each of these resources, a pod that doesn't request the resource explicitly +// will be treated as having requested the amount indicated below, for the purpose +// of computing priority only. This ensures that when scheduling zero-limit pods, such +// pods will not all be scheduled to the machine with the smallest in-use limit, +// and that when scheduling regular pods, such pods will not see zero-limit pods as +// consuming no resources whatsoever. +const defaultMilliCpuLimit int64 = 100 // 0.1 core +const defaultMemoryLimit int64 = 60 * 1024 * 1024 // 60 MB + +// TODO: Consider setting default as a fixed fraction of machine capacity (take "capacity api.ResourceList" +// as an additional argument here) rather than using constants +func toNonzeroLimits(limits *api.ResourceList) (int64, int64) { + var out_millicpu, out_memory int64 + // Override if un-set, but not if explicitly set to zero + if (*limits.Cpu() == resource.Quantity{}) { + out_millicpu = defaultMilliCpuLimit + } else { + out_millicpu = limits.Cpu().MilliValue() + } + // Override if un-set, but not if explicitly set to zero + if (*limits.Memory() == resource.Quantity{}) { + out_memory = defaultMemoryLimit + } else { + out_memory = limits.Memory().Value() + } + return out_millicpu, out_memory +} + // Calculate the resource occupancy on a node. 'node' has information about the resources on the node. // 'pods' is a list of pods currently scheduled on the node. func calculateResourceOccupancy(pod *api.Pod, node api.Node, pods []*api.Pod) algorithm.HostPriority { totalMilliCPU := int64(0) totalMemory := int64(0) + capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue() + capacityMemory := node.Status.Capacity.Memory().Value() + for _, existingPod := range pods { for _, container := range existingPod.Spec.Containers { - totalMilliCPU += container.Resources.Limits.Cpu().MilliValue() - totalMemory += container.Resources.Limits.Memory().Value() + cpu, memory := toNonzeroLimits(&container.Resources.Limits) + totalMilliCPU += cpu + totalMemory += memory } } // Add the resources requested by the current pod being scheduled. // This also helps differentiate between differently sized, but empty, minions. for _, container := range pod.Spec.Containers { - totalMilliCPU += container.Resources.Limits.Cpu().MilliValue() - totalMemory += container.Resources.Limits.Memory().Value() + cpu, memory := toNonzeroLimits(&container.Resources.Limits) + totalMilliCPU += cpu + totalMemory += memory } - capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue() - capacityMemory := node.Status.Capacity.Memory().Value() - cpuScore := calculateScore(totalMilliCPU, capacityMilliCPU, node.Name) memoryScore := calculateScore(totalMemory, capacityMemory, node.Name) -// glog.V(10).Infof( - glog.Infof( + glog.V(10).Infof( "%v -> %v: Least Requested Priority, Absolute/Requested: (%d, %d) / (%d, %d) Score: (%d, %d)", pod.Name, node.Name, totalMilliCPU, totalMemory, @@ -95,47 +126,6 @@ func LeastRequestedPriority(pod *api.Pod, podLister algorithm.PodLister, minionL return list, nil } -func min(l, r int64) (m int64) { - m = r - if l < r { - m = l - } - return m -} - -// See comment for DumbSpreadingPriority() -const dumbSpreadingDenominator int64 = 10 - -// DumbSpreadingPriority is a priority function that favors nodes with fewer pods. -// It works like LeastRequestedPeriority but instead of using 10 * percentage of machine free by resource, -// it uses 10 * percentage of machine free by pod, with "percentage of machine free by pod" claculated as -// (dumbSpreadingDenominator - number of pods already on the node + 1) / dumbSpreadingDenominator. -// dumbSpreadingDenominator serves like the machine capacity in LeasRequestedPriority but is chosen -// so that we equate one pod with a reasonable amount of resources when we combine all the scores together. -func DumbSpreadingPriority(pod *api.Pod, podLister algorithm.PodLister, minionLister algorithm.MinionLister) (algorithm.HostPriorityList, error) { - nodes, err := minionLister.List() - if err != nil { - return algorithm.HostPriorityList{}, err - } - podsToMachines, err := predicates.MapPodsToMachines(podLister) - - list := algorithm.HostPriorityList{} - for _, node := range nodes.Items { - npods := int64(len(podsToMachines[node.Name])) - score := calculateScore(min(npods+1, dumbSpreadingDenominator), dumbSpreadingDenominator, node.Name) -// glog.V(10).Infof( - glog.Infof( - "%v -> %v: DumbSpreadPriority, Old # pods (%d) Score: (%d)", - pod.Name, node.Name, npods, score, - ) - list = append(list, algorithm.HostPriority{ - Host: node.Name, - Score: score, - }) - } - return list, nil -} - type NodeLabelPrioritizer struct { label string presence bool @@ -205,15 +195,17 @@ func calculateBalancedResourceAllocation(pod *api.Pod, node api.Node, pods []*ap score := int(0) for _, existingPod := range pods { for _, container := range existingPod.Spec.Containers { - totalMilliCPU += container.Resources.Limits.Cpu().MilliValue() - totalMemory += container.Resources.Limits.Memory().Value() + cpu, memory := toNonzeroLimits(&container.Resources.Limits) + totalMilliCPU += cpu + totalMemory += memory } } // Add the resources requested by the current pod being scheduled. // This also helps differentiate between differently sized, but empty, minions. for _, container := range pod.Spec.Containers { - totalMilliCPU += container.Resources.Limits.Cpu().MilliValue() - totalMemory += container.Resources.Limits.Memory().Value() + cpu, memory := toNonzeroLimits(&container.Resources.Limits) + totalMilliCPU += cpu + totalMemory += memory } capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue() @@ -232,8 +224,7 @@ func calculateBalancedResourceAllocation(pod *api.Pod, node api.Node, pods []*ap diff := math.Abs(cpuFraction - memoryFraction) score = int(10 - diff*10) } -// glog.V(10).Infof( - glog.Infof( + glog.V(10).Infof( "%v -> %v: Balanced Resource Allocation, Absolute/Requested: (%d, %d) / (%d, %d) Score: (%d)", pod.Name, node.Name, totalMilliCPU, totalMemory, diff --git a/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go b/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go index e4262dc2e6d..4662786fedb 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go +++ b/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go @@ -19,6 +19,7 @@ package priorities import ( "reflect" "sort" + "strconv" "testing" "github.com/GoogleCloudPlatform/kubernetes/pkg/api" @@ -39,72 +40,83 @@ func makeMinion(node string, milliCPU, memory int64) api.Node { } } -func TestDumbSpreading(t *testing.T) { +func TestZeroLimit(t *testing.T) { + // A pod with no resources. We expect spreading to count it as having the default resources. noResources := api.PodSpec{ - Containers: []api.Container{}, + Containers: []api.Container{ + {}, + }, } + noResources1 := noResources + noResources1.NodeName = "machine1" + // A pod with the same resources as a 0-limit pod gets by default as its resources (for spreading). small := api.PodSpec{ - NodeName: "machine1", Containers: []api.Container{ { Resources: api.ResourceRequirements{ Limits: api.ResourceList{ - "cpu": resource.MustParse("100m"), - "memory": resource.MustParse("1000"), + "cpu": resource.MustParse( + strconv.FormatInt(defaultMilliCpuLimit, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(defaultMemoryLimit, 10)), }, }, }, }, } + small2 := small + small2.NodeName = "machine2" + // A larger pod. large := api.PodSpec{ - NodeName: "machine2", Containers: []api.Container{ { Resources: api.ResourceRequirements{ Limits: api.ResourceList{ - "cpu": resource.MustParse("600m"), - "memory": resource.MustParse("6000"), + "cpu": resource.MustParse( + strconv.FormatInt(defaultMilliCpuLimit * 3, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(defaultMemoryLimit * 3, 10)), }, }, }, }, } + large1 := large + large1.NodeName = "machine1" + large2 := large + large2.NodeName = "machine2" tests := []struct { pod *api.Pod pods []*api.Pod nodes []api.Node - expectedList algorithm.HostPriorityList test string }{ + // The point of these tests is to show you get the same priority for a zero-limit pod + // as for a pod with the defaults limits, both when the zero-limit pod is already on the machine + // and when the zero-limit pod is the one being scheduled. { - /* Minion1 CPU capacity 1000m, free 700m/7000, 3 pods - LeastRequestedPriority score 7 - BalancedResourceAllocation score 10 - ServiceSpreadingPriority score 10 - DumbSpreadingPriority score 6 - Total: 7 + 10 + 10 + 2*6 = 39 - - Minion2 CPU capacity 1000m, free 400m/4000, 1 pod - LeastRequestedPriority score 4 - BalancedResourceAllocation score 10 - ServiceSpreadingPriority score 10 - DumbSpreadingPriority score 8 - Total: 4 + 10 + 10 + 2*8 = 40 - - Moral of the story: We prefer the machine that is more heavily loaded, - because it has fewer pods. - */ pod: &api.Pod{Spec: noResources}, - nodes: []api.Node{makeMinion("machine1", 1000, 10000), makeMinion("machine2", 1000, 10000)}, - expectedList: []algorithm.HostPriority{{"machine1", 39}, {"machine2", 40}}, - test: "nothing scheduled, nothing requested", + // match current f1-micro on GCE + nodes: []api.Node{makeMinion("machine1", 1000, defaultMemoryLimit * 10), makeMinion("machine2", 1000, defaultMemoryLimit * 10)}, + test: "test priority of zero-limit pod with machine with zero-limit pod", pods: []*api.Pod { - {Spec: small}, {Spec: small}, {Spec: small}, - {Spec: large}, + {Spec: large1}, {Spec: noResources1}, + {Spec: large2}, {Spec: small2}, + }, + }, + { + pod: &api.Pod{Spec: small}, + // match current f1-micro on GCE + nodes: []api.Node{makeMinion("machine1", 1000, defaultMemoryLimit * 10), makeMinion("machine2", 1000, defaultMemoryLimit * 10)}, + test: "test priority of nonzero-limit pod with machine with zero-limit pod", + pods: []*api.Pod { + {Spec: large1}, {Spec: noResources1}, + {Spec: large2}, {Spec: small2}, }, }, } + const expectedPriority int = 25 for _, test := range tests { list, err := scheduler.PrioritizeNodes( test.pod, @@ -112,13 +124,15 @@ func TestDumbSpreading(t *testing.T) { // This should match the configuration in defaultPriorities() in // plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go if you want // to test what's actually in production. - []algorithm.PriorityConfig{{Function: LeastRequestedPriority, Weight: 1}, {Function: BalancedResourceAllocation, Weight: 1}, {Function: DumbSpreadingPriority, Weight: 2}, {Function: NewServiceSpreadPriority(algorithm.FakeServiceLister([]api.Service{})), Weight: 1}}, + []algorithm.PriorityConfig{{Function: LeastRequestedPriority, Weight: 1}, {Function: BalancedResourceAllocation, Weight: 1}, {Function: NewServiceSpreadPriority(algorithm.FakeServiceLister([]api.Service{})), Weight: 1}}, algorithm.FakeMinionLister(api.NodeList{Items: test.nodes})) if err != nil { t.Errorf("unexpected error: %v", err) } - if !reflect.DeepEqual(test.expectedList, list) { - t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list) + for _, hp := range list { + if (hp.Score != expectedPriority) { + t.Errorf("%s: expected 25 for all priorities, got list %#v", list) + } } } } @@ -149,6 +163,7 @@ func TestLeastRequested(t *testing.T) { Resources: api.ResourceRequirements{ Limits: api.ResourceList{ "cpu": resource.MustParse("1000m"), + "memory": resource.MustParse("0"), }, }, }, @@ -156,6 +171,7 @@ func TestLeastRequested(t *testing.T) { Resources: api.ResourceRequirements{ Limits: api.ResourceList{ "cpu": resource.MustParse("2000m"), + "memory": resource.MustParse("0"), }, }, }, @@ -479,6 +495,7 @@ func TestBalancedResourceAllocation(t *testing.T) { Resources: api.ResourceRequirements{ Limits: api.ResourceList{ "cpu": resource.MustParse("1000m"), + "memory": resource.MustParse("0"), }, }, }, @@ -486,6 +503,7 @@ func TestBalancedResourceAllocation(t *testing.T) { Resources: api.ResourceRequirements{ Limits: api.ResourceList{ "cpu": resource.MustParse("2000m"), + "memory": resource.MustParse("0"), }, }, }, diff --git a/plugin/pkg/scheduler/algorithm/priorities/service_spreading.go b/plugin/pkg/scheduler/algorithm/priorities/service_spreading.go index ff9216e62d6..3435b4e7cba 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/service_spreading.go +++ b/plugin/pkg/scheduler/algorithm/priorities/service_spreading.go @@ -83,8 +83,7 @@ func (s *ServiceSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorith fScore = 10 * (float32(maxCount-counts[minion.Name]) / float32(maxCount)) } result = append(result, algorithm.HostPriority{Host: minion.Name, Score: int(fScore)}) - // glog.V(10).Infof( - glog.Infof( + glog.V(10).Infof( "%v -> %v: ServiceSpreadPriority, Sore: (%d)", pod.Name, minion.Name, int(fScore), ) } diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go index 5863b62b4c7..1546412b78a 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go @@ -65,8 +65,6 @@ func defaultPriorities() util.StringSet { factory.RegisterPriorityFunction("LeastRequestedPriority", priorities.LeastRequestedPriority, 1), // Prioritizes nodes to help achieve balanced resource usage factory.RegisterPriorityFunction("BalancedResourceAllocation", priorities.BalancedResourceAllocation, 1), - // Prioritizes nodes to achieve approximately equal number of pods per node - factory.RegisterPriorityFunction("DumbSpreadingPriority", priorities.DumbSpreadingPriority, 2), // spreads pods by minimizing the number of pods (belonging to the same service) on the same minion. factory.RegisterPriorityConfigFactory( "ServiceSpreadingPriority",