diff --git a/plugin/pkg/scheduler/algorithm/priorities/priorities.go b/plugin/pkg/scheduler/algorithm/priorities/priorities.go index 44e874a722a..4a4e333131a 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/priorities.go +++ b/plugin/pkg/scheduler/algorithm/priorities/priorities.go @@ -20,6 +20,7 @@ import ( "math" "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api/resource" "github.com/GoogleCloudPlatform/kubernetes/pkg/labels" "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm" "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm/predicates" @@ -28,38 +29,69 @@ import ( // the unused capacity is calculated on a scale of 0-10 // 0 being the lowest priority and 10 being the highest -func calculateScore(requested, capacity int64, node string) int { +func calculateScore(requested int64, capacity int64, node string) int { if capacity == 0 { return 0 } if requested > capacity { - glog.Infof("Combined requested resources from existing pods exceeds capacity on minion: %s", node) + glog.Infof("Combined requested resources %d from existing pods exceeds capacity %d on node %s", + requested, capacity, node) return 0 } return int(((capacity - requested) * 10) / capacity) } -// Calculate the occupancy on a node. 'node' has information about the resources on the node. +// For each of these resources, a pod that doesn't request the resource explicitly +// will be treated as having requested the amount indicated below, for the purpose +// of computing priority only. This ensures that when scheduling zero-limit pods, such +// pods will not all be scheduled to the machine with the smallest in-use limit, +// and that when scheduling regular pods, such pods will not see zero-limit pods as +// consuming no resources whatsoever. +const defaultMilliCpuLimit int64 = 100 // 0.1 core +const defaultMemoryLimit int64 = 60 * 1024 * 1024 // 60 MB + +// TODO: Consider setting default as a fixed fraction of machine capacity (take "capacity api.ResourceList" +// as an additional argument here) rather than using constants +func getNonzeroLimits(limits *api.ResourceList) (int64, int64) { + var out_millicpu, out_memory int64 + // Override if un-set, but not if explicitly set to zero + if (*limits.Cpu() == resource.Quantity{}) { + out_millicpu = defaultMilliCpuLimit + } else { + out_millicpu = limits.Cpu().MilliValue() + } + // Override if un-set, but not if explicitly set to zero + if (*limits.Memory() == resource.Quantity{}) { + out_memory = defaultMemoryLimit + } else { + out_memory = limits.Memory().Value() + } + return out_millicpu, out_memory +} + +// Calculate the resource occupancy on a node. 'node' has information about the resources on the node. // 'pods' is a list of pods currently scheduled on the node. -func calculateOccupancy(pod *api.Pod, node api.Node, pods []*api.Pod) algorithm.HostPriority { +func calculateResourceOccupancy(pod *api.Pod, node api.Node, pods []*api.Pod) algorithm.HostPriority { totalMilliCPU := int64(0) totalMemory := int64(0) + capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue() + capacityMemory := node.Status.Capacity.Memory().Value() + for _, existingPod := range pods { for _, container := range existingPod.Spec.Containers { - totalMilliCPU += container.Resources.Limits.Cpu().MilliValue() - totalMemory += container.Resources.Limits.Memory().Value() + cpu, memory := getNonzeroLimits(&container.Resources.Limits) + totalMilliCPU += cpu + totalMemory += memory } } // Add the resources requested by the current pod being scheduled. // This also helps differentiate between differently sized, but empty, minions. for _, container := range pod.Spec.Containers { - totalMilliCPU += container.Resources.Limits.Cpu().MilliValue() - totalMemory += container.Resources.Limits.Memory().Value() + cpu, memory := getNonzeroLimits(&container.Resources.Limits) + totalMilliCPU += cpu + totalMemory += memory } - capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue() - capacityMemory := node.Status.Capacity.Memory().Value() - cpuScore := calculateScore(totalMilliCPU, capacityMilliCPU, node.Name) memoryScore := calculateScore(totalMemory, capacityMemory, node.Name) glog.V(10).Infof( @@ -89,7 +121,7 @@ func LeastRequestedPriority(pod *api.Pod, podLister algorithm.PodLister, minionL list := algorithm.HostPriorityList{} for _, node := range nodes.Items { - list = append(list, calculateOccupancy(pod, node, podsToMachines[node.Name])) + list = append(list, calculateResourceOccupancy(pod, node, podsToMachines[node.Name])) } return list, nil } @@ -163,15 +195,17 @@ func calculateBalancedResourceAllocation(pod *api.Pod, node api.Node, pods []*ap score := int(0) for _, existingPod := range pods { for _, container := range existingPod.Spec.Containers { - totalMilliCPU += container.Resources.Limits.Cpu().MilliValue() - totalMemory += container.Resources.Limits.Memory().Value() + cpu, memory := getNonzeroLimits(&container.Resources.Limits) + totalMilliCPU += cpu + totalMemory += memory } } // Add the resources requested by the current pod being scheduled. // This also helps differentiate between differently sized, but empty, minions. for _, container := range pod.Spec.Containers { - totalMilliCPU += container.Resources.Limits.Cpu().MilliValue() - totalMemory += container.Resources.Limits.Memory().Value() + cpu, memory := getNonzeroLimits(&container.Resources.Limits) + totalMilliCPU += cpu + totalMemory += memory } capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue() diff --git a/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go b/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go index ab43cecfedb..b6af3446f29 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go +++ b/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go @@ -19,10 +19,12 @@ package priorities import ( "reflect" "sort" + "strconv" "testing" "github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/api/resource" + "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler" "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm" ) @@ -38,6 +40,103 @@ func makeMinion(node string, milliCPU, memory int64) api.Node { } } +func TestZeroLimit(t *testing.T) { + // A pod with no resources. We expect spreading to count it as having the default resources. + noResources := api.PodSpec{ + Containers: []api.Container{ + {}, + }, + } + noResources1 := noResources + noResources1.NodeName = "machine1" + // A pod with the same resources as a 0-limit pod gets by default as its resources (for spreading). + small := api.PodSpec{ + Containers: []api.Container{ + { + Resources: api.ResourceRequirements{ + Limits: api.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(defaultMilliCpuLimit, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(defaultMemoryLimit, 10)), + }, + }, + }, + }, + } + small2 := small + small2.NodeName = "machine2" + // A larger pod. + large := api.PodSpec{ + Containers: []api.Container{ + { + Resources: api.ResourceRequirements{ + Limits: api.ResourceList{ + "cpu": resource.MustParse( + strconv.FormatInt(defaultMilliCpuLimit*3, 10) + "m"), + "memory": resource.MustParse( + strconv.FormatInt(defaultMemoryLimit*3, 10)), + }, + }, + }, + }, + } + large1 := large + large1.NodeName = "machine1" + large2 := large + large2.NodeName = "machine2" + tests := []struct { + pod *api.Pod + pods []*api.Pod + nodes []api.Node + test string + }{ + // The point of these tests is to show you get the same priority for a zero-limit pod + // as for a pod with the defaults limits, both when the zero-limit pod is already on the machine + // and when the zero-limit pod is the one being scheduled. + { + pod: &api.Pod{Spec: noResources}, + // match current f1-micro on GCE + nodes: []api.Node{makeMinion("machine1", 1000, defaultMemoryLimit*10), makeMinion("machine2", 1000, defaultMemoryLimit*10)}, + test: "test priority of zero-limit pod with machine with zero-limit pod", + pods: []*api.Pod{ + {Spec: large1}, {Spec: noResources1}, + {Spec: large2}, {Spec: small2}, + }, + }, + { + pod: &api.Pod{Spec: small}, + // match current f1-micro on GCE + nodes: []api.Node{makeMinion("machine1", 1000, defaultMemoryLimit*10), makeMinion("machine2", 1000, defaultMemoryLimit*10)}, + test: "test priority of nonzero-limit pod with machine with zero-limit pod", + pods: []*api.Pod{ + {Spec: large1}, {Spec: noResources1}, + {Spec: large2}, {Spec: small2}, + }, + }, + } + + const expectedPriority int = 25 + for _, test := range tests { + list, err := scheduler.PrioritizeNodes( + test.pod, + algorithm.FakePodLister(test.pods), + // This should match the configuration in defaultPriorities() in + // plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go if you want + // to test what's actually in production. + []algorithm.PriorityConfig{{Function: LeastRequestedPriority, Weight: 1}, {Function: BalancedResourceAllocation, Weight: 1}, {Function: NewServiceSpreadPriority(algorithm.FakeServiceLister([]api.Service{})), Weight: 1}}, + algorithm.FakeMinionLister(api.NodeList{Items: test.nodes})) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + for _, hp := range list { + if hp.Score != expectedPriority { + t.Errorf("%s: expected 25 for all priorities, got list %#v", list) + } + } + } +} + func TestLeastRequested(t *testing.T) { labels1 := map[string]string{ "foo": "bar", @@ -62,14 +161,16 @@ func TestLeastRequested(t *testing.T) { { Resources: api.ResourceRequirements{ Limits: api.ResourceList{ - "cpu": resource.MustParse("1000m"), + "cpu": resource.MustParse("1000m"), + "memory": resource.MustParse("0"), }, }, }, { Resources: api.ResourceRequirements{ Limits: api.ResourceList{ - "cpu": resource.MustParse("2000m"), + "cpu": resource.MustParse("2000m"), + "memory": resource.MustParse("0"), }, }, }, @@ -392,14 +493,16 @@ func TestBalancedResourceAllocation(t *testing.T) { { Resources: api.ResourceRequirements{ Limits: api.ResourceList{ - "cpu": resource.MustParse("1000m"), + "cpu": resource.MustParse("1000m"), + "memory": resource.MustParse("0"), }, }, }, { Resources: api.ResourceRequirements{ Limits: api.ResourceList{ - "cpu": resource.MustParse("2000m"), + "cpu": resource.MustParse("2000m"), + "memory": resource.MustParse("0"), }, }, }, diff --git a/plugin/pkg/scheduler/algorithm/priorities/spreading.go b/plugin/pkg/scheduler/algorithm/priorities/service_spreading.go similarity index 97% rename from plugin/pkg/scheduler/algorithm/priorities/spreading.go rename to plugin/pkg/scheduler/algorithm/priorities/service_spreading.go index eaddad66d0d..663f638ea19 100644 --- a/plugin/pkg/scheduler/algorithm/priorities/spreading.go +++ b/plugin/pkg/scheduler/algorithm/priorities/service_spreading.go @@ -20,6 +20,7 @@ import ( "github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/labels" "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm" + "github.com/golang/glog" ) type ServiceSpread struct { @@ -82,6 +83,9 @@ func (s *ServiceSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorith fScore = 10 * (float32(maxCount-counts[minion.Name]) / float32(maxCount)) } result = append(result, algorithm.HostPriority{Host: minion.Name, Score: int(fScore)}) + glog.V(10).Infof( + "%v -> %v: ServiceSpreadPriority, Score: (%d)", pod.Name, minion.Name, int(fScore), + ) } return result, nil } diff --git a/plugin/pkg/scheduler/algorithm/priorities/spreading_test.go b/plugin/pkg/scheduler/algorithm/priorities/service_spreading_test.go similarity index 100% rename from plugin/pkg/scheduler/algorithm/priorities/spreading_test.go rename to plugin/pkg/scheduler/algorithm/priorities/service_spreading_test.go diff --git a/plugin/pkg/scheduler/generic_scheduler.go b/plugin/pkg/scheduler/generic_scheduler.go index c9f5453d18e..f49fc41257d 100644 --- a/plugin/pkg/scheduler/generic_scheduler.go +++ b/plugin/pkg/scheduler/generic_scheduler.go @@ -74,7 +74,7 @@ func (g *genericScheduler) Schedule(pod *api.Pod, minionLister algorithm.MinionL return "", err } - priorityList, err := prioritizeNodes(pod, g.pods, g.prioritizers, algorithm.FakeMinionLister(filteredNodes)) + priorityList, err := PrioritizeNodes(pod, g.pods, g.prioritizers, algorithm.FakeMinionLister(filteredNodes)) if err != nil { return "", err } @@ -142,7 +142,7 @@ func findNodesThatFit(pod *api.Pod, podLister algorithm.PodLister, predicateFunc // Each priority function can also have its own weight // The minion scores returned by the priority function are multiplied by the weights to get weighted scores // All scores are finally combined (added) to get the total weighted scores of all minions -func prioritizeNodes(pod *api.Pod, podLister algorithm.PodLister, priorityConfigs []algorithm.PriorityConfig, minionLister algorithm.MinionLister) (algorithm.HostPriorityList, error) { +func PrioritizeNodes(pod *api.Pod, podLister algorithm.PodLister, priorityConfigs []algorithm.PriorityConfig, minionLister algorithm.MinionLister) (algorithm.HostPriorityList, error) { result := algorithm.HostPriorityList{} // If no priority configs are provided, then the EqualPriority function is applied @@ -168,6 +168,7 @@ func prioritizeNodes(pod *api.Pod, podLister algorithm.PodLister, priorityConfig } } for host, score := range combinedScores { + glog.V(10).Infof("Host %s Score %d", host, score) result = append(result, algorithm.HostPriority{Host: host, Score: score}) } return result, nil