mirror of
				https://github.com/k3s-io/kubernetes.git
				synced 2025-10-31 13:50:01 +00:00 
			
		
		
		
	ClusterAutoscaler-friendly scheduler priority function that promotes well-used nodes.
This commit is contained in:
		| @@ -39,9 +39,10 @@ func getNonZeroRequests(pod *api.Pod) *schedulercache.Resource { | |||||||
| 	return result | 	return result | ||||||
| } | } | ||||||
|  |  | ||||||
| // the unused capacity is calculated on a scale of 0-10 | // The unused capacity is calculated on a scale of 0-10 | ||||||
| // 0 being the lowest priority and 10 being the highest | // 0 being the lowest priority and 10 being the highest. | ||||||
| func calculateScore(requested int64, capacity int64, node string) int64 { | // The more unused resources the higher the score is. | ||||||
|  | func calculateUnusedScore(requested int64, capacity int64, node string) int64 { | ||||||
| 	if capacity == 0 { | 	if capacity == 0 { | ||||||
| 		return 0 | 		return 0 | ||||||
| 	} | 	} | ||||||
| @@ -53,17 +54,37 @@ func calculateScore(requested int64, capacity int64, node string) int64 { | |||||||
| 	return ((capacity - requested) * 10) / capacity | 	return ((capacity - requested) * 10) / capacity | ||||||
| } | } | ||||||
|  |  | ||||||
| // Calculate the resource occupancy on a node.  'node' has information about the resources on the node. | // The used capacity is calculated on a scale of 0-10 | ||||||
|  | // 0 being the lowest priority and 10 being the highest. | ||||||
|  | // The more resources are used the higher the score is. This function | ||||||
|  | // is almost a reversed version of calculatUnusedScore (10 - calculateUnusedScore). | ||||||
|  | // The main difference is in rounding. It was added to keep the | ||||||
|  | // final formula clean and not to modify the widely used (by users | ||||||
|  | // in their default scheduling policies) calculateUSedScore. | ||||||
|  | func calculateUsedScore(requested int64, capacity int64, node string) int64 { | ||||||
|  | 	if capacity == 0 { | ||||||
|  | 		return 0 | ||||||
|  | 	} | ||||||
|  | 	if requested > capacity { | ||||||
|  | 		glog.V(2).Infof("Combined requested resources %d from existing pods exceeds capacity %d on node %s", | ||||||
|  | 			requested, capacity, node) | ||||||
|  | 		return 0 | ||||||
|  | 	} | ||||||
|  | 	return (requested * 10) / capacity | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // Calculates host priority based on the amount of unused resources. | ||||||
|  | // 'node' has information about the resources on the node. | ||||||
| // 'pods' is a list of pods currently scheduled on the node. | // 'pods' is a list of pods currently scheduled on the node. | ||||||
| // TODO: Use Node() from nodeInfo instead of passing it. | // TODO: Use Node() from nodeInfo instead of passing it. | ||||||
| func calculateResourceOccupancy(pod *api.Pod, podRequests *schedulercache.Resource, node *api.Node, nodeInfo *schedulercache.NodeInfo) schedulerapi.HostPriority { | func calculateUnusedPriority(pod *api.Pod, podRequests *schedulercache.Resource, node *api.Node, nodeInfo *schedulercache.NodeInfo) schedulerapi.HostPriority { | ||||||
| 	allocatableResources := nodeInfo.AllocatableResource() | 	allocatableResources := nodeInfo.AllocatableResource() | ||||||
| 	totalResources := *podRequests | 	totalResources := *podRequests | ||||||
| 	totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU | 	totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU | ||||||
| 	totalResources.Memory += nodeInfo.NonZeroRequest().Memory | 	totalResources.Memory += nodeInfo.NonZeroRequest().Memory | ||||||
|  |  | ||||||
| 	cpuScore := calculateScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name) | 	cpuScore := calculateUnusedScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name) | ||||||
| 	memoryScore := calculateScore(totalResources.Memory, allocatableResources.Memory, node.Name) | 	memoryScore := calculateUnusedScore(totalResources.Memory, allocatableResources.Memory, node.Name) | ||||||
| 	if glog.V(10) { | 	if glog.V(10) { | ||||||
| 		// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is | 		// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is | ||||||
| 		// not logged. There is visible performance gain from it. | 		// not logged. There is visible performance gain from it. | ||||||
| @@ -82,6 +103,35 @@ func calculateResourceOccupancy(pod *api.Pod, podRequests *schedulercache.Resour | |||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // Calculate the resource used on a node.  'node' has information about the resources on the node. | ||||||
|  | // 'pods' is a list of pods currently scheduled on the node. | ||||||
|  | // TODO: Use Node() from nodeInfo instead of passing it. | ||||||
|  | func calculateUsedPriority(pod *api.Pod, podRequests *schedulercache.Resource, node *api.Node, nodeInfo *schedulercache.NodeInfo) schedulerapi.HostPriority { | ||||||
|  | 	allocatableResources := nodeInfo.AllocatableResource() | ||||||
|  | 	totalResources := *podRequests | ||||||
|  | 	totalResources.MilliCPU += nodeInfo.NonZeroRequest().MilliCPU | ||||||
|  | 	totalResources.Memory += nodeInfo.NonZeroRequest().Memory | ||||||
|  |  | ||||||
|  | 	cpuScore := calculateUsedScore(totalResources.MilliCPU, allocatableResources.MilliCPU, node.Name) | ||||||
|  | 	memoryScore := calculateUsedScore(totalResources.Memory, allocatableResources.Memory, node.Name) | ||||||
|  | 	if glog.V(10) { | ||||||
|  | 		// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is | ||||||
|  | 		// not logged. There is visible performance gain from it. | ||||||
|  | 		glog.V(10).Infof( | ||||||
|  | 			"%v -> %v: Most Requested Priority, capacity %d millicores %d memory bytes, total request %d millicores %d memory bytes, score %d CPU %d memory", | ||||||
|  | 			pod.Name, node.Name, | ||||||
|  | 			allocatableResources.MilliCPU, allocatableResources.Memory, | ||||||
|  | 			totalResources.MilliCPU, totalResources.Memory, | ||||||
|  | 			cpuScore, memoryScore, | ||||||
|  | 		) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	return schedulerapi.HostPriority{ | ||||||
|  | 		Host:  node.Name, | ||||||
|  | 		Score: int((cpuScore + memoryScore) / 2), | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
| // LeastRequestedPriority is a priority function that favors nodes with fewer requested resources. | // LeastRequestedPriority is a priority function that favors nodes with fewer requested resources. | ||||||
| // It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes | // It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes | ||||||
| // based on the minimum of the average of the fraction of requested to capacity. | // based on the minimum of the average of the fraction of requested to capacity. | ||||||
| @@ -90,7 +140,20 @@ func LeastRequestedPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulerca | |||||||
| 	podResources := getNonZeroRequests(pod) | 	podResources := getNonZeroRequests(pod) | ||||||
| 	list := make(schedulerapi.HostPriorityList, 0, len(nodes)) | 	list := make(schedulerapi.HostPriorityList, 0, len(nodes)) | ||||||
| 	for _, node := range nodes { | 	for _, node := range nodes { | ||||||
| 		list = append(list, calculateResourceOccupancy(pod, podResources, node, nodeNameToInfo[node.Name])) | 		list = append(list, calculateUnusedPriority(pod, podResources, node, nodeNameToInfo[node.Name])) | ||||||
|  | 	} | ||||||
|  | 	return list, nil | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // MostRequestedPriority is a priority function that favors nodes with most requested resources. | ||||||
|  | // It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes | ||||||
|  | // based on the maximum of the average of the fraction of requested to capacity. | ||||||
|  | // Details: (cpu(10 * sum(requested) / capacity) + memory(10 * sum(requested) / capacity)) / 2 | ||||||
|  | func MostRequestedPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*api.Node) (schedulerapi.HostPriorityList, error) { | ||||||
|  | 	podResources := getNonZeroRequests(pod) | ||||||
|  | 	list := make(schedulerapi.HostPriorityList, 0, len(nodes)) | ||||||
|  | 	for _, node := range nodes { | ||||||
|  | 		list = append(list, calculateUsedPriority(pod, podResources, node, nodeNameToInfo[node.Name])) | ||||||
| 	} | 	} | ||||||
| 	return list, nil | 	return list, nil | ||||||
| } | } | ||||||
|   | |||||||
| @@ -411,6 +411,161 @@ func TestLeastRequested(t *testing.T) { | |||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func TestMostRequested(t *testing.T) { | ||||||
|  | 	labels1 := map[string]string{ | ||||||
|  | 		"foo": "bar", | ||||||
|  | 		"baz": "blah", | ||||||
|  | 	} | ||||||
|  | 	labels2 := map[string]string{ | ||||||
|  | 		"bar": "foo", | ||||||
|  | 		"baz": "blah", | ||||||
|  | 	} | ||||||
|  | 	noResources := api.PodSpec{ | ||||||
|  | 		Containers: []api.Container{}, | ||||||
|  | 	} | ||||||
|  | 	cpuOnly := api.PodSpec{ | ||||||
|  | 		NodeName: "machine1", | ||||||
|  | 		Containers: []api.Container{ | ||||||
|  | 			{ | ||||||
|  | 				Resources: api.ResourceRequirements{ | ||||||
|  | 					Requests: api.ResourceList{ | ||||||
|  | 						"cpu":    resource.MustParse("1000m"), | ||||||
|  | 						"memory": resource.MustParse("0"), | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			{ | ||||||
|  | 				Resources: api.ResourceRequirements{ | ||||||
|  | 					Requests: api.ResourceList{ | ||||||
|  | 						"cpu":    resource.MustParse("2000m"), | ||||||
|  | 						"memory": resource.MustParse("0"), | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | 	cpuOnly2 := cpuOnly | ||||||
|  | 	cpuOnly2.NodeName = "machine2" | ||||||
|  | 	cpuAndMemory := api.PodSpec{ | ||||||
|  | 		NodeName: "machine2", | ||||||
|  | 		Containers: []api.Container{ | ||||||
|  | 			{ | ||||||
|  | 				Resources: api.ResourceRequirements{ | ||||||
|  | 					Requests: api.ResourceList{ | ||||||
|  | 						"cpu":    resource.MustParse("1000m"), | ||||||
|  | 						"memory": resource.MustParse("2000"), | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			{ | ||||||
|  | 				Resources: api.ResourceRequirements{ | ||||||
|  | 					Requests: api.ResourceList{ | ||||||
|  | 						"cpu":    resource.MustParse("2000m"), | ||||||
|  | 						"memory": resource.MustParse("3000"), | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | 	tests := []struct { | ||||||
|  | 		pod          *api.Pod | ||||||
|  | 		pods         []*api.Pod | ||||||
|  | 		nodes        []*api.Node | ||||||
|  | 		expectedList schedulerapi.HostPriorityList | ||||||
|  | 		test         string | ||||||
|  | 	}{ | ||||||
|  | 		{ | ||||||
|  | 			/* | ||||||
|  | 				Node1 scores (used resources) on 0-10 scale | ||||||
|  | 				CPU Score: (0 * 10  / 4000 = 0 | ||||||
|  | 				Memory Score: (0 * 10) / 10000 = 0 | ||||||
|  | 				Node1 Score: (0 + 0) / 2 = 0 | ||||||
|  |  | ||||||
|  | 				Node2 scores (used resources) on 0-10 scale | ||||||
|  | 				CPU Score: (0 * 10 / 4000 = 0 | ||||||
|  | 				Memory Score: (0 * 10 / 10000 = 0 | ||||||
|  | 				Node2 Score: (0 + 0) / 2 = 0 | ||||||
|  | 			*/ | ||||||
|  | 			pod:          &api.Pod{Spec: noResources}, | ||||||
|  | 			nodes:        []*api.Node{makeNode("machine1", 4000, 10000), makeNode("machine2", 4000, 10000)}, | ||||||
|  | 			expectedList: []schedulerapi.HostPriority{{"machine1", 0}, {"machine2", 0}}, | ||||||
|  | 			test:         "nothing scheduled, nothing requested", | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			/* | ||||||
|  | 				Node1 scores on 0-10 scale | ||||||
|  | 				CPU Score: (3000 * 10 / 4000 = 7.5 | ||||||
|  | 				Memory Score: (5000 * 10) / 10000 = 5 | ||||||
|  | 				Node1 Score: (7.5 + 5) / 2 = 6 | ||||||
|  |  | ||||||
|  | 				Node2 scores on 0-10 scale | ||||||
|  | 				CPU Score: (3000 * 10 / 6000 = 5 | ||||||
|  | 				Memory Score: (5000 * 10 / 10000 = 5 | ||||||
|  | 				Node2 Score: (5 + 5) / 2 = 5 | ||||||
|  | 			*/ | ||||||
|  | 			pod:          &api.Pod{Spec: cpuAndMemory}, | ||||||
|  | 			nodes:        []*api.Node{makeNode("machine1", 4000, 10000), makeNode("machine2", 6000, 10000)}, | ||||||
|  | 			expectedList: []schedulerapi.HostPriority{{"machine1", 6}, {"machine2", 5}}, | ||||||
|  | 			test:         "nothing scheduled, resources requested, differently sized machines", | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			/* | ||||||
|  | 				Node1 scores on 0-10 scale | ||||||
|  | 				CPU Score: (6000 * 10) / 10000 = 6 | ||||||
|  | 				Memory Score: (0 * 10) / 20000 = 10 | ||||||
|  | 				Node1 Score: (6 + 0) / 2 = 3 | ||||||
|  |  | ||||||
|  | 				Node2 scores on 0-10 scale | ||||||
|  | 				CPU Score: (6000 * 10) / 10000 = 6 | ||||||
|  | 				Memory Score: (5000 * 10) / 20000 = 2.5 | ||||||
|  | 				Node2 Score: (6 + 2.5) / 2 = 4 | ||||||
|  | 			*/ | ||||||
|  | 			pod:          &api.Pod{Spec: noResources}, | ||||||
|  | 			nodes:        []*api.Node{makeNode("machine1", 10000, 20000), makeNode("machine2", 10000, 20000)}, | ||||||
|  | 			expectedList: []schedulerapi.HostPriority{{"machine1", 3}, {"machine2", 4}}, | ||||||
|  | 			test:         "no resources requested, pods scheduled with resources", | ||||||
|  | 			pods: []*api.Pod{ | ||||||
|  | 				{Spec: cpuOnly, ObjectMeta: api.ObjectMeta{Labels: labels2}}, | ||||||
|  | 				{Spec: cpuOnly, ObjectMeta: api.ObjectMeta{Labels: labels1}}, | ||||||
|  | 				{Spec: cpuOnly2, ObjectMeta: api.ObjectMeta{Labels: labels1}}, | ||||||
|  | 				{Spec: cpuAndMemory, ObjectMeta: api.ObjectMeta{Labels: labels1}}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		{ | ||||||
|  | 			/* | ||||||
|  | 				Node1 scores on 0-10 scale | ||||||
|  | 				CPU Score: (6000 * 10) / 10000 = 6 | ||||||
|  | 				Memory Score: (5000 * 10) / 20000 = 2.5 | ||||||
|  | 				Node1 Score: (6 + 2.5) / 2 = 4 | ||||||
|  |  | ||||||
|  | 				Node2 scores on 0-10 scale | ||||||
|  | 				CPU Score: (6000 * 10) / 10000 = 6 | ||||||
|  | 				Memory Score: (10000 * 10) / 20000 = 5 | ||||||
|  | 				Node2 Score: (6 + 5) / 2 = 5 | ||||||
|  | 			*/ | ||||||
|  | 			pod:          &api.Pod{Spec: cpuAndMemory}, | ||||||
|  | 			nodes:        []*api.Node{makeNode("machine1", 10000, 20000), makeNode("machine2", 10000, 20000)}, | ||||||
|  | 			expectedList: []schedulerapi.HostPriority{{"machine1", 4}, {"machine2", 5}}, | ||||||
|  | 			test:         "resources requested, pods scheduled with resources", | ||||||
|  | 			pods: []*api.Pod{ | ||||||
|  | 				{Spec: cpuOnly}, | ||||||
|  | 				{Spec: cpuAndMemory}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	for _, test := range tests { | ||||||
|  | 		nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap(test.pods, test.nodes) | ||||||
|  | 		list, err := MostRequestedPriority(test.pod, nodeNameToInfo, test.nodes) | ||||||
|  | 		if err != nil { | ||||||
|  | 			t.Errorf("unexpected error: %v", err) | ||||||
|  | 		} | ||||||
|  | 		if !reflect.DeepEqual(test.expectedList, list) { | ||||||
|  | 			t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
| func TestNewNodeLabelPriority(t *testing.T) { | func TestNewNodeLabelPriority(t *testing.T) { | ||||||
| 	label1 := map[string]string{"foo": "bar"} | 	label1 := map[string]string{"foo": "bar"} | ||||||
| 	label2 := map[string]string{"bar": "foo"} | 	label2 := map[string]string{"bar": "foo"} | ||||||
|   | |||||||
| @@ -93,6 +93,8 @@ func init() { | |||||||
| 	factory.RegisterFitPredicate("HostName", predicates.PodFitsHost) | 	factory.RegisterFitPredicate("HostName", predicates.PodFitsHost) | ||||||
| 	// Fit is determined by node selector query. | 	// Fit is determined by node selector query. | ||||||
| 	factory.RegisterFitPredicate("MatchNodeSelector", predicates.PodSelectorMatches) | 	factory.RegisterFitPredicate("MatchNodeSelector", predicates.PodSelectorMatches) | ||||||
|  | 	// Optional, cluster-autoscaler friendly priority function - give used nodes higher priority. | ||||||
|  | 	factory.RegisterPriorityFunction("MostRequestedPriority", priorities.MostRequestedPriority, 1) | ||||||
| } | } | ||||||
|  |  | ||||||
| func defaultPredicates() sets.String { | func defaultPredicates() sets.String { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user