From 71b2af1e0c1c2fc6d9907dc23db5e35d81c58046 Mon Sep 17 00:00:00 2001
From: dingh <dinghaiyang@huawei.com>
Date: Mon, 30 Mar 2015 00:43:24 +0800
Subject: [PATCH] Implement BRA algorithm as a new priority function in
 scheduler

Balanced Resource Allocation policy can now be enabled to so that
host(s) with balanced resource usage would be preferred. The score
given by BRA also scales from 0 to 10 with 10 representing that the
resource usage is well balanced.
---
 pkg/scheduler/priorities.go      |  76 ++++++++++
 pkg/scheduler/priorities_test.go | 233 +++++++++++++++++++++++++++++++
 2 files changed, 309 insertions(+)

diff --git a/pkg/scheduler/priorities.go b/pkg/scheduler/priorities.go
index aa9c0bdb368..eab28e21cb8 100644
--- a/pkg/scheduler/priorities.go
+++ b/pkg/scheduler/priorities.go
@@ -17,6 +17,8 @@ limitations under the License.
 package scheduler
 
 import (
+	"math"
+
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
 	"github.com/golang/glog"
@@ -132,3 +134,77 @@ func (n *NodeLabelPrioritizer) CalculateNodeLabelPriority(pod api.Pod, podLister
 	}
 	return result, nil
 }
+
+// BalancedResourceAllocation favors nodes with balanced resource usage rate.
+// BalancedResourceAllocation should **NOT** be used alone, and **MUST** be used together with LeastRequestedPriority.
+// It calculates the difference between the cpu and memory fracion of capacity, and prioritizes the host based on how
+// close the two metrics are to each other.
+// Detail: score = 10 - abs(cpuFraction-memoryFraction)*10. The algorithm is partly inspired by:
+// "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced Resource Utilization"
+func BalancedResourceAllocation(pod api.Pod, podLister PodLister, minionLister MinionLister) (HostPriorityList, error) {
+	nodes, err := minionLister.List()
+	if err != nil {
+		return HostPriorityList{}, err
+	}
+	podsToMachines, err := MapPodsToMachines(podLister)
+
+	list := HostPriorityList{}
+	for _, node := range nodes.Items {
+		list = append(list, calculateBalancedResourceAllocation(pod, node, podsToMachines[node.Name]))
+	}
+	return list, nil
+}
+
+func calculateBalancedResourceAllocation(pod api.Pod, node api.Node, pods []api.Pod) HostPriority {
+	totalMilliCPU := int64(0)
+	totalMemory := int64(0)
+	score := int(0)
+	for _, existingPod := range pods {
+		for _, container := range existingPod.Spec.Containers {
+			totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
+			totalMemory += container.Resources.Limits.Memory().Value()
+		}
+	}
+	// Add the resources requested by the current pod being scheduled.
+	// This also helps differentiate between differently sized, but empty, minions.
+	for _, container := range pod.Spec.Containers {
+		totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
+		totalMemory += container.Resources.Limits.Memory().Value()
+	}
+
+	capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue()
+	capacityMemory := node.Status.Capacity.Memory().Value()
+
+	cpuFraction := fractionOfCapacity(totalMilliCPU, capacityMilliCPU, node.Name)
+	memoryFraction := fractionOfCapacity(totalMemory, capacityMemory, node.Name)
+	if cpuFraction >= 1 || memoryFraction >= 1 {
+		// if requested >= capacity, the corresponding host should never be preferrred.
+		score = 0
+	} else {
+		// Upper and lower boundary of difference between cpuFraction and memoryFraction are -1 and 1
+		// respectively. Multilying the absolute value of the difference by 10 scales the value to
+		// 0-10 with 0 representing well balanced allocation and 10 poorly balanced. Subtracting it from
+		// 10 leads to the score which also scales from 0 to 10 while 10 representing well balanced.
+		diff := math.Abs(cpuFraction - memoryFraction)
+		score = int(10 - diff*10)
+	}
+	glog.V(4).Infof(
+		"%v -> %v: Balanced Resource Allocation, Absolute/Requested: (%d, %d) / (%d, %d) Score: (%d)",
+		pod.Name, node.Name,
+		totalMilliCPU, totalMemory,
+		capacityMilliCPU, capacityMemory,
+		score,
+	)
+
+	return HostPriority{
+		host:  node.Name,
+		score: score,
+	}
+}
+
+func fractionOfCapacity(requested, capacity int64, node string) float64 {
+	if capacity == 0 {
+		return 1
+	}
+	return float64(requested) / float64(capacity)
+}
diff --git a/pkg/scheduler/priorities_test.go b/pkg/scheduler/priorities_test.go
index 486cb0a7a39..88c5aa31f41 100644
--- a/pkg/scheduler/priorities_test.go
+++ b/pkg/scheduler/priorities_test.go
@@ -368,3 +368,236 @@ func TestNewNodeLabelPriority(t *testing.T) {
 		}
 	}
 }
+
+func TestBalancedResourceAllocation(t *testing.T) {
+	labels1 := map[string]string{
+		"foo": "bar",
+		"baz": "blah",
+	}
+	labels2 := map[string]string{
+		"bar": "foo",
+		"baz": "blah",
+	}
+	machine1Spec := api.PodSpec{
+		Host: "machine1",
+	}
+	machine2Spec := api.PodSpec{
+		Host: "machine2",
+	}
+	noResources := api.PodSpec{
+		Containers: []api.Container{},
+	}
+	cpuOnly := api.PodSpec{
+		Host: "machine1",
+		Containers: []api.Container{
+			{
+				Resources: api.ResourceRequirements{
+					Limits: api.ResourceList{
+						"cpu": resource.MustParse("1000m"),
+					},
+				},
+			},
+			{
+				Resources: api.ResourceRequirements{
+					Limits: api.ResourceList{
+						"cpu": resource.MustParse("2000m"),
+					},
+				},
+			},
+		},
+	}
+	cpuOnly2 := cpuOnly
+	cpuOnly2.Host = "machine2"
+	cpuAndMemory := api.PodSpec{
+		Host: "machine2",
+		Containers: []api.Container{
+			{
+				Resources: api.ResourceRequirements{
+					Limits: api.ResourceList{
+						"cpu":    resource.MustParse("1000m"),
+						"memory": resource.MustParse("2000"),
+					},
+				},
+			},
+			{
+				Resources: api.ResourceRequirements{
+					Limits: api.ResourceList{
+						"cpu":    resource.MustParse("2000m"),
+						"memory": resource.MustParse("3000"),
+					},
+				},
+			},
+		},
+	}
+	tests := []struct {
+		pod          api.Pod
+		pods         []api.Pod
+		nodes        []api.Node
+		expectedList HostPriorityList
+		test         string
+	}{
+		{
+			/*
+				Minion1 scores (remaining resources) on 0-10 scale
+				CPU Fraction: 0 / 4000 = 0%
+				Memory Fraction: 0 / 10000 = 0%
+				Minion1 Score: 10 - (0-0)*10 = 10
+
+				Minion2 scores (remaining resources) on 0-10 scale
+				CPU Fraction: 0 / 4000 = 0 %
+				Memory Fraction: 0 / 10000 = 0%
+				Minion2 Score: 10 - (0-0)*10 = 10
+			*/
+			pod:          api.Pod{Spec: noResources},
+			nodes:        []api.Node{makeMinion("machine1", 4000, 10000), makeMinion("machine2", 4000, 10000)},
+			expectedList: []HostPriority{{"machine1", 10}, {"machine2", 10}},
+			test:         "nothing scheduled, nothing requested",
+		},
+		{
+			/*
+				Minion1 scores on 0-10 scale
+				CPU Fraction: 3000 / 4000= 75%
+				Memory Fraction: 5000 / 10000 = 50%
+				Minion1 Score: 10 - (0.75-0.5)*10 = 7
+
+				Minion2 scores on 0-10 scale
+				CPU Fraction: 3000 / 6000= 50%
+				Memory Fraction: 5000/10000 = 50%
+				Minion2 Score: 10 - (0.5-0.5)*10 = 10
+			*/
+			pod:          api.Pod{Spec: cpuAndMemory},
+			nodes:        []api.Node{makeMinion("machine1", 4000, 10000), makeMinion("machine2", 6000, 10000)},
+			expectedList: []HostPriority{{"machine1", 7}, {"machine2", 10}},
+			test:         "nothing scheduled, resources requested, differently sized machines",
+		},
+		{
+			/*
+				Minion1 scores on 0-10 scale
+				CPU Fraction: 0 / 4000= 0%
+				Memory Fraction: 0 / 10000 = 0%
+				Minion1 Score: 10 - (0-0)*10 = 10
+
+				Minion2 scores on 0-10 scale
+				CPU Fraction: 0 / 4000= 0%
+				Memory Fraction: 0 / 10000 = 0%
+				Minion2 Score: 10 - (0-0)*10 = 10
+			*/
+			pod:          api.Pod{Spec: noResources},
+			nodes:        []api.Node{makeMinion("machine1", 4000, 10000), makeMinion("machine2", 4000, 10000)},
+			expectedList: []HostPriority{{"machine1", 10}, {"machine2", 10}},
+			test:         "no resources requested, pods scheduled",
+			pods: []api.Pod{
+				{Spec: machine1Spec, ObjectMeta: api.ObjectMeta{Labels: labels2}},
+				{Spec: machine1Spec, ObjectMeta: api.ObjectMeta{Labels: labels1}},
+				{Spec: machine2Spec, ObjectMeta: api.ObjectMeta{Labels: labels1}},
+				{Spec: machine2Spec, ObjectMeta: api.ObjectMeta{Labels: labels1}},
+			},
+		},
+		{
+			/*
+				Minion1 scores on 0-10 scale
+				CPU Fraction: 6000 / 10000 = 60%
+				Memory Fraction: 0 / 20000 = 0%
+				Minion1 Score: 10 - (0.6-0)*10 = 4
+
+				Minion2 scores on 0-10 scale
+				CPU Fraction: 6000 / 10000 = 60%
+				Memory Fraction: 5000 / 20000 = 25%
+				Minion2 Score: 10 - (0.6-0.25)*10 = 6
+			*/
+			pod:          api.Pod{Spec: noResources},
+			nodes:        []api.Node{makeMinion("machine1", 10000, 20000), makeMinion("machine2", 10000, 20000)},
+			expectedList: []HostPriority{{"machine1", 4}, {"machine2", 6}},
+			test:         "no resources requested, pods scheduled with resources",
+			pods: []api.Pod{
+				{Spec: cpuOnly, ObjectMeta: api.ObjectMeta{Labels: labels2}},
+				{Spec: cpuOnly, ObjectMeta: api.ObjectMeta{Labels: labels1}},
+				{Spec: cpuOnly2, ObjectMeta: api.ObjectMeta{Labels: labels1}},
+				{Spec: cpuAndMemory, ObjectMeta: api.ObjectMeta{Labels: labels1}},
+			},
+		},
+		{
+			/*
+				Minion1 scores on 0-10 scale
+				CPU Fraction: 6000 / 10000 = 60%
+				Memory Fraction: 5000 / 20000 = 25%
+				Minion1 Score: 10 - (0.6-0.25)*10 = 6
+
+				Minion2 scores on 0-10 scale
+				CPU Fraction: 6000 / 10000 = 60%
+				Memory Fraction: 10000 / 20000 = 50%
+				Minion2 Score: 10 - (0.6-0.5)*10 = 9
+			*/
+			pod:          api.Pod{Spec: cpuAndMemory},
+			nodes:        []api.Node{makeMinion("machine1", 10000, 20000), makeMinion("machine2", 10000, 20000)},
+			expectedList: []HostPriority{{"machine1", 6}, {"machine2", 9}},
+			test:         "resources requested, pods scheduled with resources",
+			pods: []api.Pod{
+				{Spec: cpuOnly},
+				{Spec: cpuAndMemory},
+			},
+		},
+		{
+			/*
+				Minion1 scores on 0-10 scale
+				CPU Fraction: 6000 / 10000 = 60%
+				Memory Fraction: 5000 / 20000 = 25%
+				Minion1 Score: 10 - (0.6-0.25)*10 = 6
+
+				Minion2 scores on 0-10 scale
+				CPU Fraction: 6000 / 10000 = 60%
+				Memory Fraction: 10000 / 50000 = 20%
+				Minion2 Score: 10 - (0.6-0.2)*10 = 6
+			*/
+			pod:          api.Pod{Spec: cpuAndMemory},
+			nodes:        []api.Node{makeMinion("machine1", 10000, 20000), makeMinion("machine2", 10000, 50000)},
+			expectedList: []HostPriority{{"machine1", 6}, {"machine2", 6}},
+			test:         "resources requested, pods scheduled with resources, differently sized machines",
+			pods: []api.Pod{
+				{Spec: cpuOnly},
+				{Spec: cpuAndMemory},
+			},
+		},
+		{
+			/*
+				Minion1 scores on 0-10 scale
+				CPU Fraction: 6000 / 4000 > 100% ==> Score := 0
+				Memory Fraction: 0 / 10000 = 0
+				Minion1 Score: 0
+
+				Minion2 scores on 0-10 scale
+				CPU Fraction: 6000 / 4000 > 100% ==> Score := 0
+				Memory Fraction 5000 / 10000 = 50%
+				Minion2 Score: 0
+			*/
+			pod:          api.Pod{Spec: cpuOnly},
+			nodes:        []api.Node{makeMinion("machine1", 4000, 10000), makeMinion("machine2", 4000, 10000)},
+			expectedList: []HostPriority{{"machine1", 0}, {"machine2", 0}},
+			test:         "requested resources exceed minion capacity",
+			pods: []api.Pod{
+				{Spec: cpuOnly},
+				{Spec: cpuAndMemory},
+			},
+		},
+		{
+			pod:          api.Pod{Spec: noResources},
+			nodes:        []api.Node{makeMinion("machine1", 0, 0), makeMinion("machine2", 0, 0)},
+			expectedList: []HostPriority{{"machine1", 0}, {"machine2", 0}},
+			test:         "zero minion resources, pods scheduled with resources",
+			pods: []api.Pod{
+				{Spec: cpuOnly},
+				{Spec: cpuAndMemory},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		list, err := BalancedResourceAllocation(test.pod, FakePodLister(test.pods), FakeMinionLister(api.NodeList{Items: test.nodes}))
+		if err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+		if !reflect.DeepEqual(test.expectedList, list) {
+			t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list)
+		}
+	}
+}