From 8a4218969090c1eebafc1dd6befe071f6defbb86 Mon Sep 17 00:00:00 2001
From: Connor Doyle <connor.p.doyle@intel.com>
Date: Mon, 27 Feb 2017 22:33:14 -0800
Subject: [PATCH] Fix unbounded growth of cached OIRs in sched cache

- Added schedulercache.Resource.SetOpaque helper.
- Amend kubelet allocatable sync so that when OIRs are removed from capacity
  they are also removed from allocatable.
- Fixes #41861.
---
 pkg/kubelet/kubelet_node_status.go            |  8 ++++++
 .../algorithm/predicates/predicates.go        | 28 +++++++++++++++++--
 .../pkg/scheduler/schedulercache/node_info.go |  8 ++++--
 3 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/pkg/kubelet/kubelet_node_status.go b/pkg/kubelet/kubelet_node_status.go
index 791effef0fd..e07596f6e59 100644
--- a/pkg/kubelet/kubelet_node_status.go
+++ b/pkg/kubelet/kubelet_node_status.go
@@ -529,6 +529,14 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
 	if node.Status.Allocatable == nil {
 		node.Status.Allocatable = make(v1.ResourceList)
 	}
+	// Remove opaque integer resources from allocatable that are no longer
+	// present in capacity.
+	for k := range node.Status.Allocatable {
+		_, found := node.Status.Capacity[k]
+		if !found && v1.IsOpaqueIntResourceName(k) {
+			delete(node.Status.Allocatable, k)
+		}
+	}
 	allocatableReservation := kl.containerManager.GetNodeAllocatableReservation()
 	for k, v := range node.Status.Capacity {
 		value := *(v.Copy())
diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go
index 4f1387ca085..d255374d91a 100644
--- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go
+++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go
@@ -468,6 +468,30 @@ func (c *VolumeZoneChecker) predicate(pod *v1.Pod, meta interface{}, nodeInfo *s
 	return true, nil, nil
 }
 
+// Returns a *schedulercache.Resource that covers the largest width in each
+// resource dimension. Because init-containers run sequentially, we collect the
+// max in each dimension iteratively. In contrast, we sum the resource vectors
+// for regular containers since they run simultaneously.
+//
+// Example:
+//
+// Pod:
+//   InitContainers
+//     IC1:
+//       CPU: 2
+//       Memory: 1G
+//     IC2:
+//       CPU: 2
+//       Memory: 3G
+//   Containers
+//     C1:
+//       CPU: 2
+//       Memory: 1G
+//     C2:
+//       CPU: 1
+//       Memory: 1G
+//
+// Result: CPU: 3, Memory: 3G
 func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource {
 	result := schedulercache.Resource{}
 	for _, container := range pod.Spec.Containers {
@@ -505,10 +529,8 @@ func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource {
 			default:
 				if v1.IsOpaqueIntResourceName(rName) {
 					value := rQuantity.Value()
-					// Ensure the opaque resource map is initialized in the result.
-					result.AddOpaque(rName, int64(0))
 					if value > result.OpaqueIntResources[rName] {
-						result.OpaqueIntResources[rName] = value
+						result.SetOpaque(rName, value)
 					}
 				}
 			}
diff --git a/plugin/pkg/scheduler/schedulercache/node_info.go b/plugin/pkg/scheduler/schedulercache/node_info.go
index 4fca801e419..3e3d417b66e 100644
--- a/plugin/pkg/scheduler/schedulercache/node_info.go
+++ b/plugin/pkg/scheduler/schedulercache/node_info.go
@@ -83,11 +83,15 @@ func (r *Resource) ResourceList() v1.ResourceList {
 }
 
 func (r *Resource) AddOpaque(name v1.ResourceName, quantity int64) {
+	r.SetOpaque(name, r.OpaqueIntResources[name]+quantity)
+}
+
+func (r *Resource) SetOpaque(name v1.ResourceName, quantity int64) {
 	// Lazily allocate opaque integer resource map.
 	if r.OpaqueIntResources == nil {
 		r.OpaqueIntResources = map[v1.ResourceName]int64{}
 	}
-	r.OpaqueIntResources[name] += quantity
+	r.OpaqueIntResources[name] = quantity
 }
 
 // NewNodeInfo returns a ready to use empty NodeInfo object.
@@ -333,7 +337,7 @@ func (n *NodeInfo) SetNode(node *v1.Node) error {
 			n.allowedPodNumber = int(rQuant.Value())
 		default:
 			if v1.IsOpaqueIntResourceName(rName) {
-				n.allocatableResource.AddOpaque(rName, rQuant.Value())
+				n.allocatableResource.SetOpaque(rName, rQuant.Value())
 			}
 		}
 	}