From 8a4218969090c1eebafc1dd6befe071f6defbb86 Mon Sep 17 00:00:00 2001 From: Connor Doyle Date: Mon, 27 Feb 2017 22:33:14 -0800 Subject: [PATCH] Fix unbounded growth of cached OIRs in sched cache - Added schedulercache.Resource.SetOpaque helper. - Amend kubelet allocatable sync so that when OIRs are removed from capacity they are also removed from allocatable. - Fixes #41861. --- pkg/kubelet/kubelet_node_status.go | 8 ++++++ .../algorithm/predicates/predicates.go | 28 +++++++++++++++++-- .../pkg/scheduler/schedulercache/node_info.go | 8 ++++-- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/pkg/kubelet/kubelet_node_status.go b/pkg/kubelet/kubelet_node_status.go index 791effef0fd..e07596f6e59 100644 --- a/pkg/kubelet/kubelet_node_status.go +++ b/pkg/kubelet/kubelet_node_status.go @@ -529,6 +529,14 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) { if node.Status.Allocatable == nil { node.Status.Allocatable = make(v1.ResourceList) } + // Remove opaque integer resources from allocatable that are no longer + // present in capacity. + for k := range node.Status.Allocatable { + _, found := node.Status.Capacity[k] + if !found && v1.IsOpaqueIntResourceName(k) { + delete(node.Status.Allocatable, k) + } + } allocatableReservation := kl.containerManager.GetNodeAllocatableReservation() for k, v := range node.Status.Capacity { value := *(v.Copy()) diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index 4f1387ca085..d255374d91a 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -468,6 +468,30 @@ func (c *VolumeZoneChecker) predicate(pod *v1.Pod, meta interface{}, nodeInfo *s return true, nil, nil } +// Returns a *schedulercache.Resource that covers the largest width in each +// resource dimension. Because init-containers run sequentially, we collect the +// max in each dimension iteratively. In contrast, we sum the resource vectors +// for regular containers since they run simultaneously. +// +// Example: +// +// Pod: +// InitContainers +// IC1: +// CPU: 2 +// Memory: 1G +// IC2: +// CPU: 2 +// Memory: 3G +// Containers +// C1: +// CPU: 2 +// Memory: 1G +// C2: +// CPU: 1 +// Memory: 1G +// +// Result: CPU: 3, Memory: 3G func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource { result := schedulercache.Resource{} for _, container := range pod.Spec.Containers { @@ -505,10 +529,8 @@ func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource { default: if v1.IsOpaqueIntResourceName(rName) { value := rQuantity.Value() - // Ensure the opaque resource map is initialized in the result. - result.AddOpaque(rName, int64(0)) if value > result.OpaqueIntResources[rName] { - result.OpaqueIntResources[rName] = value + result.SetOpaque(rName, value) } } } diff --git a/plugin/pkg/scheduler/schedulercache/node_info.go b/plugin/pkg/scheduler/schedulercache/node_info.go index 4fca801e419..3e3d417b66e 100644 --- a/plugin/pkg/scheduler/schedulercache/node_info.go +++ b/plugin/pkg/scheduler/schedulercache/node_info.go @@ -83,11 +83,15 @@ func (r *Resource) ResourceList() v1.ResourceList { } func (r *Resource) AddOpaque(name v1.ResourceName, quantity int64) { + r.SetOpaque(name, r.OpaqueIntResources[name]+quantity) +} + +func (r *Resource) SetOpaque(name v1.ResourceName, quantity int64) { // Lazily allocate opaque integer resource map. if r.OpaqueIntResources == nil { r.OpaqueIntResources = map[v1.ResourceName]int64{} } - r.OpaqueIntResources[name] += quantity + r.OpaqueIntResources[name] = quantity } // NewNodeInfo returns a ready to use empty NodeInfo object. @@ -333,7 +337,7 @@ func (n *NodeInfo) SetNode(node *v1.Node) error { n.allowedPodNumber = int(rQuant.Value()) default: if v1.IsOpaqueIntResourceName(rName) { - n.allocatableResource.AddOpaque(rName, rQuant.Value()) + n.allocatableResource.SetOpaque(rName, rQuant.Value()) } } }