Consume TopologyHints in the devicemanager

This commit is contained in:
Kevin Klues 2019-08-14 19:56:39 +02:00
parent a3320f80d9
commit cc567afaf0

View File

@ -22,6 +22,7 @@ import (
"net"
"os"
"path/filepath"
"sort"
"sync"
"time"
@ -41,6 +42,7 @@ import (
cputopology "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/socketmask"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/metrics"
@ -654,7 +656,14 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
if available.Len() < needed {
return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
}
// By default, pull devices from the unsorted list of available devices.
allocated := available.UnsortedList()[:needed]
// If topology alignment is desired, update allocated to the set of devices
// with the best alignment.
hint := m.topologyAffinityStore.GetAffinity(podUID, contName)
if m.deviceHasTopologyAlignment(resource) && hint.NUMANodeAffinity != nil {
allocated = m.takeByTopology(resource, available, hint.NUMANodeAffinity, needed)
}
// Updates m.allocatedDevices with allocated devices to prevent them
// from being allocated to other pods/containers, given that we are
// not holding lock during the rpc call.
@ -665,6 +674,74 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
return devices, nil
}
func (m *ManagerImpl) takeByTopology(resource string, available sets.String, affinity socketmask.SocketMask, request int) []string {
// Build a map of of NUMA Nodes to the devices associated with them. A
// device may be associated to multiple NUMA nodes at the same time. If an
// available device does not have any NUMA Nodes associated with it, add it
// to a list of NUMA Nodes for the fake NUMANode -1.
perNodeDevices := make(map[int]sets.String)
for d := range available {
var nodes []int
if m.allDevices[resource][d].Topology != nil {
for _, node := range m.allDevices[resource][d].Topology.Nodes {
nodes = append(nodes, int(node.ID))
}
}
if len(nodes) == 0 {
nodes = []int{-1}
}
for _, node := range nodes {
if _, ok := perNodeDevices[node]; !ok {
perNodeDevices[node] = sets.NewString()
}
perNodeDevices[node].Insert(d)
}
}
// Get a flat list of all of the nodes associated with available devices.
var nodes []int
for node := range perNodeDevices {
nodes = append(nodes, node)
}
// Sort the list of nodes by how many devices they contain.
sort.Slice(nodes, func(i, j int) bool {
return perNodeDevices[i].Len() < perNodeDevices[j].Len()
})
// Generate three sorted lists of devices. Devices in the first list come
// from valid NUMA Nodes contained in the affinity mask. Devices in the
// second list come from valid NUMA Nodes not in the affinity mask. Devices
// in the third list come from devices with no NUMA Node association (i.e.
// those mapped to the fake NUMA Node -1). Because we loop through the
// sorted list of NUMA nodes in order, within each list, devices are sorted
// by their connection to NUMA Nodes with more devices on them.
var fromAffinity []string
var notFromAffinity []string
var withoutTopology []string
for d := range available {
// Since the same device may be associated with multiple NUMA Nodes. We
// need to be careful not to add each device to multiple lists. The
// logic below ensures this by breaking after the first NUMA node that
// has the device is encountered.
for _, n := range nodes {
if perNodeDevices[n].Has(d) {
if n == -1 {
withoutTopology = append(withoutTopology, d)
} else if affinity.IsSet(n) {
fromAffinity = append(fromAffinity, d)
} else {
notFromAffinity = append(notFromAffinity, d)
}
break
}
}
}
// Concatenate the lists above return the first 'request' devices from it..
return append(append(fromAffinity, notFromAffinity...), withoutTopology...)[:request]
}
// allocateContainerResources attempts to allocate all of required device
// plugin resources for the input container, issues an Allocate rpc request
// for each new device resource requirement, processes their AllocateResponses,