mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-27 21:47:07 +00:00
Merge pull request #57266 from vikaschoudhary16/unhealthy_device
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Handle Unhealthy devices Update node capacity with sum of both healthy and unhealthy devices. Node allocatable reflect only healthy devices. **What this PR does / why we need it**: Currently node capacity only reflects healthy devices. Unhealthy devices are ignored totally while updating node status. This PR handles unhealthy devices while updating node status. **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes #57241 **Special notes for your reviewer**: **Release note**: <!-- Write your release note: Handle Unhealthy devices ```release-note Handle Unhealthy devices ``` /cc @tengqm @ConnorDoyle @jiayingz @vishh @jeremyeder @sjenning @resouer @ScorpioCPH @lichuqiang @RenaudWasTaken @balajismaniam /sig node
This commit is contained in:
commit
f2e46a2147
@ -70,9 +70,10 @@ type ContainerManager interface {
|
|||||||
// GetCapacity returns the amount of compute resources tracked by container manager available on the node.
|
// GetCapacity returns the amount of compute resources tracked by container manager available on the node.
|
||||||
GetCapacity() v1.ResourceList
|
GetCapacity() v1.ResourceList
|
||||||
|
|
||||||
// GetDevicePluginResourceCapacity returns the amount of device plugin resources available on the node
|
// GetDevicePluginResourceCapacity returns the node capacity (amount of total device plugin resources),
|
||||||
|
// node allocatable (amount of total healthy resources reported by device plugin),
|
||||||
// and inactive device plugin resources previously registered on the node.
|
// and inactive device plugin resources previously registered on the node.
|
||||||
GetDevicePluginResourceCapacity() (v1.ResourceList, []string)
|
GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string)
|
||||||
|
|
||||||
// UpdateQOSCgroups performs housekeeping updates to ensure that the top
|
// UpdateQOSCgroups performs housekeeping updates to ensure that the top
|
||||||
// level QoS containers have their desired state in a thread-safe way
|
// level QoS containers have their desired state in a thread-safe way
|
||||||
|
@ -887,6 +887,6 @@ func (cm *containerManagerImpl) GetCapacity() v1.ResourceList {
|
|||||||
return cm.capacity
|
return cm.capacity
|
||||||
}
|
}
|
||||||
|
|
||||||
func (cm *containerManagerImpl) GetDevicePluginResourceCapacity() (v1.ResourceList, []string) {
|
func (cm *containerManagerImpl) GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) {
|
||||||
return cm.devicePluginManager.GetCapacity()
|
return cm.devicePluginManager.GetCapacity()
|
||||||
}
|
}
|
||||||
|
@ -70,8 +70,8 @@ func (cm *containerManagerStub) GetCapacity() v1.ResourceList {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (cm *containerManagerStub) GetDevicePluginResourceCapacity() (v1.ResourceList, []string) {
|
func (cm *containerManagerStub) GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) {
|
||||||
return nil, []string{}
|
return nil, nil, []string{}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (cm *containerManagerStub) NewPodContainerManager() PodContainerManager {
|
func (cm *containerManagerStub) NewPodContainerManager() PodContainerManager {
|
||||||
|
@ -73,8 +73,11 @@ type ManagerImpl struct {
|
|||||||
// e.g. a new device is advertised, two old devices are deleted and a running device fails.
|
// e.g. a new device is advertised, two old devices are deleted and a running device fails.
|
||||||
callback monitorCallback
|
callback monitorCallback
|
||||||
|
|
||||||
// allDevices contains all of registered resourceNames and their exported device IDs.
|
// healthyDevices contains all of the registered healthy resourceNames and their exported device IDs.
|
||||||
allDevices map[string]sets.String
|
healthyDevices map[string]sets.String
|
||||||
|
|
||||||
|
// unhealthyDevices contains all of the unhealthy devices and their exported device IDs.
|
||||||
|
unhealthyDevices map[string]sets.String
|
||||||
|
|
||||||
// allocatedDevices contains allocated deviceIds, keyed by resourceName.
|
// allocatedDevices contains allocated deviceIds, keyed by resourceName.
|
||||||
allocatedDevices map[string]sets.String
|
allocatedDevices map[string]sets.String
|
||||||
@ -106,7 +109,8 @@ func newManagerImpl(socketPath string) (*ManagerImpl, error) {
|
|||||||
endpoints: make(map[string]endpoint),
|
endpoints: make(map[string]endpoint),
|
||||||
socketname: file,
|
socketname: file,
|
||||||
socketdir: dir,
|
socketdir: dir,
|
||||||
allDevices: make(map[string]sets.String),
|
healthyDevices: make(map[string]sets.String),
|
||||||
|
unhealthyDevices: make(map[string]sets.String),
|
||||||
allocatedDevices: make(map[string]sets.String),
|
allocatedDevices: make(map[string]sets.String),
|
||||||
podDevices: make(podDevices),
|
podDevices: make(podDevices),
|
||||||
}
|
}
|
||||||
@ -128,20 +132,24 @@ func newManagerImpl(socketPath string) (*ManagerImpl, error) {
|
|||||||
func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, added, updated, deleted []pluginapi.Device) {
|
func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, added, updated, deleted []pluginapi.Device) {
|
||||||
kept := append(updated, added...)
|
kept := append(updated, added...)
|
||||||
m.mutex.Lock()
|
m.mutex.Lock()
|
||||||
if _, ok := m.allDevices[resourceName]; !ok {
|
if _, ok := m.healthyDevices[resourceName]; !ok {
|
||||||
m.allDevices[resourceName] = sets.NewString()
|
m.healthyDevices[resourceName] = sets.NewString()
|
||||||
|
}
|
||||||
|
if _, ok := m.unhealthyDevices[resourceName]; !ok {
|
||||||
|
m.unhealthyDevices[resourceName] = sets.NewString()
|
||||||
}
|
}
|
||||||
// For now, Manager only keeps track of healthy devices.
|
|
||||||
// TODO: adds support to track unhealthy devices.
|
|
||||||
for _, dev := range kept {
|
for _, dev := range kept {
|
||||||
if dev.Health == pluginapi.Healthy {
|
if dev.Health == pluginapi.Healthy {
|
||||||
m.allDevices[resourceName].Insert(dev.ID)
|
m.healthyDevices[resourceName].Insert(dev.ID)
|
||||||
|
m.unhealthyDevices[resourceName].Delete(dev.ID)
|
||||||
} else {
|
} else {
|
||||||
m.allDevices[resourceName].Delete(dev.ID)
|
m.unhealthyDevices[resourceName].Insert(dev.ID)
|
||||||
|
m.healthyDevices[resourceName].Delete(dev.ID)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, dev := range deleted {
|
for _, dev := range deleted {
|
||||||
m.allDevices[resourceName].Delete(dev.ID)
|
m.healthyDevices[resourceName].Delete(dev.ID)
|
||||||
|
m.unhealthyDevices[resourceName].Delete(dev.ID)
|
||||||
}
|
}
|
||||||
m.mutex.Unlock()
|
m.mutex.Unlock()
|
||||||
m.writeCheckpoint()
|
m.writeCheckpoint()
|
||||||
@ -371,7 +379,8 @@ func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
|
|||||||
|
|
||||||
// GetCapacity is expected to be called when Kubelet updates its node status.
|
// GetCapacity is expected to be called when Kubelet updates its node status.
|
||||||
// The first returned variable contains the registered device plugin resource capacity.
|
// The first returned variable contains the registered device plugin resource capacity.
|
||||||
// The second returned variable contains previously registered resources that are no longer active.
|
// The second returned variable contains the registered device plugin resource allocatable.
|
||||||
|
// The third returned variable contains previously registered resources that are no longer active.
|
||||||
// Kubelet uses this information to update resource capacity/allocatable in its node status.
|
// Kubelet uses this information to update resource capacity/allocatable in its node status.
|
||||||
// After the call, device plugin can remove the inactive resources from its internal list as the
|
// After the call, device plugin can remove the inactive resources from its internal list as the
|
||||||
// change is already reflected in Kubelet node status.
|
// change is already reflected in Kubelet node status.
|
||||||
@ -380,25 +389,47 @@ func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
|
|||||||
// cm.UpdatePluginResource() run during predicate Admit guarantees we adjust nodeinfo
|
// cm.UpdatePluginResource() run during predicate Admit guarantees we adjust nodeinfo
|
||||||
// capacity for already allocated pods so that they can continue to run. However, new pods
|
// capacity for already allocated pods so that they can continue to run. However, new pods
|
||||||
// requiring device plugin resources will not be scheduled till device plugin re-registers.
|
// requiring device plugin resources will not be scheduled till device plugin re-registers.
|
||||||
func (m *ManagerImpl) GetCapacity() (v1.ResourceList, []string) {
|
func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) {
|
||||||
needsUpdateCheckpoint := false
|
needsUpdateCheckpoint := false
|
||||||
var capacity = v1.ResourceList{}
|
var capacity = v1.ResourceList{}
|
||||||
|
var allocatable = v1.ResourceList{}
|
||||||
var deletedResources []string
|
var deletedResources []string
|
||||||
m.mutex.Lock()
|
m.mutex.Lock()
|
||||||
for resourceName, devices := range m.allDevices {
|
for resourceName, devices := range m.healthyDevices {
|
||||||
if _, ok := m.endpoints[resourceName]; !ok {
|
if _, ok := m.endpoints[resourceName]; !ok {
|
||||||
delete(m.allDevices, resourceName)
|
delete(m.healthyDevices, resourceName)
|
||||||
deletedResources = append(deletedResources, resourceName)
|
deletedResources = append(deletedResources, resourceName)
|
||||||
needsUpdateCheckpoint = true
|
needsUpdateCheckpoint = true
|
||||||
} else {
|
} else {
|
||||||
capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
|
capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
|
||||||
|
allocatable[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for resourceName, devices := range m.unhealthyDevices {
|
||||||
|
if _, ok := m.endpoints[resourceName]; !ok {
|
||||||
|
delete(m.unhealthyDevices, resourceName)
|
||||||
|
alreadyDeleted := false
|
||||||
|
for _, name := range deletedResources {
|
||||||
|
if name == resourceName {
|
||||||
|
alreadyDeleted = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !alreadyDeleted {
|
||||||
|
deletedResources = append(deletedResources, resourceName)
|
||||||
|
}
|
||||||
|
needsUpdateCheckpoint = true
|
||||||
|
} else {
|
||||||
|
capacityCount := capacity[v1.ResourceName(resourceName)]
|
||||||
|
unhealthyCount := *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
|
||||||
|
capacityCount.Add(unhealthyCount)
|
||||||
|
capacity[v1.ResourceName(resourceName)] = capacityCount
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.mutex.Unlock()
|
m.mutex.Unlock()
|
||||||
if needsUpdateCheckpoint {
|
if needsUpdateCheckpoint {
|
||||||
m.writeCheckpoint()
|
m.writeCheckpoint()
|
||||||
}
|
}
|
||||||
return capacity, deletedResources
|
return capacity, allocatable, deletedResources
|
||||||
}
|
}
|
||||||
|
|
||||||
// checkpointData struct is used to store pod to device allocation information
|
// checkpointData struct is used to store pod to device allocation information
|
||||||
@ -416,7 +447,7 @@ func (m *ManagerImpl) writeCheckpoint() error {
|
|||||||
PodDeviceEntries: m.podDevices.toCheckpointData(),
|
PodDeviceEntries: m.podDevices.toCheckpointData(),
|
||||||
RegisteredDevices: make(map[string][]string),
|
RegisteredDevices: make(map[string][]string),
|
||||||
}
|
}
|
||||||
for resource, devices := range m.allDevices {
|
for resource, devices := range m.healthyDevices {
|
||||||
data.RegisteredDevices[resource] = devices.UnsortedList()
|
data.RegisteredDevices[resource] = devices.UnsortedList()
|
||||||
}
|
}
|
||||||
m.mutex.Unlock()
|
m.mutex.Unlock()
|
||||||
@ -453,9 +484,10 @@ func (m *ManagerImpl) readCheckpoint() error {
|
|||||||
m.podDevices.fromCheckpointData(data.PodDeviceEntries)
|
m.podDevices.fromCheckpointData(data.PodDeviceEntries)
|
||||||
m.allocatedDevices = m.podDevices.devices()
|
m.allocatedDevices = m.podDevices.devices()
|
||||||
for resource, devices := range data.RegisteredDevices {
|
for resource, devices := range data.RegisteredDevices {
|
||||||
m.allDevices[resource] = sets.NewString()
|
// TODO: Support Checkpointing for unhealthy devices as well
|
||||||
|
m.healthyDevices[resource] = sets.NewString()
|
||||||
for _, dev := range devices {
|
for _, dev := range devices {
|
||||||
m.allDevices[resource].Insert(dev)
|
m.healthyDevices[resource].Insert(dev)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
@ -508,7 +540,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
|||||||
}
|
}
|
||||||
glog.V(3).Infof("Needs to allocate %v %v for pod %q container %q", needed, resource, podUID, contName)
|
glog.V(3).Infof("Needs to allocate %v %v for pod %q container %q", needed, resource, podUID, contName)
|
||||||
// Needs to allocate additional devices.
|
// Needs to allocate additional devices.
|
||||||
if _, ok := m.allDevices[resource]; !ok {
|
if _, ok := m.healthyDevices[resource]; !ok {
|
||||||
return nil, fmt.Errorf("can't allocate unregistered device %v", resource)
|
return nil, fmt.Errorf("can't allocate unregistered device %v", resource)
|
||||||
}
|
}
|
||||||
devices = sets.NewString()
|
devices = sets.NewString()
|
||||||
@ -527,7 +559,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
|||||||
// Gets Devices in use.
|
// Gets Devices in use.
|
||||||
devicesInUse := m.allocatedDevices[resource]
|
devicesInUse := m.allocatedDevices[resource]
|
||||||
// Gets a list of available devices.
|
// Gets a list of available devices.
|
||||||
available := m.allDevices[resource].Difference(devicesInUse)
|
available := m.healthyDevices[resource].Difference(devicesInUse)
|
||||||
if int(available.Len()) < needed {
|
if int(available.Len()) < needed {
|
||||||
return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
|
return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
|
||||||
}
|
}
|
||||||
@ -558,7 +590,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
|
|||||||
resource := string(k)
|
resource := string(k)
|
||||||
needed := int(v.Value())
|
needed := int(v.Value())
|
||||||
glog.V(3).Infof("needs %d %s", needed, resource)
|
glog.V(3).Infof("needs %d %s", needed, resource)
|
||||||
_, registeredResource := m.allDevices[resource]
|
_, registeredResource := m.healthyDevices[resource]
|
||||||
_, allocatedResource := m.allocatedDevices[resource]
|
_, allocatedResource := m.allocatedDevices[resource]
|
||||||
// Continues if this is neither an active device plugin resource nor
|
// Continues if this is neither an active device plugin resource nor
|
||||||
// a resource we have previously allocated.
|
// a resource we have previously allocated.
|
||||||
|
@ -58,6 +58,6 @@ func (h *ManagerStub) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co
|
|||||||
}
|
}
|
||||||
|
|
||||||
// GetCapacity simply returns nil capacity and empty removed resource list.
|
// GetCapacity simply returns nil capacity and empty removed resource list.
|
||||||
func (h *ManagerStub) GetCapacity() (v1.ResourceList, []string) {
|
func (h *ManagerStub) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) {
|
||||||
return nil, []string{}
|
return nil, nil, []string{}
|
||||||
}
|
}
|
||||||
|
@ -138,7 +138,7 @@ func cleanup(t *testing.T, m Manager, p *Stub) {
|
|||||||
m.Stop()
|
m.Stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestUpdateCapacity(t *testing.T) {
|
func TestUpdateCapacityAllocatable(t *testing.T) {
|
||||||
testManager, err := newManagerImpl(socketName)
|
testManager, err := newManagerImpl(socketName)
|
||||||
as := assert.New(t)
|
as := assert.New(t)
|
||||||
as.NotNil(testManager)
|
as.NotNil(testManager)
|
||||||
@ -156,61 +156,81 @@ func TestUpdateCapacity(t *testing.T) {
|
|||||||
resourceName1 := "domain1.com/resource1"
|
resourceName1 := "domain1.com/resource1"
|
||||||
testManager.endpoints[resourceName1] = &endpointImpl{devices: make(map[string]pluginapi.Device)}
|
testManager.endpoints[resourceName1] = &endpointImpl{devices: make(map[string]pluginapi.Device)}
|
||||||
callback(resourceName1, devs, []pluginapi.Device{}, []pluginapi.Device{})
|
callback(resourceName1, devs, []pluginapi.Device{}, []pluginapi.Device{})
|
||||||
capacity, removedResources := testManager.GetCapacity()
|
capacity, allocatable, removedResources := testManager.GetCapacity()
|
||||||
resource1Capacity, ok := capacity[v1.ResourceName(resourceName1)]
|
resource1Capacity, ok := capacity[v1.ResourceName(resourceName1)]
|
||||||
as.True(ok)
|
as.True(ok)
|
||||||
as.Equal(int64(2), resource1Capacity.Value())
|
resource1Allocatable, ok := allocatable[v1.ResourceName(resourceName1)]
|
||||||
|
as.True(ok)
|
||||||
|
as.Equal(int64(3), resource1Capacity.Value())
|
||||||
|
as.Equal(int64(2), resource1Allocatable.Value())
|
||||||
as.Equal(0, len(removedResources))
|
as.Equal(0, len(removedResources))
|
||||||
|
|
||||||
// Deletes an unhealthy device should NOT change capacity.
|
// Deletes an unhealthy device should NOT change allocatable but change capacity.
|
||||||
callback(resourceName1, []pluginapi.Device{}, []pluginapi.Device{}, []pluginapi.Device{devs[2]})
|
callback(resourceName1, []pluginapi.Device{}, []pluginapi.Device{}, []pluginapi.Device{devs[2]})
|
||||||
capacity, removedResources = testManager.GetCapacity()
|
capacity, allocatable, removedResources = testManager.GetCapacity()
|
||||||
resource1Capacity, ok = capacity[v1.ResourceName(resourceName1)]
|
resource1Capacity, ok = capacity[v1.ResourceName(resourceName1)]
|
||||||
as.True(ok)
|
as.True(ok)
|
||||||
|
resource1Allocatable, ok = allocatable[v1.ResourceName(resourceName1)]
|
||||||
|
as.True(ok)
|
||||||
as.Equal(int64(2), resource1Capacity.Value())
|
as.Equal(int64(2), resource1Capacity.Value())
|
||||||
|
as.Equal(int64(2), resource1Allocatable.Value())
|
||||||
as.Equal(0, len(removedResources))
|
as.Equal(0, len(removedResources))
|
||||||
|
|
||||||
// Updates a healthy device to unhealthy should reduce capacity by 1.
|
// Updates a healthy device to unhealthy should reduce allocatable by 1.
|
||||||
dev2 := devs[1]
|
dev2 := devs[1]
|
||||||
dev2.Health = pluginapi.Unhealthy
|
dev2.Health = pluginapi.Unhealthy
|
||||||
callback(resourceName1, []pluginapi.Device{}, []pluginapi.Device{dev2}, []pluginapi.Device{})
|
callback(resourceName1, []pluginapi.Device{}, []pluginapi.Device{dev2}, []pluginapi.Device{})
|
||||||
capacity, removedResources = testManager.GetCapacity()
|
capacity, allocatable, removedResources = testManager.GetCapacity()
|
||||||
resource1Capacity, ok = capacity[v1.ResourceName(resourceName1)]
|
resource1Capacity, ok = capacity[v1.ResourceName(resourceName1)]
|
||||||
as.True(ok)
|
as.True(ok)
|
||||||
as.Equal(int64(1), resource1Capacity.Value())
|
resource1Allocatable, ok = allocatable[v1.ResourceName(resourceName1)]
|
||||||
|
as.True(ok)
|
||||||
|
as.Equal(int64(2), resource1Capacity.Value())
|
||||||
|
as.Equal(int64(1), resource1Allocatable.Value())
|
||||||
as.Equal(0, len(removedResources))
|
as.Equal(0, len(removedResources))
|
||||||
|
|
||||||
// Deletes a healthy device should reduce capacity by 1.
|
// Deletes a healthy device should reduce capacity and allocatable by 1.
|
||||||
callback(resourceName1, []pluginapi.Device{}, []pluginapi.Device{}, []pluginapi.Device{devs[0]})
|
callback(resourceName1, []pluginapi.Device{}, []pluginapi.Device{}, []pluginapi.Device{devs[0]})
|
||||||
capacity, removedResources = testManager.GetCapacity()
|
capacity, allocatable, removedResources = testManager.GetCapacity()
|
||||||
resource1Capacity, ok = capacity[v1.ResourceName(resourceName1)]
|
resource1Capacity, ok = capacity[v1.ResourceName(resourceName1)]
|
||||||
as.True(ok)
|
as.True(ok)
|
||||||
as.Equal(int64(0), resource1Capacity.Value())
|
resource1Allocatable, ok = allocatable[v1.ResourceName(resourceName1)]
|
||||||
|
as.True(ok)
|
||||||
|
as.Equal(int64(0), resource1Allocatable.Value())
|
||||||
|
as.Equal(int64(1), resource1Capacity.Value())
|
||||||
as.Equal(0, len(removedResources))
|
as.Equal(0, len(removedResources))
|
||||||
|
|
||||||
// Tests adding another resource.
|
// Tests adding another resource.
|
||||||
resourceName2 := "resource2"
|
resourceName2 := "resource2"
|
||||||
testManager.endpoints[resourceName2] = &endpointImpl{devices: make(map[string]pluginapi.Device)}
|
testManager.endpoints[resourceName2] = &endpointImpl{devices: make(map[string]pluginapi.Device)}
|
||||||
callback(resourceName2, devs, []pluginapi.Device{}, []pluginapi.Device{})
|
callback(resourceName2, devs, []pluginapi.Device{}, []pluginapi.Device{})
|
||||||
capacity, removedResources = testManager.GetCapacity()
|
capacity, allocatable, removedResources = testManager.GetCapacity()
|
||||||
as.Equal(2, len(capacity))
|
as.Equal(2, len(capacity))
|
||||||
resource2Capacity, ok := capacity[v1.ResourceName(resourceName2)]
|
resource2Capacity, ok := capacity[v1.ResourceName(resourceName2)]
|
||||||
as.True(ok)
|
as.True(ok)
|
||||||
as.Equal(int64(2), resource2Capacity.Value())
|
resource2Allocatable, ok := allocatable[v1.ResourceName(resourceName2)]
|
||||||
|
as.True(ok)
|
||||||
|
as.Equal(int64(3), resource2Capacity.Value())
|
||||||
|
as.Equal(int64(2), resource2Allocatable.Value())
|
||||||
as.Equal(0, len(removedResources))
|
as.Equal(0, len(removedResources))
|
||||||
|
|
||||||
// Removes resourceName1 endpoint. Verifies testManager.GetCapacity() reports that resourceName1
|
// Removes resourceName1 endpoint. Verifies testManager.GetCapacity() reports that resourceName1
|
||||||
// is removed from capacity and it no longer exists in allDevices after the call.
|
// is removed from capacity and it no longer exists in healthyDevices after the call.
|
||||||
delete(testManager.endpoints, resourceName1)
|
delete(testManager.endpoints, resourceName1)
|
||||||
capacity, removed := testManager.GetCapacity()
|
capacity, allocatable, removed := testManager.GetCapacity()
|
||||||
as.Equal([]string{resourceName1}, removed)
|
as.Equal([]string{resourceName1}, removed)
|
||||||
_, ok = capacity[v1.ResourceName(resourceName1)]
|
_, ok = capacity[v1.ResourceName(resourceName1)]
|
||||||
as.False(ok)
|
as.False(ok)
|
||||||
val, ok := capacity[v1.ResourceName(resourceName2)]
|
val, ok := capacity[v1.ResourceName(resourceName2)]
|
||||||
as.True(ok)
|
as.True(ok)
|
||||||
as.Equal(int64(2), val.Value())
|
as.Equal(int64(3), val.Value())
|
||||||
_, ok = testManager.allDevices[resourceName1]
|
_, ok = testManager.healthyDevices[resourceName1]
|
||||||
as.False(ok)
|
as.False(ok)
|
||||||
|
_, ok = testManager.unhealthyDevices[resourceName1]
|
||||||
|
as.False(ok)
|
||||||
|
fmt.Println("removed: ", removed)
|
||||||
|
as.Equal(1, len(removed))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type stringPairType struct {
|
type stringPairType struct {
|
||||||
@ -259,7 +279,7 @@ func TestCheckpoint(t *testing.T) {
|
|||||||
defer os.RemoveAll(tmpDir)
|
defer os.RemoveAll(tmpDir)
|
||||||
testManager := &ManagerImpl{
|
testManager := &ManagerImpl{
|
||||||
socketdir: tmpDir,
|
socketdir: tmpDir,
|
||||||
allDevices: make(map[string]sets.String),
|
healthyDevices: make(map[string]sets.String),
|
||||||
allocatedDevices: make(map[string]sets.String),
|
allocatedDevices: make(map[string]sets.String),
|
||||||
podDevices: make(podDevices),
|
podDevices: make(podDevices),
|
||||||
}
|
}
|
||||||
@ -283,19 +303,19 @@ func TestCheckpoint(t *testing.T) {
|
|||||||
constructAllocResp(map[string]string{"/dev/r1dev4": "/dev/r1dev4"},
|
constructAllocResp(map[string]string{"/dev/r1dev4": "/dev/r1dev4"},
|
||||||
map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
|
map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
|
||||||
|
|
||||||
testManager.allDevices[resourceName1] = sets.NewString()
|
testManager.healthyDevices[resourceName1] = sets.NewString()
|
||||||
testManager.allDevices[resourceName1].Insert("dev1")
|
testManager.healthyDevices[resourceName1].Insert("dev1")
|
||||||
testManager.allDevices[resourceName1].Insert("dev2")
|
testManager.healthyDevices[resourceName1].Insert("dev2")
|
||||||
testManager.allDevices[resourceName1].Insert("dev3")
|
testManager.healthyDevices[resourceName1].Insert("dev3")
|
||||||
testManager.allDevices[resourceName1].Insert("dev4")
|
testManager.healthyDevices[resourceName1].Insert("dev4")
|
||||||
testManager.allDevices[resourceName1].Insert("dev5")
|
testManager.healthyDevices[resourceName1].Insert("dev5")
|
||||||
testManager.allDevices[resourceName2] = sets.NewString()
|
testManager.healthyDevices[resourceName2] = sets.NewString()
|
||||||
testManager.allDevices[resourceName2].Insert("dev1")
|
testManager.healthyDevices[resourceName2].Insert("dev1")
|
||||||
testManager.allDevices[resourceName2].Insert("dev2")
|
testManager.healthyDevices[resourceName2].Insert("dev2")
|
||||||
|
|
||||||
expectedPodDevices := testManager.podDevices
|
expectedPodDevices := testManager.podDevices
|
||||||
expectedAllocatedDevices := testManager.podDevices.devices()
|
expectedAllocatedDevices := testManager.podDevices.devices()
|
||||||
expectedAllDevices := testManager.allDevices
|
expectedAllDevices := testManager.healthyDevices
|
||||||
|
|
||||||
err = testManager.writeCheckpoint()
|
err = testManager.writeCheckpoint()
|
||||||
|
|
||||||
@ -320,7 +340,7 @@ func TestCheckpoint(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
as.True(reflect.DeepEqual(expectedAllocatedDevices, testManager.allocatedDevices))
|
as.True(reflect.DeepEqual(expectedAllocatedDevices, testManager.allocatedDevices))
|
||||||
as.True(reflect.DeepEqual(expectedAllDevices, testManager.allDevices))
|
as.True(reflect.DeepEqual(expectedAllDevices, testManager.healthyDevices))
|
||||||
}
|
}
|
||||||
|
|
||||||
type activePodsStub struct {
|
type activePodsStub struct {
|
||||||
@ -377,7 +397,7 @@ func getTestManager(tmpDir string, activePods ActivePodsFunc, testRes []TestReso
|
|||||||
testManager := &ManagerImpl{
|
testManager := &ManagerImpl{
|
||||||
socketdir: tmpDir,
|
socketdir: tmpDir,
|
||||||
callback: monitorCallback,
|
callback: monitorCallback,
|
||||||
allDevices: make(map[string]sets.String),
|
healthyDevices: make(map[string]sets.String),
|
||||||
allocatedDevices: make(map[string]sets.String),
|
allocatedDevices: make(map[string]sets.String),
|
||||||
endpoints: make(map[string]endpoint),
|
endpoints: make(map[string]endpoint),
|
||||||
podDevices: make(podDevices),
|
podDevices: make(podDevices),
|
||||||
@ -386,9 +406,9 @@ func getTestManager(tmpDir string, activePods ActivePodsFunc, testRes []TestReso
|
|||||||
}
|
}
|
||||||
testManager.store, _ = utilstore.NewFileStore("/tmp/", utilfs.DefaultFs{})
|
testManager.store, _ = utilstore.NewFileStore("/tmp/", utilfs.DefaultFs{})
|
||||||
for _, res := range testRes {
|
for _, res := range testRes {
|
||||||
testManager.allDevices[res.resourceName] = sets.NewString()
|
testManager.healthyDevices[res.resourceName] = sets.NewString()
|
||||||
for _, dev := range res.devs {
|
for _, dev := range res.devs {
|
||||||
testManager.allDevices[res.resourceName].Insert(dev)
|
testManager.healthyDevices[res.resourceName].Insert(dev)
|
||||||
}
|
}
|
||||||
if res.resourceName == "domain1.com/resource1" {
|
if res.resourceName == "domain1.com/resource1" {
|
||||||
testManager.endpoints[res.resourceName] = &MockEndpoint{
|
testManager.endpoints[res.resourceName] = &MockEndpoint{
|
||||||
@ -675,7 +695,7 @@ func TestSanitizeNodeAllocatable(t *testing.T) {
|
|||||||
|
|
||||||
testManager := &ManagerImpl{
|
testManager := &ManagerImpl{
|
||||||
callback: monitorCallback,
|
callback: monitorCallback,
|
||||||
allDevices: make(map[string]sets.String),
|
healthyDevices: make(map[string]sets.String),
|
||||||
allocatedDevices: make(map[string]sets.String),
|
allocatedDevices: make(map[string]sets.String),
|
||||||
podDevices: make(podDevices),
|
podDevices: make(podDevices),
|
||||||
}
|
}
|
||||||
|
@ -53,9 +53,9 @@ type Manager interface {
|
|||||||
// for the found one. An empty struct is returned in case no cached state is found.
|
// for the found one. An empty struct is returned in case no cached state is found.
|
||||||
GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions
|
GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions
|
||||||
|
|
||||||
// GetCapacity returns the amount of available device plugin resource capacity
|
// GetCapacity returns the amount of available device plugin resource capacity, resource allocatable
|
||||||
// and inactive device plugin resources previously registered on the node.
|
// and inactive device plugin resources previously registered on the node.
|
||||||
GetCapacity() (v1.ResourceList, []string)
|
GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
|
||||||
}
|
}
|
||||||
|
|
||||||
// DeviceRunContainerOptions contains the combined container runtime settings to consume its allocated devices.
|
// DeviceRunContainerOptions contains the combined container runtime settings to consume its allocated devices.
|
||||||
|
@ -548,6 +548,10 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var devicePluginAllocatable v1.ResourceList
|
||||||
|
var devicePluginCapacity v1.ResourceList
|
||||||
|
var removedDevicePlugins []string
|
||||||
|
|
||||||
// TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start
|
// TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start
|
||||||
// cAdvisor locally, e.g. for test-cmd.sh, and in integration test.
|
// cAdvisor locally, e.g. for test-cmd.sh, and in integration test.
|
||||||
info, err := kl.GetCachedMachineInfo()
|
info, err := kl.GetCachedMachineInfo()
|
||||||
@ -592,13 +596,14 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
devicePluginCapacity, removedDevicePlugins := kl.containerManager.GetDevicePluginResourceCapacity()
|
devicePluginCapacity, devicePluginAllocatable, removedDevicePlugins = kl.containerManager.GetDevicePluginResourceCapacity()
|
||||||
if devicePluginCapacity != nil {
|
if devicePluginCapacity != nil {
|
||||||
for k, v := range devicePluginCapacity {
|
for k, v := range devicePluginCapacity {
|
||||||
glog.V(2).Infof("Update capacity for %s to %d", k, v.Value())
|
glog.V(2).Infof("Update capacity for %s to %d", k, v.Value())
|
||||||
node.Status.Capacity[k] = v
|
node.Status.Capacity[k] = v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, removedResource := range removedDevicePlugins {
|
for _, removedResource := range removedDevicePlugins {
|
||||||
glog.V(2).Infof("Remove capacity for %s", removedResource)
|
glog.V(2).Infof("Remove capacity for %s", removedResource)
|
||||||
delete(node.Status.Capacity, v1.ResourceName(removedResource))
|
delete(node.Status.Capacity, v1.ResourceName(removedResource))
|
||||||
@ -629,6 +634,12 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
|
|||||||
}
|
}
|
||||||
node.Status.Allocatable[k] = value
|
node.Status.Allocatable[k] = value
|
||||||
}
|
}
|
||||||
|
if devicePluginAllocatable != nil {
|
||||||
|
for k, v := range devicePluginAllocatable {
|
||||||
|
glog.V(2).Infof("Update allocatable for %s to %d", k, v.Value())
|
||||||
|
node.Status.Allocatable[k] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
// for every huge page reservation, we need to remove it from allocatable memory
|
// for every huge page reservation, we need to remove it from allocatable memory
|
||||||
for k, v := range node.Status.Capacity {
|
for k, v := range node.Status.Capacity {
|
||||||
if v1helper.IsHugePageResourceName(k) {
|
if v1helper.IsHugePageResourceName(k) {
|
||||||
|
Loading…
Reference in New Issue
Block a user