Merge pull request #73824 from jiayingz/reallocate

Checks whether we have cached runtime state before starting a container
This commit is contained in:
Kubernetes Prow Robot 2019-02-15 20:35:30 -08:00 committed by GitHub
commit c88dcee3e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 4 deletions

View File

@ -311,10 +311,7 @@ func (m *ManagerImpl) isVersionCompatibleWithPlugin(versions []string) bool {
return false
}
// Allocate is the call that you can use to allocate a set of devices
// from the registered device plugins.
func (m *ManagerImpl) Allocate(node *schedulernodeinfo.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
pod := attrs.Pod
func (m *ManagerImpl) allocatePodResources(pod *v1.Pod) error {
devicesToReuse := make(map[string]sets.String)
for _, container := range pod.Spec.InitContainers {
if err := m.allocateContainerResources(pod, &container, devicesToReuse); err != nil {
@ -328,6 +325,18 @@ func (m *ManagerImpl) Allocate(node *schedulernodeinfo.NodeInfo, attrs *lifecycl
}
m.podDevices.removeContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)
}
return nil
}
// Allocate is the call that you can use to allocate a set of devices
// from the registered device plugins.
func (m *ManagerImpl) Allocate(node *schedulernodeinfo.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
pod := attrs.Pod
err := m.allocatePodResources(pod)
if err != nil {
klog.Errorf("Failed to allocate device plugin resource for pod %s: %v", string(pod.UID), err)
return err
}
m.mutex.Lock()
defer m.mutex.Unlock()
@ -717,6 +726,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error) {
podUID := string(pod.UID)
contName := container.Name
needsReAllocate := false
for k := range container.Resources.Limits {
resource := string(k)
if !m.isDevicePluginResource(resource) {
@ -726,6 +736,16 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co
if err != nil {
return nil, err
}
// This is a device plugin resource yet we don't have cached
// resource state. This is likely due to a race during node
// restart. We re-issue allocate request to cover this race.
if m.podDevices.containerDevices(podUID, contName, resource) == nil {
needsReAllocate = true
}
}
if needsReAllocate {
klog.V(2).Infof("needs re-allocate device plugin resources for pod %s", podUID)
m.allocatePodResources(pod)
}
m.mutex.Lock()
defer m.mutex.Unlock()

View File

@ -134,6 +134,7 @@ func (kl *Kubelet) reconcileExtendedResource(initialNode, node *v1.Node) bool {
requiresUpdate := false
for k := range node.Status.Capacity {
if v1helper.IsExtendedResourceName(k) {
klog.Infof("Zero out resource %s capacity in existing node.", k)
node.Status.Capacity[k] = *resource.NewQuantity(int64(0), resource.DecimalSI)
node.Status.Allocatable[k] = *resource.NewQuantity(int64(0), resource.DecimalSI)
requiresUpdate = true