add admission handler for device resources allocation

This commit is contained in:
lichuqiang 2017-11-02 09:17:48 +08:00
parent 441e646827
commit ebd445eb8c
12 changed files with 157 additions and 37 deletions

View File

@ -37,8 +37,10 @@ go_library(
"//pkg/kubelet/cm/cpumanager:go_default_library",
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/eviction/api:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/status:go_default_library",
"//pkg/util/mount:go_default_library",
"//plugin/pkg/scheduler/schedulercache:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",

View File

@ -26,7 +26,9 @@ import (
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
"fmt"
"strconv"
@ -74,7 +76,14 @@ type ContainerManager interface {
// GetResources returns RunContainerOptions with devices, mounts, and env fields populated for
// extended resources required by container.
GetResources(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) (*kubecontainer.RunContainerOptions, error)
GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error)
// UpdatePluginResources calls Allocate of device plugin handler for potential
// requests for device plugin resources, and returns an error if fails.
// Otherwise, it updates allocatableResource in nodeInfo if necessary,
// to make sure it is at least equal to the pod's requested capacity for
// any registered device plugin resource
UpdatePluginResources(*schedulercache.NodeInfo, *lifecycle.PodAdmitAttributes) error
InternalContainerLifecycle() InternalContainerLifecycle
}

View File

@ -48,6 +48,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/deviceplugin"
cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/status"
utilfile "k8s.io/kubernetes/pkg/util/file"
@ -56,6 +57,7 @@ import (
"k8s.io/kubernetes/pkg/util/procfs"
utilsysctl "k8s.io/kubernetes/pkg/util/sysctl"
utilversion "k8s.io/kubernetes/pkg/util/version"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)
const (
@ -594,7 +596,7 @@ func (cm *containerManagerImpl) Start(node *v1.Node,
}, time.Second, stopChan)
// Starts device plugin manager.
if err := cm.devicePluginHandler.Start(); err != nil {
if err := cm.devicePluginHandler.Start(deviceplugin.ActivePodsFunc(activePods)); err != nil {
return err
}
return nil
@ -615,14 +617,10 @@ func (cm *containerManagerImpl) setFsCapacity() error {
}
// TODO: move the GetResources logic to PodContainerManager.
func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) (*kubecontainer.RunContainerOptions, error) {
func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
opts := &kubecontainer.RunContainerOptions{}
// Gets devices, mounts, and envs from device plugin handler.
glog.V(3).Infof("Calling devicePluginHandler AllocateDevices")
err := cm.devicePluginHandler.Allocate(pod, container, activePods)
if err != nil {
return opts, err
}
// Allocate should already be called during predicateAdmitHandler.Admit(),
// just try to fetch device runtime information from cached state here
devOpts := cm.devicePluginHandler.GetDeviceRunContainerOptions(pod, container)
if devOpts == nil {
return opts, nil
@ -633,6 +631,10 @@ func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Containe
return opts, nil
}
func (cm *containerManagerImpl) UpdatePluginResources(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
return cm.devicePluginHandler.Allocate(node, attrs)
}
func (cm *containerManagerImpl) SystemCgroupsLimit() v1.ResourceList {
cpuLimit := int64(0)

View File

@ -23,7 +23,9 @@ import (
internalapi "k8s.io/kubernetes/pkg/kubelet/apis/cri"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)
type containerManagerStub struct{}
@ -71,10 +73,14 @@ func (cm *containerManagerStub) NewPodContainerManager() PodContainerManager {
return &podContainerManagerStub{}
}
func (cm *containerManagerStub) GetResources(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) (*kubecontainer.RunContainerOptions, error) {
func (cm *containerManagerStub) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
return &kubecontainer.RunContainerOptions{}, nil
}
func (cm *containerManagerStub) UpdatePluginResources(*schedulercache.NodeInfo, *lifecycle.PodAdmitAttributes) error {
return nil
}
func (cm *containerManagerStub) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cpumanager.NewFakeManager()}
}

View File

@ -27,8 +27,10 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/pkg/util/mount"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)
type unsupportedContainerManager struct {
@ -76,10 +78,14 @@ func (cm *unsupportedContainerManager) NewPodContainerManager() PodContainerMana
return &unsupportedPodContainerManager{}
}
func (cm *unsupportedContainerManager) GetResources(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) (*kubecontainer.RunContainerOptions, error) {
func (cm *unsupportedContainerManager) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
return &kubecontainer.RunContainerOptions{}, nil
}
func (cm *unsupportedContainerManager) UpdatePluginResources(*schedulercache.NodeInfo, *lifecycle.PodAdmitAttributes) error {
return nil
}
func (cm *unsupportedContainerManager) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cpumanager.NewFakeManager()}
}

View File

@ -22,6 +22,8 @@ go_library(
"//pkg/api/v1/helper:go_default_library",
"//pkg/kubelet/apis/deviceplugin/v1alpha:go_default_library",
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//plugin/pkg/scheduler/schedulercache:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/golang.org/x/net/context:go_default_library",
"//vendor/google.golang.org/grpc:go_default_library",

View File

@ -29,19 +29,28 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/sets"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)
// ActivePodsFunc is a function that returns a list of pods to reconcile.
type ActivePodsFunc func() []*v1.Pod
// Handler defines the functions used to manage and access device plugin resources.
type Handler interface {
// Start starts device plugin registration service.
Start() error
Start(activePods ActivePodsFunc) error
// Devices returns all of registered devices keyed by resourceName.
Devices() map[string][]pluginapi.Device
// Allocate attempts to allocate all of required extended resources for
// the input container, issues an Allocate rpc request for each of such
// resources, processes their AllocateResponses, and updates the cached
// containerDevices on success.
Allocate(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) error
// Allocate scans through containers in the pod spec
// If it finds the container requires device plugin resource, it:
// 1. Checks whether it already has this information in its cached state.
// 2. If not, it calls Allocate and populate its cached state afterwards.
// 3. If there is no cached state and Allocate fails, it returns an error.
// 4. Otherwise, it updates allocatableResource in nodeInfo if necessary,
// to make sure it is at least equal to the pod's requested capacity for
// any registered device plugin resource
Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error
// GetDeviceRunContainerOptions checks whether we have cached containerDevices
// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
// for the found one. An empty struct is returned in case no cached state is found.
@ -53,6 +62,10 @@ type HandlerImpl struct {
// TODO: consider to change this to RWMutex.
sync.Mutex
devicePluginManager Manager
// activePods is a method for listing active pods on the node
// so the amount of pluginResources requested by existing pods
// could be counted when updating allocated devices
activePods ActivePodsFunc
// devicePluginManagerMonitorCallback is used for testing only.
devicePluginManagerMonitorCallback MonitorCallback
// allDevices contains all of registered resourceNames and their exported device IDs.
@ -103,16 +116,21 @@ func NewHandlerImpl(updateCapacityFunc func(v1.ResourceList)) (*HandlerImpl, err
handler.devicePluginManager = mgr
handler.devicePluginManagerMonitorCallback = deviceManagerMonitorCallback
// Loads in allocatedDevices information from disk.
err = handler.readCheckpoint()
if err != nil {
glog.Warningf("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date. Err: %v", err)
}
return handler, nil
}
// Start starts device plugin registration service.
func (h *HandlerImpl) Start() error {
// Start initializes podDevices and allocatedDevices information from checkpoint-ed state
// and starts device plugin registration service.
func (h *HandlerImpl) Start(activePods ActivePodsFunc) error {
h.activePods = activePods
// Loads in allocatedDevices information from disk.
err := h.readCheckpoint()
if err != nil {
glog.Warningf("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date. Err: %v", err)
}
return h.devicePluginManager.Start()
}
@ -166,11 +184,11 @@ func (h *HandlerImpl) devicesToAllocate(podUID, contName, resource string, requi
return devices, nil
}
// Allocate attempts to allocate all of required extended resources for
// the input container, issues an Allocate rpc request for each of such
// resources, processes their AllocateResponses, and updates the cached
// containerDevices on success.
func (h *HandlerImpl) Allocate(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) error {
// allocateContainerResources attempts to allocate all of required device
// plugin resources for the input container, issues an Allocate rpc request
// for each new device resource requirement, processes their AllocateResponses,
// and updates the cached containerDevices on success.
func (h *HandlerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Container) error {
podUID := string(pod.UID)
contName := container.Name
allocatedDevicesUpdated := false
@ -184,7 +202,7 @@ func (h *HandlerImpl) Allocate(pod *v1.Pod, container *v1.Container, activePods
// Updates allocatedDevices to garbage collect any stranded resources
// before doing the device plugin allocation.
if !allocatedDevicesUpdated {
h.updateAllocatedDevices(activePods)
h.updateAllocatedDevices(h.activePods())
allocatedDevicesUpdated = true
}
allocDevices, err := h.devicesToAllocate(podUID, contName, resource, needed)
@ -226,6 +244,60 @@ func (h *HandlerImpl) Allocate(pod *v1.Pod, container *v1.Container, activePods
return h.writeCheckpoint()
}
// Allocate attempts to allocate all of required device plugin resources,
// and update Allocatable resources in nodeInfo if necessary
func (h *HandlerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
pod := attrs.Pod
// TODO: Reuse devices between init containers and regular containers.
for _, container := range pod.Spec.InitContainers {
if err := h.allocateContainerResources(pod, &container); err != nil {
return err
}
}
for _, container := range pod.Spec.Containers {
if err := h.allocateContainerResources(pod, &container); err != nil {
return err
}
}
// quick return if no pluginResources requested
if _, podRequireDevicePluginResource := h.podDevices[string(pod.UID)]; !podRequireDevicePluginResource {
return nil
}
h.sanitizeNodeAllocatable(node)
return nil
}
// sanitizeNodeAllocatable scans through allocatedDevices in DevicePluginHandler
// and if necessary, updates allocatableResource in nodeInfo to at least equal to
// the allocated capacity. This allows pods that have already been scheduled on
// the node to pass GeneralPredicates admission checking even upon device plugin failure.
func (h *HandlerImpl) sanitizeNodeAllocatable(node *schedulercache.NodeInfo) {
var newAllocatableResource *schedulercache.Resource
allocatableResource := node.AllocatableResource()
if allocatableResource.ScalarResources == nil {
allocatableResource.ScalarResources = make(map[v1.ResourceName]int64)
}
for resource, devices := range h.allocatedDevices {
needed := devices.Len()
quant, ok := allocatableResource.ScalarResources[v1.ResourceName(resource)]
if ok && int(quant) >= needed {
continue
}
// Needs to update nodeInfo.AllocatableResource to make sure
// NodeInfo.allocatableResource at least equal to the capacity already allocated.
if newAllocatableResource == nil {
newAllocatableResource = allocatableResource.Clone()
}
newAllocatableResource.ScalarResources[v1.ResourceName(resource)] = int64(needed)
}
if newAllocatableResource != nil {
node.SetAllocatableResource(newAllocatableResource)
}
}
// GetDeviceRunContainerOptions checks whether we have cached containerDevices
// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
// for the found one. An empty struct is returned in case no cached state is found.

View File

@ -19,6 +19,8 @@ package deviceplugin
import (
"k8s.io/api/core/v1"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)
// HandlerStub provides a simple stub implementation for Handler.
@ -30,7 +32,7 @@ func NewHandlerStub() (*HandlerStub, error) {
}
// Start simply returns nil.
func (h *HandlerStub) Start() error {
func (h *HandlerStub) Start(activePods ActivePodsFunc) error {
return nil
}
@ -40,7 +42,7 @@ func (h *HandlerStub) Devices() map[string][]pluginapi.Device {
}
// Allocate simply returns nil.
func (h *HandlerStub) Allocate(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) error {
func (h *HandlerStub) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
return nil
}

View File

@ -873,7 +873,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
klet.AddPodSyncHandler(activeDeadlineHandler)
criticalPodAdmissionHandler := preemption.NewCriticalPodAdmissionHandler(klet.GetActivePods, killPodNow(klet.podWorkers, kubeDeps.Recorder), kubeDeps.Recorder)
klet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(klet.getNodeAnyWay, criticalPodAdmissionHandler))
klet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(klet.getNodeAnyWay, criticalPodAdmissionHandler, klet.containerManager.UpdatePluginResources))
// apply functional Option's
for _, opt := range kubeDeps.Options {
opt(klet)

View File

@ -383,7 +383,7 @@ func (kl *Kubelet) GetPodCgroupParent(pod *v1.Pod) string {
// the container runtime to set parameters for launching a container.
func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Container, podIP string) (*kubecontainer.RunContainerOptions, bool, error) {
useClusterFirstPolicy := false
opts, err := kl.containerManager.GetResources(pod, container, kl.GetActivePods())
opts, err := kl.containerManager.GetResources(pod, container)
if err != nil {
return nil, false, err
}

View File

@ -29,6 +29,8 @@ import (
type getNodeAnyWayFuncType func() (*v1.Node, error)
type pluginResourceUpdateFuncType func(*schedulercache.NodeInfo, *PodAdmitAttributes) error
// AdmissionFailureHandler is an interface which defines how to deal with a failure to admit a pod.
// This allows for the graceful handling of pod admission failure.
type AdmissionFailureHandler interface {
@ -36,15 +38,17 @@ type AdmissionFailureHandler interface {
}
type predicateAdmitHandler struct {
getNodeAnyWayFunc getNodeAnyWayFuncType
admissionFailureHandler AdmissionFailureHandler
getNodeAnyWayFunc getNodeAnyWayFuncType
pluginResourceUpdateFunc pluginResourceUpdateFuncType
admissionFailureHandler AdmissionFailureHandler
}
var _ PodAdmitHandler = &predicateAdmitHandler{}
func NewPredicateAdmitHandler(getNodeAnyWayFunc getNodeAnyWayFuncType, admissionFailureHandler AdmissionFailureHandler) *predicateAdmitHandler {
func NewPredicateAdmitHandler(getNodeAnyWayFunc getNodeAnyWayFuncType, admissionFailureHandler AdmissionFailureHandler, pluginResourceUpdateFunc pluginResourceUpdateFuncType) *predicateAdmitHandler {
return &predicateAdmitHandler{
getNodeAnyWayFunc,
pluginResourceUpdateFunc,
admissionFailureHandler,
}
}
@ -63,6 +67,16 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
pods := attrs.OtherPods
nodeInfo := schedulercache.NewNodeInfo(pods...)
nodeInfo.SetNode(node)
// ensure the node has enough plugin resources for that required in pods
if err = w.pluginResourceUpdateFunc(nodeInfo, attrs); err != nil {
message := fmt.Sprintf("Update plugin resources failed due to %v, which is unexpected.", err)
glog.Warningf("Failed to admit pod %v - %s", format.Pod(pod), message)
return PodAdmitResult{
Admit: false,
Reason: "UnexpectedAdmissionError",
Message: message,
}
}
fit, reasons, err := predicates.GeneralPredicates(pod, nil, nodeInfo)
if err != nil {
message := fmt.Sprintf("GeneralPredicates failed due to %v, which is unexpected.", err)

View File

@ -255,6 +255,11 @@ func (n *NodeInfo) AllocatableResource() Resource {
return *n.allocatableResource
}
// SetAllocatableResource sets the allocatableResource information of given node.
func (n *NodeInfo) SetAllocatableResource(allocatableResource *Resource) {
n.allocatableResource = allocatableResource
}
func (n *NodeInfo) Clone() *NodeInfo {
clone := &NodeInfo{
node: n.node,