mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-24 12:15:52 +00:00
use active pods instead of runtime pods in gpu manager
Signed-off-by: Vishnu kannan <vishnuk@google.com>
This commit is contained in:
parent
8ed9bff073
commit
ff158090b3
@ -48,7 +48,7 @@ const (
|
||||
|
||||
type activePodsLister interface {
|
||||
// Returns a list of active pods on the node.
|
||||
GetRunningPods() ([]*v1.Pod, error)
|
||||
GetActivePods() []*v1.Pod
|
||||
}
|
||||
|
||||
// nvidiaGPUManager manages nvidia gpu devices.
|
||||
@ -148,9 +148,7 @@ func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) (
|
||||
ngm.allocated = allocated
|
||||
} else {
|
||||
// update internal list of GPUs in use prior to allocating new GPUs.
|
||||
if err := ngm.updateAllocatedGPUs(); err != nil {
|
||||
return nil, fmt.Errorf("Failed to allocate GPUs because of issues with updating GPUs in use: %v", err)
|
||||
}
|
||||
ngm.updateAllocatedGPUs()
|
||||
}
|
||||
// Check if GPUs have already been allocated. If so return them right away.
|
||||
// This can happen if a container restarts for example.
|
||||
@ -179,13 +177,10 @@ func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) (
|
||||
}
|
||||
|
||||
// updateAllocatedGPUs updates the list of GPUs in use.
|
||||
// It gets a list of running pods and then frees any GPUs that are bound to terminated pods.
|
||||
// It gets a list of active pods and then frees any GPUs that are bound to terminated pods.
|
||||
// Returns error on failure.
|
||||
func (ngm *nvidiaGPUManager) updateAllocatedGPUs() error {
|
||||
activePods, err := ngm.activePodsLister.GetRunningPods()
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to list active pods: %v", err)
|
||||
}
|
||||
func (ngm *nvidiaGPUManager) updateAllocatedGPUs() {
|
||||
activePods := ngm.activePodsLister.GetActivePods()
|
||||
activePodUids := sets.NewString()
|
||||
for _, pod := range activePods {
|
||||
activePodUids.Insert(string(pod.UID))
|
||||
@ -194,7 +189,6 @@ func (ngm *nvidiaGPUManager) updateAllocatedGPUs() error {
|
||||
podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
|
||||
glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
|
||||
ngm.allocated.delete(podsToBeRemoved.List())
|
||||
return nil
|
||||
}
|
||||
|
||||
// discoverGPUs identifies allGPUs NVIDIA GPU devices available on the local node by walking `/dev` directory.
|
||||
@ -224,10 +218,7 @@ func (ngm *nvidiaGPUManager) discoverGPUs() error {
|
||||
|
||||
// gpusInUse returns a list of GPUs in use along with the respective pods that are using it.
|
||||
func (ngm *nvidiaGPUManager) gpusInUse() (*podGPUs, error) {
|
||||
pods, err := ngm.activePodsLister.GetRunningPods()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pods := ngm.activePodsLister.GetActivePods()
|
||||
type containerIdentifier struct {
|
||||
id string
|
||||
name string
|
||||
|
@ -32,8 +32,8 @@ type testActivePodsLister struct {
|
||||
activePods []*v1.Pod
|
||||
}
|
||||
|
||||
func (tapl *testActivePodsLister) GetRunningPods() ([]*v1.Pod, error) {
|
||||
return tapl.activePods, nil
|
||||
func (tapl *testActivePodsLister) GetActivePods() []*v1.Pod {
|
||||
return tapl.activePods
|
||||
}
|
||||
|
||||
func makeTestPod(numContainers, gpusPerContainer int) *v1.Pod {
|
||||
|
@ -792,7 +792,7 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
|
||||
klet.AddPodSyncLoopHandler(activeDeadlineHandler)
|
||||
klet.AddPodSyncHandler(activeDeadlineHandler)
|
||||
|
||||
criticalPodAdmissionHandler := preemption.NewCriticalPodAdmissionHandler(klet.getActivePods, killPodNow(klet.podWorkers, kubeDeps.Recorder), kubeDeps.Recorder)
|
||||
criticalPodAdmissionHandler := preemption.NewCriticalPodAdmissionHandler(klet.GetActivePods, killPodNow(klet.podWorkers, kubeDeps.Recorder), kubeDeps.Recorder)
|
||||
klet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(klet.getNodeAnyWay, criticalPodAdmissionHandler))
|
||||
// apply functional Option's
|
||||
for _, opt := range kubeDeps.Options {
|
||||
@ -1204,7 +1204,7 @@ func (kl *Kubelet) initializeModules() error {
|
||||
return fmt.Errorf("Kubelet failed to get node info: %v", err)
|
||||
}
|
||||
|
||||
if err := kl.containerManager.Start(node, kl.getActivePods); err != nil {
|
||||
if err := kl.containerManager.Start(node, kl.GetActivePods); err != nil {
|
||||
return fmt.Errorf("Failed to start ContainerManager %v", err)
|
||||
}
|
||||
|
||||
@ -1230,7 +1230,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
|
||||
glog.Fatalf("Failed to start cAdvisor %v", err)
|
||||
}
|
||||
// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
|
||||
kl.evictionManager.Start(kl, kl.getActivePods, kl, evictionMonitoringPeriod)
|
||||
kl.evictionManager.Start(kl, kl.GetActivePods, kl, evictionMonitoringPeriod)
|
||||
}
|
||||
|
||||
// Run starts the kubelet reacting to config updates
|
||||
|
@ -76,8 +76,8 @@ func (kl *Kubelet) listPodsFromDisk() ([]types.UID, error) {
|
||||
return pods, nil
|
||||
}
|
||||
|
||||
// getActivePods returns non-terminal pods
|
||||
func (kl *Kubelet) getActivePods() []*v1.Pod {
|
||||
// GetActivePods returns non-terminal pods
|
||||
func (kl *Kubelet) GetActivePods() []*v1.Pod {
|
||||
allPods := kl.podManager.GetPods()
|
||||
activePods := kl.filterOutTerminatedPods(allPods)
|
||||
return activePods
|
||||
|
Loading…
Reference in New Issue
Block a user