kubelet: Migrate DRA Manager to contextual logging

Co-authored-by: Patrick Ohly <patrick.ohly@intel.com>
This commit is contained in:
Ed Bartosh 2024-07-31 16:25:54 +03:00
parent f6c88abb2a
commit e1bc8defac
20 changed files with 91 additions and 80 deletions

View File

@ -151,6 +151,7 @@ linters-settings: # please keep this alphabetized
contextual k8s.io/kubernetes/pkg/controller/.* contextual k8s.io/kubernetes/pkg/controller/.*
contextual k8s.io/kubernetes/pkg/scheduler/.* contextual k8s.io/kubernetes/pkg/scheduler/.*
contextual k8s.io/kubernetes/test/e2e/dra/.* contextual k8s.io/kubernetes/test/e2e/dra/.*
contextual k8s.io/kubernetes/pkg/kubelet/cm/dra/.*
# As long as contextual logging is alpha or beta, all WithName, WithValues, # As long as contextual logging is alpha or beta, all WithName, WithValues,
# NewContext calls have to go through klog. Once it is GA, we can lift # NewContext calls have to go through klog. Once it is GA, we can lift

View File

@ -197,6 +197,7 @@ linters-settings: # please keep this alphabetized
contextual k8s.io/kubernetes/pkg/controller/.* contextual k8s.io/kubernetes/pkg/controller/.*
contextual k8s.io/kubernetes/pkg/scheduler/.* contextual k8s.io/kubernetes/pkg/scheduler/.*
contextual k8s.io/kubernetes/test/e2e/dra/.* contextual k8s.io/kubernetes/test/e2e/dra/.*
contextual k8s.io/kubernetes/pkg/kubelet/cm/dra/.*
# As long as contextual logging is alpha or beta, all WithName, WithValues, # As long as contextual logging is alpha or beta, all WithName, WithValues,
# NewContext calls have to go through klog. Once it is GA, we can lift # NewContext calls have to go through klog. Once it is GA, we can lift

View File

@ -200,6 +200,7 @@ linters-settings: # please keep this alphabetized
contextual k8s.io/kubernetes/pkg/controller/.* contextual k8s.io/kubernetes/pkg/controller/.*
contextual k8s.io/kubernetes/pkg/scheduler/.* contextual k8s.io/kubernetes/pkg/scheduler/.*
contextual k8s.io/kubernetes/test/e2e/dra/.* contextual k8s.io/kubernetes/test/e2e/dra/.*
contextual k8s.io/kubernetes/pkg/kubelet/cm/dra/.*
# As long as contextual logging is alpha or beta, all WithName, WithValues, # As long as contextual logging is alpha or beta, all WithName, WithValues,
# NewContext calls have to go through klog. Once it is GA, we can lift # NewContext calls have to go through klog. Once it is GA, we can lift

View File

@ -47,6 +47,7 @@ contextual k8s.io/kubernetes/cmd/kube-scheduler/.*
contextual k8s.io/kubernetes/pkg/controller/.* contextual k8s.io/kubernetes/pkg/controller/.*
contextual k8s.io/kubernetes/pkg/scheduler/.* contextual k8s.io/kubernetes/pkg/scheduler/.*
contextual k8s.io/kubernetes/test/e2e/dra/.* contextual k8s.io/kubernetes/test/e2e/dra/.*
contextual k8s.io/kubernetes/pkg/kubelet/cm/dra/.*
# As long as contextual logging is alpha or beta, all WithName, WithValues, # As long as contextual logging is alpha or beta, all WithName, WithValues,
# NewContext calls have to go through klog. Once it is GA, we can lift # NewContext calls have to go through klog. Once it is GA, we can lift

View File

@ -17,6 +17,7 @@ limitations under the License.
package cm package cm
import ( import (
"context"
"fmt" "fmt"
"strconv" "strconv"
"strings" "strings"
@ -55,7 +56,7 @@ type ContainerManager interface {
// Runs the container manager's housekeeping. // Runs the container manager's housekeeping.
// - Ensures that the Docker daemon is in a container. // - Ensures that the Docker daemon is in a container.
// - Creates the system container where all non-containerized processes run. // - Creates the system container where all non-containerized processes run.
Start(*v1.Node, ActivePodsFunc, config.SourcesReady, status.PodStatusProvider, internalapi.RuntimeService, bool) error Start(context.Context, *v1.Node, ActivePodsFunc, config.SourcesReady, status.PodStatusProvider, internalapi.RuntimeService, bool) error
// SystemCgroupsLimit returns resources allocated to system cgroups in the machine. // SystemCgroupsLimit returns resources allocated to system cgroups in the machine.
// These cgroups include the system and Kubernetes services. // These cgroups include the system and Kubernetes services.
@ -94,7 +95,7 @@ type ContainerManager interface {
// GetResources returns RunContainerOptions with devices, mounts, and env fields populated for // GetResources returns RunContainerOptions with devices, mounts, and env fields populated for
// extended resources required by container. // extended resources required by container.
GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error)
// UpdatePluginResources calls Allocate of device plugin handler for potential // UpdatePluginResources calls Allocate of device plugin handler for potential
// requests for device plugin resources, and returns an error if fails. // requests for device plugin resources, and returns an error if fails.
@ -124,10 +125,10 @@ type ContainerManager interface {
GetNodeAllocatableAbsolute() v1.ResourceList GetNodeAllocatableAbsolute() v1.ResourceList
// PrepareDynamicResource prepares dynamic pod resources // PrepareDynamicResource prepares dynamic pod resources
PrepareDynamicResources(*v1.Pod) error PrepareDynamicResources(context.Context, *v1.Pod) error
// UnprepareDynamicResources unprepares dynamic pod resources // UnprepareDynamicResources unprepares dynamic pod resources
UnprepareDynamicResources(*v1.Pod) error UnprepareDynamicResources(context.Context, *v1.Pod) error
// PodMightNeedToUnprepareResources returns true if the pod with the given UID // PodMightNeedToUnprepareResources returns true if the pod with the given UID
// might need to unprepare resources. // might need to unprepare resources.

View File

@ -553,19 +553,18 @@ func (cm *containerManagerImpl) Status() Status {
return cm.status return cm.status
} }
func (cm *containerManagerImpl) Start(node *v1.Node, func (cm *containerManagerImpl) Start(ctx context.Context, node *v1.Node,
activePods ActivePodsFunc, activePods ActivePodsFunc,
sourcesReady config.SourcesReady, sourcesReady config.SourcesReady,
podStatusProvider status.PodStatusProvider, podStatusProvider status.PodStatusProvider,
runtimeService internalapi.RuntimeService, runtimeService internalapi.RuntimeService,
localStorageCapacityIsolation bool) error { localStorageCapacityIsolation bool) error {
ctx := context.Background()
containerMap, containerRunningSet := buildContainerMapAndRunningSetFromRuntime(ctx, runtimeService) containerMap, containerRunningSet := buildContainerMapAndRunningSetFromRuntime(ctx, runtimeService)
// Initialize DRA manager // Initialize DRA manager
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) { if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) {
err := cm.draManager.Start(dra.ActivePodsFunc(activePods), sourcesReady) err := cm.draManager.Start(ctx, dra.ActivePodsFunc(activePods), sourcesReady)
if err != nil { if err != nil {
return fmt.Errorf("start dra manager error: %w", err) return fmt.Errorf("start dra manager error: %w", err)
} }
@ -655,13 +654,15 @@ func (cm *containerManagerImpl) GetPluginRegistrationHandler() cache.PluginHandl
} }
// TODO: move the GetResources logic to PodContainerManager. // TODO: move the GetResources logic to PodContainerManager.
func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) { func (cm *containerManagerImpl) GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
logger := klog.FromContext(ctx)
opts := &kubecontainer.RunContainerOptions{} opts := &kubecontainer.RunContainerOptions{}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) { if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) {
resOpts, err := cm.draManager.GetResources(pod, container) resOpts, err := cm.draManager.GetResources(pod, container)
if err != nil { if err != nil {
return nil, err return nil, err
} }
logger.V(5).Info("Determined CDI devices for pod", "pod", klog.KObj(pod), "cdiDevices", resOpts.CDIDevices)
opts.CDIDevices = append(opts.CDIDevices, resOpts.CDIDevices...) opts.CDIDevices = append(opts.CDIDevices, resOpts.CDIDevices...)
} }
// Allocate should already be called during predicateAdmitHandler.Admit(), // Allocate should already be called during predicateAdmitHandler.Admit(),
@ -1017,12 +1018,12 @@ func containerMemoryFromBlock(blocks []memorymanagerstate.Block) []*podresources
return containerMemories return containerMemories
} }
func (cm *containerManagerImpl) PrepareDynamicResources(pod *v1.Pod) error { func (cm *containerManagerImpl) PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return cm.draManager.PrepareResources(pod) return cm.draManager.PrepareResources(ctx, pod)
} }
func (cm *containerManagerImpl) UnprepareDynamicResources(pod *v1.Pod) error { func (cm *containerManagerImpl) UnprepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return cm.draManager.UnprepareResources(pod) return cm.draManager.UnprepareResources(ctx, pod)
} }
func (cm *containerManagerImpl) PodMightNeedToUnprepareResources(UID types.UID) bool { func (cm *containerManagerImpl) PodMightNeedToUnprepareResources(UID types.UID) bool {

View File

@ -17,6 +17,7 @@ limitations under the License.
package cm package cm
import ( import (
"context"
"fmt" "fmt"
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
@ -45,7 +46,7 @@ type containerManagerStub struct {
var _ ContainerManager = &containerManagerStub{} var _ ContainerManager = &containerManagerStub{}
func (cm *containerManagerStub) Start(_ *v1.Node, _ ActivePodsFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService, _ bool) error { func (cm *containerManagerStub) Start(_ context.Context, _ *v1.Node, _ ActivePodsFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService, _ bool) error {
klog.V(2).InfoS("Starting stub container manager") klog.V(2).InfoS("Starting stub container manager")
return nil return nil
} }
@ -110,7 +111,7 @@ func (cm *containerManagerStub) NewPodContainerManager() PodContainerManager {
return &podContainerManagerStub{} return &podContainerManagerStub{}
} }
func (cm *containerManagerStub) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) { func (cm *containerManagerStub) GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
return &kubecontainer.RunContainerOptions{}, nil return &kubecontainer.RunContainerOptions{}, nil
} }
@ -170,11 +171,11 @@ func (cm *containerManagerStub) GetNodeAllocatableAbsolute() v1.ResourceList {
return nil return nil
} }
func (cm *containerManagerStub) PrepareDynamicResources(pod *v1.Pod) error { func (cm *containerManagerStub) PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil return nil
} }
func (cm *containerManagerStub) UnprepareDynamicResources(*v1.Pod) error { func (cm *containerManagerStub) UnprepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil return nil
} }

View File

@ -20,6 +20,7 @@ limitations under the License.
package cm package cm
import ( import (
"context"
"fmt" "fmt"
"k8s.io/mount-utils" "k8s.io/mount-utils"
@ -39,7 +40,7 @@ type unsupportedContainerManager struct {
var _ ContainerManager = &unsupportedContainerManager{} var _ ContainerManager = &unsupportedContainerManager{}
func (unsupportedContainerManager) Start(_ *v1.Node, _ ActivePodsFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService, _ bool) error { func (unsupportedContainerManager) Start(_ context.Context, _ *v1.Node, _ ActivePodsFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService, _ bool) error {
return fmt.Errorf("Container Manager is unsupported in this build") return fmt.Errorf("Container Manager is unsupported in this build")
} }

View File

@ -70,7 +70,7 @@ func (ra *noopWindowsResourceAllocator) Admit(attrs *lifecycle.PodAdmitAttribute
return admission.GetPodAdmitResult(nil) return admission.GetPodAdmitResult(nil)
} }
func (cm *containerManagerImpl) Start(node *v1.Node, func (cm *containerManagerImpl) Start(ctx context.Context, node *v1.Node,
activePods ActivePodsFunc, activePods ActivePodsFunc,
sourcesReady config.SourcesReady, sourcesReady config.SourcesReady,
podStatusProvider status.PodStatusProvider, podStatusProvider status.PodStatusProvider,
@ -88,7 +88,6 @@ func (cm *containerManagerImpl) Start(node *v1.Node,
} }
} }
ctx := context.Background()
containerMap, containerRunningSet := buildContainerMapAndRunningSetFromRuntime(ctx, runtimeService) containerMap, containerRunningSet := buildContainerMapAndRunningSetFromRuntime(ctx, runtimeService)
// Starts device manager. // Starts device manager.
@ -189,7 +188,7 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
return &podContainerManagerStub{} return &podContainerManagerStub{}
} }
func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) { func (cm *containerManagerImpl) GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
opts := &kubecontainer.RunContainerOptions{} opts := &kubecontainer.RunContainerOptions{}
// Allocate should already be called during predicateAdmitHandler.Admit(), // Allocate should already be called during predicateAdmitHandler.Admit(),
// just try to fetch device runtime information from cached state here // just try to fetch device runtime information from cached state here
@ -275,11 +274,11 @@ func (cm *containerManagerImpl) GetDynamicResources(pod *v1.Pod, container *v1.C
return nil return nil
} }
func (cm *containerManagerImpl) PrepareDynamicResources(pod *v1.Pod) error { func (cm *containerManagerImpl) PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil return nil
} }
func (cm *containerManagerImpl) UnprepareDynamicResources(*v1.Pod) error { func (cm *containerManagerImpl) UnprepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil return nil
} }

View File

@ -68,8 +68,6 @@ type ManagerImpl struct {
// NewManagerImpl creates a new manager. // NewManagerImpl creates a new manager.
func NewManagerImpl(kubeClient clientset.Interface, stateFileDirectory string, nodeName types.NodeName) (*ManagerImpl, error) { func NewManagerImpl(kubeClient clientset.Interface, stateFileDirectory string, nodeName types.NodeName) (*ManagerImpl, error) {
klog.V(2).InfoS("Creating DRA manager")
claimInfoCache, err := newClaimInfoCache(stateFileDirectory, draManagerStateFileName) claimInfoCache, err := newClaimInfoCache(stateFileDirectory, draManagerStateFileName)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to create claimInfo cache: %+v", err) return nil, fmt.Errorf("failed to create claimInfo cache: %+v", err)
@ -91,15 +89,16 @@ func NewManagerImpl(kubeClient clientset.Interface, stateFileDirectory string, n
} }
// Start starts the reconcile loop of the manager. // Start starts the reconcile loop of the manager.
func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady) error { func (m *ManagerImpl) Start(ctx context.Context, activePods ActivePodsFunc, sourcesReady config.SourcesReady) error {
m.activePods = activePods m.activePods = activePods
m.sourcesReady = sourcesReady m.sourcesReady = sourcesReady
go wait.Until(func() { m.reconcileLoop() }, m.reconcilePeriod, wait.NeverStop) go wait.UntilWithContext(ctx, func(ctx context.Context) { m.reconcileLoop(ctx) }, m.reconcilePeriod)
return nil return nil
} }
// reconcileLoop ensures that any stale state in the manager's claimInfoCache gets periodically reconciled. // reconcileLoop ensures that any stale state in the manager's claimInfoCache gets periodically reconciled.
func (m *ManagerImpl) reconcileLoop() { func (m *ManagerImpl) reconcileLoop(ctx context.Context) {
logger := klog.FromContext(ctx)
// Only once all sources are ready do we attempt to reconcile. // Only once all sources are ready do we attempt to reconcile.
// This ensures that the call to m.activePods() below will succeed with // This ensures that the call to m.activePods() below will succeed with
// the actual active pods list. // the actual active pods list.
@ -140,8 +139,8 @@ func (m *ManagerImpl) reconcileLoop() {
// Loop through all inactive pods and call UnprepareResources on them. // Loop through all inactive pods and call UnprepareResources on them.
for _, podClaims := range inactivePodClaims { for _, podClaims := range inactivePodClaims {
if err := m.unprepareResources(podClaims.uid, podClaims.namespace, podClaims.claimNames); err != nil { if err := m.unprepareResources(ctx, podClaims.uid, podClaims.namespace, podClaims.claimNames); err != nil {
klog.ErrorS(err, "Unpreparing pod resources in reconcile loop", "podUID", podClaims.uid) logger.Info("Unpreparing pod resources in reconcile loop failed, will retry", "podUID", podClaims.uid, "err", err)
} }
} }
} }
@ -150,12 +149,13 @@ func (m *ManagerImpl) reconcileLoop() {
// for the input container, issue NodePrepareResources rpc requests // for the input container, issue NodePrepareResources rpc requests
// for each new resource requirement, process their responses and update the cached // for each new resource requirement, process their responses and update the cached
// containerResources on success. // containerResources on success.
func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error { func (m *ManagerImpl) PrepareResources(ctx context.Context, pod *v1.Pod) error {
logger := klog.FromContext(ctx)
batches := make(map[string][]*drapb.Claim) batches := make(map[string][]*drapb.Claim)
resourceClaims := make(map[types.UID]*resourceapi.ResourceClaim) resourceClaims := make(map[types.UID]*resourceapi.ResourceClaim)
for i := range pod.Spec.ResourceClaims { for i := range pod.Spec.ResourceClaims {
podClaim := &pod.Spec.ResourceClaims[i] podClaim := &pod.Spec.ResourceClaims[i]
klog.V(3).InfoS("Processing resource", "pod", klog.KObj(pod), "podClaim", podClaim.Name) logger.V(3).Info("Processing resource", "pod", klog.KObj(pod), "podClaim", podClaim.Name)
claimName, mustCheckOwner, err := resourceclaim.Name(pod, podClaim) claimName, mustCheckOwner, err := resourceclaim.Name(pod, podClaim)
if err != nil { if err != nil {
return fmt.Errorf("prepare resource claim: %v", err) return fmt.Errorf("prepare resource claim: %v", err)
@ -163,12 +163,12 @@ func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error {
if claimName == nil { if claimName == nil {
// Nothing to do. // Nothing to do.
klog.V(5).InfoS("No need to prepare resources, no claim generated", "pod", klog.KObj(pod), "podClaim", podClaim.Name) logger.V(5).Info("No need to prepare resources, no claim generated", "pod", klog.KObj(pod), "podClaim", podClaim.Name)
continue continue
} }
// Query claim object from the API server // Query claim object from the API server
resourceClaim, err := m.kubeClient.ResourceV1alpha3().ResourceClaims(pod.Namespace).Get( resourceClaim, err := m.kubeClient.ResourceV1alpha3().ResourceClaims(pod.Namespace).Get(
context.TODO(), ctx,
*claimName, *claimName,
metav1.GetOptions{}) metav1.GetOptions{})
if err != nil { if err != nil {
@ -198,9 +198,9 @@ func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error {
return fmt.Errorf("claim %s: %w", klog.KObj(resourceClaim), err) return fmt.Errorf("claim %s: %w", klog.KObj(resourceClaim), err)
} }
claimInfo = m.cache.add(ci) claimInfo = m.cache.add(ci)
klog.V(6).InfoS("Created new claim info cache entry", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim), "claimInfoEntry", claimInfo) logger.V(6).Info("Created new claim info cache entry", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim), "claimInfoEntry", claimInfo)
} else { } else {
klog.V(6).InfoS("Found existing claim info cache entry", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim), "claimInfoEntry", claimInfo) logger.V(6).Info("Found existing claim info cache entry", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim), "claimInfoEntry", claimInfo)
} }
// Add a reference to the current pod in the claim info. // Add a reference to the current pod in the claim info.
@ -216,7 +216,7 @@ func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error {
// If this claim is already prepared, there is no need to prepare it again. // If this claim is already prepared, there is no need to prepare it again.
if claimInfo.isPrepared() { if claimInfo.isPrepared() {
klog.V(5).InfoS("Resources already prepared", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim)) logger.V(5).Info("Resources already prepared", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim))
return nil return nil
} }
@ -250,7 +250,7 @@ func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error {
if err != nil { if err != nil {
return fmt.Errorf("failed to get gRPC client for driver %s: %w", driverName, err) return fmt.Errorf("failed to get gRPC client for driver %s: %w", driverName, err)
} }
response, err := client.NodePrepareResources(context.Background(), &drapb.NodePrepareResourcesRequest{Claims: claims}) response, err := client.NodePrepareResources(ctx, &drapb.NodePrepareResourcesRequest{Claims: claims})
if err != nil { if err != nil {
// General error unrelated to any particular claim. // General error unrelated to any particular claim.
return fmt.Errorf("NodePrepareResources failed: %w", err) return fmt.Errorf("NodePrepareResources failed: %w", err)
@ -338,7 +338,6 @@ func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*Conta
// was generated for the referenced claim. There are valid use // was generated for the referenced claim. There are valid use
// cases when this might happen, so we simply skip it. // cases when this might happen, so we simply skip it.
if claimName == nil { if claimName == nil {
klog.V(5).InfoS("No CDI devices, no claim generated", "pod", klog.KObj(pod), "podClaimName", podClaim.Name)
continue continue
} }
for _, claim := range container.Resources.Claims { for _, claim := range container.Resources.Claims {
@ -362,8 +361,6 @@ func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*Conta
} }
} }
} }
klog.V(5).InfoS("Determined CDI devices for pod", "pod", klog.KObj(pod), "cdiDevices", cdiDevices)
return &ContainerInfo{CDIDevices: cdiDevices}, nil return &ContainerInfo{CDIDevices: cdiDevices}, nil
} }
@ -371,7 +368,7 @@ func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*Conta
// This function is idempotent and may be called multiple times against the same pod. // This function is idempotent and may be called multiple times against the same pod.
// As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have // As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have
// already been successfully unprepared. // already been successfully unprepared.
func (m *ManagerImpl) UnprepareResources(pod *v1.Pod) error { func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error {
var claimNames []string var claimNames []string
for i := range pod.Spec.ResourceClaims { for i := range pod.Spec.ResourceClaims {
claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
@ -386,10 +383,11 @@ func (m *ManagerImpl) UnprepareResources(pod *v1.Pod) error {
} }
claimNames = append(claimNames, *claimName) claimNames = append(claimNames, *claimName)
} }
return m.unprepareResources(pod.UID, pod.Namespace, claimNames) return m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames)
} }
func (m *ManagerImpl) unprepareResources(podUID types.UID, namespace string, claimNames []string) error { func (m *ManagerImpl) unprepareResources(ctx context.Context, podUID types.UID, namespace string, claimNames []string) error {
logger := klog.FromContext(ctx)
batches := make(map[string][]*drapb.Claim) batches := make(map[string][]*drapb.Claim)
claimNamesMap := make(map[types.UID]string) claimNamesMap := make(map[types.UID]string)
for _, claimName := range claimNames { for _, claimName := range claimNames {
@ -445,7 +443,7 @@ func (m *ManagerImpl) unprepareResources(podUID types.UID, namespace string, cla
if err != nil { if err != nil {
return fmt.Errorf("get gRPC client for DRA driver %s: %w", driverName, err) return fmt.Errorf("get gRPC client for DRA driver %s: %w", driverName, err)
} }
response, err := client.NodeUnprepareResources(context.Background(), &drapb.NodeUnprepareResourcesRequest{Claims: claims}) response, err := client.NodeUnprepareResources(ctx, &drapb.NodeUnprepareResourcesRequest{Claims: claims})
if err != nil { if err != nil {
// General error unrelated to any particular claim. // General error unrelated to any particular claim.
return fmt.Errorf("NodeUnprepareResources failed: %w", err) return fmt.Errorf("NodeUnprepareResources failed: %w", err)
@ -473,7 +471,7 @@ func (m *ManagerImpl) unprepareResources(podUID types.UID, namespace string, cla
for _, claimName := range claimNamesMap { for _, claimName := range claimNamesMap {
claimInfo, _ := m.cache.get(claimName, namespace) claimInfo, _ := m.cache.get(claimName, namespace)
m.cache.delete(claimName, namespace) m.cache.delete(claimName, namespace)
klog.V(6).InfoS("Deleted claim info cache entry", "claim", klog.KRef(namespace, claimName), "claimInfoEntry", claimInfo) logger.V(6).Info("Deleted claim info cache entry", "claim", klog.KRef(namespace, claimName), "claimInfoEntry", claimInfo)
} }
// Atomically sync the cache back to the checkpoint. // Atomically sync the cache back to the checkpoint.

View File

@ -41,6 +41,7 @@ import (
drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha4" drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
"k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin" "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin"
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state" "k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
"k8s.io/kubernetes/test/utils/ktesting"
) )
const ( const (
@ -538,6 +539,7 @@ func TestPrepareResources(t *testing.T) {
}, },
} { } {
t.Run(test.description, func(t *testing.T) { t.Run(test.description, func(t *testing.T) {
tCtx := ktesting.Init(t)
cache, err := newClaimInfoCache(t.TempDir(), draManagerStateFileName) cache, err := newClaimInfoCache(t.TempDir(), draManagerStateFileName)
if err != nil { if err != nil {
t.Fatalf("failed to newClaimInfoCache, err:%v", err) t.Fatalf("failed to newClaimInfoCache, err:%v", err)
@ -549,11 +551,11 @@ func TestPrepareResources(t *testing.T) {
} }
if test.claim != nil { if test.claim != nil {
if _, err := fakeKubeClient.ResourceV1alpha3().ResourceClaims(test.pod.Namespace).Create(context.Background(), test.claim, metav1.CreateOptions{}); err != nil { if _, err := fakeKubeClient.ResourceV1alpha3().ResourceClaims(test.pod.Namespace).Create(tCtx, test.claim, metav1.CreateOptions{}); err != nil {
t.Fatalf("failed to create ResourceClaim %s: %+v", test.claim.Name, err) t.Fatalf("failed to create ResourceClaim %s: %+v", test.claim.Name, err)
} }
defer func() { defer func() {
require.NoError(t, fakeKubeClient.ResourceV1alpha3().ResourceClaims(test.pod.Namespace).Delete(context.Background(), test.claim.Name, metav1.DeleteOptions{})) require.NoError(t, fakeKubeClient.ResourceV1alpha3().ResourceClaims(test.pod.Namespace).Delete(tCtx, test.claim.Name, metav1.DeleteOptions{}))
}() }()
} }
@ -579,7 +581,7 @@ func TestPrepareResources(t *testing.T) {
manager.cache.add(test.claimInfo) manager.cache.add(test.claimInfo)
} }
err = manager.PrepareResources(test.pod) err = manager.PrepareResources(tCtx, test.pod)
assert.Equal(t, test.expectedPrepareCalls, draServerInfo.server.prepareResourceCalls.Load()) assert.Equal(t, test.expectedPrepareCalls, draServerInfo.server.prepareResourceCalls.Load())
@ -688,6 +690,7 @@ func TestUnprepareResources(t *testing.T) {
}, },
} { } {
t.Run(test.description, func(t *testing.T) { t.Run(test.description, func(t *testing.T) {
tCtx := ktesting.Init(t)
cache, err := newClaimInfoCache(t.TempDir(), draManagerStateFileName) cache, err := newClaimInfoCache(t.TempDir(), draManagerStateFileName)
if err != nil { if err != nil {
t.Fatalf("failed to create a new instance of the claimInfoCache, err: %v", err) t.Fatalf("failed to create a new instance of the claimInfoCache, err: %v", err)
@ -720,7 +723,7 @@ func TestUnprepareResources(t *testing.T) {
manager.cache.add(test.claimInfo) manager.cache.add(test.claimInfo)
} }
err = manager.UnprepareResources(test.pod) err = manager.UnprepareResources(tCtx, test.pod)
assert.Equal(t, test.expectedUnprepareCalls, draServerInfo.server.unprepareResourceCalls.Load()) assert.Equal(t, test.expectedUnprepareCalls, draServerInfo.server.unprepareResourceCalls.Load())
@ -866,6 +869,8 @@ func TestGetContainerClaimInfos(t *testing.T) {
// TestParallelPrepareUnprepareResources calls PrepareResources and UnprepareResources APIs in parallel // TestParallelPrepareUnprepareResources calls PrepareResources and UnprepareResources APIs in parallel
// to detect possible data races // to detect possible data races
func TestParallelPrepareUnprepareResources(t *testing.T) { func TestParallelPrepareUnprepareResources(t *testing.T) {
tCtx := ktesting.Init(t)
// Setup and register fake DRA driver // Setup and register fake DRA driver
draServerInfo, err := setupFakeDRADriverGRPCServer(false, nil, nil, nil) draServerInfo, err := setupFakeDRADriverGRPCServer(false, nil, nil, nil)
if err != nil { if err != nil {
@ -934,17 +939,17 @@ func TestParallelPrepareUnprepareResources(t *testing.T) {
} }
claim := genTestClaim(claimName, driverName, deviceName, string(podUID)) claim := genTestClaim(claimName, driverName, deviceName, string(podUID))
if _, err = fakeKubeClient.ResourceV1alpha3().ResourceClaims(pod.Namespace).Create(context.Background(), claim, metav1.CreateOptions{}); err != nil { if _, err = fakeKubeClient.ResourceV1alpha3().ResourceClaims(pod.Namespace).Create(tCtx, claim, metav1.CreateOptions{}); err != nil {
t.Errorf("failed to create ResourceClaim %s: %+v", claim.Name, err) t.Errorf("failed to create ResourceClaim %s: %+v", claim.Name, err)
return return
} }
if err = manager.PrepareResources(pod); err != nil { if err = manager.PrepareResources(tCtx, pod); err != nil {
t.Errorf("pod: %s: PrepareResources failed: %+v", pod.Name, err) t.Errorf("pod: %s: PrepareResources failed: %+v", pod.Name, err)
return return
} }
if err = manager.UnprepareResources(pod); err != nil { if err = manager.UnprepareResources(tCtx, pod); err != nil {
t.Errorf("pod: %s: UnprepareResources failed: %+v", pod.Name, err) t.Errorf("pod: %s: UnprepareResources failed: %+v", pod.Name, err)
return return
} }

View File

@ -97,7 +97,7 @@ func setupFakeGRPCServer(version string) (string, tearDown, error) {
} }
func TestGRPCConnIsReused(t *testing.T) { func TestGRPCConnIsReused(t *testing.T) {
ctx := ktesting.Init(t) tCtx := ktesting.Init(t)
addr, teardown, err := setupFakeGRPCServer(v1alpha4Version) addr, teardown, err := setupFakeGRPCServer(v1alpha4Version)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
@ -109,7 +109,7 @@ func TestGRPCConnIsReused(t *testing.T) {
m := sync.Mutex{} m := sync.Mutex{}
p := &Plugin{ p := &Plugin{
backgroundCtx: ctx, backgroundCtx: tCtx,
endpoint: addr, endpoint: addr,
clientCallTimeout: defaultClientCallTimeout, clientCallTimeout: defaultClientCallTimeout,
} }
@ -149,7 +149,8 @@ func TestGRPCConnIsReused(t *testing.T) {
}, },
}, },
} }
_, err = client.NodePrepareResources(context.TODO(), req)
_, err = client.NodePrepareResources(tCtx, req)
assert.NoError(t, err) assert.NoError(t, err)
client.mutex.Lock() client.mutex.Lock()
@ -237,7 +238,7 @@ func TestNodeUnprepareResources(t *testing.T) {
}, },
} { } {
t.Run(test.description, func(t *testing.T) { t.Run(test.description, func(t *testing.T) {
ctx := ktesting.Init(t) tCtx := ktesting.Init(t)
addr, teardown, err := setupFakeGRPCServer(test.serverVersion) addr, teardown, err := setupFakeGRPCServer(test.serverVersion)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
@ -245,7 +246,7 @@ func TestNodeUnprepareResources(t *testing.T) {
defer teardown() defer teardown()
p := &Plugin{ p := &Plugin{
backgroundCtx: ctx, backgroundCtx: tCtx,
endpoint: addr, endpoint: addr,
clientCallTimeout: defaultClientCallTimeout, clientCallTimeout: defaultClientCallTimeout,
} }
@ -269,7 +270,7 @@ func TestNodeUnprepareResources(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
_, err = client.NodeUnprepareResources(context.TODO(), test.request) _, err = client.NodeUnprepareResources(tCtx, test.request)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }

View File

@ -20,7 +20,6 @@ import (
"context" "context"
"errors" "errors"
"fmt" "fmt"
"strings"
"time" "time"
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
@ -226,8 +225,6 @@ func (h *RegistrationHandler) DeRegisterPlugin(pluginName string) {
// ValidatePlugin is called by kubelet's plugin watcher upon detection // ValidatePlugin is called by kubelet's plugin watcher upon detection
// of a new registration socket opened by DRA plugin. // of a new registration socket opened by DRA plugin.
func (h *RegistrationHandler) ValidatePlugin(pluginName string, endpoint string, versions []string) error { func (h *RegistrationHandler) ValidatePlugin(pluginName string, endpoint string, versions []string) error {
klog.InfoS("Validate DRA plugin", "name", pluginName, "endpoint", endpoint, "versions", strings.Join(versions, ","))
_, err := h.validateVersions(pluginName, versions) _, err := h.validateVersions(pluginName, versions)
if err != nil { if err != nil {
return fmt.Errorf("invalid versions of plugin %s: %w", pluginName, err) return fmt.Errorf("invalid versions of plugin %s: %w", pluginName, err)

View File

@ -17,6 +17,8 @@ limitations under the License.
package dra package dra
import ( import (
"context"
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/types"
"k8s.io/kubernetes/pkg/kubelet/config" "k8s.io/kubernetes/pkg/kubelet/config"
@ -27,14 +29,14 @@ import (
type Manager interface { type Manager interface {
// Start starts the reconcile loop of the manager. // Start starts the reconcile loop of the manager.
// This will ensure that all claims are unprepared even if pods get deleted unexpectedly. // This will ensure that all claims are unprepared even if pods get deleted unexpectedly.
Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady) error Start(ctx context.Context, activePods ActivePodsFunc, sourcesReady config.SourcesReady) error
// PrepareResources prepares resources for a pod. // PrepareResources prepares resources for a pod.
// It communicates with the DRA resource plugin to prepare resources. // It communicates with the DRA resource plugin to prepare resources.
PrepareResources(pod *v1.Pod) error PrepareResources(ctx context.Context, pod *v1.Pod) error
// UnprepareResources calls NodeUnprepareResource GRPC from DRA plugin to unprepare pod resources // UnprepareResources calls NodeUnprepareResource GRPC from DRA plugin to unprepare pod resources
UnprepareResources(pod *v1.Pod) error UnprepareResources(ctx context.Context, pod *v1.Pod) error
// GetResources gets a ContainerInfo object from the claimInfo cache. // GetResources gets a ContainerInfo object from the claimInfo cache.
// This information is used by the caller to update a container config. // This information is used by the caller to update a container config.

View File

@ -17,6 +17,7 @@ limitations under the License.
package cm package cm
import ( import (
"context"
"sync" "sync"
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
@ -52,7 +53,7 @@ func NewFakeContainerManager() *FakeContainerManager {
} }
} }
func (cm *FakeContainerManager) Start(_ *v1.Node, _ ActivePodsFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService, _ bool) error { func (cm *FakeContainerManager) Start(_ context.Context, _ *v1.Node, _ ActivePodsFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService, _ bool) error {
cm.Lock() cm.Lock()
defer cm.Unlock() defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "Start") cm.CalledFunctions = append(cm.CalledFunctions, "Start")
@ -144,7 +145,7 @@ func (cm *FakeContainerManager) NewPodContainerManager() PodContainerManager {
return cm.PodContainerManager return cm.PodContainerManager
} }
func (cm *FakeContainerManager) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) { func (cm *FakeContainerManager) GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
cm.Lock() cm.Lock()
defer cm.Unlock() defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetResources") cm.CalledFunctions = append(cm.CalledFunctions, "GetResources")
@ -243,11 +244,11 @@ func (cm *FakeContainerManager) GetNodeAllocatableAbsolute() v1.ResourceList {
return nil return nil
} }
func (cm *FakeContainerManager) PrepareDynamicResources(pod *v1.Pod) error { func (cm *FakeContainerManager) PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil return nil
} }
func (cm *FakeContainerManager) UnprepareDynamicResources(*v1.Pod) error { func (cm *FakeContainerManager) UnprepareDynamicResources(context.Context, *v1.Pod) error {
return nil return nil
} }

View File

@ -62,10 +62,10 @@ type RuntimeHelper interface {
GetOrCreateUserNamespaceMappings(pod *v1.Pod, runtimeHandler string) (*runtimeapi.UserNamespace, error) GetOrCreateUserNamespaceMappings(pod *v1.Pod, runtimeHandler string) (*runtimeapi.UserNamespace, error)
// PrepareDynamicResources prepares resources for a pod. // PrepareDynamicResources prepares resources for a pod.
PrepareDynamicResources(pod *v1.Pod) error PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error
// UnprepareDynamicResources unprepares resources for a a pod. // UnprepareDynamicResources unprepares resources for a a pod.
UnprepareDynamicResources(pod *v1.Pod) error UnprepareDynamicResources(ctx context.Context, pod *v1.Pod) error
} }
// ShouldContainerBeRestarted checks whether a container needs to be restarted. // ShouldContainerBeRestarted checks whether a container needs to be restarted.

View File

@ -107,10 +107,10 @@ func (f *FakeRuntimeHelper) GetOrCreateUserNamespaceMappings(pod *v1.Pod, runtim
}, nil }, nil
} }
func (f *FakeRuntimeHelper) PrepareDynamicResources(pod *v1.Pod) error { func (f *FakeRuntimeHelper) PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil return nil
} }
func (f *FakeRuntimeHelper) UnprepareDynamicResources(pod *v1.Pod) error { func (f *FakeRuntimeHelper) UnprepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil return nil
} }

View File

@ -1561,7 +1561,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
os.Exit(1) os.Exit(1)
} }
// containerManager must start after cAdvisor because it needs filesystem capacity information // containerManager must start after cAdvisor because it needs filesystem capacity information
if err := kl.containerManager.Start(node, kl.GetActivePods, kl.sourcesReady, kl.statusManager, kl.runtimeService, kl.supportLocalStorageCapacityIsolation()); err != nil { if err := kl.containerManager.Start(context.TODO(), node, kl.GetActivePods, kl.sourcesReady, kl.statusManager, kl.runtimeService, kl.supportLocalStorageCapacityIsolation()); err != nil {
// Fail kubelet and rely on the babysitter to retry starting kubelet. // Fail kubelet and rely on the babysitter to retry starting kubelet.
klog.ErrorS(err, "Failed to start ContainerManager") klog.ErrorS(err, "Failed to start ContainerManager")
os.Exit(1) os.Exit(1)
@ -2075,7 +2075,7 @@ func (kl *Kubelet) SyncTerminatingPod(_ context.Context, pod *v1.Pod, podStatus
// and BEFORE the pod status is changed on the API server // and BEFORE the pod status is changed on the API server
// to avoid race conditions with the resource deallocation code in kubernetes core. // to avoid race conditions with the resource deallocation code in kubernetes core.
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
if err := kl.UnprepareDynamicResources(pod); err != nil { if err := kl.UnprepareDynamicResources(ctx, pod); err != nil {
return err return err
} }
} }
@ -3057,14 +3057,14 @@ func isSyncPodWorthy(event *pleg.PodLifecycleEvent) bool {
// PrepareDynamicResources calls the container Manager PrepareDynamicResources API // PrepareDynamicResources calls the container Manager PrepareDynamicResources API
// This method implements the RuntimeHelper interface // This method implements the RuntimeHelper interface
func (kl *Kubelet) PrepareDynamicResources(pod *v1.Pod) error { func (kl *Kubelet) PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return kl.containerManager.PrepareDynamicResources(pod) return kl.containerManager.PrepareDynamicResources(ctx, pod)
} }
// UnprepareDynamicResources calls the container Manager UnprepareDynamicResources API // UnprepareDynamicResources calls the container Manager UnprepareDynamicResources API
// This method implements the RuntimeHelper interface // This method implements the RuntimeHelper interface
func (kl *Kubelet) UnprepareDynamicResources(pod *v1.Pod) error { func (kl *Kubelet) UnprepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return kl.containerManager.UnprepareDynamicResources(pod) return kl.containerManager.UnprepareDynamicResources(ctx, pod)
} }
func (kl *Kubelet) warnCgroupV1Usage() { func (kl *Kubelet) warnCgroupV1Usage() {

View File

@ -591,7 +591,7 @@ func (kl *Kubelet) GetPodCgroupParent(pod *v1.Pod) string {
func (kl *Kubelet) GenerateRunContainerOptions(ctx context.Context, pod *v1.Pod, container *v1.Container, podIP string, podIPs []string, imageVolumes kubecontainer.ImageVolumes) (*kubecontainer.RunContainerOptions, func(), error) { func (kl *Kubelet) GenerateRunContainerOptions(ctx context.Context, pod *v1.Pod, container *v1.Container, podIP string, podIPs []string, imageVolumes kubecontainer.ImageVolumes) (*kubecontainer.RunContainerOptions, func(), error) {
supportsRRO := kl.runtimeClassSupportsRecursiveReadOnlyMounts(pod) supportsRRO := kl.runtimeClassSupportsRecursiveReadOnlyMounts(pod)
opts, err := kl.containerManager.GetResources(pod, container) opts, err := kl.containerManager.GetResources(ctx, pod, container)
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }

View File

@ -1138,7 +1138,7 @@ func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, po
// Prepare resources allocated by the Dynammic Resource Allocation feature for the pod // Prepare resources allocated by the Dynammic Resource Allocation feature for the pod
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
if err := m.runtimeHelper.PrepareDynamicResources(pod); err != nil { if err := m.runtimeHelper.PrepareDynamicResources(ctx, pod); err != nil {
ref, referr := ref.GetReference(legacyscheme.Scheme, pod) ref, referr := ref.GetReference(legacyscheme.Scheme, pod)
if referr != nil { if referr != nil {
klog.ErrorS(referr, "Couldn't make a ref to pod", "pod", klog.KObj(pod)) klog.ErrorS(referr, "Couldn't make a ref to pod", "pod", klog.KObj(pod))