From 9d01f0bf52891d777078daafcd146b6eea1c98eb Mon Sep 17 00:00:00 2001 From: Jiawei Wang Date: Wed, 26 Jan 2022 09:55:09 +0000 Subject: [PATCH] Count inline volume for NodeVolumeLimit when CSI migration enabled Previsouly, when kube-scheduler schedule a pod, it does not take inline intree volume into account when CSI migration is enabled. This could lead to failures where pod scheduled to a node but volume attachment fails. --- .../framework/plugins/nodevolumelimits/csi.go | 56 ++++++++++++++- .../plugins/nodevolumelimits/csi_test.go | 71 ++++++++++++++++++- 2 files changed, 124 insertions(+), 3 deletions(-) diff --git a/pkg/scheduler/framework/plugins/nodevolumelimits/csi.go b/pkg/scheduler/framework/plugins/nodevolumelimits/csi.go index 55e53ed6eb6..8c33826d488 100644 --- a/pkg/scheduler/framework/plugins/nodevolumelimits/csi.go +++ b/pkg/scheduler/framework/plugins/nodevolumelimits/csi.go @@ -40,10 +40,12 @@ import ( // and perform translations from InTree PV's to CSI type InTreeToCSITranslator interface { IsPVMigratable(pv *v1.PersistentVolume) bool + IsInlineMigratable(vol *v1.Volume) bool IsMigratableIntreePluginByName(inTreePluginName string) bool GetInTreePluginNameFromSpec(pv *v1.PersistentVolume, vol *v1.Volume) (string, error) GetCSINameFromInTreeName(pluginName string) (string, error) TranslateInTreePVToCSI(pv *v1.PersistentVolume) (*v1.PersistentVolume, error) + TranslateInTreeInlineVolumeToCSI(volume *v1.Volume, podNamespace string) (*v1.PersistentVolume, error) } // CSILimits is a plugin that checks node volume limits. @@ -136,6 +138,9 @@ func (pl *CSILimits) Filter(ctx context.Context, _ *framework.CycleState, pod *v maxVolumeLimit, ok := nodeVolumeLimits[v1.ResourceName(volumeLimitKey)] if ok { currentVolumeCount := attachedVolumeCount[volumeLimitKey] + klog.V(5).InfoS("Found plugin volume limits", "node", node.Name, "volumeLimitKey", volumeLimitKey, + "maxLimits", maxVolumeLimit, "currentVolumeCount", currentVolumeCount, "newVolumeCount", count, + "pod", klog.KObj(pod)) if currentVolumeCount+count > int(maxVolumeLimit) { return framework.NewStatus(framework.Unschedulable, ErrReasonMaxVolumeCountExceeded) } @@ -148,11 +153,11 @@ func (pl *CSILimits) Filter(ctx context.Context, _ *framework.CycleState, pod *v func (pl *CSILimits) filterAttachableVolumes( pod *v1.Pod, csiNode *storagev1.CSINode, newPod bool, result map[string]string) error { for _, vol := range pod.Spec.Volumes { - // CSI volumes can only be used through a PVC. pvcName := "" isEphemeral := false switch { case vol.PersistentVolumeClaim != nil: + // Normal CSI volume can only be used through PVC pvcName = vol.PersistentVolumeClaim.ClaimName case vol.Ephemeral != nil: // Generic ephemeral inline volumes also use a PVC, @@ -162,6 +167,15 @@ func (pl *CSILimits) filterAttachableVolumes( pvcName = ephemeral.VolumeClaimName(pod, &vol) isEphemeral = true default: + // Inline Volume does not have PVC. + // Need to check if CSI migration is enabled for this inline volume. + // - If the volume is migratable and CSI migration is enabled, need to count it + // as well. + // - If the volume is not migratable, it will be count in non_csi filter. + if err := pl.checkAttachableInlineVolume(vol, csiNode, pod, result); err != nil { + return err + } + continue } @@ -204,6 +218,43 @@ func (pl *CSILimits) filterAttachableVolumes( return nil } +// checkAttachableInlineVolume takes an inline volume and add to the result map if the +// volume is migratable and CSI migration for this plugin has been enabled. +func (pl *CSILimits) checkAttachableInlineVolume(vol v1.Volume, csiNode *storagev1.CSINode, + pod *v1.Pod, result map[string]string) error { + if !pl.translator.IsInlineMigratable(&vol) { + return nil + } + // Check if the intree provisioner CSI migration has been enabled. + inTreeProvisionerName, err := pl.translator.GetInTreePluginNameFromSpec(nil, &vol) + if err != nil { + return fmt.Errorf("looking up provisioner name for volume %v: %w", vol, err) + } + if !isCSIMigrationOn(csiNode, inTreeProvisionerName) { + klog.V(5).InfoS("CSI Migration is not enabled for provisioner", "provisioner", inTreeProvisionerName, + "pod", klog.KObj(pod), "csiNode", csiNode.Name) + return nil + } + // Do translation for the in-tree volume. + translatedPV, err := pl.translator.TranslateInTreeInlineVolumeToCSI(&vol, pod.Namespace) + if err != nil || translatedPV == nil { + return fmt.Errorf("converting volume(%v) from inline to csi: %w", vol, err) + } + driverName, err := pl.translator.GetCSINameFromInTreeName(inTreeProvisionerName) + if err != nil { + return fmt.Errorf("looking up CSI driver name for provisioner %s: %w", inTreeProvisionerName, err) + } + // TranslateInTreeInlineVolumeToCSI should translate inline volume to CSI. If it is not set, + // the volume does not support inline. Skip the count. + if translatedPV.Spec.PersistentVolumeSource.CSI == nil { + return nil + } + volumeUniqueName := fmt.Sprintf("%s/%s", driverName, translatedPV.Spec.PersistentVolumeSource.CSI.VolumeHandle) + volumeLimitKey := volumeutil.GetCSIAttachLimitKey(driverName) + result[volumeUniqueName] = volumeLimitKey + return nil +} + // getCSIDriverInfo returns the CSI driver name and volume ID of a given PVC. // If the PVC is from a migrated in-tree plugin, this function will return // the information of the CSI driver that the plugin has been migrated to. @@ -308,6 +359,7 @@ func NewCSI(_ runtime.Object, handle framework.Handle, fts feature.Features) (fr pvcLister := informerFactory.Core().V1().PersistentVolumeClaims().Lister() csiNodesLister := informerFactory.Storage().V1().CSINodes().Lister() scLister := informerFactory.Storage().V1().StorageClasses().Lister() + csiTranslator := csitrans.New() return &CSILimits{ csiNodeLister: csiNodesLister, @@ -315,7 +367,7 @@ func NewCSI(_ runtime.Object, handle framework.Handle, fts feature.Features) (fr pvcLister: pvcLister, scLister: scLister, randomVolumeIDPrefix: rand.String(32), - translator: csitrans.New(), + translator: csiTranslator, }, nil } diff --git a/pkg/scheduler/framework/plugins/nodevolumelimits/csi_test.go b/pkg/scheduler/framework/plugins/nodevolumelimits/csi_test.go index 0b68239c4cf..ea46d881a23 100644 --- a/pkg/scheduler/framework/plugins/nodevolumelimits/csi_test.go +++ b/pkg/scheduler/framework/plugins/nodevolumelimits/csi_test.go @@ -335,6 +335,32 @@ func TestCSILimits(t *testing.T) { StorageClassName: &scName, }, } + inTreeInlineVolPod := &v1.Pod{ + Spec: v1.PodSpec{ + Volumes: []v1.Volume{ + { + VolumeSource: v1.VolumeSource{ + AWSElasticBlockStore: &v1.AWSElasticBlockStoreVolumeSource{ + VolumeID: "aws-inline1", + }, + }, + }, + }, + }, + } + inTreeInlineVolPodWithSameCSIVolumeID := &v1.Pod{ + Spec: v1.PodSpec{ + Volumes: []v1.Volume{ + { + VolumeSource: v1.VolumeSource{ + AWSElasticBlockStore: &v1.AWSElasticBlockStoreVolumeSource{ + VolumeID: "csi-ebs.csi.aws.com-1", + }, + }, + }, + }, + }, + } tests := []struct { newPod *v1.Pod @@ -502,6 +528,17 @@ func TestCSILimits(t *testing.T) { limitSource: "csinode", test: "should not count non-migratable in-tree volumes", }, + { + newPod: inTreeInlineVolPod, + existingPods: []*v1.Pod{inTreeTwoVolPod}, + filterName: "csi", + maxVols: 2, + driverNames: []string{csilibplugins.AWSEBSInTreePluginName, ebsCSIDriverName}, + migrationEnabled: true, + limitSource: "csinode", + test: "should count in-tree inline volumes if migration is enabled", + wantStatus: framework.NewStatus(framework.Unschedulable, ErrReasonMaxVolumeCountExceeded), + }, // mixed volumes { newPod: inTreeOneVolPod, @@ -514,6 +551,27 @@ func TestCSILimits(t *testing.T) { test: "should count in-tree and csi volumes if migration is enabled (when scheduling in-tree volumes)", wantStatus: framework.NewStatus(framework.Unschedulable, ErrReasonMaxVolumeCountExceeded), }, + { + newPod: inTreeInlineVolPod, + existingPods: []*v1.Pod{csiEBSTwoVolPod, inTreeOneVolPod}, + filterName: "csi", + maxVols: 3, + driverNames: []string{csilibplugins.AWSEBSInTreePluginName, ebsCSIDriverName}, + migrationEnabled: true, + limitSource: "csinode", + test: "should count in-tree, inline and csi volumes if migration is enabled (when scheduling in-tree volumes)", + wantStatus: framework.NewStatus(framework.Unschedulable, ErrReasonMaxVolumeCountExceeded), + }, + { + newPod: inTreeInlineVolPodWithSameCSIVolumeID, + existingPods: []*v1.Pod{csiEBSTwoVolPod, inTreeOneVolPod}, + filterName: "csi", + maxVols: 3, + driverNames: []string{csilibplugins.AWSEBSInTreePluginName, ebsCSIDriverName}, + migrationEnabled: true, + limitSource: "csinode", + test: "should not count in-tree, inline and csi volumes if migration is enabled (when scheduling in-tree volumes)", + }, { newPod: csiEBSOneVolPod, existingPods: []*v1.Pod{inTreeTwoVolPod}, @@ -535,6 +593,16 @@ func TestCSILimits(t *testing.T) { limitSource: "csinode", test: "should not count in-tree and count csi volumes if migration is disabled (when scheduling csi volumes)", }, + { + newPod: csiEBSOneVolPod, + existingPods: []*v1.Pod{csiEBSTwoVolPod, inTreeTwoVolPod, inTreeInlineVolPod}, + filterName: "csi", + maxVols: 3, + driverNames: []string{csilibplugins.AWSEBSInTreePluginName, ebsCSIDriverName}, + migrationEnabled: false, + limitSource: "csinode", + test: "should not count in-tree, inline; should count csi volumes if migration is disabled (when scheduling csi volumes)", + }, { newPod: inTreeOneVolPod, existingPods: []*v1.Pod{csiEBSTwoVolPod}, @@ -643,13 +711,14 @@ func TestCSILimits(t *testing.T) { defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.CSIMigration, false)() defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.CSIMigrationAWS, false)() } + csiTranslator := csitrans.New() p := &CSILimits{ csiNodeLister: getFakeCSINodeLister(csiNode), pvLister: getFakeCSIPVLister(test.filterName, test.driverNames...), pvcLister: append(getFakeCSIPVCLister(test.filterName, scName, test.driverNames...), test.extraClaims...), scLister: getFakeCSIStorageClassLister(scName, test.driverNames[0]), randomVolumeIDPrefix: rand.String(32), - translator: csitrans.New(), + translator: csiTranslator, } gotStatus := p.Filter(context.Background(), nil, test.newPod, node) if !reflect.DeepEqual(gotStatus, test.wantStatus) {