Merge pull request #86601 from angao/no-disk-conflict

move NoDiskConflict predicate to its filter plugin
2025-08-11 04:52:08 +00:00 · 2019-12-25 15:27:38 -08:00 · 2019-12-25 15:27:38 -08:00 · c84b1a8a10
commit c84b1a8a10
parent bd1195c28e e5d90c57f0
5 changed files with 88 additions and 324 deletions
--- a/pkg/scheduler/algorithm/predicates/predicates.go
+++ b/pkg/scheduler/algorithm/predicates/predicates.go
@ -147,73 +147,6 @@ func Ordering() []string {
 // The failure information is given by the error.
 type FitPredicate func(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error)

-func isVolumeConflict(volume v1.Volume, pod *v1.Pod) bool {
-	// fast path if there is no conflict checking targets.
-	if volume.GCEPersistentDisk == nil && volume.AWSElasticBlockStore == nil && volume.RBD == nil && volume.ISCSI == nil {
-		return false
-	}
-
-	for _, existingVolume := range pod.Spec.Volumes {
-		// Same GCE disk mounted by multiple pods conflicts unless all pods mount it read-only.
-		if volume.GCEPersistentDisk != nil && existingVolume.GCEPersistentDisk != nil {
-			disk, existingDisk := volume.GCEPersistentDisk, existingVolume.GCEPersistentDisk
-			if disk.PDName == existingDisk.PDName && !(disk.ReadOnly && existingDisk.ReadOnly) {
-				return true
-			}
-		}
-
-		if volume.AWSElasticBlockStore != nil && existingVolume.AWSElasticBlockStore != nil {
-			if volume.AWSElasticBlockStore.VolumeID == existingVolume.AWSElasticBlockStore.VolumeID {
-				return true
-			}
-		}
-
-		if volume.ISCSI != nil && existingVolume.ISCSI != nil {
-			iqn := volume.ISCSI.IQN
-			eiqn := existingVolume.ISCSI.IQN
-			// two ISCSI volumes are same, if they share the same iqn. As iscsi volumes are of type
-			// RWO or ROX, we could permit only one RW mount. Same iscsi volume mounted by multiple Pods
-			// conflict unless all other pods mount as read only.
-			if iqn == eiqn && !(volume.ISCSI.ReadOnly && existingVolume.ISCSI.ReadOnly) {
-				return true
-			}
-		}
-
-		if volume.RBD != nil && existingVolume.RBD != nil {
-			mon, pool, image := volume.RBD.CephMonitors, volume.RBD.RBDPool, volume.RBD.RBDImage
-			emon, epool, eimage := existingVolume.RBD.CephMonitors, existingVolume.RBD.RBDPool, existingVolume.RBD.RBDImage
-			// two RBDs images are the same if they share the same Ceph monitor, are in the same RADOS Pool, and have the same image name
-			// only one read-write mount is permitted for the same RBD image.
-			// same RBD image mounted by multiple Pods conflicts unless all Pods mount the image read-only
-			if haveOverlap(mon, emon) && pool == epool && image == eimage && !(volume.RBD.ReadOnly && existingVolume.RBD.ReadOnly) {
-				return true
-			}
-		}
-	}
-
-	return false
-}
-
-// NoDiskConflict evaluates if a pod can fit due to the volumes it requests, and those that
-// are already mounted. If there is already a volume mounted on that node, another pod that uses the same volume
-// can't be scheduled there.
-// This is GCE, Amazon EBS, ISCSI and Ceph RBD specific for now:
-// - GCE PD allows multiple mounts as long as they're all read-only
-// - AWS EBS forbids any two pods mounting the same volume ID
-// - Ceph RBD forbids if any two pods share at least same monitor, and match pool and image, and the image is read-only
-// - ISCSI forbids if any two pods share at least same IQN and ISCSI volume is read-only
-// TODO: migrate this into some per-volume specific code?
-func NoDiskConflict(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
-	for _, v := range pod.Spec.Volumes {
-		for _, ev := range nodeInfo.Pods() {
-			if isVolumeConflict(v, ev) {
-				return false, []PredicateFailureReason{ErrDiskConflict}, nil
-			}
-		}
-	}
-	return true, nil, nil
-}
-
 // MaxPDVolumeCountChecker contains information to check the max number of volumes for a predicate.
 type MaxPDVolumeCountChecker struct {
 	filter         VolumeFilter
@ -963,25 +896,6 @@ func PodFitsHostPortsPredicate(pod *v1.Pod, meta []*v1.ContainerPort, nodeInfo *
 	return true, nil, nil
 }

-// haveOverlap searches two arrays and returns true if they have at least one common element; returns false otherwise.
-func haveOverlap(a1, a2 []string) bool {
-	if len(a1) > len(a2) {
-		a1, a2 = a2, a1
-	}
-	m := map[string]bool{}
-
-	for _, val := range a1 {
-		m[val] = true
-	}
-	for _, val := range a2 {
-		if _, ok := m[val]; ok {
-			return true
-		}
-	}
-
-	return false
-}
-
 // GeneralPredicates checks a group of predicates that the kubelet cares about.
 // DEPRECATED: this exist only because kubelet uses it. We should change kubelet to execute the individual predicates it requires.
 func GeneralPredicates(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
--- a/pkg/scheduler/algorithm/predicates/predicates_test.go
+++ b/pkg/scheduler/algorithm/predicates/predicates_test.go
@ -721,238 +721,6 @@ func TestPodFitsHostPorts(t *testing.T) {
 	}
 }

-func TestGCEDiskConflicts(t *testing.T) {
-	volState := v1.PodSpec{
-		Volumes: []v1.Volume{
-			{
-				VolumeSource: v1.VolumeSource{
-					GCEPersistentDisk: &v1.GCEPersistentDiskVolumeSource{
-						PDName: "foo",
-					},
-				},
-			},
-		},
-	}
-	volState2 := v1.PodSpec{
-		Volumes: []v1.Volume{
-			{
-				VolumeSource: v1.VolumeSource{
-					GCEPersistentDisk: &v1.GCEPersistentDiskVolumeSource{
-						PDName: "bar",
-					},
-				},
-			},
-		},
-	}
-	tests := []struct {
-		pod      *v1.Pod
-		nodeInfo *schedulernodeinfo.NodeInfo
-		isOk     bool
-		name     string
-	}{
-		{&v1.Pod{}, schedulernodeinfo.NewNodeInfo(), true, "nothing"},
-		{&v1.Pod{}, schedulernodeinfo.NewNodeInfo(&v1.Pod{Spec: volState}), true, "one state"},
-		{&v1.Pod{Spec: volState}, schedulernodeinfo.NewNodeInfo(&v1.Pod{Spec: volState}), false, "same state"},
-		{&v1.Pod{Spec: volState2}, schedulernodeinfo.NewNodeInfo(&v1.Pod{Spec: volState}), true, "different state"},
-	}
-	expectedFailureReasons := []PredicateFailureReason{ErrDiskConflict}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			ok, reasons, err := NoDiskConflict(test.pod, nil, test.nodeInfo)
-			if err != nil {
-				t.Errorf("unexpected error: %v", err)
-			}
-			if !ok && !reflect.DeepEqual(reasons, expectedFailureReasons) {
-				t.Errorf("unexpected failure reasons: %v, want: %v", reasons, expectedFailureReasons)
-			}
-			if test.isOk && !ok {
-				t.Errorf("expected ok, got none.  %v %s", test.pod, test.nodeInfo)
-			}
-			if !test.isOk && ok {
-				t.Errorf("expected no ok, got one.  %v %s", test.pod, test.nodeInfo)
-			}
-		})
-	}
-}
-
-func TestAWSDiskConflicts(t *testing.T) {
-	volState := v1.PodSpec{
-		Volumes: []v1.Volume{
-			{
-				VolumeSource: v1.VolumeSource{
-					AWSElasticBlockStore: &v1.AWSElasticBlockStoreVolumeSource{
-						VolumeID: "foo",
-					},
-				},
-			},
-		},
-	}
-	volState2 := v1.PodSpec{
-		Volumes: []v1.Volume{
-			{
-				VolumeSource: v1.VolumeSource{
-					AWSElasticBlockStore: &v1.AWSElasticBlockStoreVolumeSource{
-						VolumeID: "bar",
-					},
-				},
-			},
-		},
-	}
-	tests := []struct {
-		pod      *v1.Pod
-		nodeInfo *schedulernodeinfo.NodeInfo
-		isOk     bool
-		name     string
-	}{
-		{&v1.Pod{}, schedulernodeinfo.NewNodeInfo(), true, "nothing"},
-		{&v1.Pod{}, schedulernodeinfo.NewNodeInfo(&v1.Pod{Spec: volState}), true, "one state"},
-		{&v1.Pod{Spec: volState}, schedulernodeinfo.NewNodeInfo(&v1.Pod{Spec: volState}), false, "same state"},
-		{&v1.Pod{Spec: volState2}, schedulernodeinfo.NewNodeInfo(&v1.Pod{Spec: volState}), true, "different state"},
-	}
-	expectedFailureReasons := []PredicateFailureReason{ErrDiskConflict}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			ok, reasons, err := NoDiskConflict(test.pod, nil, test.nodeInfo)
-			if err != nil {
-				t.Errorf("unexpected error: %v", err)
-			}
-			if !ok && !reflect.DeepEqual(reasons, expectedFailureReasons) {
-				t.Errorf("unexpected failure reasons: %v, want: %v", reasons, expectedFailureReasons)
-			}
-			if test.isOk && !ok {
-				t.Errorf("expected ok, got none.  %v %s", test.pod, test.nodeInfo)
-			}
-			if !test.isOk && ok {
-				t.Errorf("expected no ok, got one.  %v %s", test.pod, test.nodeInfo)
-			}
-		})
-	}
-}
-
-func TestRBDDiskConflicts(t *testing.T) {
-	volState := v1.PodSpec{
-		Volumes: []v1.Volume{
-			{
-				VolumeSource: v1.VolumeSource{
-					RBD: &v1.RBDVolumeSource{
-						CephMonitors: []string{"a", "b"},
-						RBDPool:      "foo",
-						RBDImage:     "bar",
-						FSType:       "ext4",
-					},
-				},
-			},
-		},
-	}
-	volState2 := v1.PodSpec{
-		Volumes: []v1.Volume{
-			{
-				VolumeSource: v1.VolumeSource{
-					RBD: &v1.RBDVolumeSource{
-						CephMonitors: []string{"c", "d"},
-						RBDPool:      "foo",
-						RBDImage:     "bar",
-						FSType:       "ext4",
-					},
-				},
-			},
-		},
-	}
-	tests := []struct {
-		pod      *v1.Pod
-		nodeInfo *schedulernodeinfo.NodeInfo
-		isOk     bool
-		name     string
-	}{
-		{&v1.Pod{}, schedulernodeinfo.NewNodeInfo(), true, "nothing"},
-		{&v1.Pod{}, schedulernodeinfo.NewNodeInfo(&v1.Pod{Spec: volState}), true, "one state"},
-		{&v1.Pod{Spec: volState}, schedulernodeinfo.NewNodeInfo(&v1.Pod{Spec: volState}), false, "same state"},
-		{&v1.Pod{Spec: volState2}, schedulernodeinfo.NewNodeInfo(&v1.Pod{Spec: volState}), true, "different state"},
-	}
-	expectedFailureReasons := []PredicateFailureReason{ErrDiskConflict}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			ok, reasons, err := NoDiskConflict(test.pod, nil, test.nodeInfo)
-			if err != nil {
-				t.Errorf("unexpected error: %v", err)
-			}
-			if !ok && !reflect.DeepEqual(reasons, expectedFailureReasons) {
-				t.Errorf("unexpected failure reasons: %v, want: %v", reasons, expectedFailureReasons)
-			}
-			if test.isOk && !ok {
-				t.Errorf("expected ok, got none.  %v %s", test.pod, test.nodeInfo)
-			}
-			if !test.isOk && ok {
-				t.Errorf("expected no ok, got one.  %v %s", test.pod, test.nodeInfo)
-			}
-		})
-	}
-}
-
-func TestISCSIDiskConflicts(t *testing.T) {
-	volState := v1.PodSpec{
-		Volumes: []v1.Volume{
-			{
-				VolumeSource: v1.VolumeSource{
-					ISCSI: &v1.ISCSIVolumeSource{
-						TargetPortal: "127.0.0.1:3260",
-						IQN:          "iqn.2016-12.server:storage.target01",
-						FSType:       "ext4",
-						Lun:          0,
-					},
-				},
-			},
-		},
-	}
-	volState2 := v1.PodSpec{
-		Volumes: []v1.Volume{
-			{
-				VolumeSource: v1.VolumeSource{
-					ISCSI: &v1.ISCSIVolumeSource{
-						TargetPortal: "127.0.0.1:3260",
-						IQN:          "iqn.2017-12.server:storage.target01",
-						FSType:       "ext4",
-						Lun:          0,
-					},
-				},
-			},
-		},
-	}
-	tests := []struct {
-		pod      *v1.Pod
-		nodeInfo *schedulernodeinfo.NodeInfo
-		isOk     bool
-		name     string
-	}{
-		{&v1.Pod{}, schedulernodeinfo.NewNodeInfo(), true, "nothing"},
-		{&v1.Pod{}, schedulernodeinfo.NewNodeInfo(&v1.Pod{Spec: volState}), true, "one state"},
-		{&v1.Pod{Spec: volState}, schedulernodeinfo.NewNodeInfo(&v1.Pod{Spec: volState}), false, "same state"},
-		{&v1.Pod{Spec: volState2}, schedulernodeinfo.NewNodeInfo(&v1.Pod{Spec: volState}), true, "different state"},
-	}
-	expectedFailureReasons := []PredicateFailureReason{ErrDiskConflict}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			ok, reasons, err := NoDiskConflict(test.pod, nil, test.nodeInfo)
-			if err != nil {
-				t.Errorf("unexpected error: %v", err)
-			}
-			if !ok && !reflect.DeepEqual(reasons, expectedFailureReasons) {
-				t.Errorf("unexpected failure reasons: %v, want: %v", reasons, expectedFailureReasons)
-			}
-			if test.isOk && !ok {
-				t.Errorf("expected ok, got none.  %v %s", test.pod, test.nodeInfo)
-			}
-			if !test.isOk && ok {
-				t.Errorf("expected no ok, got one.  %v %s", test.pod, test.nodeInfo)
-			}
-		})
-	}
-}
-
 // TODO: Add test case for RequiredDuringSchedulingRequiredDuringExecution after it's implemented.
 func TestPodFitsSelector(t *testing.T) {
 	tests := []struct {
--- a/pkg/scheduler/algorithmprovider/defaults/register_predicates.go
+++ b/pkg/scheduler/algorithmprovider/defaults/register_predicates.go
@ -117,7 +117,12 @@ func init() {
 	)

 	// Fit is determined by non-conflicting disk volumes.
-	scheduler.RegisterFitPredicate(predicates.NoDiskConflictPred, predicates.NoDiskConflict)
+	scheduler.RegisterFitPredicateFactory(
+		predicates.NoDiskConflictPred,
+		func(args scheduler.AlgorithmFactoryArgs) predicates.FitPredicate {
+			return nil
+		},
+	)

 	// GeneralPredicates are the predicates that are enforced by all Kubernetes components
 	// (e.g. kubelet and all schedulers)
--- a/pkg/scheduler/framework/plugins/volumerestrictions/BUILD
+++ b/pkg/scheduler/framework/plugins/volumerestrictions/BUILD
@ -7,7 +7,6 @@ go_library(
    visibility = ["//visibility:public"],
    deps = [
        "//pkg/scheduler/algorithm/predicates:go_default_library",
-        "//pkg/scheduler/framework/plugins/migration:go_default_library",
        "//pkg/scheduler/framework/v1alpha1:go_default_library",
        "//pkg/scheduler/nodeinfo:go_default_library",
        "//staging/src/k8s.io/api/core/v1:go_default_library",
--- a/pkg/scheduler/framework/plugins/volumerestrictions/volume_restrictions.go
+++ b/pkg/scheduler/framework/plugins/volumerestrictions/volume_restrictions.go
@ -22,7 +22,6 @@ import (
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
-	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/migration"
 	framework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
 	"k8s.io/kubernetes/pkg/scheduler/nodeinfo"
 )
@ -40,11 +39,90 @@ func (pl *VolumeRestrictions) Name() string {
 	return Name
 }

+func isVolumeConflict(volume v1.Volume, pod *v1.Pod) bool {
+	// fast path if there is no conflict checking targets.
+	if volume.GCEPersistentDisk == nil && volume.AWSElasticBlockStore == nil && volume.RBD == nil && volume.ISCSI == nil {
+		return false
+	}
+
+	for _, existingVolume := range pod.Spec.Volumes {
+		// Same GCE disk mounted by multiple pods conflicts unless all pods mount it read-only.
+		if volume.GCEPersistentDisk != nil && existingVolume.GCEPersistentDisk != nil {
+			disk, existingDisk := volume.GCEPersistentDisk, existingVolume.GCEPersistentDisk
+			if disk.PDName == existingDisk.PDName && !(disk.ReadOnly && existingDisk.ReadOnly) {
+				return true
+			}
+		}
+
+		if volume.AWSElasticBlockStore != nil && existingVolume.AWSElasticBlockStore != nil {
+			if volume.AWSElasticBlockStore.VolumeID == existingVolume.AWSElasticBlockStore.VolumeID {
+				return true
+			}
+		}
+
+		if volume.ISCSI != nil && existingVolume.ISCSI != nil {
+			iqn := volume.ISCSI.IQN
+			eiqn := existingVolume.ISCSI.IQN
+			// two ISCSI volumes are same, if they share the same iqn. As iscsi volumes are of type
+			// RWO or ROX, we could permit only one RW mount. Same iscsi volume mounted by multiple Pods
+			// conflict unless all other pods mount as read only.
+			if iqn == eiqn && !(volume.ISCSI.ReadOnly && existingVolume.ISCSI.ReadOnly) {
+				return true
+			}
+		}
+
+		if volume.RBD != nil && existingVolume.RBD != nil {
+			mon, pool, image := volume.RBD.CephMonitors, volume.RBD.RBDPool, volume.RBD.RBDImage
+			emon, epool, eimage := existingVolume.RBD.CephMonitors, existingVolume.RBD.RBDPool, existingVolume.RBD.RBDImage
+			// two RBDs images are the same if they share the same Ceph monitor, are in the same RADOS Pool, and have the same image name
+			// only one read-write mount is permitted for the same RBD image.
+			// same RBD image mounted by multiple Pods conflicts unless all Pods mount the image read-only
+			if haveOverlap(mon, emon) && pool == epool && image == eimage && !(volume.RBD.ReadOnly && existingVolume.RBD.ReadOnly) {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+// haveOverlap searches two arrays and returns true if they have at least one common element; returns false otherwise.
+func haveOverlap(a1, a2 []string) bool {
+	if len(a1) > len(a2) {
+		a1, a2 = a2, a1
+	}
+	m := map[string]bool{}
+
+	for _, val := range a1 {
+		m[val] = true
+	}
+	for _, val := range a2 {
+		if _, ok := m[val]; ok {
+			return true
+		}
+	}
+
+	return false
+}
+
 // Filter invoked at the filter extension point.
+// It evaluates if a pod can fit due to the volumes it requests, and those that
+// are already mounted. If there is already a volume mounted on that node, another pod that uses the same volume
+// can't be scheduled there.
+// This is GCE, Amazon EBS, ISCSI and Ceph RBD specific for now:
+// - GCE PD allows multiple mounts as long as they're all read-only
+// - AWS EBS forbids any two pods mounting the same volume ID
+// - Ceph RBD forbids if any two pods share at least same monitor, and match pool and image, and the image is read-only
+// - ISCSI forbids if any two pods share at least same IQN and ISCSI volume is read-only
 func (pl *VolumeRestrictions) Filter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *nodeinfo.NodeInfo) *framework.Status {
-	// metadata is not needed for NoDiskConflict
-	_, reasons, err := predicates.NoDiskConflict(pod, nil, nodeInfo)
-	return migration.PredicateResultToFrameworkStatus(reasons, err)
+	for _, v := range pod.Spec.Volumes {
+		for _, ev := range nodeInfo.Pods() {
+			if isVolumeConflict(v, ev) {
+				return framework.NewStatus(framework.Unschedulable, predicates.ErrDiskConflict.GetReason())
+			}
+		}
+	}
+	return nil
 }

 // New initializes a new plugin and returns it.