diff --git a/staging/src/k8s.io/legacy-cloud-providers/aws/BUILD b/staging/src/k8s.io/legacy-cloud-providers/aws/BUILD index f8c82a9a819..7805a85b498 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/aws/BUILD +++ b/staging/src/k8s.io/legacy-cloud-providers/aws/BUILD @@ -84,6 +84,7 @@ go_test( "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/types:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library", + "//staging/src/k8s.io/client-go/kubernetes/fake:go_default_library", "//staging/src/k8s.io/cloud-provider/volume:go_default_library", "//vendor/github.com/aws/aws-sdk-go/aws:go_default_library", "//vendor/github.com/aws/aws-sdk-go/service/ec2:go_default_library", diff --git a/staging/src/k8s.io/legacy-cloud-providers/aws/aws.go b/staging/src/k8s.io/legacy-cloud-providers/aws/aws.go index 541ee26011d..8417e8e5bb8 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/aws/aws.go +++ b/staging/src/k8s.io/legacy-cloud-providers/aws/aws.go @@ -57,7 +57,7 @@ import ( v1core "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/client-go/pkg/version" "k8s.io/client-go/tools/record" - cloudprovider "k8s.io/cloud-provider" + "k8s.io/cloud-provider" nodehelpers "k8s.io/cloud-provider/node/helpers" servicehelpers "k8s.io/cloud-provider/service/helpers" cloudvolume "k8s.io/cloud-provider/volume" @@ -205,6 +205,16 @@ const volumeAttachmentStuck = "VolumeAttachmentStuck" // Indicates that a node has volumes stuck in attaching state and hence it is not fit for scheduling more pods const nodeWithImpairedVolumes = "NodeWithImpairedVolumes" +const ( + // These constants help to identify if a node is a master or a minion + labelKeyNodeRole = "kubernetes.io/role" + nodeMasterRole = "master" + nodeMinionRole = "node" + labelKeyNodeMaster = "node-role.kubernetes.io/master" + labelKeyNodeCompute = "node-role.kubernetes.io/compute" + labelKeyNodeMinion = "node-role.kubernetes.io/node" +) + const ( // volumeAttachmentConsecutiveErrorLimit is the number of consecutive errors we will ignore when waiting for a volume to attach/detach volumeAttachmentStatusConsecutiveErrorLimit = 10 @@ -1598,64 +1608,30 @@ func (c *Cloud) InstanceType(ctx context.Context, nodeName types.NodeName) (stri // GetCandidateZonesForDynamicVolume retrieves a list of all the zones in which nodes are running // It currently involves querying all instances func (c *Cloud) GetCandidateZonesForDynamicVolume() (sets.String, error) { - // We don't currently cache this; it is currently used only in volume - // creation which is expected to be a comparatively rare occurrence. + zones := sets.NewString() - // TODO: Caching / expose v1.Nodes to the cloud provider? - // TODO: We could also query for subnets, I think - - // Note: It is more efficient to call the EC2 API twice with different tag - // filters than to call it once with a tag filter that results in a logical - // OR. For really large clusters the logical OR will result in EC2 API rate - // limiting. - instances := []*ec2.Instance{} - - baseFilters := []*ec2.Filter{newEc2Filter("instance-state-name", "running")} - - filters := c.tagging.addFilters(baseFilters) - di, err := c.describeInstances(filters) + // TODO: list from cache? + nodes, err := c.kubeClient.CoreV1().Nodes().List(metav1.ListOptions{}) if err != nil { + klog.Errorf("Failed to get nodes from api server: %#v", err) return nil, err } - instances = append(instances, di...) - - if c.tagging.usesLegacyTags { - filters = c.tagging.addLegacyFilters(baseFilters) - di, err = c.describeInstances(filters) - if err != nil { - return nil, err - } - - instances = append(instances, di...) - } - - if len(instances) == 0 { - return nil, fmt.Errorf("no instances returned") - } - - zones := sets.NewString() - - for _, instance := range instances { - // We skip over master nodes, if the installation tool labels them with one of the well-known master labels - // This avoids creating a volume in a zone where only the master is running - e.g. #34583 - // This is a short-term workaround until the scheduler takes care of zone selection - master := false - for _, tag := range instance.Tags { - tagKey := aws.StringValue(tag.Key) - if awsTagNameMasterRoles.Has(tagKey) { - master = true - } - } - - if master { - klog.V(4).Infof("Ignoring master instance %q in zone discovery", aws.StringValue(instance.InstanceId)) + for _, n := range nodes.Items { + if !c.isNodeReady(&n) { + klog.V(4).Infof("Ignoring not ready node %q in zone discovery", n.Name) continue } - - if instance.Placement != nil { - zone := aws.StringValue(instance.Placement.AvailabilityZone) - zones.Insert(zone) + // In some cluster provisioning software, a node can be both a minion and a master. Therefore we white-list + // here, and only filter out node that is not minion AND is labeled as master explicitly + if c.isMinionNode(&n) || !c.isMasterNode(&n) { + if zone, ok := n.Labels[v1.LabelZoneFailureDomain]; ok { + zones.Insert(zone) + } else { + klog.Warningf("Node %s does not have zone label, ignore for zone discovery.", n.Name) + } + } else { + klog.V(4).Infof("Ignoring master node %q in zone discovery", n.Name) } } @@ -1663,6 +1639,48 @@ func (c *Cloud) GetCandidateZonesForDynamicVolume() (sets.String, error) { return zones, nil } +// isNodeReady checks node condition and return true if NodeReady is marked as true +func (c *Cloud) isNodeReady(node *v1.Node) bool { + for _, c := range node.Status.Conditions { + if c.Type == v1.NodeReady { + return c.Status == v1.ConditionTrue + } + } + return false +} + +// isMasterNode checks if the node is labeled as master +func (c *Cloud) isMasterNode(node *v1.Node) bool { + // Master node has one or more of the following labels: + // + // kubernetes.io/role: master + // node-role.kubernetes.io/master: "" + // node-role.kubernetes.io/master: "true" + if val, ok := node.Labels[labelKeyNodeMaster]; ok && val != "false" { + return true + } else if role, ok := node.Labels[labelKeyNodeRole]; ok && role == nodeMasterRole { + return true + } + return false +} + +// isMinionNode checks if the node is labeled as minion +func (c *Cloud) isMinionNode(node *v1.Node) bool { + // Minion node has one or more oof the following labels: + // + // kubernetes.io/role: "node" + // node-role.kubernetes.io/compute: "true" + // node-role.kubernetes.io/node: "" + if val, ok := node.Labels[labelKeyNodeMinion]; ok && val != "false" { + return true + } else if val, ok := node.Labels[labelKeyNodeCompute]; ok && val != "false" { + return true + } else if role, ok := node.Labels[labelKeyNodeRole]; ok && role == nodeMinionRole { + return true + } + return false +} + // GetZone implements Zones.GetZone func (c *Cloud) GetZone(ctx context.Context) (cloudprovider.Zone, error) { return cloudprovider.Zone{ diff --git a/staging/src/k8s.io/legacy-cloud-providers/aws/aws_test.go b/staging/src/k8s.io/legacy-cloud-providers/aws/aws_test.go index 46494dc57ad..7ad3180a318 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/aws/aws_test.go +++ b/staging/src/k8s.io/legacy-cloud-providers/aws/aws_test.go @@ -36,6 +36,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/kubernetes/fake" cloudvolume "k8s.io/cloud-provider/volume" ) @@ -1834,6 +1835,118 @@ func TestCreateDisk(t *testing.T) { awsServices.ec2.(*MockedFakeEC2).AssertExpectations(t) } +func TestGetCandidateZonesForDynamicVolume(t *testing.T) { + tests := []struct { + name string + labels map[string]string + ready bool + expectedZones sets.String + }{ + { + name: "master node with role label", + labels: map[string]string{labelKeyNodeRole: nodeMasterRole, v1.LabelZoneFailureDomain: "us-east-1a"}, + ready: true, + expectedZones: sets.NewString(), + }, + { + name: "master node with master label empty", + labels: map[string]string{labelKeyNodeMaster: "", v1.LabelZoneFailureDomain: "us-east-1a"}, + ready: true, + expectedZones: sets.NewString(), + }, + { + name: "master node with master label true", + labels: map[string]string{labelKeyNodeMaster: "true", v1.LabelZoneFailureDomain: "us-east-1a"}, + ready: true, + expectedZones: sets.NewString(), + }, + { + name: "master node with master label false", + labels: map[string]string{labelKeyNodeMaster: "false", v1.LabelZoneFailureDomain: "us-east-1a"}, + ready: true, + expectedZones: sets.NewString("us-east-1a"), + }, + { + name: "minion node with role label", + labels: map[string]string{labelKeyNodeRole: nodeMinionRole, v1.LabelZoneFailureDomain: "us-east-1a"}, + ready: true, + expectedZones: sets.NewString("us-east-1a"), + }, + { + name: "minion node with minion label", + labels: map[string]string{labelKeyNodeMinion: "", v1.LabelZoneFailureDomain: "us-east-1a"}, + ready: true, + expectedZones: sets.NewString("us-east-1a"), + }, + { + name: "minion node with compute label", + labels: map[string]string{labelKeyNodeCompute: "true", v1.LabelZoneFailureDomain: "us-east-1a"}, + ready: true, + expectedZones: sets.NewString("us-east-1a"), + }, + { + name: "master and minion node", + labels: map[string]string{labelKeyNodeMaster: "true", labelKeyNodeCompute: "true", v1.LabelZoneFailureDomain: "us-east-1a"}, + ready: true, + expectedZones: sets.NewString("us-east-1a"), + }, + { + name: "node not ready", + labels: map[string]string{v1.LabelZoneFailureDomain: "us-east-1a"}, + ready: false, + expectedZones: sets.NewString(), + }, + { + name: "node has no zone", + labels: map[string]string{}, + ready: true, + expectedZones: sets.NewString(), + }, + { + name: "node with no label", + labels: map[string]string{v1.LabelZoneFailureDomain: "us-east-1a"}, + ready: true, + expectedZones: sets.NewString("us-east-1a"), + }, + } + + for i, test := range tests { + t.Run(test.name, func(t *testing.T) { + awsServices := newMockedFakeAWSServices(TestClusterID) + c, _ := newAWSCloud(CloudConfig{}, awsServices) + c.kubeClient = fake.NewSimpleClientset() + nodeName := fmt.Sprintf("node-%d", i) + _, err := c.kubeClient.CoreV1().Nodes().Create(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: test.labels, + }, + Status: genNodeStatus(test.ready), + }) + assert.Nil(t, err) + zones, err := c.GetCandidateZonesForDynamicVolume() + assert.Nil(t, err) + assert.Equal(t, test.expectedZones, zones) + }) + } +} + +func genNodeStatus(ready bool) v1.NodeStatus { + status := v1.ConditionFalse + if ready { + status = v1.ConditionTrue + } + ret := v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: status, + }, + }, + } + return ret +} + func newMockedFakeAWSServices(id string) *FakeAWSServices { s := NewFakeAWSServices(id) s.ec2 = &MockedFakeEC2{FakeEC2Impl: s.ec2.(*FakeEC2Impl)}