From 22d719018aaaab838223a2da38e13a7a65d83a43 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Sat, 20 Feb 2016 14:30:35 -0500 Subject: [PATCH] AWS: Recover if tags missing on security group In the AWS API (generally) we tag things we create, and then we filter to find them. However, creation & tagging are typically two separate calls. So there is a chance that we will create an object, but fail to tag it. We fix this (done here in the case of security groups, but we can do this more generally) by retrieving the resource without a tag filter. If the retrieved resource has the correct tags, great. If it has the tags for another cluster, that's a problem, and we raise an error. If it has no tags at all, we add the tags. This only works where the resource is uniquely named (or we can otherwise retrieve it uniquely). For security groups, the SG name comes from the service UUID, so that's unique. Fixes #11324 --- pkg/cloudprovider/providers/aws/aws.go | 104 +++++++++++++++++-------- 1 file changed, 71 insertions(+), 33 deletions(-) diff --git a/pkg/cloudprovider/providers/aws/aws.go b/pkg/cloudprovider/providers/aws/aws.go index 9df66a2b768..f7d0c328c79 100644 --- a/pkg/cloudprovider/providers/aws/aws.go +++ b/pkg/cloudprovider/providers/aws/aws.go @@ -1367,17 +1367,7 @@ func (s *AWSCloud) CreateDisk(volumeOptions *VolumeOptions) (string, error) { // apply tags if volumeOptions.Tags != nil { - tags := []*ec2.Tag{} - for k, v := range *volumeOptions.Tags { - tag := &ec2.Tag{} - tag.Key = aws.String(k) - tag.Value = aws.String(v) - tags = append(tags, tag) - } - tagRequest := &ec2.CreateTagsInput{} - tagRequest.Resources = []*string{&awsID} - tagRequest.Tags = tags - if _, err := s.createTags(tagRequest); err != nil { + if err := s.createTags(awsID, *volumeOptions.Tags); err != nil { // delete the volume and hope it succeeds _, delerr := s.DeleteDisk(volumeName) if delerr != nil { @@ -1682,6 +1672,36 @@ func (s *AWSCloud) removeSecurityGroupIngress(securityGroupId string, removePerm return true, nil } +// Ensure that a resource has the correct tags +// If it has no tags, we assume that this was a problem caused by an error in between creation and tagging, +// and we add the tags. If it has a different cluster's tags, that is an error. +func (s *AWSCloud) ensureClusterTags(resourceID string, tags []*ec2.Tag) error { + actualTags := make(map[string]string) + for _, tag := range tags { + actualTags[aws.StringValue(tag.Key)] = aws.StringValue(tag.Value) + } + + addTags := make(map[string]string) + for k, expected := range s.filterTags { + actual := actualTags[k] + if actual == expected { + continue + } + if actual == "" { + glog.Warningf("Resource %q was missing expected cluster tag %q. Will add (with value %q)", resourceID, k, expected) + addTags[k] = expected + } else { + return fmt.Errorf("resource %q has tag belonging to another cluster: %q=%q (expected %q)", resourceID, k, actual, expected) + } + } + + if err := s.createTags(resourceID, addTags); err != nil { + return fmt.Errorf("error adding missing tags to resource %q: %v", resourceID, err) + } + + return nil +} + // Makes sure the security group exists // Returns the security group id or error func (s *AWSCloud) ensureSecurityGroup(name string, description string, vpcID string) (string, error) { @@ -1695,7 +1715,12 @@ func (s *AWSCloud) ensureSecurityGroup(name string, description string, vpcID st newEc2Filter("group-name", name), newEc2Filter("vpc-id", vpcID), } - request.Filters = s.addFilters(filters) + // Note that we do _not_ add our tag filters; group-name + vpc-id is the EC2 primary key. + // However, we do check that it matches our tags. + // If it doesn't have any tags, we tag it; this is how we recover if we failed to tag before. + // If it has a different cluster's tags, that is an error. + // This shouldn't happen because name is expected to be globally unique (UUID derived) + request.Filters = filters securityGroups, err := s.ec2.DescribeSecurityGroups(request) if err != nil { @@ -1706,7 +1731,12 @@ func (s *AWSCloud) ensureSecurityGroup(name string, description string, vpcID st if len(securityGroups) > 1 { glog.Warning("Found multiple security groups with name:", name) } - return orEmpty(securityGroups[0].GroupId), nil + err := s.ensureClusterTags(aws.StringValue(securityGroups[0].GroupId), securityGroups[0].Tags) + if err != nil { + return "", err + } + + return aws.StringValue(securityGroups[0].GroupId), nil } createRequest := &ec2.CreateSecurityGroupInput{} @@ -1738,22 +1768,13 @@ func (s *AWSCloud) ensureSecurityGroup(name string, description string, vpcID st return "", fmt.Errorf("created security group, but id was not returned: %s", name) } - tags := []*ec2.Tag{} - for k, v := range s.filterTags { - tag := &ec2.Tag{} - tag.Key = aws.String(k) - tag.Value = aws.String(v) - tags = append(tags, tag) - } - - if len(tags) > 0 { - tagRequest := &ec2.CreateTagsInput{} - tagRequest.Resources = []*string{&groupID} - tagRequest.Tags = tags - if _, err := s.createTags(tagRequest); err != nil { - // Not clear how to recover fully from this; we're OK because we don't match on tags, but that is a little odd - return "", fmt.Errorf("error tagging security group: %v", err) - } + err := s.createTags(groupID, s.filterTags) + if err != nil { + // If we retry, ensureClusterTags will recover from this - it + // will add the missing tags. We could delete the security + // group here, but that doesn't feel like the right thing, as + // the caller is likely to retry the create + return "", fmt.Errorf("error tagging security group: %v", err) } return groupID, nil } @@ -1761,15 +1782,32 @@ func (s *AWSCloud) ensureSecurityGroup(name string, description string, vpcID st // createTags calls EC2 CreateTags, but adds retry-on-failure logic // We retry mainly because if we create an object, we cannot tag it until it is "fully created" (eventual consistency) // The error code varies though (depending on what we are tagging), so we simply retry on all errors -func (s *AWSCloud) createTags(request *ec2.CreateTagsInput) (*ec2.CreateTagsOutput, error) { +func (s *AWSCloud) createTags(resourceID string, tags map[string]string) error { + if tags == nil || len(tags) == 0 { + return nil + } + + var awsTags []*ec2.Tag + for k, v := range tags { + tag := &ec2.Tag{ + Key: aws.String(k), + Value: aws.String(v), + } + awsTags = append(awsTags, tag) + } + + request := &ec2.CreateTagsInput{} + request.Resources = []*string{&resourceID} + request.Tags = awsTags + // TODO: We really should do exponential backoff here attempt := 0 maxAttempts := 60 for { - response, err := s.ec2.CreateTags(request) + _, err := s.ec2.CreateTags(request) if err == nil { - return response, err + return nil } // We could check that the error is retryable, but the error code changes based on what we are tagging @@ -1777,7 +1815,7 @@ func (s *AWSCloud) createTags(request *ec2.CreateTagsInput) (*ec2.CreateTagsOutp attempt++ if attempt > maxAttempts { glog.Warningf("Failed to create tags (too many attempts): %v", err) - return response, err + return err } glog.V(2).Infof("Failed to create tags; will retry. Error was %v", err) time.Sleep(1 * time.Second)