From d422a92e661209a23fcfec8039b132fce1eecea5 Mon Sep 17 00:00:00 2001 From: Nick Turner Date: Mon, 20 Jul 2020 10:00:26 -0700 Subject: [PATCH] Fix ECR provider startup latency * Before this change, even on non-AWS platforms, the Enabled() check attempts to make calls to the metadata endpoint when the session and credentials are initialized (in order to determine if the provider should be initialized at all). * This can cause latency because the SDK times out and retries -- up to 20 seconds of latency has been observed on non-AWS platforms when the metadata IP was blocked with an iptables rule. * Instead, check once if we are running on an EC2 platform, first trying to find the EC2 UUID in system files, and second attempting to get credentials. * Add a benchmark test that includes intialization and the credential check. --- pkg/credentialprovider/aws/aws_credentials.go | 90 +++++++++++++++++-- .../aws/aws_credentials_test.go | 17 +++- 2 files changed, 96 insertions(+), 11 deletions(-) diff --git a/pkg/credentialprovider/aws/aws_credentials.go b/pkg/credentialprovider/aws/aws_credentials.go index 8f85237e8f3..a0eef38594a 100644 --- a/pkg/credentialprovider/aws/aws_credentials.go +++ b/pkg/credentialprovider/aws/aws_credentials.go @@ -20,7 +20,9 @@ import ( "encoding/base64" "errors" "fmt" + "io/ioutil" "net/url" + "os" "regexp" "strings" "sync" @@ -38,13 +40,19 @@ import ( "k8s.io/kubernetes/pkg/credentialprovider" ) -var ecrPattern = regexp.MustCompile(`^(\d{12})\.dkr\.ecr(\-fips)?\.([a-zA-Z0-9][a-zA-Z0-9-_]*)\.amazonaws\.com(\.cn)?$`) +var ( + ecrPattern = regexp.MustCompile(`^(\d{12})\.dkr\.ecr(\-fips)?\.([a-zA-Z0-9][a-zA-Z0-9-_]*)\.amazonaws\.com(\.cn)?$`) + once sync.Once + isEC2 bool +) // init registers a credential provider for each registryURLTemplate and creates // an ECR token getter factory with a new cache to store token getters func init() { credentialprovider.RegisterCredentialProvider("amazon-ecr", - newECRProvider(&ecrTokenGetterFactory{cache: make(map[string]tokenGetter)})) + newECRProvider(&ecrTokenGetterFactory{cache: make(map[string]tokenGetter)}, + ec2ValidationImpl, + )) } // ecrProvider is a DockerConfigProvider that gets and refreshes tokens @@ -52,20 +60,74 @@ func init() { type ecrProvider struct { cache cache.Store getterFactory tokenGetterFactory + isEC2 ec2ValidationFunc } var _ credentialprovider.DockerConfigProvider = &ecrProvider{} -func newECRProvider(getterFactory tokenGetterFactory) *ecrProvider { +func newECRProvider(getterFactory tokenGetterFactory, isEC2 ec2ValidationFunc) *ecrProvider { return &ecrProvider{ cache: cache.NewExpirationStore(stringKeyFunc, &ecrExpirationPolicy{}), getterFactory: getterFactory, + isEC2: isEC2, } } -// Enabled implements DockerConfigProvider.Enabled. Enabled is true if AWS -// credentials are found. +// Enabled implements DockerConfigProvider.Enabled. func (p *ecrProvider) Enabled() bool { + return true +} + +type ec2ValidationFunc func() bool + +// ec2ValidationImpl returns true if we detect +// an EC2 vm based on checking for the EC2 system UUID, the asset tag (for nitro +// instances), or instance credentials if the UUID is not present. +// Ref: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/identify_ec2_instances.html +func ec2ValidationImpl() bool { + return tryValidateEC2UUID() || tryValidateEC2Creds() +} + +func tryValidateEC2UUID() bool { + hypervisor_uuid := "/sys/hypervisor/uuid" + product_uuid := "/sys/devices/virtual/dmi/id/product_uuid" + asset_tag := "/sys/devices/virtual/dmi/id/board_asset_tag" + + if _, err := os.Stat(hypervisor_uuid); err == nil { + b, err := ioutil.ReadFile(hypervisor_uuid) + if err != nil { + klog.Errorf("error checking if this is an EC2 instance: %v", err) + } else if strings.HasPrefix(string(b), "EC2") || strings.HasPrefix(string(b), "ec2") { + klog.V(5).Infof("found 'ec2' in uuid %v from %v, enabling legacy AWS credential provider", string(b), hypervisor_uuid) + return true + } + } + + if _, err := os.Stat(product_uuid); err == nil { + b, err := ioutil.ReadFile(product_uuid) + if err != nil { + klog.Errorf("error checking if this is an EC2 instance: %v", err) + } else if strings.HasPrefix(string(b), "EC2") || strings.HasPrefix(string(b), "ec2") { + klog.V(5).Infof("found 'ec2' in uuid %v from %v, enabling legacy AWS credential provider", string(b), product_uuid) + return true + } + } + + if _, err := os.Stat(asset_tag); err == nil { + b, err := ioutil.ReadFile(asset_tag) + s := strings.TrimSpace(string(b)) + if err != nil { + klog.Errorf("error checking if this is an EC2 instance: %v", err) + } else if strings.HasPrefix(s, "i-") && len(s) == 19 { + // Instance ID's are 19 characters plus newline + klog.V(5).Infof("found instance ID in %v from %v, enabling legacy AWS credential provider", string(b), asset_tag) + return true + } + } + return false +} + +func tryValidateEC2Creds() bool { sess, err := session.NewSessionWithOptions(session.Options{ SharedConfigState: session.SharedConfigEnable, }) @@ -77,6 +139,7 @@ func (p *ecrProvider) Enabled() bool { klog.Errorf("while getting AWS credentials %v", err) return false } + klog.V(5).Infof("found aws credentials, enabling legacy AWS credential provider") return true } @@ -85,12 +148,25 @@ func (p *ecrProvider) Enabled() bool { func (p *ecrProvider) Provide(image string) credentialprovider.DockerConfig { parsed, err := parseRepoURL(image) if err != nil { - klog.V(3).Info(err) + return credentialprovider.DockerConfig{} + } + + // To prevent the AWS SDK from causing latency on non-aws platforms, only test if we are on + // EC2 or have access to credentials once. Attempt to do it without network calls by checking + // for certain EC2-specific files. Otherwise, we ask the SDK to initialize a session to see if + // credentials are available. On non-aws platforms, especially when a metadata endpoint is blocked, + // this has been shown to cause 20 seconds of latency due to SDK retries + // (see https://github.com/kubernetes/kubernetes/issues/92162) + once.Do(func() { + isEC2 = p.isEC2() + }) + + if !isEC2 { return credentialprovider.DockerConfig{} } if cfg, exists := p.getFromCache(parsed); exists { - klog.V(6).Infof("Got ECR credentials from cache for %s", parsed.registry) + klog.V(3).Infof("Got ECR credentials from cache for %s", parsed.registry) return cfg } klog.V(3).Info("unable to get ECR credentials from cache, checking ECR API") diff --git a/pkg/credentialprovider/aws/aws_credentials_test.go b/pkg/credentialprovider/aws/aws_credentials_test.go index 8598d553a65..a299d9ac109 100644 --- a/pkg/credentialprovider/aws/aws_credentials_test.go +++ b/pkg/credentialprovider/aws/aws_credentials_test.go @@ -154,6 +154,10 @@ func TestParseRepoURLFail(t *testing.T) { } } +func isAlwaysEC2() bool { + return true +} + func TestECRProvide(t *testing.T) { registry := "123456789012.dkr.ecr.lala-land-1.amazonaws.com" otherRegistries := []string{ @@ -168,7 +172,7 @@ func TestECRProvide(t *testing.T) { password: password, endpoint: registry, }, - }) + }, isAlwaysEC2) keyring := &credentialprovider.BasicDockerKeyring{} keyring.Add(p.Provide(image)) @@ -213,7 +217,7 @@ func TestECRProvideCached(t *testing.T) { endpoint: registry, randomizePassword: true, }, - }) + }, isAlwaysEC2) image1 := path.Join(registry, "foo/bar") image2 := path.Join(registry, "bar/baz") keyring := &credentialprovider.BasicDockerKeyring{} @@ -266,7 +270,7 @@ func TestChinaECRProvide(t *testing.T) { password: password, endpoint: registry, }, - }) + }, isAlwaysEC2) keyring := &credentialprovider.BasicDockerKeyring{} keyring.Add(p.Provide(image)) // Verify that we get the expected username/password combo for @@ -310,7 +314,7 @@ func TestChinaECRProvideCached(t *testing.T) { endpoint: registry, randomizePassword: true, }, - }) + }, isAlwaysEC2) image := path.Join(registry, "foo/bar") keyring := &credentialprovider.BasicDockerKeyring{} keyring.Add(p.Provide(image)) @@ -331,3 +335,8 @@ func TestChinaECRProvideCached(t *testing.T) { t.Errorf("cached credentials do not match") } } + +func BenchmarkSetupLatency(b *testing.B) { + p := newECRProvider(&ecrTokenGetterFactory{cache: make(map[string]tokenGetter)}, ec2ValidationImpl) + _ = p.Enabled() +}