From 0e5401c93940126beac45264aa056507b0950075 Mon Sep 17 00:00:00 2001 From: Nic Cope Date: Wed, 27 Jul 2022 14:44:49 -0700 Subject: [PATCH 1/4] Disable the etcd3 client logger This logger is responsible for 20% of the API server's memory usage when many CRDs are installed. See the below issue for more context. https://github.com/kubernetes/kubernetes/issues/111476 Signed-off-by: Nic Cope --- .../apiserver/pkg/storage/storagebackend/factory/etcd3.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go b/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go index 07b61d357a0..214e930140b 100644 --- a/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go +++ b/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go @@ -30,6 +30,7 @@ import ( "go.etcd.io/etcd/client/pkg/v3/transport" clientv3 "go.etcd.io/etcd/client/v3" "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" + "go.uber.org/zap" "google.golang.org/grpc" "k8s.io/apimachinery/pkg/runtime" @@ -223,6 +224,10 @@ var newETCD3Client = func(c storagebackend.TransportConfig) (*clientv3.Client, e DialOptions: dialOptions, Endpoints: c.ServerList, TLS: tlsConfig, + + // This logger uses a significant amount of memory when many CRDs (i.e. + // 1,000+) are added to the API server, so we disable it. + Logger: zap.NewNop(), } return clientv3.New(cfg) From 0c81eabb853e581abbcb37ebf094af3316e1012e Mon Sep 17 00:00:00 2001 From: Nic Cope Date: Thu, 28 Jul 2022 19:51:55 -0700 Subject: [PATCH 2/4] Share a single etcd3 client logger across all clients Currently the API server creates one etcd client per CRD. If clients aren't provided a logger they'll each create their own. These loggers can account for ~20% of API server memory consumption on a cluster with hundreds of CRDs. Signed-off-by: Nic Cope --- .../storage/storagebackend/factory/etcd3.go | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go b/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go index 214e930140b..72473d61c57 100644 --- a/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go +++ b/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go @@ -27,10 +27,12 @@ import ( "time" grpcprom "github.com/grpc-ecosystem/go-grpc-prometheus" + "go.etcd.io/etcd/client/pkg/v3/logutil" "go.etcd.io/etcd/client/pkg/v3/transport" clientv3 "go.etcd.io/etcd/client/v3" "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "go.uber.org/zap" + "go.uber.org/zap/zapcore" "google.golang.org/grpc" "k8s.io/apimachinery/pkg/runtime" @@ -64,6 +66,14 @@ const ( dbMetricsMonitorJitter = 0.5 ) +// TODO(negz): Stop using a package scoped logger. At the time of writing we're +// creating an etcd client for each CRD. We need to pass each etcd client a +// logger or each client will create its own, which comes with a significant +// memory cost (around 20% of the API server's memory when hundreds of CRDs are +// present). The correct fix here is to not create a client per CRD. See +// https://github.com/kubernetes/kubernetes/issues/111476 for more. +var etcd3ClientLogger *zap.Logger + func init() { // grpcprom auto-registers (via an init function) their client metrics, since we are opting out of // using the global prometheus registry and using our own wrapped global registry, @@ -71,6 +81,12 @@ func init() { // For reference: https://github.com/kubernetes/kubernetes/pull/81387 legacyregistry.RawMustRegister(grpcprom.DefaultClientMetrics) dbMetricsMonitors = make(map[string]struct{}) + + l, err := logutil.CreateDefaultZapLogger(zapcore.InfoLevel) + if err != nil { + l = zap.NewNop() + } + etcd3ClientLogger = l } func newETCD3HealthCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) { @@ -217,6 +233,7 @@ var newETCD3Client = func(c storagebackend.TransportConfig) (*clientv3.Client, e } dialOptions = append(dialOptions, grpc.WithContextDialer(dialer)) } + cfg := clientv3.Config{ DialTimeout: dialTimeout, DialKeepAliveTime: keepaliveTime, @@ -224,10 +241,7 @@ var newETCD3Client = func(c storagebackend.TransportConfig) (*clientv3.Client, e DialOptions: dialOptions, Endpoints: c.ServerList, TLS: tlsConfig, - - // This logger uses a significant amount of memory when many CRDs (i.e. - // 1,000+) are added to the API server, so we disable it. - Logger: zap.NewNop(), + Logger: etcd3ClientLogger, } return clientv3.New(cfg) From f54d2606336e2e8130339d2a0bc04fac6906aa78 Mon Sep 17 00:00:00 2001 From: Nic Cope Date: Fri, 29 Jul 2022 14:24:17 -0700 Subject: [PATCH 3/4] Give etcd client logger a name Logic copied from https://github.com/etcd-io/etcd/blob/v3.5.4/client/v3/client.go#L374 Signed-off-by: Nic Cope --- .../apiserver/pkg/storage/storagebackend/factory/etcd3.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go b/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go index 72473d61c57..541502900eb 100644 --- a/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go +++ b/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go @@ -86,7 +86,7 @@ func init() { if err != nil { l = zap.NewNop() } - etcd3ClientLogger = l + etcd3ClientLogger = l.Named("etcd-client") } func newETCD3HealthCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) { From c1aa7a0fe73cbcab8e70f7b73a845ae9394f9a71 Mon Sep 17 00:00:00 2001 From: Nic Cope Date: Fri, 29 Jul 2022 14:26:31 -0700 Subject: [PATCH 4/4] Copy etcd client debug level logic from upstream Replicated from https://github.com/etcd-io/etcd/blob/v3.5.4/client/v3/logger.go#L47 The logic of this function doesn't make a lot of sense to me, but copying it will avoid any behaviour change. Signed-off-by: Nic Cope --- .../storage/storagebackend/factory/etcd3.go | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go b/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go index 541502900eb..8a6c0ae57f2 100644 --- a/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go +++ b/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go @@ -19,8 +19,10 @@ package factory import ( "context" "fmt" + "log" "net" "net/url" + "os" "path" "strings" "sync" @@ -82,13 +84,29 @@ func init() { legacyregistry.RawMustRegister(grpcprom.DefaultClientMetrics) dbMetricsMonitors = make(map[string]struct{}) - l, err := logutil.CreateDefaultZapLogger(zapcore.InfoLevel) + l, err := logutil.CreateDefaultZapLogger(etcdClientDebugLevel()) if err != nil { l = zap.NewNop() } etcd3ClientLogger = l.Named("etcd-client") } +// etcdClientDebugLevel translates ETCD_CLIENT_DEBUG into zap log level. +// NOTE(negz): This is a copy of a private etcd client function: +// https://github.com/etcd-io/etcd/blob/v3.5.4/client/v3/logger.go#L47 +func etcdClientDebugLevel() zapcore.Level { + envLevel := os.Getenv("ETCD_CLIENT_DEBUG") + if envLevel == "" || envLevel == "true" { + return zapcore.InfoLevel + } + var l zapcore.Level + if err := l.Set(envLevel); err == nil { + log.Printf("Deprecated env ETCD_CLIENT_DEBUG value. Using default level: 'info'") + return zapcore.InfoLevel + } + return l +} + func newETCD3HealthCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) { timeout := storagebackend.DefaultHealthcheckTimeout if c.HealthcheckTimeout != time.Duration(0) {