From b16323e37ce8b30b1767eb9ab56db7836ba83aa8 Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Mon, 24 May 2021 23:18:02 +0900 Subject: [PATCH 1/4] New feature gate: KubeletInUserNamespace Enables support for running kubelet in a user namespace. The user namespace has to be created before running kubelet. All the node components such as CRI need to be running in the same user namespace. See kubernetes/enhancements PR 1371 (merged) and issue 2033. Signed-off-by: Akihiro Suda --- pkg/features/kube_features.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 7f32ad1db5a..ced805c7f0e 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -761,6 +761,14 @@ const ( // // Allows clients to request a duration for certificates issued via the Kubernetes CSR API. CSRDuration featuregate.Feature = "CSRDuration" + + // owner: @AkihiroSuda + // alpha: v1.22 + // + // Enables support for running kubelet in a user namespace. + // The user namespace has to be created before running kubelet. + // All the node components such as CRI need to be running in the same user namespace. + KubeletInUserNamespace featuregate.Feature = "KubeletInUserNamespace" ) func init() { @@ -875,6 +883,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS ReadWriteOncePod: {Default: false, PreRelease: featuregate.Alpha}, CSRDuration: {Default: true, PreRelease: featuregate.Beta}, DelegateFSGroupToCSIDriver: {Default: false, PreRelease: featuregate.Alpha}, + KubeletInUserNamespace: {Default: false, PreRelease: featuregate.Alpha}, // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: From dbe015513971b69b2f69c02de1bb5960fe230b51 Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Tue, 21 Aug 2018 16:45:04 +0900 Subject: [PATCH 2/4] kubelet/cm: ignore sysctl error when running in userns Errors during setting the following sysctl values are ignored: - vm.overcommit_memory - vm.panic_on_oom - kernel.panic - kernel.panic_on_oops - kernel.keys.root_maxkeys - kernel.keys.root_maxbytes Signed-off-by: Akihiro Suda --- pkg/kubelet/cm/container_manager_linux.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index d98162ec8a8..03f5aa36503 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -39,6 +39,7 @@ import ( utilpath "k8s.io/utils/path" libcontainerdevices "github.com/opencontainers/runc/libcontainer/devices" + libcontaineruserns "github.com/opencontainers/runc/libcontainer/userns" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" utilerrors "k8s.io/apimachinery/pkg/util/errors" @@ -455,6 +456,13 @@ func setupKernelTunables(option KernelTunableBehavior) error { klog.V(2).InfoS("Updating kernel flag", "flag", flag, "expectedValue", expectedValue, "actualValue", val) err = sysctl.SetSysctl(flag, expectedValue) if err != nil { + if libcontaineruserns.RunningInUserNS() { + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.KubeletInUserNamespace) { + klog.V(2).InfoS("Updating kernel flag failed (running in UserNS, ignoring)", "flag", flag, "err", err) + continue + } + klog.ErrorS(err, "Updating kernel flag failed (Hint: enable KubeletInUserNamespace feature flag to ignore the error)", "flag", flag) + } errList = append(errList, err) } } From 192790c52fed05a865ef5e5e35f57236cd23e9c1 Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Thu, 23 Aug 2018 14:14:44 +0900 Subject: [PATCH 3/4] kube-proxy: allow running in userns Ignore an error during setting RLIMIT_NOFILE. Signed-off-by: Akihiro Suda --- pkg/proxy/userspace/proxier.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/proxy/userspace/proxier.go b/pkg/proxy/userspace/proxier.go index b9c14d28a4e..381fa842a2f 100644 --- a/pkg/proxy/userspace/proxier.go +++ b/pkg/proxy/userspace/proxier.go @@ -26,14 +26,17 @@ import ( "sync/atomic" "time" + libcontaineruserns "github.com/opencontainers/runc/libcontainer/userns" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" utilerrors "k8s.io/apimachinery/pkg/util/errors" utilnet "k8s.io/apimachinery/pkg/util/net" "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" + utilfeature "k8s.io/apiserver/pkg/util/feature" servicehelper "k8s.io/cloud-provider/service/helpers" "k8s.io/klog/v2" + kubefeatures "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/proxy" "k8s.io/kubernetes/pkg/proxy/config" utilproxy "k8s.io/kubernetes/pkg/proxy/util" @@ -231,7 +234,11 @@ func NewCustomProxier(loadBalancer LoadBalancer, listenIP net.IP, iptables iptab err = setRLimit(64 * 1000) if err != nil { - return nil, fmt.Errorf("failed to set open file handler limit: %v", err) + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.KubeletInUserNamespace) && libcontaineruserns.RunningInUserNS() { + klog.V(2).InfoS("Failed to set open file handler limit to 64000 (running in UserNS, ignoring)", "err", err) + } else { + return nil, fmt.Errorf("failed to set open file handler limit to 64000: %w", err) + } } proxyPorts := newPortAllocator(pr) From 26e83ac4d4398ed94ed5391e4faed54824ed9a4d Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Mon, 24 May 2021 23:35:22 +0900 Subject: [PATCH 4/4] kubelet: ignore /dev/kmsg error when running in userns oomwatcher.NewWatcher returns "open /dev/kmsg: operation not permitted" error, when running with sysctl value `kernel.dmesg_restrict=1`. The error is negligible for KubeletInUserNamespace. Signed-off-by: Akihiro Suda --- pkg/kubelet/kubelet.go | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 43cad89dba0..4cf6fe201ab 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -34,6 +34,7 @@ import ( "k8s.io/client-go/informers" cadvisorapi "github.com/google/cadvisor/info/v1" + libcontaineruserns "github.com/opencontainers/runc/libcontainer/userns" "k8s.io/mount-utils" "k8s.io/utils/integer" @@ -481,7 +482,19 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, oomWatcher, err := oomwatcher.NewWatcher(kubeDeps.Recorder) if err != nil { - return nil, err + if libcontaineruserns.RunningInUserNS() { + if utilfeature.DefaultFeatureGate.Enabled(features.KubeletInUserNamespace) { + // oomwatcher.NewWatcher returns "open /dev/kmsg: operation not permitted" error, + // when running in a user namespace with sysctl value `kernel.dmesg_restrict=1`. + klog.V(2).InfoS("Failed to create an oomWatcher (running in UserNS, ignoring)", "err", err) + oomWatcher = nil + } else { + klog.ErrorS(err, "Failed to create an oomWatcher (running in UserNS, Hint: enable KubeletInUserNamespace feature flag to ignore the error)") + return nil, err + } + } else { + return nil, err + } } clusterDNS := make([]net.IP, 0, len(kubeCfg.ClusterDNS)) @@ -1360,8 +1373,10 @@ func (kl *Kubelet) initializeModules() error { } // Start out of memory watcher. - if err := kl.oomWatcher.Start(kl.nodeRef); err != nil { - return fmt.Errorf("failed to start OOM watcher %v", err) + if kl.oomWatcher != nil { + if err := kl.oomWatcher.Start(kl.nodeRef); err != nil { + return fmt.Errorf("failed to start OOM watcher: %w", err) + } } // Start resource analyzer