Merge pull request #28697 from Random-Liu/fix-kube-proxy-panic

Automatic merge from submit-queue Prevent kube-proxy from panicing when sysfs is mounted as read-only. Fixes https://github.com/kubernetes/kubernetes/issues/25543. This PR: * Checks the permission of sysfs before setting conntrack hashsize, and returns an error "readOnlySysFSError" if sysfs is readonly. As I know, this is the only place we need write permission to sysfs, CMIIW. * Update a new node condition 'RuntimeUnhealthy' with specific reason, message and hit to the administrator about the remediation. I think this should be an acceptable fix for now. Node problem detector is designed to integrate with different problem daemons, but **the main logic is in the problem detection phase**. After the problem is detected, what node problem detector does is also simply updating a node condition. If we let kube-proxy pass the problem to node problem detector and let node problem detector update the node condition. It looks like an unnecessary hop. The logic in kube-proxy won't be different from this PR, but node problem detector will have to open an unsafe door to other pods because the lack of authentication mechanism. It is a bit hard to test this PR, because we don't really have a bad docker in hand. I can only manually test it: * If I manually change the code to let it return `"readOnlySysFSError`, the node condition will be updated: ``` NetworkUnavailable False Mon, 01 Jan 0001 00:00:00 +0000 Fri, 08 Jul 2016 01:36:41 -0700 RouteCreated RouteController created a route OutOfDisk False Fri, 08 Jul 2016 01:37:36 -0700 Fri, 08 Jul 2016 01:34:49 -0700 KubeletHasSufficientDisk kubelet has sufficient disk space available MemoryPressure False Fri, 08 Jul 2016 01:37:36 -0700 Fri, 08 Jul 2016 01:34:49 -0700 KubeletHasSufficientMemory kubelet has sufficient memory available Ready True Fri, 08 Jul 2016 01:37:36 -0700 Fri, 08 Jul 2016 01:35:26 -0700 KubeletReady kubelet is posting ready status. WARNING: CPU hardcapping unsupported RuntimeUnhealthy True Fri, 08 Jul 2016 01:35:31 -0700 Fri, 08 Jul 2016 01:35:31 -0700 ReadOnlySysFS Docker unexpectedly mounts sysfs as read-only for privileged container (docker issue #24000). This causes the critical system components of Kubernetes not properly working. To remedy this please restart the docker daemon. KernelDeadlock False Fri, 08 Jul 2016 01:37:39 -0700 Fri, 08 Jul 2016 01:35:34 -0700 KernelHasNoDeadlock kernel has no deadlock Addresses: 10.240.0.3,104.155.176.101 ``` * If not, the node condition `RuntimeUnhealthy` won't appear. * If I run the permission checking code in a unprivileged container, it did return `readOnlySysFSError`. I'm not sure whether we want to mark the node as `Unscheduable` when this happened, which only needs few lines change. I can do that if we think we should. I'll add some unit test if we think this fix is acceptable. /cc @bprashanth @dchen1107 @matchstick @thockin @alex-mohr Mark P1 to match the original issue. [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/.github/PULL_REQUEST_TEMPLATE.md?pixel)]()
2025-09-06 11:42:14 +00:00 · 2016-07-10 23:42:43 -07:00
parent d6e84cc2e9 4246853211
commit 6462f82243
2 changed files with 53 additions and 2 deletions
--- a/cmd/kube-proxy/app/conntrack.go
+++ b/cmd/kube-proxy/app/conntrack.go
@@ -17,11 +17,13 @@ limitations under the License.
 package app

 import (
+	"errors"
 	"io/ioutil"
 	"strconv"

 	"github.com/golang/glog"

+	"k8s.io/kubernetes/pkg/util/mount"
 	"k8s.io/kubernetes/pkg/util/sysctl"
 )

@@ -32,11 +34,25 @@ type Conntracker interface {

 type realConntracker struct{}

+var readOnlySysFSError = errors.New("ReadOnlySysFS")
+
 func (realConntracker) SetMax(max int) error {
 	glog.Infof("Setting nf_conntrack_max to %d", max)
 	if err := sysctl.SetSysctl("net/netfilter/nf_conntrack_max", max); err != nil {
 		return err
 	}
+	// sysfs is expected to be mounted as 'rw'. However, it may be unexpectedly mounted as
+	// 'ro' by docker because of a known docker issue (https://github.com/docker/docker/issues/24000).
+	// Setting conntrack will fail when sysfs is readonly. When that happens, we don't set conntrack
+	// hashsize and return a special error readOnlySysFSError here. The caller should deal with
+	// readOnlySysFSError differently.
+	writable, err := isSysFSWritable()
+	if err != nil {
+		return err
+	}
+	if !writable {
+		return readOnlySysFSError
+	}
 	// TODO: generify this and sysctl to a new sysfs.WriteInt()
 	glog.Infof("Setting conntrack hashsize to %d", max/4)
 	return ioutil.WriteFile("/sys/module/nf_conntrack/parameters/hashsize", []byte(strconv.Itoa(max/4)), 0640)
@@ -46,3 +62,27 @@ func (realConntracker) SetTCPEstablishedTimeout(seconds int) error {
 	glog.Infof("Setting nf_conntrack_tcp_timeout_established to %d", seconds)
 	return sysctl.SetSysctl("net/netfilter/nf_conntrack_tcp_timeout_established", seconds)
 }
+
+// isSysFSWritable checks /proc/mounts to see whether sysfs is 'rw' or not.
+func isSysFSWritable() (bool, error) {
+	const permWritable = "rw"
+	const sysfsDevice = "sysfs"
+	m := mount.New()
+	mountPoints, err := m.List()
+	if err != nil {
+		glog.Errorf("failed to list mount points: %v", err)
+		return false, err
+	}
+	for _, mountPoint := range mountPoints {
+		if mountPoint.Device != sysfsDevice {
+			continue
+		}
+		// Check whether sysfs is 'rw'
+		if len(mountPoint.Opts) > 0 && mountPoint.Opts[0] == permWritable {
+			return true, nil
+		}
+		glog.Errorf("sysfs is not writable: %+v", mountPoint)
+		break
+	}
+	return false, nil
+}
--- a/cmd/kube-proxy/app/server.go
+++ b/cmd/kube-proxy/app/server.go
@@ -299,8 +299,19 @@ func (s *ProxyServer) Run() error {
 	// Tune conntrack, if requested
 	if s.Conntracker != nil {
 		if s.Config.ConntrackMax > 0 {
-			if err := s.Conntracker.SetMax(int(s.Config.ConntrackMax)); err != nil {
-				return err
+			err := s.Conntracker.SetMax(int(s.Config.ConntrackMax))
+			if err != nil {
+				if err != readOnlySysFSError {
+					return err
+				}
+				// readOnlySysFSError is caused by a known docker issue (https://github.com/docker/docker/issues/24000),
+				// the only remediation we know is to restart the docker daemon.
+				// Here we'll send an node event with specific reason and message, the
+				// administrator should decide whether and how to handle this issue,
+				// whether to drain the node and restart docker.
+				// TODO(random-liu): Remove this when the docker bug is fixed.
+				const message = "DOCKER RESTART NEEDED (docker issue #24000): /sys is read-only: can't raise conntrack limits, problems may arise later."
+				s.Recorder.Eventf(s.Config.NodeRef, api.EventTypeWarning, err.Error(), message)
 			}
 		}
 		if s.Config.ConntrackTCPEstablishedTimeout.Duration > 0 {