From a89c2bdaf13ee3013a266724dd07a5518014caf0 Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Tue, 7 Jan 2020 21:22:09 +0100 Subject: [PATCH] fix flakes on e2e test TCP CLOSE_WAIT timeout it turns out that the e2e test was not using the timeout used to hold the CLOSE_WAIT status, hence the test was flake depending on how fast it checked the conntrack table. This PR replaces the dependency on ssh using a pod to check the conntrack entries on the host in a loop, to make the test more robust and reduce the flakiness due to race conditions and/or ssh issues. It also fixes a bug trying to grep the conntrack entry, where the error was swallowed if a conntrack entry wasn't found. --- test/e2e/network/BUILD | 1 - test/e2e/network/kube_proxy.go | 191 +++++++++++++++++++++------------ 2 files changed, 123 insertions(+), 69 deletions(-) diff --git a/test/e2e/network/BUILD b/test/e2e/network/BUILD index 10d9af48453..d978c363bc1 100644 --- a/test/e2e/network/BUILD +++ b/test/e2e/network/BUILD @@ -80,7 +80,6 @@ go_library( "//test/e2e/framework/skipper:go_default_library", "//test/e2e/framework/ssh:go_default_library", "//test/e2e/network/scale:go_default_library", - "//test/images/agnhost/net/nat:go_default_library", "//test/utils:go_default_library", "//test/utils/image:go_default_library", "//vendor/github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud:go_default_library", diff --git a/test/e2e/network/kube_proxy.go b/test/e2e/network/kube_proxy.go index 635b4198193..5c455cba5b9 100644 --- a/test/e2e/network/kube_proxy.go +++ b/test/e2e/network/kube_proxy.go @@ -18,22 +18,22 @@ package network import ( "context" - "encoding/json" + "encoding/hex" "fmt" "math" + "net" "strconv" "strings" "time" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/kubernetes/test/e2e/framework" e2enode "k8s.io/kubernetes/test/e2e/framework/node" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" - e2essh "k8s.io/kubernetes/test/e2e/framework/ssh" - "k8s.io/kubernetes/test/images/agnhost/net/nat" imageutils "k8s.io/kubernetes/test/utils/image" "github.com/onsi/ginkgo" @@ -44,15 +44,15 @@ var kubeProxyE2eImage = imageutils.GetE2EImage(imageutils.Agnhost) var _ = SIGDescribe("Network", func() { const ( - testDaemonHTTPPort = 11301 - testDaemonTCPPort = 11302 - timeoutSeconds = 10 - postFinTimeoutSeconds = 5 + testDaemonHTTPPort = 11301 + testDaemonTCPPort = 11302 + deadlineTimeoutSeconds = 5 + postFinTimeoutSeconds = 15 ) fr := framework.NewDefaultFramework("network") - ginkgo.It("should set TCP CLOSE_WAIT timeout", func() { + ginkgo.It("should set TCP CLOSE_WAIT timeout [Privileged]", func() { nodes, err := e2enode.GetBoundedReadySchedulableNodes(fr.ClientSet, 2) framework.ExpectNoError(err) if len(nodes.Items) < 2 { @@ -83,16 +83,63 @@ var _ = SIGDescribe("Network", func() { zero := int64(0) + // Create a pod to check the conntrack entries on the host node + // It mounts the host /proc/net folder to be able to access + // the nf_conntrack file with the host conntrack entries + privileged := true + + hostExecPod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "e2e-net-exec", + Namespace: fr.Namespace.Name, + Labels: map[string]string{"app": "e2e-net-exec"}, + }, + Spec: v1.PodSpec{ + HostNetwork: true, + NodeName: clientNodeInfo.name, + Containers: []v1.Container{ + { + Name: "e2e-net-exec", + Image: kubeProxyE2eImage, + ImagePullPolicy: "Always", + Args: []string{"pause"}, + VolumeMounts: []v1.VolumeMount{ + { + Name: "proc-net", + MountPath: "/rootfs/proc/net", + ReadOnly: true, + }, + }, + SecurityContext: &v1.SecurityContext{ + Privileged: &privileged, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: "proc-net", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: "/proc/net", + }, + }, + }, + }, + TerminationGracePeriodSeconds: &zero, + }, + } + fr.PodClient().CreateSync(hostExecPod) + defer fr.PodClient().DeleteSync(hostExecPod.Name, metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout) + // Some distributions (Ubuntu 16.04 etc.) don't support the proc file. - _, err = e2essh.IssueSSHCommandWithResult( - "ls /proc/net/nf_conntrack", - framework.TestContext.Provider, - clientNodeInfo.node) + _, err = framework.RunHostCmd(fr.Namespace.Name, "e2e-net-exec", + "ls /rootfs/proc/net/nf_conntrack") if err != nil && strings.Contains(err.Error(), "No such file or directory") { e2eskipper.Skipf("The node %s does not support /proc/net/nf_conntrack", clientNodeInfo.name) } framework.ExpectNoError(err) + // Create the client and server pods clientPodSpec := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "e2e-net-client", @@ -107,7 +154,13 @@ var _ = SIGDescribe("Network", func() { Image: kubeProxyE2eImage, ImagePullPolicy: "Always", Args: []string{ - "net", "--serve", fmt.Sprintf("0.0.0.0:%d", testDaemonHTTPPort), + "net", + "--runner", "nat-closewait-client", + "--options", + fmt.Sprintf(`{"RemoteAddr":"%v", "PostFinTimeoutSeconds":%v, "TimeoutSeconds":%v, "LeakConnection":true}`, + net.JoinHostPort(serverNodeInfo.nodeIP, strconv.Itoa(testDaemonTCPPort)), + postFinTimeoutSeconds, + deadlineTimeoutSeconds), }, }, }, @@ -132,7 +185,7 @@ var _ = SIGDescribe("Network", func() { "net", "--runner", "nat-closewait-server", "--options", - fmt.Sprintf(`{"LocalAddr":"0.0.0.0:%v", "PostFindTimeoutSeconds":%v}`, + fmt.Sprintf(`{"LocalAddr":":%v", "PostFinTimeoutSeconds":%v}`, testDaemonTCPPort, postFinTimeoutSeconds), }, @@ -156,72 +209,55 @@ var _ = SIGDescribe("Network", func() { kubeProxyE2eImage)) fr.PodClient().CreateSync(serverPodSpec) + // The server should be listening before spawning the client pod + <-time.After(time.Duration(2) * time.Second) + // Connect to the server and leak the connection ginkgo.By(fmt.Sprintf( - "Launching a client daemon on node %v (node ip: %v, image: %v)", + "Launching a client connection on node %v (node ip: %v, image: %v)", clientNodeInfo.name, clientNodeInfo.nodeIP, kubeProxyE2eImage)) fr.PodClient().CreateSync(clientPodSpec) - ginkgo.By("Make client connect") - - options := nat.CloseWaitClientOptions{ - RemoteAddr: fmt.Sprintf("%v:%v", - serverNodeInfo.nodeIP, testDaemonTCPPort), - TimeoutSeconds: timeoutSeconds, - PostFinTimeoutSeconds: 0, - LeakConnection: true, - } - - jsonBytes, err := json.Marshal(options) - framework.ExpectNoError(err, "could not marshal") - - cmd := fmt.Sprintf( - `curl -X POST http://localhost:%v/run/nat-closewait-client -d `+ - `'%v' 2>/dev/null`, - testDaemonHTTPPort, - string(jsonBytes)) - framework.RunHostCmdOrDie(fr.Namespace.Name, "e2e-net-client", cmd) - - <-time.After(time.Duration(1) * time.Second) - ginkgo.By("Checking /proc/net/nf_conntrack for the timeout") - // If test flakes occur here, then this check should be performed - // in a loop as there may be a race with the client connecting. - e2essh.IssueSSHCommandWithResult( - fmt.Sprintf("sudo cat /proc/net/nf_conntrack | grep 'dport=%v'", - testDaemonTCPPort), - framework.TestContext.Provider, - clientNodeInfo.node) - - // Timeout in seconds is available as the fifth column from - // /proc/net/nf_conntrack. - result, err := e2essh.IssueSSHCommandWithResult( - fmt.Sprintf( - "sudo cat /proc/net/nf_conntrack "+ - "| grep 'CLOSE_WAIT.*dst=%v.*dport=%v' "+ - "| tail -n 1"+ - "| awk '{print $5}' ", - serverNodeInfo.nodeIP, - testDaemonTCPPort), - framework.TestContext.Provider, - clientNodeInfo.node) - framework.ExpectNoError(err) - - timeoutSeconds, err := strconv.Atoi(strings.TrimSpace(result.Stdout)) - framework.ExpectNoError(err) - // These must be synchronized from the default values set in // pkg/apis/../defaults.go ConntrackTCPCloseWaitTimeout. The // current defaults are hidden in the initialization code. const epsilonSeconds = 60 const expectedTimeoutSeconds = 60 * 60 - - framework.Logf("conntrack entry timeout was: %v, expected: %v", - timeoutSeconds, expectedTimeoutSeconds) - - gomega.Expect(math.Abs(float64(timeoutSeconds - expectedTimeoutSeconds))).Should( - gomega.BeNumerically("<", (epsilonSeconds))) + // the conntrack file uses the IPv6 expanded format + ip := fullIPv6(net.ParseIP(serverNodeInfo.nodeIP)) + // Obtain the corresponding conntrack entry on the host checking + // the nf_conntrack file from the pod e2e-net-exec. + // It retries in a loop if the entry is not found. + cmd := fmt.Sprintf("cat /rootfs/proc/net/nf_conntrack "+ + "| grep -m 1 'CLOSE_WAIT.*dst=%v.*dport=%v' ", + ip, testDaemonTCPPort) + if err := wait.PollImmediate(deadlineTimeoutSeconds, postFinTimeoutSeconds, func() (bool, error) { + result, err := framework.RunHostCmd(fr.Namespace.Name, "e2e-net-exec", cmd) + // retry if we can't obtain the conntrack entry + if err != nil { + framework.Logf("failed to obtain conntrack entry: %v %v", result, err) + return false, nil + } + framework.Logf("conntrack entry for node %v and port %v: %v", serverNodeInfo.nodeIP, testDaemonTCPPort, result) + // Timeout in seconds is available as the fifth column of + // the matched entry in /proc/net/nf_conntrack. + line := strings.Fields(result) + if len(line) < 5 { + return false, fmt.Errorf("conntrack entry does not have a timeout field: %v", line) + } + timeoutSeconds, err := strconv.Atoi(line[4]) + if err != nil { + return false, fmt.Errorf("failed to convert matched timeout %s to integer: %v", line[4], err) + } + if math.Abs(float64(timeoutSeconds-expectedTimeoutSeconds)) < epsilonSeconds { + return true, nil + } + return false, fmt.Errorf("wrong TCP CLOSE_WAIT timeout: %v expected: %v", timeoutSeconds, expectedTimeoutSeconds) + }); err != nil { + framework.Failf("no conntrack entry for port %d on node %s", testDaemonTCPPort, serverNodeInfo.nodeIP) + } }) // Regression test for #74839, where: @@ -338,3 +374,22 @@ var _ = SIGDescribe("Network", func() { } }) }) + +// fullIPv6 returns a string with the IP representation +// if IPv6 it returns the expanded address format +// credit https://stackoverflow.com/a/52003106/4532704 +func fullIPv6(ip net.IP) string { + if ip.To4() == nil { + dst := make([]byte, hex.EncodedLen(len(ip))) + _ = hex.Encode(dst, ip) + return string(dst[0:4]) + ":" + + string(dst[4:8]) + ":" + + string(dst[8:12]) + ":" + + string(dst[12:16]) + ":" + + string(dst[16:20]) + ":" + + string(dst[20:24]) + ":" + + string(dst[24:28]) + ":" + + string(dst[28:]) + } + return ip.String() +}