fix flakes on e2e test TCP CLOSE_WAIT timeout

it turns out that the e2e test was not using the timeout used to
hold the CLOSE_WAIT status, hence the test was flake depending
on how fast it checked the conntrack table.

This PR replaces the dependency on ssh using a pod to check the conntrack
entries on the host in a loop, to make the test more robust
and reduce the flakiness due to race conditions and/or ssh issues.

It also fixes a bug trying to grep the conntrack entry, where
the error was swallowed if a conntrack entry wasn't found.
This commit is contained in:
Antonio Ojea 2020-01-07 21:22:09 +01:00
parent 788a073c79
commit a89c2bdaf1
No known key found for this signature in database
GPG Key ID: E4833AA228D4E824
2 changed files with 123 additions and 69 deletions

View File

@ -80,7 +80,6 @@ go_library(
"//test/e2e/framework/skipper:go_default_library",
"//test/e2e/framework/ssh:go_default_library",
"//test/e2e/network/scale:go_default_library",
"//test/images/agnhost/net/nat:go_default_library",
"//test/utils:go_default_library",
"//test/utils/image:go_default_library",
"//vendor/github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud:go_default_library",

View File

@ -18,22 +18,22 @@ package network
import (
"context"
"encoding/json"
"encoding/hex"
"fmt"
"math"
"net"
"strconv"
"strings"
"time"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/kubernetes/test/e2e/framework"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
"k8s.io/kubernetes/test/images/agnhost/net/nat"
imageutils "k8s.io/kubernetes/test/utils/image"
"github.com/onsi/ginkgo"
@ -44,15 +44,15 @@ var kubeProxyE2eImage = imageutils.GetE2EImage(imageutils.Agnhost)
var _ = SIGDescribe("Network", func() {
const (
testDaemonHTTPPort = 11301
testDaemonTCPPort = 11302
timeoutSeconds = 10
postFinTimeoutSeconds = 5
testDaemonHTTPPort = 11301
testDaemonTCPPort = 11302
deadlineTimeoutSeconds = 5
postFinTimeoutSeconds = 15
)
fr := framework.NewDefaultFramework("network")
ginkgo.It("should set TCP CLOSE_WAIT timeout", func() {
ginkgo.It("should set TCP CLOSE_WAIT timeout [Privileged]", func() {
nodes, err := e2enode.GetBoundedReadySchedulableNodes(fr.ClientSet, 2)
framework.ExpectNoError(err)
if len(nodes.Items) < 2 {
@ -83,16 +83,63 @@ var _ = SIGDescribe("Network", func() {
zero := int64(0)
// Create a pod to check the conntrack entries on the host node
// It mounts the host /proc/net folder to be able to access
// the nf_conntrack file with the host conntrack entries
privileged := true
hostExecPod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "e2e-net-exec",
Namespace: fr.Namespace.Name,
Labels: map[string]string{"app": "e2e-net-exec"},
},
Spec: v1.PodSpec{
HostNetwork: true,
NodeName: clientNodeInfo.name,
Containers: []v1.Container{
{
Name: "e2e-net-exec",
Image: kubeProxyE2eImage,
ImagePullPolicy: "Always",
Args: []string{"pause"},
VolumeMounts: []v1.VolumeMount{
{
Name: "proc-net",
MountPath: "/rootfs/proc/net",
ReadOnly: true,
},
},
SecurityContext: &v1.SecurityContext{
Privileged: &privileged,
},
},
},
Volumes: []v1.Volume{
{
Name: "proc-net",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{
Path: "/proc/net",
},
},
},
},
TerminationGracePeriodSeconds: &zero,
},
}
fr.PodClient().CreateSync(hostExecPod)
defer fr.PodClient().DeleteSync(hostExecPod.Name, metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
// Some distributions (Ubuntu 16.04 etc.) don't support the proc file.
_, err = e2essh.IssueSSHCommandWithResult(
"ls /proc/net/nf_conntrack",
framework.TestContext.Provider,
clientNodeInfo.node)
_, err = framework.RunHostCmd(fr.Namespace.Name, "e2e-net-exec",
"ls /rootfs/proc/net/nf_conntrack")
if err != nil && strings.Contains(err.Error(), "No such file or directory") {
e2eskipper.Skipf("The node %s does not support /proc/net/nf_conntrack", clientNodeInfo.name)
}
framework.ExpectNoError(err)
// Create the client and server pods
clientPodSpec := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "e2e-net-client",
@ -107,7 +154,13 @@ var _ = SIGDescribe("Network", func() {
Image: kubeProxyE2eImage,
ImagePullPolicy: "Always",
Args: []string{
"net", "--serve", fmt.Sprintf("0.0.0.0:%d", testDaemonHTTPPort),
"net",
"--runner", "nat-closewait-client",
"--options",
fmt.Sprintf(`{"RemoteAddr":"%v", "PostFinTimeoutSeconds":%v, "TimeoutSeconds":%v, "LeakConnection":true}`,
net.JoinHostPort(serverNodeInfo.nodeIP, strconv.Itoa(testDaemonTCPPort)),
postFinTimeoutSeconds,
deadlineTimeoutSeconds),
},
},
},
@ -132,7 +185,7 @@ var _ = SIGDescribe("Network", func() {
"net",
"--runner", "nat-closewait-server",
"--options",
fmt.Sprintf(`{"LocalAddr":"0.0.0.0:%v", "PostFindTimeoutSeconds":%v}`,
fmt.Sprintf(`{"LocalAddr":":%v", "PostFinTimeoutSeconds":%v}`,
testDaemonTCPPort,
postFinTimeoutSeconds),
},
@ -156,72 +209,55 @@ var _ = SIGDescribe("Network", func() {
kubeProxyE2eImage))
fr.PodClient().CreateSync(serverPodSpec)
// The server should be listening before spawning the client pod
<-time.After(time.Duration(2) * time.Second)
// Connect to the server and leak the connection
ginkgo.By(fmt.Sprintf(
"Launching a client daemon on node %v (node ip: %v, image: %v)",
"Launching a client connection on node %v (node ip: %v, image: %v)",
clientNodeInfo.name,
clientNodeInfo.nodeIP,
kubeProxyE2eImage))
fr.PodClient().CreateSync(clientPodSpec)
ginkgo.By("Make client connect")
options := nat.CloseWaitClientOptions{
RemoteAddr: fmt.Sprintf("%v:%v",
serverNodeInfo.nodeIP, testDaemonTCPPort),
TimeoutSeconds: timeoutSeconds,
PostFinTimeoutSeconds: 0,
LeakConnection: true,
}
jsonBytes, err := json.Marshal(options)
framework.ExpectNoError(err, "could not marshal")
cmd := fmt.Sprintf(
`curl -X POST http://localhost:%v/run/nat-closewait-client -d `+
`'%v' 2>/dev/null`,
testDaemonHTTPPort,
string(jsonBytes))
framework.RunHostCmdOrDie(fr.Namespace.Name, "e2e-net-client", cmd)
<-time.After(time.Duration(1) * time.Second)
ginkgo.By("Checking /proc/net/nf_conntrack for the timeout")
// If test flakes occur here, then this check should be performed
// in a loop as there may be a race with the client connecting.
e2essh.IssueSSHCommandWithResult(
fmt.Sprintf("sudo cat /proc/net/nf_conntrack | grep 'dport=%v'",
testDaemonTCPPort),
framework.TestContext.Provider,
clientNodeInfo.node)
// Timeout in seconds is available as the fifth column from
// /proc/net/nf_conntrack.
result, err := e2essh.IssueSSHCommandWithResult(
fmt.Sprintf(
"sudo cat /proc/net/nf_conntrack "+
"| grep 'CLOSE_WAIT.*dst=%v.*dport=%v' "+
"| tail -n 1"+
"| awk '{print $5}' ",
serverNodeInfo.nodeIP,
testDaemonTCPPort),
framework.TestContext.Provider,
clientNodeInfo.node)
framework.ExpectNoError(err)
timeoutSeconds, err := strconv.Atoi(strings.TrimSpace(result.Stdout))
framework.ExpectNoError(err)
// These must be synchronized from the default values set in
// pkg/apis/../defaults.go ConntrackTCPCloseWaitTimeout. The
// current defaults are hidden in the initialization code.
const epsilonSeconds = 60
const expectedTimeoutSeconds = 60 * 60
framework.Logf("conntrack entry timeout was: %v, expected: %v",
timeoutSeconds, expectedTimeoutSeconds)
gomega.Expect(math.Abs(float64(timeoutSeconds - expectedTimeoutSeconds))).Should(
gomega.BeNumerically("<", (epsilonSeconds)))
// the conntrack file uses the IPv6 expanded format
ip := fullIPv6(net.ParseIP(serverNodeInfo.nodeIP))
// Obtain the corresponding conntrack entry on the host checking
// the nf_conntrack file from the pod e2e-net-exec.
// It retries in a loop if the entry is not found.
cmd := fmt.Sprintf("cat /rootfs/proc/net/nf_conntrack "+
"| grep -m 1 'CLOSE_WAIT.*dst=%v.*dport=%v' ",
ip, testDaemonTCPPort)
if err := wait.PollImmediate(deadlineTimeoutSeconds, postFinTimeoutSeconds, func() (bool, error) {
result, err := framework.RunHostCmd(fr.Namespace.Name, "e2e-net-exec", cmd)
// retry if we can't obtain the conntrack entry
if err != nil {
framework.Logf("failed to obtain conntrack entry: %v %v", result, err)
return false, nil
}
framework.Logf("conntrack entry for node %v and port %v: %v", serverNodeInfo.nodeIP, testDaemonTCPPort, result)
// Timeout in seconds is available as the fifth column of
// the matched entry in /proc/net/nf_conntrack.
line := strings.Fields(result)
if len(line) < 5 {
return false, fmt.Errorf("conntrack entry does not have a timeout field: %v", line)
}
timeoutSeconds, err := strconv.Atoi(line[4])
if err != nil {
return false, fmt.Errorf("failed to convert matched timeout %s to integer: %v", line[4], err)
}
if math.Abs(float64(timeoutSeconds-expectedTimeoutSeconds)) < epsilonSeconds {
return true, nil
}
return false, fmt.Errorf("wrong TCP CLOSE_WAIT timeout: %v expected: %v", timeoutSeconds, expectedTimeoutSeconds)
}); err != nil {
framework.Failf("no conntrack entry for port %d on node %s", testDaemonTCPPort, serverNodeInfo.nodeIP)
}
})
// Regression test for #74839, where:
@ -338,3 +374,22 @@ var _ = SIGDescribe("Network", func() {
}
})
})
// fullIPv6 returns a string with the IP representation
// if IPv6 it returns the expanded address format
// credit https://stackoverflow.com/a/52003106/4532704
func fullIPv6(ip net.IP) string {
if ip.To4() == nil {
dst := make([]byte, hex.EncodedLen(len(ip)))
_ = hex.Encode(dst, ip)
return string(dst[0:4]) + ":" +
string(dst[4:8]) + ":" +
string(dst[8:12]) + ":" +
string(dst[12:16]) + ":" +
string(dst[16:20]) + ":" +
string(dst[20:24]) + ":" +
string(dst[24:28]) + ":" +
string(dst[28:])
}
return ip.String()
}