Merge pull request #56718 from foxish/fix-network-partition-test-gce-2

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Fix for the network partition tests

Fix https://github.com/kubernetes/kubernetes/issues/56416

The underlying issue was that after cluster upgrade, the nodes talk to the master using the in-cluster IP.
The IPTables rules used for blocking were thus far only effective when the nodes used the external network interface.

Reasoning: 

api-server.log [from gce upgrade cluster](https://storage.googleapis.com/kubernetes-jenkins/logs/ci-kubernetes-e2e-gce-stable1-beta-upgrade-cluster-new/35/artifacts/bootstrap-e2e-master/kube-apiserver.log)

> I1201 13:56:34.287956       5 wrap.go:42] PATCH /api/v1/nodes/bootstrap-e2e-minion-group-hv6p/status: (18.100082ms) 200 [[node-problem-detector/v1.4.0 (linux/amd64) kubernetes/$Format] **10.128.0.4:53766**]
> I1201 13:56:34.287956       5 wrap.go:42] PATCH /api/v1/nodes/bootstrap-e2e-minion-group-hv6p/status: (18.100082ms) 200 [[node-problem-detector/v1.4.0 (linux/amd64) kubernetes/$Format] **10.128.0.4:53766**]
> I1201 13:56:34.515042       5 wrap.go:42] PATCH /api/v1/nodes/bootstrap-e2e-master/status: (4.327563ms) 200 [[kubelet/v1.9.0 (linux/amd64) kubernetes/e067596] **10.128.0.2:41898**]

api-server.log [from gce serial](https://storage.googleapis.com/kubernetes-jenkins/logs/ci-kubernetes-e2e-gce-cos-k8sbeta-serial/70/artifacts/test-34cf3ed1e3-master/kube-apiserver.log)

> I1201 15:59:46.863961       5 wrap.go:42] GET /api/v1/nodes/test-34cf3ed1e3-minion-group-zr99?resourceVersion=0: (926.753µs) 200 [[kubelet/v1.9.0 (linux/amd64) kubernetes/e067596] **104.154.254.154:40220**]
> I1201 15:59:46.881810       5 wrap.go:42] PATCH /api/v1/nodes/test-34cf3ed1e3-minion-group-zr99/status: (10.157704ms) 200 [[kubelet/v1.9.0 (linux/amd64) kubernetes/e067596] **104.154.254.154:40220**]

The underlying issue is one of cluster setup - but we can make the test more resilient with this change.

cc @krzyzacy @spiffxp @enisoc @jberkus @kubernetes/sig-autoscaling-misc
This commit is contained in:
Kubernetes Submit Queue 2017-12-01 15:31:22 -08:00 committed by GitHub
commit a5d2a025b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 24 additions and 3 deletions

View File

@ -935,7 +935,9 @@ func TestHitNodesFromOutsideWithCount(externalIP string, httpPort int32, timeout
// This function executes commands on a node so it will work only for some
// environments.
func TestUnderTemporaryNetworkFailure(c clientset.Interface, ns string, node *v1.Node, testFunc func()) {
host := GetNodeExternalIP(node)
externalIP := GetNodeExternalIP(node)
internalIP := GetNodeInternalIP(node)
master := GetMasterAddress(c)
By(fmt.Sprintf("block network traffic from node %s to the master", node.Name))
defer func() {
@ -944,14 +946,16 @@ func TestUnderTemporaryNetworkFailure(c clientset.Interface, ns string, node *v1
// had been inserted. (yes, we could look at the error code and ssh error
// separately, but I prefer to stay on the safe side).
By(fmt.Sprintf("Unblock network traffic from node %s to the master", node.Name))
UnblockNetwork(host, master)
UnblockNetwork(externalIP, master)
UnblockNetwork(internalIP, master)
}()
Logf("Waiting %v to ensure node %s is ready before beginning test...", resizeNodeReadyTimeout, node.Name)
if !WaitForNodeToBe(c, node.Name, v1.NodeReady, true, resizeNodeReadyTimeout) {
Failf("Node %s did not become ready within %v", node.Name, resizeNodeReadyTimeout)
}
BlockNetwork(host, master)
BlockNetwork(externalIP, master)
BlockNetwork(internalIP, master)
Logf("Waiting %v for node %s to be not ready after simulated network failure", resizeNodeNotReadyTimeout, node.Name)
if !WaitForNodeToBe(c, node.Name, v1.NodeReady, false, resizeNodeNotReadyTimeout) {

View File

@ -4925,6 +4925,23 @@ func GetNodeExternalIP(node *v1.Node) string {
return host
}
// GetNodeInternalIP returns node internal IP concatenated with port 22 for ssh
// e.g. 1.2.3.4:22
func GetNodeInternalIP(node *v1.Node) string {
Logf("Getting internal IP address for %s", node.Name)
host := ""
for _, a := range node.Status.Addresses {
if a.Type == v1.NodeInternalIP {
host = net.JoinHostPort(a.Address, sshPort)
break
}
}
if host == "" {
Failf("Couldn't get the internal IP of host %s with addresses %v", node.Name, node.Status.Addresses)
}
return host
}
// SimpleGET executes a get on the given url, returns error if non-200 returned.
func SimpleGET(c *http.Client, url, host string) (string, error) {
req, err := http.NewRequest("GET", url, nil)