From 6ee2b7bc4e2d4ca9e7877c3ee7fedba1b0103eec Mon Sep 17 00:00:00 2001 From: Jeff Lowdermilk Date: Thu, 26 May 2016 09:42:47 -0700 Subject: [PATCH] Fix some gce-only tests to run on gke as well DaemonRestart kubelet test, and Services apiserver restart test. --- test/e2e/cluster_upgrade.go | 333 +--------------------------- test/e2e/daemon_restart.go | 7 +- test/e2e/framework/nodes_util.go | 358 +++++++++++++++++++++++++++++++ test/e2e/framework/util.go | 74 ++++++- test/e2e/restart.go | 93 +------- test/e2e/service.go | 5 +- 6 files changed, 453 insertions(+), 417 deletions(-) create mode 100644 test/e2e/framework/nodes_util.go diff --git a/test/e2e/cluster_upgrade.go b/test/e2e/cluster_upgrade.go index 66ca5d6ed2a..676309e4637 100644 --- a/test/e2e/cluster_upgrade.go +++ b/test/e2e/cluster_upgrade.go @@ -17,14 +17,9 @@ limitations under the License. package e2e import ( - "bytes" "fmt" - "io" - "os" - "os/exec" "path" "strings" - "time" "k8s.io/kubernetes/pkg/api" client "k8s.io/kubernetes/pkg/client/unversioned" @@ -47,7 +42,7 @@ var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() { cm := chaosmonkey.New(func() { v, err := realVersion(framework.TestContext.UpgradeTarget) framework.ExpectNoError(err) - framework.ExpectNoError(masterUpgrade(v)) + framework.ExpectNoError(framework.MasterUpgrade(v)) framework.ExpectNoError(checkMasterVersion(f.Client, v)) }) cm.Register(func(sem *chaosmonkey.Semaphore) { @@ -63,7 +58,7 @@ var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() { cm := chaosmonkey.New(func() { v, err := realVersion(framework.TestContext.UpgradeTarget) framework.ExpectNoError(err) - framework.ExpectNoError(nodeUpgrade(f, v)) + framework.ExpectNoError(framework.NodeUpgrade(f, v)) framework.ExpectNoError(checkNodesVersions(f.Client, v)) }) cm.Register(func(sem *chaosmonkey.Semaphore) { @@ -77,7 +72,7 @@ var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() { cm := chaosmonkey.New(func() { v, err := realVersion(framework.TestContext.UpgradeTarget) framework.ExpectNoError(err) - framework.ExpectNoError(nodeUpgrade(f, v)) + framework.ExpectNoError(framework.NodeUpgrade(f, v)) framework.ExpectNoError(checkNodesVersions(f.Client, v)) }) cm.Register(func(sem *chaosmonkey.Semaphore) { @@ -93,9 +88,9 @@ var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() { cm := chaosmonkey.New(func() { v, err := realVersion(framework.TestContext.UpgradeTarget) framework.ExpectNoError(err) - framework.ExpectNoError(masterUpgrade(v)) + framework.ExpectNoError(framework.MasterUpgrade(v)) framework.ExpectNoError(checkMasterVersion(f.Client, v)) - framework.ExpectNoError(nodeUpgrade(f, v)) + framework.ExpectNoError(framework.NodeUpgrade(f, v)) framework.ExpectNoError(checkNodesVersions(f.Client, v)) }) cm.Register(func(sem *chaosmonkey.Semaphore) { @@ -109,9 +104,9 @@ var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() { cm := chaosmonkey.New(func() { v, err := realVersion(framework.TestContext.UpgradeTarget) framework.ExpectNoError(err) - framework.ExpectNoError(masterUpgrade(v)) + framework.ExpectNoError(framework.MasterUpgrade(v)) framework.ExpectNoError(checkMasterVersion(f.Client, v)) - framework.ExpectNoError(nodeUpgrade(f, v)) + framework.ExpectNoError(framework.NodeUpgrade(f, v)) framework.ExpectNoError(checkNodesVersions(f.Client, v)) }) cm.Register(func(sem *chaosmonkey.Semaphore) { @@ -127,7 +122,7 @@ var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() { // GKE. See hack/get-build.sh for more information. func realVersion(s string) (string, error) { framework.Logf(fmt.Sprintf("Getting real version for %q", s)) - v, _, err := runCmd(path.Join(framework.TestContext.RepoRoot, "hack/get-build.sh"), "-v", s) + v, _, err := framework.RunCmd(path.Join(framework.TestContext.RepoRoot, "hack/get-build.sh"), "-v", s) if err != nil { return v, err } @@ -135,128 +130,6 @@ func realVersion(s string) (string, error) { return strings.TrimPrefix(strings.TrimSpace(v), "v"), nil } -// The following upgrade functions are passed into the framework below and used -// to do the actual upgrades. -var masterUpgrade = func(v string) error { - switch framework.TestContext.Provider { - case "gce": - return masterUpgradeGCE(v) - case "gke": - return masterUpgradeGKE(v) - default: - return fmt.Errorf("masterUpgrade() is not implemented for provider %s", framework.TestContext.Provider) - } -} - -func masterUpgradeGCE(rawV string) error { - v := "v" + rawV - _, _, err := runCmd(path.Join(framework.TestContext.RepoRoot, "cluster/gce/upgrade.sh"), "-M", v) - return err -} - -func masterUpgradeGKE(v string) error { - framework.Logf("Upgrading master to %q", v) - _, _, err := runCmd("gcloud", "container", - fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID), - fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone), - "clusters", - "upgrade", - framework.TestContext.CloudConfig.Cluster, - "--master", - fmt.Sprintf("--cluster-version=%s", v), - "--quiet") - return err -} - -var nodeUpgrade = func(f *framework.Framework, v string) error { - // Perform the upgrade. - var err error - switch framework.TestContext.Provider { - case "gce": - err = nodeUpgradeGCE(v) - case "gke": - err = nodeUpgradeGKE(v) - default: - err = fmt.Errorf("nodeUpgrade() is not implemented for provider %s", framework.TestContext.Provider) - } - if err != nil { - return err - } - - // Wait for it to complete and validate nodes are healthy. - // - // TODO(ihmccreery) We shouldn't have to wait for nodes to be ready in - // GKE; the operation shouldn't return until they all are. - framework.Logf("Waiting up to %v for all nodes to be ready after the upgrade", restartNodeReadyAgainTimeout) - if _, err := checkNodesReady(f.Client, restartNodeReadyAgainTimeout, framework.TestContext.CloudConfig.NumNodes); err != nil { - return err - } - return nil -} - -func nodeUpgradeGCE(rawV string) error { - // TODO(ihmccreery) This code path should be identical to how a user - // would trigger a node update; right now it's very different. - v := "v" + rawV - - framework.Logf("Getting the node template before the upgrade") - tmplBefore, err := migTemplate() - if err != nil { - return fmt.Errorf("error getting the node template before the upgrade: %v", err) - } - - framework.Logf("Preparing node upgrade by creating new instance template for %q", v) - stdout, _, err := runCmd(path.Join(framework.TestContext.RepoRoot, "cluster/gce/upgrade.sh"), "-P", v) - if err != nil { - cleanupNodeUpgradeGCE(tmplBefore) - return fmt.Errorf("error preparing node upgrade: %v", err) - } - tmpl := strings.TrimSpace(stdout) - - framework.Logf("Performing a node upgrade to %q; waiting at most %v per node", tmpl, restartPerNodeTimeout) - if err := migRollingUpdate(tmpl, restartPerNodeTimeout); err != nil { - cleanupNodeUpgradeGCE(tmplBefore) - return fmt.Errorf("error doing node upgrade via a migRollingUpdate to %s: %v", tmpl, err) - } - return nil -} - -func cleanupNodeUpgradeGCE(tmplBefore string) { - framework.Logf("Cleaning up any unused node templates") - tmplAfter, err := migTemplate() - if err != nil { - framework.Logf("Could not get node template post-upgrade; may have leaked template %s", tmplBefore) - return - } - if tmplBefore == tmplAfter { - // The node upgrade failed so there's no need to delete - // anything. - framework.Logf("Node template %s is still in use; not cleaning up", tmplBefore) - return - } - framework.Logf("Deleting node template %s", tmplBefore) - if _, _, err := retryCmd("gcloud", "compute", "instance-templates", - fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID), - "delete", - tmplBefore); err != nil { - framework.Logf("gcloud compute instance-templates delete %s call failed with err: %v", tmplBefore, err) - framework.Logf("May have leaked instance template %q", tmplBefore) - } -} - -func nodeUpgradeGKE(v string) error { - framework.Logf("Upgrading nodes to %q", v) - _, _, err := runCmd("gcloud", "container", - fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID), - fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone), - "clusters", - "upgrade", - framework.TestContext.CloudConfig.Cluster, - fmt.Sprintf("--cluster-version=%s", v), - "--quiet") - return err -} - func testServiceUpBeforeAndAfter(f *framework.Framework, sem *chaosmonkey.Semaphore) { testService(f, sem, false) } @@ -355,193 +228,3 @@ func checkNodesVersions(c *client.Client, want string) error { } return nil } - -// retryCmd runs cmd using args and retries it for up to framework.SingleCallTimeout if -// it returns an error. It returns stdout and stderr. -func retryCmd(command string, args ...string) (string, string, error) { - var err error - stdout, stderr := "", "" - wait.Poll(framework.Poll, framework.SingleCallTimeout, func() (bool, error) { - stdout, stderr, err = runCmd(command, args...) - if err != nil { - framework.Logf("Got %v", err) - return false, nil - } - return true, nil - }) - return stdout, stderr, err -} - -// runCmd runs cmd using args and returns its stdout and stderr. It also outputs -// cmd's stdout and stderr to their respective OS streams. -// -// TODO(ihmccreery) This function should either be moved into util.go or -// removed; other e2e's use bare exe.Command. -func runCmd(command string, args ...string) (string, string, error) { - framework.Logf("Running %s %v", command, args) - var bout, berr bytes.Buffer - cmd := exec.Command(command, args...) - // We also output to the OS stdout/stderr to aid in debugging in case cmd - // hangs and never returns before the test gets killed. - // - // This creates some ugly output because gcloud doesn't always provide - // newlines. - cmd.Stdout = io.MultiWriter(os.Stdout, &bout) - cmd.Stderr = io.MultiWriter(os.Stderr, &berr) - err := cmd.Run() - stdout, stderr := bout.String(), berr.String() - if err != nil { - return "", "", fmt.Errorf("error running %s %v; got error %v, stdout %q, stderr %q", - command, args, err, stdout, stderr) - } - return stdout, stderr, nil -} - -// migRollingUpdate starts a MIG rolling update, upgrading the nodes to a new -// instance template named tmpl, and waits up to nt times the number of nodes -// for it to complete. -func migRollingUpdate(tmpl string, nt time.Duration) error { - framework.Logf(fmt.Sprintf("starting the MIG rolling update to %s", tmpl)) - id, err := migRollingUpdateStart(tmpl, nt) - if err != nil { - return fmt.Errorf("couldn't start the MIG rolling update: %v", err) - } - - framework.Logf(fmt.Sprintf("polling the MIG rolling update (%s) until it completes", id)) - if err := migRollingUpdatePoll(id, nt); err != nil { - return fmt.Errorf("err waiting until update completed: %v", err) - } - - return nil -} - -// migTemplate (GCE-only) returns the name of the MIG template that the -// nodes of the cluster use. -func migTemplate() (string, error) { - var errLast error - var templ string - key := "instanceTemplate" - if wait.Poll(framework.Poll, framework.SingleCallTimeout, func() (bool, error) { - // TODO(mikedanese): make this hit the compute API directly instead of - // shelling out to gcloud. - // An `instance-groups managed describe` call outputs what we want to stdout. - output, _, err := retryCmd("gcloud", "compute", "instance-groups", "managed", - fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID), - "describe", - fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone), - framework.TestContext.CloudConfig.NodeInstanceGroup) - if err != nil { - errLast = fmt.Errorf("gcloud compute instance-groups managed describe call failed with err: %v", err) - return false, nil - } - - // The 'describe' call probably succeeded; parse the output and try to - // find the line that looks like "instanceTemplate: url/to/" and - // return . - if val := framework.ParseKVLines(output, key); len(val) > 0 { - url := strings.Split(val, "/") - templ = url[len(url)-1] - framework.Logf("MIG group %s using template: %s", framework.TestContext.CloudConfig.NodeInstanceGroup, templ) - return true, nil - } - errLast = fmt.Errorf("couldn't find %s in output to get MIG template. Output: %s", key, output) - return false, nil - }) != nil { - return "", fmt.Errorf("migTemplate() failed with last error: %v", errLast) - } - return templ, nil -} - -// migRollingUpdateStart (GCE/GKE-only) starts a MIG rolling update using templ -// as the new template, waiting up to nt per node, and returns the ID of that -// update. -func migRollingUpdateStart(templ string, nt time.Duration) (string, error) { - var errLast error - var id string - prefix, suffix := "Started [", "]." - if err := wait.Poll(framework.Poll, framework.SingleCallTimeout, func() (bool, error) { - // TODO(mikedanese): make this hit the compute API directly instead of - // shelling out to gcloud. - // NOTE(mikedanese): If you are changing this gcloud command, update - // cluster/gce/upgrade.sh to match this EXACTLY. - // A `rolling-updates start` call outputs what we want to stderr. - _, output, err := retryCmd("gcloud", "alpha", "compute", - "rolling-updates", - fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID), - fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone), - "start", - // Required args. - fmt.Sprintf("--group=%s", framework.TestContext.CloudConfig.NodeInstanceGroup), - fmt.Sprintf("--template=%s", templ), - // Optional args to fine-tune behavior. - fmt.Sprintf("--instance-startup-timeout=%ds", int(nt.Seconds())), - // NOTE: We can speed up this process by increasing - // --max-num-concurrent-instances. - fmt.Sprintf("--max-num-concurrent-instances=%d", 1), - fmt.Sprintf("--max-num-failed-instances=%d", 0), - fmt.Sprintf("--min-instance-update-time=%ds", 0)) - if err != nil { - errLast = fmt.Errorf("rolling-updates call failed with err: %v", err) - return false, nil - } - - // The 'start' call probably succeeded; parse the output and try to find - // the line that looks like "Started [url/to/]." and return . - for _, line := range strings.Split(output, "\n") { - // As a sanity check, ensure the line starts with prefix and ends - // with suffix. - if strings.Index(line, prefix) != 0 || strings.Index(line, suffix) != len(line)-len(suffix) { - continue - } - url := strings.Split(strings.TrimSuffix(strings.TrimPrefix(line, prefix), suffix), "/") - id = url[len(url)-1] - framework.Logf("Started MIG rolling update; ID: %s", id) - return true, nil - } - errLast = fmt.Errorf("couldn't find line like '%s ... %s' in output to MIG rolling-update start. Output: %s", - prefix, suffix, output) - return false, nil - }); err != nil { - return "", fmt.Errorf("migRollingUpdateStart() failed with last error: %v", errLast) - } - return id, nil -} - -// migRollingUpdatePoll (CKE/GKE-only) polls the progress of the MIG rolling -// update with ID id until it is complete. It returns an error if this takes -// longer than nt times the number of nodes. -func migRollingUpdatePoll(id string, nt time.Duration) error { - // Two keys and a val. - status, progress, done := "status", "statusMessage", "ROLLED_OUT" - start, timeout := time.Now(), nt*time.Duration(framework.TestContext.CloudConfig.NumNodes) - var errLast error - framework.Logf("Waiting up to %v for MIG rolling update to complete.", timeout) - if wait.Poll(restartPoll, timeout, func() (bool, error) { - // A `rolling-updates describe` call outputs what we want to stdout. - output, _, err := retryCmd("gcloud", "alpha", "compute", - "rolling-updates", - fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID), - fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone), - "describe", - id) - if err != nil { - errLast = fmt.Errorf("Error calling rolling-updates describe %s: %v", id, err) - framework.Logf("%v", errLast) - return false, nil - } - - // The 'describe' call probably succeeded; parse the output and try to - // find the line that looks like "status: " and see whether it's - // done. - framework.Logf("Waiting for MIG rolling update: %s (%v elapsed)", - framework.ParseKVLines(output, progress), time.Since(start)) - if st := framework.ParseKVLines(output, status); st == done { - return true, nil - } - return false, nil - }) != nil { - return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast) - } - framework.Logf("MIG rolling update complete after %v", time.Since(start)) - return nil -} diff --git a/test/e2e/daemon_restart.go b/test/e2e/daemon_restart.go index 762d6157554..43ef6749c9c 100644 --- a/test/e2e/daemon_restart.go +++ b/test/e2e/daemon_restart.go @@ -199,8 +199,7 @@ var _ = framework.KubeDescribe("DaemonRestart [Disruptive]", func() { BeforeEach(func() { // These tests require SSH - // TODO(11834): Enable this test in GKE once experimental API there is switched on - framework.SkipUnlessProviderIs("gce", "aws") + framework.SkipUnlessProviderIs(framework.ProvidersWithSSH...) ns = f.Namespace.Name // All the restart tests need an rc and a watch on pods of the rc. @@ -252,6 +251,8 @@ var _ = framework.KubeDescribe("DaemonRestart [Disruptive]", func() { It("Controller Manager should not create/delete replicas across restart", func() { + // Requires master ssh access. + framework.SkipUnlessProviderIs("gce", "aws") restarter := NewRestartConfig( framework.GetMasterHost(), "kube-controller", ports.ControllerManagerPort, restartPollInterval, restartTimeout) restarter.restart() @@ -281,6 +282,8 @@ var _ = framework.KubeDescribe("DaemonRestart [Disruptive]", func() { It("Scheduler should continue assigning pods to nodes across restart", func() { + // Requires master ssh access. + framework.SkipUnlessProviderIs("gce", "aws") restarter := NewRestartConfig( framework.GetMasterHost(), "kube-scheduler", ports.SchedulerPort, restartPollInterval, restartTimeout) diff --git a/test/e2e/framework/nodes_util.go b/test/e2e/framework/nodes_util.go new file mode 100644 index 00000000000..e2f9aae82bb --- /dev/null +++ b/test/e2e/framework/nodes_util.go @@ -0,0 +1,358 @@ +/* +Copyright 2014 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package framework + +import ( + "fmt" + "path" + "strings" + "time" + + "k8s.io/kubernetes/pkg/api" + client "k8s.io/kubernetes/pkg/client/unversioned" + "k8s.io/kubernetes/pkg/fields" + "k8s.io/kubernetes/pkg/util/wait" +) + +// The following upgrade functions are passed into the framework below and used +// to do the actual upgrades. +var MasterUpgrade = func(v string) error { + switch TestContext.Provider { + case "gce": + return masterUpgradeGCE(v) + case "gke": + return masterUpgradeGKE(v) + default: + return fmt.Errorf("MasterUpgrade() is not implemented for provider %s", TestContext.Provider) + } +} + +func masterUpgradeGCE(rawV string) error { + v := "v" + rawV + _, _, err := RunCmd(path.Join(TestContext.RepoRoot, "cluster/gce/upgrade.sh"), "-M", v) + return err +} + +func masterUpgradeGKE(v string) error { + Logf("Upgrading master to %q", v) + _, _, err := RunCmd("gcloud", "container", + fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID), + fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone), + "clusters", + "upgrade", + TestContext.CloudConfig.Cluster, + "--master", + fmt.Sprintf("--cluster-version=%s", v), + "--quiet") + return err +} + +var NodeUpgrade = func(f *Framework, v string) error { + // Perform the upgrade. + var err error + switch TestContext.Provider { + case "gce": + err = nodeUpgradeGCE(v) + case "gke": + err = nodeUpgradeGKE(v) + default: + err = fmt.Errorf("NodeUpgrade() is not implemented for provider %s", TestContext.Provider) + } + if err != nil { + return err + } + + // Wait for it to complete and validate nodes are healthy. + // + // TODO(ihmccreery) We shouldn't have to wait for nodes to be ready in + // GKE; the operation shouldn't return until they all are. + Logf("Waiting up to %v for all nodes to be ready after the upgrade", RestartNodeReadyAgainTimeout) + if _, err := CheckNodesReady(f.Client, RestartNodeReadyAgainTimeout, TestContext.CloudConfig.NumNodes); err != nil { + return err + } + return nil +} + +func nodeUpgradeGCE(rawV string) error { + // TODO(ihmccreery) This code path should be identical to how a user + // would trigger a node update; right now it's very different. + v := "v" + rawV + + Logf("Getting the node template before the upgrade") + tmplBefore, err := MigTemplate() + if err != nil { + return fmt.Errorf("error getting the node template before the upgrade: %v", err) + } + + Logf("Preparing node upgrade by creating new instance template for %q", v) + stdout, _, err := RunCmd(path.Join(TestContext.RepoRoot, "cluster/gce/upgrade.sh"), "-P", v) + if err != nil { + cleanupNodeUpgradeGCE(tmplBefore) + return fmt.Errorf("error preparing node upgrade: %v", err) + } + tmpl := strings.TrimSpace(stdout) + + Logf("Performing a node upgrade to %q; waiting at most %v per node", tmpl, RestartPerNodeTimeout) + if err := MigRollingUpdate(tmpl, RestartPerNodeTimeout); err != nil { + cleanupNodeUpgradeGCE(tmplBefore) + return fmt.Errorf("error doing node upgrade via a MigRollingUpdate to %s: %v", tmpl, err) + } + return nil +} + +// MigRollingUpdate starts a MIG rolling update, upgrading the nodes to a new +// instance template named tmpl, and waits up to nt times the number of nodes +// for it to complete. +func MigRollingUpdate(tmpl string, nt time.Duration) error { + Logf(fmt.Sprintf("starting the MIG rolling update to %s", tmpl)) + id, err := migRollingUpdateStart(tmpl, nt) + if err != nil { + return fmt.Errorf("couldn't start the MIG rolling update: %v", err) + } + + Logf(fmt.Sprintf("polling the MIG rolling update (%s) until it completes", id)) + if err := migRollingUpdatePoll(id, nt); err != nil { + return fmt.Errorf("err waiting until update completed: %v", err) + } + + return nil +} + +// migRollingUpdateStart (GCE/GKE-only) starts a MIG rolling update using templ +// as the new template, waiting up to nt per node, and returns the ID of that +// update. +func migRollingUpdateStart(templ string, nt time.Duration) (string, error) { + var errLast error + var id string + prefix, suffix := "Started [", "]." + if err := wait.Poll(Poll, SingleCallTimeout, func() (bool, error) { + // TODO(mikedanese): make this hit the compute API directly instead of + // shelling out to gcloud. + // NOTE(mikedanese): If you are changing this gcloud command, update + // cluster/gce/upgrade.sh to match this EXACTLY. + // A `rolling-updates start` call outputs what we want to stderr. + _, output, err := retryCmd("gcloud", "alpha", "compute", + "rolling-updates", + fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID), + fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone), + "start", + // Required args. + fmt.Sprintf("--group=%s", TestContext.CloudConfig.NodeInstanceGroup), + fmt.Sprintf("--template=%s", templ), + // Optional args to fine-tune behavior. + fmt.Sprintf("--instance-startup-timeout=%ds", int(nt.Seconds())), + // NOTE: We can speed up this process by increasing + // --max-num-concurrent-instances. + fmt.Sprintf("--max-num-concurrent-instances=%d", 1), + fmt.Sprintf("--max-num-failed-instances=%d", 0), + fmt.Sprintf("--min-instance-update-time=%ds", 0)) + if err != nil { + errLast = fmt.Errorf("rolling-updates call failed with err: %v", err) + return false, nil + } + + // The 'start' call probably succeeded; parse the output and try to find + // the line that looks like "Started [url/to/]." and return . + for _, line := range strings.Split(output, "\n") { + // As a sanity check, ensure the line starts with prefix and ends + // with suffix. + if strings.Index(line, prefix) != 0 || strings.Index(line, suffix) != len(line)-len(suffix) { + continue + } + url := strings.Split(strings.TrimSuffix(strings.TrimPrefix(line, prefix), suffix), "/") + id = url[len(url)-1] + Logf("Started MIG rolling update; ID: %s", id) + return true, nil + } + errLast = fmt.Errorf("couldn't find line like '%s ... %s' in output to MIG rolling-update start. Output: %s", + prefix, suffix, output) + return false, nil + }); err != nil { + return "", fmt.Errorf("migRollingUpdateStart() failed with last error: %v", errLast) + } + return id, nil +} + +// migRollingUpdatePoll (CKE/GKE-only) polls the progress of the MIG rolling +// update with ID id until it is complete. It returns an error if this takes +// longer than nt times the number of nodes. +func migRollingUpdatePoll(id string, nt time.Duration) error { + // Two keys and a val. + status, progress, done := "status", "statusMessage", "ROLLED_OUT" + start, timeout := time.Now(), nt*time.Duration(TestContext.CloudConfig.NumNodes) + var errLast error + Logf("Waiting up to %v for MIG rolling update to complete.", timeout) + if wait.Poll(RestartPoll, timeout, func() (bool, error) { + // A `rolling-updates describe` call outputs what we want to stdout. + output, _, err := retryCmd("gcloud", "alpha", "compute", + "rolling-updates", + fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID), + fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone), + "describe", + id) + if err != nil { + errLast = fmt.Errorf("Error calling rolling-updates describe %s: %v", id, err) + Logf("%v", errLast) + return false, nil + } + + // The 'describe' call probably succeeded; parse the output and try to + // find the line that looks like "status: " and see whether it's + // done. + Logf("Waiting for MIG rolling update: %s (%v elapsed)", + ParseKVLines(output, progress), time.Since(start)) + if st := ParseKVLines(output, status); st == done { + return true, nil + } + return false, nil + }) != nil { + return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast) + } + Logf("MIG rolling update complete after %v", time.Since(start)) + return nil +} + +func cleanupNodeUpgradeGCE(tmplBefore string) { + Logf("Cleaning up any unused node templates") + tmplAfter, err := MigTemplate() + if err != nil { + Logf("Could not get node template post-upgrade; may have leaked template %s", tmplBefore) + return + } + if tmplBefore == tmplAfter { + // The node upgrade failed so there's no need to delete + // anything. + Logf("Node template %s is still in use; not cleaning up", tmplBefore) + return + } + Logf("Deleting node template %s", tmplBefore) + if _, _, err := retryCmd("gcloud", "compute", "instance-templates", + fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID), + "delete", + tmplBefore); err != nil { + Logf("gcloud compute instance-templates delete %s call failed with err: %v", tmplBefore, err) + Logf("May have leaked instance template %q", tmplBefore) + } +} + +func nodeUpgradeGKE(v string) error { + Logf("Upgrading nodes to %q", v) + _, _, err := RunCmd("gcloud", "container", + fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID), + fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone), + "clusters", + "upgrade", + TestContext.CloudConfig.Cluster, + fmt.Sprintf("--cluster-version=%s", v), + "--quiet") + return err +} + +// CheckNodesReady waits up to nt for expect nodes accessed by c to be ready, +// returning an error if this doesn't happen in time. It returns the names of +// nodes it finds. +func CheckNodesReady(c *client.Client, nt time.Duration, expect int) ([]string, error) { + // First, keep getting all of the nodes until we get the number we expect. + var nodeList *api.NodeList + var errLast error + start := time.Now() + found := wait.Poll(Poll, nt, func() (bool, error) { + // A rolling-update (GCE/GKE implementation of restart) can complete before the apiserver + // knows about all of the nodes. Thus, we retry the list nodes call + // until we get the expected number of nodes. + nodeList, errLast = c.Nodes().List(api.ListOptions{ + FieldSelector: fields.Set{"spec.unschedulable": "false"}.AsSelector()}) + if errLast != nil { + return false, nil + } + if len(nodeList.Items) != expect { + errLast = fmt.Errorf("expected to find %d nodes but found only %d (%v elapsed)", + expect, len(nodeList.Items), time.Since(start)) + Logf("%v", errLast) + return false, nil + } + return true, nil + }) == nil + nodeNames := make([]string, len(nodeList.Items)) + for i, n := range nodeList.Items { + nodeNames[i] = n.ObjectMeta.Name + } + if !found { + return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v", + expect, nt, errLast) + } + Logf("Successfully found %d nodes", expect) + + // Next, ensure in parallel that all the nodes are ready. We subtract the + // time we spent waiting above. + timeout := nt - time.Since(start) + result := make(chan bool, len(nodeList.Items)) + for _, n := range nodeNames { + n := n + go func() { result <- WaitForNodeToBeReady(c, n, timeout) }() + } + failed := false + // TODO(mbforbes): Change to `for range` syntax once we support only Go + // >= 1.4. + for i := range nodeList.Items { + _ = i + if !<-result { + failed = true + } + } + if failed { + return nodeNames, fmt.Errorf("at least one node failed to be ready") + } + return nodeNames, nil +} + +// MigTemplate (GCE-only) returns the name of the MIG template that the +// nodes of the cluster use. +func MigTemplate() (string, error) { + var errLast error + var templ string + key := "instanceTemplate" + if wait.Poll(Poll, SingleCallTimeout, func() (bool, error) { + // TODO(mikedanese): make this hit the compute API directly instead of + // shelling out to gcloud. + // An `instance-groups managed describe` call outputs what we want to stdout. + output, _, err := retryCmd("gcloud", "compute", "instance-groups", "managed", + fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID), + "describe", + fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone), + TestContext.CloudConfig.NodeInstanceGroup) + if err != nil { + errLast = fmt.Errorf("gcloud compute instance-groups managed describe call failed with err: %v", err) + return false, nil + } + + // The 'describe' call probably succeeded; parse the output and try to + // find the line that looks like "instanceTemplate: url/to/" and + // return . + if val := ParseKVLines(output, key); len(val) > 0 { + url := strings.Split(val, "/") + templ = url[len(url)-1] + Logf("MIG group %s using template: %s", TestContext.CloudConfig.NodeInstanceGroup, templ) + return true, nil + } + errLast = fmt.Errorf("couldn't find %s in output to get MIG template. Output: %s", key, output) + return false, nil + }) != nil { + return "", fmt.Errorf("MigTemplate() failed with last error: %v", errLast) + } + return templ, nil +} diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index 1e4f1ab690c..36af6cd797b 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -134,6 +134,22 @@ const ( // When these values are updated, also update cmd/kubelet/app/options/options.go currentPodInfraContainerImageName = "gcr.io/google_containers/pause" currentPodInfraContainerImageVersion = "3.0" + + // How long each node is given during a process that restarts all nodes + // before the test is considered failed. (Note that the total time to + // restart all nodes will be this number times the number of nodes.) + RestartPerNodeTimeout = 5 * time.Minute + + // How often to Poll the statues of a restart. + RestartPoll = 20 * time.Second + + // How long a node is allowed to become "Ready" after it is restarted before + // the test is considered failed. + RestartNodeReadyAgainTimeout = 5 * time.Minute + + // How long a pod is allowed to become "running" and "ready" after a node + // restart before test is considered failed. + RestartPodReadyAgainTimeout = 5 * time.Minute ) // Label allocated to the image puller static pod that runs on each node @@ -3505,13 +3521,29 @@ func RestartKubeProxy(host string) error { return nil } -func RestartApiserver() error { +func RestartApiserver(c *client.Client) error { // TODO: Make it work for all providers. if !ProviderIs("gce", "gke", "aws") { return fmt.Errorf("unsupported provider: %s", TestContext.Provider) } + if ProviderIs("gce", "aws") { + return sshRestartMaster() + } + // GKE doesn't allow ssh accesss, so use a same-version master + // upgrade to teardown/recreate master. + v, err := c.ServerVersion() + if err != nil { + return err + } + return masterUpgradeGKE(v.GitVersion[1:]) // strip leading 'v' +} + +func sshRestartMaster() error { + if !ProviderIs("gce", "aws") { + return fmt.Errorf("unsupported provider: %s", TestContext.Provider) + } var command string - if ProviderIs("gce", "gke") { + if ProviderIs("gce") { command = "sudo docker ps | grep /kube-apiserver | cut -d ' ' -f 1 | xargs sudo docker kill" } else { command = "sudo /etc/init.d/kube-apiserver restart" @@ -4127,3 +4159,41 @@ func GetPodsInNamespace(c *client.Client, ns string, ignoreLabels map[string]str } return filtered, nil } + +// RunCmd runs cmd using args and returns its stdout and stderr. It also outputs +// cmd's stdout and stderr to their respective OS streams. +func RunCmd(command string, args ...string) (string, string, error) { + Logf("Running %s %v", command, args) + var bout, berr bytes.Buffer + cmd := exec.Command(command, args...) + // We also output to the OS stdout/stderr to aid in debugging in case cmd + // hangs and never returns before the test gets killed. + // + // This creates some ugly output because gcloud doesn't always provide + // newlines. + cmd.Stdout = io.MultiWriter(os.Stdout, &bout) + cmd.Stderr = io.MultiWriter(os.Stderr, &berr) + err := cmd.Run() + stdout, stderr := bout.String(), berr.String() + if err != nil { + return "", "", fmt.Errorf("error running %s %v; got error %v, stdout %q, stderr %q", + command, args, err, stdout, stderr) + } + return stdout, stderr, nil +} + +// retryCmd runs cmd using args and retries it for up to SingleCallTimeout if +// it returns an error. It returns stdout and stderr. +func retryCmd(command string, args ...string) (string, string, error) { + var err error + stdout, stderr := "", "" + wait.Poll(Poll, SingleCallTimeout, func() (bool, error) { + stdout, stderr, err = RunCmd(command, args...) + if err != nil { + Logf("Got %v", err) + return false, nil + } + return true, nil + }) + return stdout, stderr, err +} diff --git a/test/e2e/restart.go b/test/e2e/restart.go index 57b50ec86fc..c216fd1c902 100644 --- a/test/e2e/restart.go +++ b/test/e2e/restart.go @@ -21,7 +21,6 @@ import ( "time" "k8s.io/kubernetes/pkg/api" - client "k8s.io/kubernetes/pkg/client/unversioned" "k8s.io/kubernetes/pkg/fields" "k8s.io/kubernetes/pkg/labels" "k8s.io/kubernetes/pkg/util/wait" @@ -31,24 +30,6 @@ import ( . "github.com/onsi/gomega" ) -const ( - // How long each node is given during a process that restarts all nodes - // before the test is considered failed. (Note that the total time to - // restart all nodes will be this number times the number of nodes.) - restartPerNodeTimeout = 5 * time.Minute - - // How often to framework.Poll the statues of a restart. - restartPoll = 20 * time.Second - - // How long a node is allowed to become "Ready" after it is restarted before - // the test is considered failed. - restartNodeReadyAgainTimeout = 5 * time.Minute - - // How long a pod is allowed to become "running" and "ready" after a node - // restart before test is considered failed. - restartPodReadyAgainTimeout = 5 * time.Minute -) - var _ = framework.KubeDescribe("Restart [Disruptive]", func() { f := framework.NewDefaultFramework("restart") var ps *framework.PodStore @@ -71,7 +52,7 @@ var _ = framework.KubeDescribe("Restart [Disruptive]", func() { nn := framework.TestContext.CloudConfig.NumNodes By("ensuring all nodes are ready") - nodeNamesBefore, err := checkNodesReady(f.Client, framework.NodeReadyInitialTimeout, nn) + nodeNamesBefore, err := framework.CheckNodesReady(f.Client, framework.NodeReadyInitialTimeout, nn) Expect(err).NotTo(HaveOccurred()) framework.Logf("Got the following nodes before restart: %v", nodeNamesBefore) @@ -87,11 +68,11 @@ var _ = framework.KubeDescribe("Restart [Disruptive]", func() { } By("restarting all of the nodes") - err = restartNodes(framework.TestContext.Provider, restartPerNodeTimeout) + err = restartNodes(framework.TestContext.Provider, framework.RestartPerNodeTimeout) Expect(err).NotTo(HaveOccurred()) By("ensuring all nodes are ready after the restart") - nodeNamesAfter, err := checkNodesReady(f.Client, restartNodeReadyAgainTimeout, nn) + nodeNamesAfter, err := framework.CheckNodesReady(f.Client, framework.RestartNodeReadyAgainTimeout, nn) Expect(err).NotTo(HaveOccurred()) framework.Logf("Got the following nodes after restart: %v", nodeNamesAfter) @@ -108,10 +89,10 @@ var _ = framework.KubeDescribe("Restart [Disruptive]", func() { // across node restarts. By("ensuring the same number of pods are running and ready after restart") podCheckStart := time.Now() - podNamesAfter, err := waitForNPods(ps, len(podNamesBefore), restartPodReadyAgainTimeout) + podNamesAfter, err := waitForNPods(ps, len(podNamesBefore), framework.RestartPodReadyAgainTimeout) Expect(err).NotTo(HaveOccurred()) - remaining := restartPodReadyAgainTimeout - time.Since(podCheckStart) - if !framework.CheckPodsRunningReadyOrSucceeded(f.Client, ns, podNamesAfter, remaining) { + remaining := framework.RestartPodReadyAgainTimeout - time.Since(podCheckStart) + if !framework.CheckPodsRunningReady(f.Client, ns, podNamesAfter, remaining) { framework.Failf("At least one pod wasn't running and ready after the restart.") } }) @@ -144,64 +125,6 @@ func waitForNPods(ps *framework.PodStore, expect int, timeout time.Duration) ([] return podNames, nil } -// checkNodesReady waits up to nt for expect nodes accessed by c to be ready, -// returning an error if this doesn't happen in time. It returns the names of -// nodes it finds. -func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string, error) { - // First, keep getting all of the nodes until we get the number we expect. - var nodeList *api.NodeList - var errLast error - start := time.Now() - found := wait.Poll(framework.Poll, nt, func() (bool, error) { - // A rolling-update (GCE/GKE implementation of restart) can complete before the apiserver - // knows about all of the nodes. Thus, we retry the list nodes call - // until we get the expected number of nodes. - nodeList, errLast = c.Nodes().List(api.ListOptions{ - FieldSelector: fields.Set{"spec.unschedulable": "false"}.AsSelector()}) - if errLast != nil { - return false, nil - } - if len(nodeList.Items) != expect { - errLast = fmt.Errorf("expected to find %d nodes but found only %d (%v elapsed)", - expect, len(nodeList.Items), time.Since(start)) - framework.Logf("%v", errLast) - return false, nil - } - return true, nil - }) == nil - nodeNames := make([]string, len(nodeList.Items)) - for i, n := range nodeList.Items { - nodeNames[i] = n.ObjectMeta.Name - } - if !found { - return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v", - expect, nt, errLast) - } - framework.Logf("Successfully found %d nodes", expect) - - // Next, ensure in parallel that all the nodes are ready. We subtract the - // time we spent waiting above. - timeout := nt - time.Since(start) - result := make(chan bool, len(nodeList.Items)) - for _, n := range nodeNames { - n := n - go func() { result <- framework.WaitForNodeToBeReady(c, n, timeout) }() - } - failed := false - // TODO(mbforbes): Change to `for range` syntax once we support only Go - // >= 1.4. - for i := range nodeList.Items { - _ = i - if !<-result { - failed = true - } - } - if failed { - return nodeNames, fmt.Errorf("at least one node failed to be ready") - } - return nodeNames, nil -} - // restartNodes uses provider to do a restart of all nodes in the cluster, // allowing up to nt per node. func restartNodes(provider string, nt time.Duration) error { @@ -235,9 +158,9 @@ func restartNodes(provider string, nt time.Duration) error { // done func migRollingUpdateSelf(nt time.Duration) error { By("getting the name of the template for the managed instance group") - tmpl, err := migTemplate() + tmpl, err := framework.MigTemplate() if err != nil { return fmt.Errorf("couldn't get MIG template name: %v", err) } - return migRollingUpdate(tmpl, nt) + return framework.MigRollingUpdate(tmpl, nt) } diff --git a/test/e2e/service.go b/test/e2e/service.go index a87d4e75dd7..f121d5a7043 100644 --- a/test/e2e/service.go +++ b/test/e2e/service.go @@ -331,8 +331,7 @@ var _ = framework.KubeDescribe("Services", func() { It("should work after restarting apiserver [Disruptive]", func() { // TODO: use the ServiceTestJig here - // TODO: framework.RestartApiserver doesn't work in GKE - fix it and reenable this test. - framework.SkipUnlessProviderIs("gce") + framework.SkipUnlessProviderIs("gce", "gke") ns := f.Namespace.Name numPods, servicePort := 3, 80 @@ -351,7 +350,7 @@ var _ = framework.KubeDescribe("Services", func() { framework.ExpectNoError(verifyServeHostnameServiceUp(c, ns, host, podNames1, svc1IP, servicePort)) // Restart apiserver - if err := framework.RestartApiserver(); err != nil { + if err := framework.RestartApiserver(c); err != nil { framework.Failf("error restarting apiserver: %v", err) } if err := framework.WaitForApiserverUp(c); err != nil {