From dc79cc82ced892e47b5aa5ea7a8edebf1f553877 Mon Sep 17 00:00:00 2001 From: Joe Finney Date: Wed, 28 Sep 2016 09:36:24 -0700 Subject: [PATCH] Make the restart test restart the nodes without a mig rolling update. --- test/e2e/framework/nodes_util.go | 112 ------------------------------- test/e2e/restart.go | 70 ++++++++++--------- 2 files changed, 34 insertions(+), 148 deletions(-) diff --git a/test/e2e/framework/nodes_util.go b/test/e2e/framework/nodes_util.go index 64a90f10ebd..1b706d5926d 100644 --- a/test/e2e/framework/nodes_util.go +++ b/test/e2e/framework/nodes_util.go @@ -94,118 +94,6 @@ func nodeUpgradeGCE(rawV string) error { return err } -// MigRollingUpdate starts a MIG rolling update, upgrading the nodes to a new -// instance template named tmpl, and waits up to nt times the number of nodes -// for it to complete. -func MigRollingUpdate(tmpl string, nt time.Duration) error { - Logf(fmt.Sprintf("starting the MIG rolling update to %s", tmpl)) - id, err := migRollingUpdateStart(tmpl, nt) - if err != nil { - return fmt.Errorf("couldn't start the MIG rolling update: %v", err) - } - - Logf(fmt.Sprintf("polling the MIG rolling update (%s) until it completes", id)) - if err := migRollingUpdatePoll(id, nt); err != nil { - return fmt.Errorf("err waiting until update completed: %v", err) - } - - return nil -} - -// migRollingUpdateStart (GCE/GKE-only) starts a MIG rolling update using templ -// as the new template, waiting up to nt per node, and returns the ID of that -// update. -func migRollingUpdateStart(templ string, nt time.Duration) (string, error) { - var errLast error - var id string - prefix, suffix := "Started [", "]." - if err := wait.Poll(Poll, SingleCallTimeout, func() (bool, error) { - // TODO(mikedanese): make this hit the compute API directly instead of - // shelling out to gcloud. - // NOTE(mikedanese): If you are changing this gcloud command, update - // cluster/gce/upgrade.sh to match this EXACTLY. - // A `rolling-updates start` call outputs what we want to stderr. - _, output, err := retryCmd("gcloud", "alpha", "compute", - "rolling-updates", - fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID), - fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone), - "start", - // Required args. - fmt.Sprintf("--group=%s", TestContext.CloudConfig.NodeInstanceGroup), - fmt.Sprintf("--template=%s", templ), - // Optional args to fine-tune behavior. - fmt.Sprintf("--instance-startup-timeout=%ds", int(nt.Seconds())), - // NOTE: We can speed up this process by increasing - // --max-num-concurrent-instances. - fmt.Sprintf("--max-num-concurrent-instances=%d", 1), - fmt.Sprintf("--max-num-failed-instances=%d", 0), - fmt.Sprintf("--min-instance-update-time=%ds", 0)) - if err != nil { - errLast = fmt.Errorf("rolling-updates call failed with err: %v", err) - return false, nil - } - - // The 'start' call probably succeeded; parse the output and try to find - // the line that looks like "Started [url/to/]." and return . - for _, line := range strings.Split(output, "\n") { - // As a sanity check, ensure the line starts with prefix and ends - // with suffix. - if strings.Index(line, prefix) != 0 || strings.Index(line, suffix) != len(line)-len(suffix) { - continue - } - url := strings.Split(strings.TrimSuffix(strings.TrimPrefix(line, prefix), suffix), "/") - id = url[len(url)-1] - Logf("Started MIG rolling update; ID: %s", id) - return true, nil - } - errLast = fmt.Errorf("couldn't find line like '%s ... %s' in output to MIG rolling-update start. Output: %s", - prefix, suffix, output) - return false, nil - }); err != nil { - return "", fmt.Errorf("migRollingUpdateStart() failed with last error: %v", errLast) - } - return id, nil -} - -// migRollingUpdatePoll (CKE/GKE-only) polls the progress of the MIG rolling -// update with ID id until it is complete. It returns an error if this takes -// longer than nt times the number of nodes. -func migRollingUpdatePoll(id string, nt time.Duration) error { - // Two keys and a val. - status, progress, done := "status", "statusMessage", "ROLLED_OUT" - start, timeout := time.Now(), nt*time.Duration(TestContext.CloudConfig.NumNodes) - var errLast error - Logf("Waiting up to %v for MIG rolling update to complete.", timeout) - if wait.Poll(RestartPoll, timeout, func() (bool, error) { - // A `rolling-updates describe` call outputs what we want to stdout. - output, _, err := retryCmd("gcloud", "alpha", "compute", - "rolling-updates", - fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID), - fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone), - "describe", - id) - if err != nil { - errLast = fmt.Errorf("Error calling rolling-updates describe %s: %v", id, err) - Logf("%v", errLast) - return false, nil - } - - // The 'describe' call probably succeeded; parse the output and try to - // find the line that looks like "status: " and see whether it's - // done. - Logf("Waiting for MIG rolling update: %s (%v elapsed)", - ParseKVLines(output, progress), time.Since(start)) - if st := ParseKVLines(output, status); st == done { - return true, nil - } - return false, nil - }) != nil { - return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast) - } - Logf("MIG rolling update complete after %v", time.Since(start)) - return nil -} - func cleanupNodeUpgradeGCE(tmplBefore string) { Logf("Cleaning up any unused node templates") tmplAfter, err := MigTemplate() diff --git a/test/e2e/restart.go b/test/e2e/restart.go index 683390dafde..32f917738ec 100644 --- a/test/e2e/restart.go +++ b/test/e2e/restart.go @@ -68,7 +68,7 @@ var _ = framework.KubeDescribe("Restart [Disruptive]", func() { } By("restarting all of the nodes") - err = restartNodes(framework.TestContext.Provider, framework.RestartPerNodeTimeout) + err = restartNodes(f, nodeNamesBefore) Expect(err).NotTo(HaveOccurred()) By("ensuring all nodes are ready after the restart") @@ -125,42 +125,40 @@ func waitForNPods(ps *framework.PodStore, expect int, timeout time.Duration) ([] return podNames, nil } -// restartNodes uses provider to do a restart of all nodes in the cluster, -// allowing up to nt per node. -func restartNodes(provider string, nt time.Duration) error { - switch provider { - case "gce", "gke": - return migRollingUpdateSelf(nt) - default: - return fmt.Errorf("restartNodes(...) not implemented for %s", provider) +func restartNodes(f *framework.Framework, nodeNames []string) error { + // List old boot IDs. + oldBootIDs := make(map[string]string) + for _, name := range nodeNames { + node, err := f.Client.Nodes().Get(name) + if err != nil { + return fmt.Errorf("error getting node info before reboot: %s", err) + } + oldBootIDs[name] = node.Status.NodeInfo.BootID } -} - -// TODO(marekbiskup): Switch this to MIG recreate-instances. This can be done -// with the following bash, but needs to be written in Go: -// -// # Step 1: Get instance names. -// list=$(gcloud compute instance-groups --project=${PROJECT} --zone=${ZONE} instances --group=${GROUP} list) -// i="" -// for l in $list; do -// i="${l##*/},${i}" -// done -// -// # Step 2: Start the recreate. -// output=$(gcloud compute instance-groups managed --project=${PROJECT} --zone=${ZONE} recreate-instances ${GROUP} --instance="${i}") -// op=${output##*:} -// -// # Step 3: Wait until it's complete. -// status="" -// while [[ "${status}" != "DONE" ]]; do -// output=$(gcloud compute instance-groups managed --zone="${ZONE}" get-operation ${op} | grep status) -// status=${output##*:} -// done -func migRollingUpdateSelf(nt time.Duration) error { - By("getting the name of the template for the managed instance group") - tmpl, err := framework.MigTemplate() + // Reboot the nodes. + args := []string{ + "compute", + fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID), + "instances", + "reset", + } + args = append(args, nodeNames...) + args = append(args, fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone)) + stdout, stderr, err := framework.RunCmd("gcloud", args...) if err != nil { - return fmt.Errorf("couldn't get MIG template name: %v", err) + return fmt.Errorf("error restarting nodes: %s\nstdout: %s\nstderr: %s", err, stdout, stderr) } - return framework.MigRollingUpdate(tmpl, nt) + // Wait for their boot IDs to change. + for _, name := range nodeNames { + if err := wait.Poll(30*time.Second, 5*time.Minute, func() (bool, error) { + node, err := f.Client.Nodes().Get(name) + if err != nil { + return false, fmt.Errorf("error getting node info after reboot: %s", err) + } + return node.Status.NodeInfo.BootID != oldBootIDs[name], nil + }); err != nil { + return fmt.Errorf("error waiting for node %s boot ID to change: %s", name, err) + } + } + return nil }