Merge pull request #26360 from jlowdermilk/skip-audit

Automatic merge from submit-queue Fix some gce-only tests to run on gke as well Enable "Services should work after restarting apiserver [Disruptive]" and DaemonRestart tests, except the 2 that require master ssh access. Move restart/upgrade related test helpers into their own file in framework package.
2025-09-14 21:53:52 +00:00 · 2016-05-31 10:01:26 -07:00
parent 38181bb3fb 6ee2b7bc4e
commit 5762ebfc63
6 changed files with 453 additions and 417 deletions
--- a/test/e2e/cluster_upgrade.go
+++ b/test/e2e/cluster_upgrade.go
@@ -17,14 +17,9 @@ limitations under the License.
 package e2e

 import (
-	"bytes"
 	"fmt"
-	"io"
-	"os"
-	"os/exec"
 	"path"
 	"strings"
-	"time"

 	"k8s.io/kubernetes/pkg/api"
 	client "k8s.io/kubernetes/pkg/client/unversioned"
@@ -47,7 +42,7 @@ var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() {
 			cm := chaosmonkey.New(func() {
 				v, err := realVersion(framework.TestContext.UpgradeTarget)
 				framework.ExpectNoError(err)
-				framework.ExpectNoError(masterUpgrade(v))
+				framework.ExpectNoError(framework.MasterUpgrade(v))
 				framework.ExpectNoError(checkMasterVersion(f.Client, v))
 			})
 			cm.Register(func(sem *chaosmonkey.Semaphore) {
@@ -63,7 +58,7 @@ var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() {
 			cm := chaosmonkey.New(func() {
 				v, err := realVersion(framework.TestContext.UpgradeTarget)
 				framework.ExpectNoError(err)
-				framework.ExpectNoError(nodeUpgrade(f, v))
+				framework.ExpectNoError(framework.NodeUpgrade(f, v))
 				framework.ExpectNoError(checkNodesVersions(f.Client, v))
 			})
 			cm.Register(func(sem *chaosmonkey.Semaphore) {
@@ -77,7 +72,7 @@ var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() {
 			cm := chaosmonkey.New(func() {
 				v, err := realVersion(framework.TestContext.UpgradeTarget)
 				framework.ExpectNoError(err)
-				framework.ExpectNoError(nodeUpgrade(f, v))
+				framework.ExpectNoError(framework.NodeUpgrade(f, v))
 				framework.ExpectNoError(checkNodesVersions(f.Client, v))
 			})
 			cm.Register(func(sem *chaosmonkey.Semaphore) {
@@ -93,9 +88,9 @@ var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() {
 			cm := chaosmonkey.New(func() {
 				v, err := realVersion(framework.TestContext.UpgradeTarget)
 				framework.ExpectNoError(err)
-				framework.ExpectNoError(masterUpgrade(v))
+				framework.ExpectNoError(framework.MasterUpgrade(v))
 				framework.ExpectNoError(checkMasterVersion(f.Client, v))
-				framework.ExpectNoError(nodeUpgrade(f, v))
+				framework.ExpectNoError(framework.NodeUpgrade(f, v))
 				framework.ExpectNoError(checkNodesVersions(f.Client, v))
 			})
 			cm.Register(func(sem *chaosmonkey.Semaphore) {
@@ -109,9 +104,9 @@ var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() {
 			cm := chaosmonkey.New(func() {
 				v, err := realVersion(framework.TestContext.UpgradeTarget)
 				framework.ExpectNoError(err)
-				framework.ExpectNoError(masterUpgrade(v))
+				framework.ExpectNoError(framework.MasterUpgrade(v))
 				framework.ExpectNoError(checkMasterVersion(f.Client, v))
-				framework.ExpectNoError(nodeUpgrade(f, v))
+				framework.ExpectNoError(framework.NodeUpgrade(f, v))
 				framework.ExpectNoError(checkNodesVersions(f.Client, v))
 			})
 			cm.Register(func(sem *chaosmonkey.Semaphore) {
@@ -127,7 +122,7 @@ var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() {
 // GKE.  See hack/get-build.sh for more information.
 func realVersion(s string) (string, error) {
 	framework.Logf(fmt.Sprintf("Getting real version for %q", s))
-	v, _, err := runCmd(path.Join(framework.TestContext.RepoRoot, "hack/get-build.sh"), "-v", s)
+	v, _, err := framework.RunCmd(path.Join(framework.TestContext.RepoRoot, "hack/get-build.sh"), "-v", s)
 	if err != nil {
 		return v, err
 	}
@@ -135,128 +130,6 @@ func realVersion(s string) (string, error) {
 	return strings.TrimPrefix(strings.TrimSpace(v), "v"), nil
 }

-// The following upgrade functions are passed into the framework below and used
-// to do the actual upgrades.
-var masterUpgrade = func(v string) error {
-	switch framework.TestContext.Provider {
-	case "gce":
-		return masterUpgradeGCE(v)
-	case "gke":
-		return masterUpgradeGKE(v)
-	default:
-		return fmt.Errorf("masterUpgrade() is not implemented for provider %s", framework.TestContext.Provider)
-	}
-}
-
-func masterUpgradeGCE(rawV string) error {
-	v := "v" + rawV
-	_, _, err := runCmd(path.Join(framework.TestContext.RepoRoot, "cluster/gce/upgrade.sh"), "-M", v)
-	return err
-}
-
-func masterUpgradeGKE(v string) error {
-	framework.Logf("Upgrading master to %q", v)
-	_, _, err := runCmd("gcloud", "container",
-		fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID),
-		fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone),
-		"clusters",
-		"upgrade",
-		framework.TestContext.CloudConfig.Cluster,
-		"--master",
-		fmt.Sprintf("--cluster-version=%s", v),
-		"--quiet")
-	return err
-}
-
-var nodeUpgrade = func(f *framework.Framework, v string) error {
-	// Perform the upgrade.
-	var err error
-	switch framework.TestContext.Provider {
-	case "gce":
-		err = nodeUpgradeGCE(v)
-	case "gke":
-		err = nodeUpgradeGKE(v)
-	default:
-		err = fmt.Errorf("nodeUpgrade() is not implemented for provider %s", framework.TestContext.Provider)
-	}
-	if err != nil {
-		return err
-	}
-
-	// Wait for it to complete and validate nodes are healthy.
-	//
-	// TODO(ihmccreery) We shouldn't have to wait for nodes to be ready in
-	// GKE; the operation shouldn't return until they all are.
-	framework.Logf("Waiting up to %v for all nodes to be ready after the upgrade", restartNodeReadyAgainTimeout)
-	if _, err := checkNodesReady(f.Client, restartNodeReadyAgainTimeout, framework.TestContext.CloudConfig.NumNodes); err != nil {
-		return err
-	}
-	return nil
-}
-
-func nodeUpgradeGCE(rawV string) error {
-	// TODO(ihmccreery) This code path should be identical to how a user
-	// would trigger a node update; right now it's very different.
-	v := "v" + rawV
-
-	framework.Logf("Getting the node template before the upgrade")
-	tmplBefore, err := migTemplate()
-	if err != nil {
-		return fmt.Errorf("error getting the node template before the upgrade: %v", err)
-	}
-
-	framework.Logf("Preparing node upgrade by creating new instance template for %q", v)
-	stdout, _, err := runCmd(path.Join(framework.TestContext.RepoRoot, "cluster/gce/upgrade.sh"), "-P", v)
-	if err != nil {
-		cleanupNodeUpgradeGCE(tmplBefore)
-		return fmt.Errorf("error preparing node upgrade: %v", err)
-	}
-	tmpl := strings.TrimSpace(stdout)
-
-	framework.Logf("Performing a node upgrade to %q; waiting at most %v per node", tmpl, restartPerNodeTimeout)
-	if err := migRollingUpdate(tmpl, restartPerNodeTimeout); err != nil {
-		cleanupNodeUpgradeGCE(tmplBefore)
-		return fmt.Errorf("error doing node upgrade via a migRollingUpdate to %s: %v", tmpl, err)
-	}
-	return nil
-}
-
-func cleanupNodeUpgradeGCE(tmplBefore string) {
-	framework.Logf("Cleaning up any unused node templates")
-	tmplAfter, err := migTemplate()
-	if err != nil {
-		framework.Logf("Could not get node template post-upgrade; may have leaked template %s", tmplBefore)
-		return
-	}
-	if tmplBefore == tmplAfter {
-		// The node upgrade failed so there's no need to delete
-		// anything.
-		framework.Logf("Node template %s is still in use; not cleaning up", tmplBefore)
-		return
-	}
-	framework.Logf("Deleting node template %s", tmplBefore)
-	if _, _, err := retryCmd("gcloud", "compute", "instance-templates",
-		fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID),
-		"delete",
-		tmplBefore); err != nil {
-		framework.Logf("gcloud compute instance-templates delete %s call failed with err: %v", tmplBefore, err)
-		framework.Logf("May have leaked instance template %q", tmplBefore)
-	}
-}
-
-func nodeUpgradeGKE(v string) error {
-	framework.Logf("Upgrading nodes to %q", v)
-	_, _, err := runCmd("gcloud", "container",
-		fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID),
-		fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone),
-		"clusters",
-		"upgrade",
-		framework.TestContext.CloudConfig.Cluster,
-		fmt.Sprintf("--cluster-version=%s", v),
-		"--quiet")
-	return err
-}
-
 func testServiceUpBeforeAndAfter(f *framework.Framework, sem *chaosmonkey.Semaphore) {
 	testService(f, sem, false)
 }
@@ -355,193 +228,3 @@ func checkNodesVersions(c *client.Client, want string) error {
 	}
 	return nil
 }
-
-// retryCmd runs cmd using args and retries it for up to framework.SingleCallTimeout if
-// it returns an error. It returns stdout and stderr.
-func retryCmd(command string, args ...string) (string, string, error) {
-	var err error
-	stdout, stderr := "", ""
-	wait.Poll(framework.Poll, framework.SingleCallTimeout, func() (bool, error) {
-		stdout, stderr, err = runCmd(command, args...)
-		if err != nil {
-			framework.Logf("Got %v", err)
-			return false, nil
-		}
-		return true, nil
-	})
-	return stdout, stderr, err
-}
-
-// runCmd runs cmd using args and returns its stdout and stderr. It also outputs
-// cmd's stdout and stderr to their respective OS streams.
-//
-// TODO(ihmccreery) This function should either be moved into util.go or
-// removed; other e2e's use bare exe.Command.
-func runCmd(command string, args ...string) (string, string, error) {
-	framework.Logf("Running %s %v", command, args)
-	var bout, berr bytes.Buffer
-	cmd := exec.Command(command, args...)
-	// We also output to the OS stdout/stderr to aid in debugging in case cmd
-	// hangs and never returns before the test gets killed.
-	//
-	// This creates some ugly output because gcloud doesn't always provide
-	// newlines.
-	cmd.Stdout = io.MultiWriter(os.Stdout, &bout)
-	cmd.Stderr = io.MultiWriter(os.Stderr, &berr)
-	err := cmd.Run()
-	stdout, stderr := bout.String(), berr.String()
-	if err != nil {
-		return "", "", fmt.Errorf("error running %s %v; got error %v, stdout %q, stderr %q",
-			command, args, err, stdout, stderr)
-	}
-	return stdout, stderr, nil
-}
-
-// migRollingUpdate starts a MIG rolling update, upgrading the nodes to a new
-// instance template named tmpl, and waits up to nt times the number of nodes
-// for it to complete.
-func migRollingUpdate(tmpl string, nt time.Duration) error {
-	framework.Logf(fmt.Sprintf("starting the MIG rolling update to %s", tmpl))
-	id, err := migRollingUpdateStart(tmpl, nt)
-	if err != nil {
-		return fmt.Errorf("couldn't start the MIG rolling update: %v", err)
-	}
-
-	framework.Logf(fmt.Sprintf("polling the MIG rolling update (%s) until it completes", id))
-	if err := migRollingUpdatePoll(id, nt); err != nil {
-		return fmt.Errorf("err waiting until update completed: %v", err)
-	}
-
-	return nil
-}
-
-// migTemplate (GCE-only) returns the name of the MIG template that the
-// nodes of the cluster use.
-func migTemplate() (string, error) {
-	var errLast error
-	var templ string
-	key := "instanceTemplate"
-	if wait.Poll(framework.Poll, framework.SingleCallTimeout, func() (bool, error) {
-		// TODO(mikedanese): make this hit the compute API directly instead of
-		// shelling out to gcloud.
-		// An `instance-groups managed describe` call outputs what we want to stdout.
-		output, _, err := retryCmd("gcloud", "compute", "instance-groups", "managed",
-			fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID),
-			"describe",
-			fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone),
-			framework.TestContext.CloudConfig.NodeInstanceGroup)
-		if err != nil {
-			errLast = fmt.Errorf("gcloud compute instance-groups managed describe call failed with err: %v", err)
-			return false, nil
-		}
-
-		// The 'describe' call probably succeeded; parse the output and try to
-		// find the line that looks like "instanceTemplate: url/to/<templ>" and
-		// return <templ>.
-		if val := framework.ParseKVLines(output, key); len(val) > 0 {
-			url := strings.Split(val, "/")
-			templ = url[len(url)-1]
-			framework.Logf("MIG group %s using template: %s", framework.TestContext.CloudConfig.NodeInstanceGroup, templ)
-			return true, nil
-		}
-		errLast = fmt.Errorf("couldn't find %s in output to get MIG template. Output: %s", key, output)
-		return false, nil
-	}) != nil {
-		return "", fmt.Errorf("migTemplate() failed with last error: %v", errLast)
-	}
-	return templ, nil
-}
-
-// migRollingUpdateStart (GCE/GKE-only) starts a MIG rolling update using templ
-// as the new template, waiting up to nt per node, and returns the ID of that
-// update.
-func migRollingUpdateStart(templ string, nt time.Duration) (string, error) {
-	var errLast error
-	var id string
-	prefix, suffix := "Started [", "]."
-	if err := wait.Poll(framework.Poll, framework.SingleCallTimeout, func() (bool, error) {
-		// TODO(mikedanese): make this hit the compute API directly instead of
-		//                 shelling out to gcloud.
-		// NOTE(mikedanese): If you are changing this gcloud command, update
-		//                 cluster/gce/upgrade.sh to match this EXACTLY.
-		// A `rolling-updates start` call outputs what we want to stderr.
-		_, output, err := retryCmd("gcloud", "alpha", "compute",
-			"rolling-updates",
-			fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID),
-			fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone),
-			"start",
-			// Required args.
-			fmt.Sprintf("--group=%s", framework.TestContext.CloudConfig.NodeInstanceGroup),
-			fmt.Sprintf("--template=%s", templ),
-			// Optional args to fine-tune behavior.
-			fmt.Sprintf("--instance-startup-timeout=%ds", int(nt.Seconds())),
-			// NOTE: We can speed up this process by increasing
-			//       --max-num-concurrent-instances.
-			fmt.Sprintf("--max-num-concurrent-instances=%d", 1),
-			fmt.Sprintf("--max-num-failed-instances=%d", 0),
-			fmt.Sprintf("--min-instance-update-time=%ds", 0))
-		if err != nil {
-			errLast = fmt.Errorf("rolling-updates call failed with err: %v", err)
-			return false, nil
-		}
-
-		// The 'start' call probably succeeded; parse the output and try to find
-		// the line that looks like "Started [url/to/<id>]." and return <id>.
-		for _, line := range strings.Split(output, "\n") {
-			// As a sanity check, ensure the line starts with prefix and ends
-			// with suffix.
-			if strings.Index(line, prefix) != 0 || strings.Index(line, suffix) != len(line)-len(suffix) {
-				continue
-			}
-			url := strings.Split(strings.TrimSuffix(strings.TrimPrefix(line, prefix), suffix), "/")
-			id = url[len(url)-1]
-			framework.Logf("Started MIG rolling update; ID: %s", id)
-			return true, nil
-		}
-		errLast = fmt.Errorf("couldn't find line like '%s ... %s' in output to MIG rolling-update start. Output: %s",
-			prefix, suffix, output)
-		return false, nil
-	}); err != nil {
-		return "", fmt.Errorf("migRollingUpdateStart() failed with last error: %v", errLast)
-	}
-	return id, nil
-}
-
-// migRollingUpdatePoll (CKE/GKE-only) polls the progress of the MIG rolling
-// update with ID id until it is complete. It returns an error if this takes
-// longer than nt times the number of nodes.
-func migRollingUpdatePoll(id string, nt time.Duration) error {
-	// Two keys and a val.
-	status, progress, done := "status", "statusMessage", "ROLLED_OUT"
-	start, timeout := time.Now(), nt*time.Duration(framework.TestContext.CloudConfig.NumNodes)
-	var errLast error
-	framework.Logf("Waiting up to %v for MIG rolling update to complete.", timeout)
-	if wait.Poll(restartPoll, timeout, func() (bool, error) {
-		// A `rolling-updates describe` call outputs what we want to stdout.
-		output, _, err := retryCmd("gcloud", "alpha", "compute",
-			"rolling-updates",
-			fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID),
-			fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone),
-			"describe",
-			id)
-		if err != nil {
-			errLast = fmt.Errorf("Error calling rolling-updates describe %s: %v", id, err)
-			framework.Logf("%v", errLast)
-			return false, nil
-		}
-
-		// The 'describe' call probably succeeded; parse the output and try to
-		// find the line that looks like "status: <status>" and see whether it's
-		// done.
-		framework.Logf("Waiting for MIG rolling update: %s (%v elapsed)",
-			framework.ParseKVLines(output, progress), time.Since(start))
-		if st := framework.ParseKVLines(output, status); st == done {
-			return true, nil
-		}
-		return false, nil
-	}) != nil {
-		return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast)
-	}
-	framework.Logf("MIG rolling update complete after %v", time.Since(start))
-	return nil
-}
--- a/test/e2e/daemon_restart.go
+++ b/test/e2e/daemon_restart.go
@@ -199,8 +199,7 @@ var _ = framework.KubeDescribe("DaemonRestart [Disruptive]", func() {

 	BeforeEach(func() {
 		// These tests require SSH
-		// TODO(11834): Enable this test in GKE once experimental API there is switched on
-		framework.SkipUnlessProviderIs("gce", "aws")
+		framework.SkipUnlessProviderIs(framework.ProvidersWithSSH...)
 		ns = f.Namespace.Name

 		// All the restart tests need an rc and a watch on pods of the rc.
@@ -252,6 +251,8 @@ var _ = framework.KubeDescribe("DaemonRestart [Disruptive]", func() {

 	It("Controller Manager should not create/delete replicas across restart", func() {

+		// Requires master ssh access.
+		framework.SkipUnlessProviderIs("gce", "aws")
 		restarter := NewRestartConfig(
 			framework.GetMasterHost(), "kube-controller", ports.ControllerManagerPort, restartPollInterval, restartTimeout)
 		restarter.restart()
@@ -281,6 +282,8 @@ var _ = framework.KubeDescribe("DaemonRestart [Disruptive]", func() {

 	It("Scheduler should continue assigning pods to nodes across restart", func() {

+		// Requires master ssh access.
+		framework.SkipUnlessProviderIs("gce", "aws")
 		restarter := NewRestartConfig(
 			framework.GetMasterHost(), "kube-scheduler", ports.SchedulerPort, restartPollInterval, restartTimeout)

--- a/test/e2e/framework/nodes_util.go
+++ b/test/e2e/framework/nodes_util.go
@@ -0,0 +1,358 @@
+/*
+Copyright 2014 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package framework
+
+import (
+	"fmt"
+	"path"
+	"strings"
+	"time"
+
+	"k8s.io/kubernetes/pkg/api"
+	client "k8s.io/kubernetes/pkg/client/unversioned"
+	"k8s.io/kubernetes/pkg/fields"
+	"k8s.io/kubernetes/pkg/util/wait"
+)
+
+// The following upgrade functions are passed into the framework below and used
+// to do the actual upgrades.
+var MasterUpgrade = func(v string) error {
+	switch TestContext.Provider {
+	case "gce":
+		return masterUpgradeGCE(v)
+	case "gke":
+		return masterUpgradeGKE(v)
+	default:
+		return fmt.Errorf("MasterUpgrade() is not implemented for provider %s", TestContext.Provider)
+	}
+}
+
+func masterUpgradeGCE(rawV string) error {
+	v := "v" + rawV
+	_, _, err := RunCmd(path.Join(TestContext.RepoRoot, "cluster/gce/upgrade.sh"), "-M", v)
+	return err
+}
+
+func masterUpgradeGKE(v string) error {
+	Logf("Upgrading master to %q", v)
+	_, _, err := RunCmd("gcloud", "container",
+		fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
+		fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone),
+		"clusters",
+		"upgrade",
+		TestContext.CloudConfig.Cluster,
+		"--master",
+		fmt.Sprintf("--cluster-version=%s", v),
+		"--quiet")
+	return err
+}
+
+var NodeUpgrade = func(f *Framework, v string) error {
+	// Perform the upgrade.
+	var err error
+	switch TestContext.Provider {
+	case "gce":
+		err = nodeUpgradeGCE(v)
+	case "gke":
+		err = nodeUpgradeGKE(v)
+	default:
+		err = fmt.Errorf("NodeUpgrade() is not implemented for provider %s", TestContext.Provider)
+	}
+	if err != nil {
+		return err
+	}
+
+	// Wait for it to complete and validate nodes are healthy.
+	//
+	// TODO(ihmccreery) We shouldn't have to wait for nodes to be ready in
+	// GKE; the operation shouldn't return until they all are.
+	Logf("Waiting up to %v for all nodes to be ready after the upgrade", RestartNodeReadyAgainTimeout)
+	if _, err := CheckNodesReady(f.Client, RestartNodeReadyAgainTimeout, TestContext.CloudConfig.NumNodes); err != nil {
+		return err
+	}
+	return nil
+}
+
+func nodeUpgradeGCE(rawV string) error {
+	// TODO(ihmccreery) This code path should be identical to how a user
+	// would trigger a node update; right now it's very different.
+	v := "v" + rawV
+
+	Logf("Getting the node template before the upgrade")
+	tmplBefore, err := MigTemplate()
+	if err != nil {
+		return fmt.Errorf("error getting the node template before the upgrade: %v", err)
+	}
+
+	Logf("Preparing node upgrade by creating new instance template for %q", v)
+	stdout, _, err := RunCmd(path.Join(TestContext.RepoRoot, "cluster/gce/upgrade.sh"), "-P", v)
+	if err != nil {
+		cleanupNodeUpgradeGCE(tmplBefore)
+		return fmt.Errorf("error preparing node upgrade: %v", err)
+	}
+	tmpl := strings.TrimSpace(stdout)
+
+	Logf("Performing a node upgrade to %q; waiting at most %v per node", tmpl, RestartPerNodeTimeout)
+	if err := MigRollingUpdate(tmpl, RestartPerNodeTimeout); err != nil {
+		cleanupNodeUpgradeGCE(tmplBefore)
+		return fmt.Errorf("error doing node upgrade via a MigRollingUpdate to %s: %v", tmpl, err)
+	}
+	return nil
+}
+
+// MigRollingUpdate starts a MIG rolling update, upgrading the nodes to a new
+// instance template named tmpl, and waits up to nt times the number of nodes
+// for it to complete.
+func MigRollingUpdate(tmpl string, nt time.Duration) error {
+	Logf(fmt.Sprintf("starting the MIG rolling update to %s", tmpl))
+	id, err := migRollingUpdateStart(tmpl, nt)
+	if err != nil {
+		return fmt.Errorf("couldn't start the MIG rolling update: %v", err)
+	}
+
+	Logf(fmt.Sprintf("polling the MIG rolling update (%s) until it completes", id))
+	if err := migRollingUpdatePoll(id, nt); err != nil {
+		return fmt.Errorf("err waiting until update completed: %v", err)
+	}
+
+	return nil
+}
+
+// migRollingUpdateStart (GCE/GKE-only) starts a MIG rolling update using templ
+// as the new template, waiting up to nt per node, and returns the ID of that
+// update.
+func migRollingUpdateStart(templ string, nt time.Duration) (string, error) {
+	var errLast error
+	var id string
+	prefix, suffix := "Started [", "]."
+	if err := wait.Poll(Poll, SingleCallTimeout, func() (bool, error) {
+		// TODO(mikedanese): make this hit the compute API directly instead of
+		//                 shelling out to gcloud.
+		// NOTE(mikedanese): If you are changing this gcloud command, update
+		//                 cluster/gce/upgrade.sh to match this EXACTLY.
+		// A `rolling-updates start` call outputs what we want to stderr.
+		_, output, err := retryCmd("gcloud", "alpha", "compute",
+			"rolling-updates",
+			fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
+			fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone),
+			"start",
+			// Required args.
+			fmt.Sprintf("--group=%s", TestContext.CloudConfig.NodeInstanceGroup),
+			fmt.Sprintf("--template=%s", templ),
+			// Optional args to fine-tune behavior.
+			fmt.Sprintf("--instance-startup-timeout=%ds", int(nt.Seconds())),
+			// NOTE: We can speed up this process by increasing
+			//       --max-num-concurrent-instances.
+			fmt.Sprintf("--max-num-concurrent-instances=%d", 1),
+			fmt.Sprintf("--max-num-failed-instances=%d", 0),
+			fmt.Sprintf("--min-instance-update-time=%ds", 0))
+		if err != nil {
+			errLast = fmt.Errorf("rolling-updates call failed with err: %v", err)
+			return false, nil
+		}
+
+		// The 'start' call probably succeeded; parse the output and try to find
+		// the line that looks like "Started [url/to/<id>]." and return <id>.
+		for _, line := range strings.Split(output, "\n") {
+			// As a sanity check, ensure the line starts with prefix and ends
+			// with suffix.
+			if strings.Index(line, prefix) != 0 || strings.Index(line, suffix) != len(line)-len(suffix) {
+				continue
+			}
+			url := strings.Split(strings.TrimSuffix(strings.TrimPrefix(line, prefix), suffix), "/")
+			id = url[len(url)-1]
+			Logf("Started MIG rolling update; ID: %s", id)
+			return true, nil
+		}
+		errLast = fmt.Errorf("couldn't find line like '%s ... %s' in output to MIG rolling-update start. Output: %s",
+			prefix, suffix, output)
+		return false, nil
+	}); err != nil {
+		return "", fmt.Errorf("migRollingUpdateStart() failed with last error: %v", errLast)
+	}
+	return id, nil
+}
+
+// migRollingUpdatePoll (CKE/GKE-only) polls the progress of the MIG rolling
+// update with ID id until it is complete. It returns an error if this takes
+// longer than nt times the number of nodes.
+func migRollingUpdatePoll(id string, nt time.Duration) error {
+	// Two keys and a val.
+	status, progress, done := "status", "statusMessage", "ROLLED_OUT"
+	start, timeout := time.Now(), nt*time.Duration(TestContext.CloudConfig.NumNodes)
+	var errLast error
+	Logf("Waiting up to %v for MIG rolling update to complete.", timeout)
+	if wait.Poll(RestartPoll, timeout, func() (bool, error) {
+		// A `rolling-updates describe` call outputs what we want to stdout.
+		output, _, err := retryCmd("gcloud", "alpha", "compute",
+			"rolling-updates",
+			fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
+			fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone),
+			"describe",
+			id)
+		if err != nil {
+			errLast = fmt.Errorf("Error calling rolling-updates describe %s: %v", id, err)
+			Logf("%v", errLast)
+			return false, nil
+		}
+
+		// The 'describe' call probably succeeded; parse the output and try to
+		// find the line that looks like "status: <status>" and see whether it's
+		// done.
+		Logf("Waiting for MIG rolling update: %s (%v elapsed)",
+			ParseKVLines(output, progress), time.Since(start))
+		if st := ParseKVLines(output, status); st == done {
+			return true, nil
+		}
+		return false, nil
+	}) != nil {
+		return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast)
+	}
+	Logf("MIG rolling update complete after %v", time.Since(start))
+	return nil
+}
+
+func cleanupNodeUpgradeGCE(tmplBefore string) {
+	Logf("Cleaning up any unused node templates")
+	tmplAfter, err := MigTemplate()
+	if err != nil {
+		Logf("Could not get node template post-upgrade; may have leaked template %s", tmplBefore)
+		return
+	}
+	if tmplBefore == tmplAfter {
+		// The node upgrade failed so there's no need to delete
+		// anything.
+		Logf("Node template %s is still in use; not cleaning up", tmplBefore)
+		return
+	}
+	Logf("Deleting node template %s", tmplBefore)
+	if _, _, err := retryCmd("gcloud", "compute", "instance-templates",
+		fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
+		"delete",
+		tmplBefore); err != nil {
+		Logf("gcloud compute instance-templates delete %s call failed with err: %v", tmplBefore, err)
+		Logf("May have leaked instance template %q", tmplBefore)
+	}
+}
+
+func nodeUpgradeGKE(v string) error {
+	Logf("Upgrading nodes to %q", v)
+	_, _, err := RunCmd("gcloud", "container",
+		fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
+		fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone),
+		"clusters",
+		"upgrade",
+		TestContext.CloudConfig.Cluster,
+		fmt.Sprintf("--cluster-version=%s", v),
+		"--quiet")
+	return err
+}
+
+// CheckNodesReady waits up to nt for expect nodes accessed by c to be ready,
+// returning an error if this doesn't happen in time. It returns the names of
+// nodes it finds.
+func CheckNodesReady(c *client.Client, nt time.Duration, expect int) ([]string, error) {
+	// First, keep getting all of the nodes until we get the number we expect.
+	var nodeList *api.NodeList
+	var errLast error
+	start := time.Now()
+	found := wait.Poll(Poll, nt, func() (bool, error) {
+		// A rolling-update (GCE/GKE implementation of restart) can complete before the apiserver
+		// knows about all of the nodes. Thus, we retry the list nodes call
+		// until we get the expected number of nodes.
+		nodeList, errLast = c.Nodes().List(api.ListOptions{
+			FieldSelector: fields.Set{"spec.unschedulable": "false"}.AsSelector()})
+		if errLast != nil {
+			return false, nil
+		}
+		if len(nodeList.Items) != expect {
+			errLast = fmt.Errorf("expected to find %d nodes but found only %d (%v elapsed)",
+				expect, len(nodeList.Items), time.Since(start))
+			Logf("%v", errLast)
+			return false, nil
+		}
+		return true, nil
+	}) == nil
+	nodeNames := make([]string, len(nodeList.Items))
+	for i, n := range nodeList.Items {
+		nodeNames[i] = n.ObjectMeta.Name
+	}
+	if !found {
+		return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v",
+			expect, nt, errLast)
+	}
+	Logf("Successfully found %d nodes", expect)
+
+	// Next, ensure in parallel that all the nodes are ready. We subtract the
+	// time we spent waiting above.
+	timeout := nt - time.Since(start)
+	result := make(chan bool, len(nodeList.Items))
+	for _, n := range nodeNames {
+		n := n
+		go func() { result <- WaitForNodeToBeReady(c, n, timeout) }()
+	}
+	failed := false
+	// TODO(mbforbes): Change to `for range` syntax once we support only Go
+	// >= 1.4.
+	for i := range nodeList.Items {
+		_ = i
+		if !<-result {
+			failed = true
+		}
+	}
+	if failed {
+		return nodeNames, fmt.Errorf("at least one node failed to be ready")
+	}
+	return nodeNames, nil
+}
+
+// MigTemplate (GCE-only) returns the name of the MIG template that the
+// nodes of the cluster use.
+func MigTemplate() (string, error) {
+	var errLast error
+	var templ string
+	key := "instanceTemplate"
+	if wait.Poll(Poll, SingleCallTimeout, func() (bool, error) {
+		// TODO(mikedanese): make this hit the compute API directly instead of
+		// shelling out to gcloud.
+		// An `instance-groups managed describe` call outputs what we want to stdout.
+		output, _, err := retryCmd("gcloud", "compute", "instance-groups", "managed",
+			fmt.Sprintf("--project=%s", TestContext.CloudConfig.ProjectID),
+			"describe",
+			fmt.Sprintf("--zone=%s", TestContext.CloudConfig.Zone),
+			TestContext.CloudConfig.NodeInstanceGroup)
+		if err != nil {
+			errLast = fmt.Errorf("gcloud compute instance-groups managed describe call failed with err: %v", err)
+			return false, nil
+		}
+
+		// The 'describe' call probably succeeded; parse the output and try to
+		// find the line that looks like "instanceTemplate: url/to/<templ>" and
+		// return <templ>.
+		if val := ParseKVLines(output, key); len(val) > 0 {
+			url := strings.Split(val, "/")
+			templ = url[len(url)-1]
+			Logf("MIG group %s using template: %s", TestContext.CloudConfig.NodeInstanceGroup, templ)
+			return true, nil
+		}
+		errLast = fmt.Errorf("couldn't find %s in output to get MIG template. Output: %s", key, output)
+		return false, nil
+	}) != nil {
+		return "", fmt.Errorf("MigTemplate() failed with last error: %v", errLast)
+	}
+	return templ, nil
+}
--- a/test/e2e/framework/util.go
+++ b/test/e2e/framework/util.go
@@ -134,6 +134,22 @@ const (
 	// When these values are updated, also update cmd/kubelet/app/options/options.go
 	currentPodInfraContainerImageName    = "gcr.io/google_containers/pause"
 	currentPodInfraContainerImageVersion = "3.0"
+
+	// How long each node is given during a process that restarts all nodes
+	// before the test is considered failed. (Note that the total time to
+	// restart all nodes will be this number times the number of nodes.)
+	RestartPerNodeTimeout = 5 * time.Minute
+
+	// How often to Poll the statues of a restart.
+	RestartPoll = 20 * time.Second
+
+	// How long a node is allowed to become "Ready" after it is restarted before
+	// the test is considered failed.
+	RestartNodeReadyAgainTimeout = 5 * time.Minute
+
+	// How long a pod is allowed to become "running" and "ready" after a node
+	// restart before test is considered failed.
+	RestartPodReadyAgainTimeout = 5 * time.Minute
 )

 // Label allocated to the image puller static pod that runs on each node
@@ -3517,13 +3533,29 @@ func RestartKubeProxy(host string) error {
 	return nil
 }

-func RestartApiserver() error {
+func RestartApiserver(c *client.Client) error {
 	// TODO: Make it work for all providers.
 	if !ProviderIs("gce", "gke", "aws") {
 		return fmt.Errorf("unsupported provider: %s", TestContext.Provider)
 	}
+	if ProviderIs("gce", "aws") {
+		return sshRestartMaster()
+	}
+	// GKE doesn't allow ssh accesss, so use a same-version master
+	// upgrade to teardown/recreate master.
+	v, err := c.ServerVersion()
+	if err != nil {
+		return err
+	}
+	return masterUpgradeGKE(v.GitVersion[1:]) // strip leading 'v'
+}
+
+func sshRestartMaster() error {
+	if !ProviderIs("gce", "aws") {
+		return fmt.Errorf("unsupported provider: %s", TestContext.Provider)
+	}
 	var command string
-	if ProviderIs("gce", "gke") {
+	if ProviderIs("gce") {
 		command = "sudo docker ps | grep /kube-apiserver | cut -d ' ' -f 1 | xargs sudo docker kill"
 	} else {
 		command = "sudo /etc/init.d/kube-apiserver restart"
@@ -4139,3 +4171,41 @@ func GetPodsInNamespace(c *client.Client, ns string, ignoreLabels map[string]str
 	}
 	return filtered, nil
 }
+
+// RunCmd runs cmd using args and returns its stdout and stderr. It also outputs
+// cmd's stdout and stderr to their respective OS streams.
+func RunCmd(command string, args ...string) (string, string, error) {
+	Logf("Running %s %v", command, args)
+	var bout, berr bytes.Buffer
+	cmd := exec.Command(command, args...)
+	// We also output to the OS stdout/stderr to aid in debugging in case cmd
+	// hangs and never returns before the test gets killed.
+	//
+	// This creates some ugly output because gcloud doesn't always provide
+	// newlines.
+	cmd.Stdout = io.MultiWriter(os.Stdout, &bout)
+	cmd.Stderr = io.MultiWriter(os.Stderr, &berr)
+	err := cmd.Run()
+	stdout, stderr := bout.String(), berr.String()
+	if err != nil {
+		return "", "", fmt.Errorf("error running %s %v; got error %v, stdout %q, stderr %q",
+			command, args, err, stdout, stderr)
+	}
+	return stdout, stderr, nil
+}
+
+// retryCmd runs cmd using args and retries it for up to SingleCallTimeout if
+// it returns an error. It returns stdout and stderr.
+func retryCmd(command string, args ...string) (string, string, error) {
+	var err error
+	stdout, stderr := "", ""
+	wait.Poll(Poll, SingleCallTimeout, func() (bool, error) {
+		stdout, stderr, err = RunCmd(command, args...)
+		if err != nil {
+			Logf("Got %v", err)
+			return false, nil
+		}
+		return true, nil
+	})
+	return stdout, stderr, err
+}
--- a/test/e2e/restart.go
+++ b/test/e2e/restart.go
@@ -21,7 +21,6 @@ import (
 	"time"

 	"k8s.io/kubernetes/pkg/api"
-	client "k8s.io/kubernetes/pkg/client/unversioned"
 	"k8s.io/kubernetes/pkg/fields"
 	"k8s.io/kubernetes/pkg/labels"
 	"k8s.io/kubernetes/pkg/util/wait"
@@ -31,24 +30,6 @@ import (
 	. "github.com/onsi/gomega"
 )

-const (
-	// How long each node is given during a process that restarts all nodes
-	// before the test is considered failed. (Note that the total time to
-	// restart all nodes will be this number times the number of nodes.)
-	restartPerNodeTimeout = 5 * time.Minute
-
-	// How often to framework.Poll the statues of a restart.
-	restartPoll = 20 * time.Second
-
-	// How long a node is allowed to become "Ready" after it is restarted before
-	// the test is considered failed.
-	restartNodeReadyAgainTimeout = 5 * time.Minute
-
-	// How long a pod is allowed to become "running" and "ready" after a node
-	// restart before test is considered failed.
-	restartPodReadyAgainTimeout = 5 * time.Minute
-)
-
 var _ = framework.KubeDescribe("Restart [Disruptive]", func() {
 	f := framework.NewDefaultFramework("restart")
 	var ps *framework.PodStore
@@ -71,7 +52,7 @@ var _ = framework.KubeDescribe("Restart [Disruptive]", func() {
 		nn := framework.TestContext.CloudConfig.NumNodes

 		By("ensuring all nodes are ready")
-		nodeNamesBefore, err := checkNodesReady(f.Client, framework.NodeReadyInitialTimeout, nn)
+		nodeNamesBefore, err := framework.CheckNodesReady(f.Client, framework.NodeReadyInitialTimeout, nn)
 		Expect(err).NotTo(HaveOccurred())
 		framework.Logf("Got the following nodes before restart: %v", nodeNamesBefore)

@@ -87,11 +68,11 @@ var _ = framework.KubeDescribe("Restart [Disruptive]", func() {
 		}

 		By("restarting all of the nodes")
-		err = restartNodes(framework.TestContext.Provider, restartPerNodeTimeout)
+		err = restartNodes(framework.TestContext.Provider, framework.RestartPerNodeTimeout)
 		Expect(err).NotTo(HaveOccurred())

 		By("ensuring all nodes are ready after the restart")
-		nodeNamesAfter, err := checkNodesReady(f.Client, restartNodeReadyAgainTimeout, nn)
+		nodeNamesAfter, err := framework.CheckNodesReady(f.Client, framework.RestartNodeReadyAgainTimeout, nn)
 		Expect(err).NotTo(HaveOccurred())
 		framework.Logf("Got the following nodes after restart: %v", nodeNamesAfter)

@@ -108,10 +89,10 @@ var _ = framework.KubeDescribe("Restart [Disruptive]", func() {
 		// across node restarts.
 		By("ensuring the same number of pods are running and ready after restart")
 		podCheckStart := time.Now()
-		podNamesAfter, err := waitForNPods(ps, len(podNamesBefore), restartPodReadyAgainTimeout)
+		podNamesAfter, err := waitForNPods(ps, len(podNamesBefore), framework.RestartPodReadyAgainTimeout)
 		Expect(err).NotTo(HaveOccurred())
-		remaining := restartPodReadyAgainTimeout - time.Since(podCheckStart)
-		if !framework.CheckPodsRunningReadyOrSucceeded(f.Client, ns, podNamesAfter, remaining) {
+		remaining := framework.RestartPodReadyAgainTimeout - time.Since(podCheckStart)
+		if !framework.CheckPodsRunningReady(f.Client, ns, podNamesAfter, remaining) {
 			framework.Failf("At least one pod wasn't running and ready after the restart.")
 		}
 	})
@@ -144,64 +125,6 @@ func waitForNPods(ps *framework.PodStore, expect int, timeout time.Duration) ([]
 	return podNames, nil
 }

-// checkNodesReady waits up to nt for expect nodes accessed by c to be ready,
-// returning an error if this doesn't happen in time. It returns the names of
-// nodes it finds.
-func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string, error) {
-	// First, keep getting all of the nodes until we get the number we expect.
-	var nodeList *api.NodeList
-	var errLast error
-	start := time.Now()
-	found := wait.Poll(framework.Poll, nt, func() (bool, error) {
-		// A rolling-update (GCE/GKE implementation of restart) can complete before the apiserver
-		// knows about all of the nodes. Thus, we retry the list nodes call
-		// until we get the expected number of nodes.
-		nodeList, errLast = c.Nodes().List(api.ListOptions{
-			FieldSelector: fields.Set{"spec.unschedulable": "false"}.AsSelector()})
-		if errLast != nil {
-			return false, nil
-		}
-		if len(nodeList.Items) != expect {
-			errLast = fmt.Errorf("expected to find %d nodes but found only %d (%v elapsed)",
-				expect, len(nodeList.Items), time.Since(start))
-			framework.Logf("%v", errLast)
-			return false, nil
-		}
-		return true, nil
-	}) == nil
-	nodeNames := make([]string, len(nodeList.Items))
-	for i, n := range nodeList.Items {
-		nodeNames[i] = n.ObjectMeta.Name
-	}
-	if !found {
-		return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v",
-			expect, nt, errLast)
-	}
-	framework.Logf("Successfully found %d nodes", expect)
-
-	// Next, ensure in parallel that all the nodes are ready. We subtract the
-	// time we spent waiting above.
-	timeout := nt - time.Since(start)
-	result := make(chan bool, len(nodeList.Items))
-	for _, n := range nodeNames {
-		n := n
-		go func() { result <- framework.WaitForNodeToBeReady(c, n, timeout) }()
-	}
-	failed := false
-	// TODO(mbforbes): Change to `for range` syntax once we support only Go
-	// >= 1.4.
-	for i := range nodeList.Items {
-		_ = i
-		if !<-result {
-			failed = true
-		}
-	}
-	if failed {
-		return nodeNames, fmt.Errorf("at least one node failed to be ready")
-	}
-	return nodeNames, nil
-}
-
 // restartNodes uses provider to do a restart of all nodes in the cluster,
 // allowing up to nt per node.
 func restartNodes(provider string, nt time.Duration) error {
@@ -235,9 +158,9 @@ func restartNodes(provider string, nt time.Duration) error {
 //   done
 func migRollingUpdateSelf(nt time.Duration) error {
 	By("getting the name of the template for the managed instance group")
-	tmpl, err := migTemplate()
+	tmpl, err := framework.MigTemplate()
 	if err != nil {
 		return fmt.Errorf("couldn't get MIG template name: %v", err)
 	}
-	return migRollingUpdate(tmpl, nt)
+	return framework.MigRollingUpdate(tmpl, nt)
 }
--- a/test/e2e/service.go
+++ b/test/e2e/service.go
@@ -331,8 +331,7 @@ var _ = framework.KubeDescribe("Services", func() {

 	It("should work after restarting apiserver [Disruptive]", func() {
 		// TODO: use the ServiceTestJig here
-		// TODO: framework.RestartApiserver doesn't work in GKE - fix it and reenable this test.
-		framework.SkipUnlessProviderIs("gce")
+		framework.SkipUnlessProviderIs("gce", "gke")

 		ns := f.Namespace.Name
 		numPods, servicePort := 3, 80
@@ -351,7 +350,7 @@ var _ = framework.KubeDescribe("Services", func() {
 		framework.ExpectNoError(verifyServeHostnameServiceUp(c, ns, host, podNames1, svc1IP, servicePort))

 		// Restart apiserver
-		if err := framework.RestartApiserver(); err != nil {
+		if err := framework.RestartApiserver(c); err != nil {
 			framework.Failf("error restarting apiserver: %v", err)
 		}
 		if err := framework.WaitForApiserverUp(c); err != nil {