From 775f2ef9a0d8bf224fd767cf5161b0c42263ad4d Mon Sep 17 00:00:00 2001
From: Matt Liggett <mml@google.com>
Date: Tue, 25 Apr 2017 15:06:24 -0700
Subject: [PATCH] Respect PDBs during GCE node upgrades.

Respect PDBs during node upgrades and add test coverage to the
ServiceTest upgrade test.  Modified that test so that we include pod
anti-affinity constraints and a PDB.
---
 cluster/gce/upgrade.sh             | 149 ++++++++++++++++++++---------
 test/e2e/framework/service_util.go |  75 ++++++++++++++-
 test/e2e/upgrades/services.go      |  14 ++-
 3 files changed, 191 insertions(+), 47 deletions(-)

diff --git a/cluster/gce/upgrade.sh b/cluster/gce/upgrade.sh
index 4425e99c53c..ad84f1ff090 100755
--- a/cluster/gce/upgrade.sh
+++ b/cluster/gce/upgrade.sh
@@ -343,51 +343,114 @@ function do-node-upgrade() {
         --zones="${ZONE}" \
         --regexp="${group}" \
         --format='value(instanceTemplate)' || true))
-    echo "== Calling rolling-update for ${group}. ==" >&2
-    update=$(gcloud alpha compute rolling-updates \
-        --project="${PROJECT}" \
-        --zone="${ZONE}" \
-        start \
-        --group="${group}" \
-        --template="${template_name}" \
-        --instance-startup-timeout=300s \
-        --max-num-concurrent-instances=1 \
-        --max-num-failed-instances=0 \
-        --min-instance-update-time=0s 2>&1) && update_rc=$? || update_rc=$?
-
-    if [[ "${update_rc}" != 0 ]]; then
-      echo "== FAILED to start rolling-update: =="
-      echo "${update}"
-      echo "  This may be due to a preexisting rolling-update;"
-      echo "  see https://github.com/kubernetes/kubernetes/issues/33113 for details."
-      echo "  All rolling-updates in project ${PROJECT} zone ${ZONE}:"
-      gcloud alpha compute rolling-updates \
-        --project="${PROJECT}" \
-        --zone="${ZONE}" \
-        list || true
-      return ${update_rc}
+    set_instance_template_out=$(gcloud compute instance-groups managed set-instance-template "${group}" \
+      --template="${template_name}" \
+      --project="${PROJECT}" \
+      --zone="${ZONE}" 2>&1) && set_instance_template_rc=$? || set_instance_template_rc=$?
+    if [[ "${set_instance_template_rc}" != 0 ]]; then
+      echo "== FAILED to set-instance-template for ${group} to ${template_name} =="
+      echo "${set_instance_template_out}"
+      return ${set_instance_template_rc}
     fi
-
-    id=$(echo "${update}" | grep "Started" | cut -d '/' -f 11 | cut -d ']' -f 1)
-    updates+=("${id}")
-  done
-
-  echo "== Waiting for Upgrading nodes to be finished. ==" >&2
-  # Wait until rolling updates are finished.
-  for update in ${updates[@]}; do
-    while true; do
-      result=$(gcloud alpha compute rolling-updates \
-          --project="${PROJECT}" \
-          --zone="${ZONE}" \
-          describe \
-          ${update} \
-          --format='value(status)' || true)
-      if [ $result = "ROLLED_OUT" ]; then
-        echo "Rolling update ${update} is ${result} state and finished."
-        break
+    instances=()
+    instances+=($(gcloud compute instance-groups managed list-instances "${group}" \
+        --format='value(instance)' \
+        --project="${PROJECT}" \
+        --zone="${ZONE}" 2>&1)) && list_instances_rc=$? || list_instances_rc=$?
+    if [[ "${list_instances_rc}" != 0 ]]; then
+      echo "== FAILED to list instances in group ${group} =="
+      echo "${instances}"
+      return ${list_instances_rc}
+    fi
+    for instance in ${instances[@]}; do
+      # Cache instance id for later
+      instance_id=$(gcloud compute instances describe "${instance}" \
+        --format='get(id)' \
+        --project="${PROJECT}" \
+        --zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$?
+      if [[ "${describe_rc}" != 0 ]]; then
+        echo "== FAILED to describe ${instance} =="
+        echo "${instance_id}"
+        return ${describe_rc}
       fi
-      echo "Rolling update ${update} is still in ${result} state."
-      sleep 10
+
+      # Drain node
+      echo "== Draining ${instance}. == " >&2
+      "${KUBE_ROOT}/cluster/kubectl.sh" drain --delete-local-data --force --ignore-daemonsets "${instance}" \
+        && drain_rc=$? || drain_rc=$?
+      if [[ "${drain_rc}" != 0 ]]; then
+        echo "== FAILED to drain ${instance} =="
+        return ${drain_rc}
+      fi
+
+      # Recreate instance
+      echo "== Recreating instance ${instance}. ==" >&2
+      recreate=$(gcloud compute instance-groups managed recreate-instances "${group}" \
+        --project="${PROJECT}" \
+        --zone="${ZONE}" \
+        --instances="${instance}" 2>&1) && recreate_rc=$? || recreate_rc=$?
+      if [[ "${recreate_rc}" != 0 ]]; then
+        echo "== FAILED to recreate ${instance} =="
+        echo "${recreate}"
+        return ${recreate_rc}
+      fi
+
+      # Wait for instance to be recreated
+      echo "== Waiting for instance ${instance} to be recreated. ==" >&2
+      while true; do
+        new_instance_id=$(gcloud compute instances describe "${instance}" \
+          --format='get(id)' \
+          --project="${PROJECT}" \
+          --zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$?
+        if [[ "${describe_rc}" != 0 ]]; then
+          echo "== FAILED to describe ${instance} =="
+          echo "${new_instance_id}"
+          echo "  (Will retry.)"
+        elif [[ "${new_instance_id}" == "${instance_id}" ]]; then
+          echo -n .
+        else
+          echo "Instance ${instance} recreated."
+          break
+        fi
+        sleep 1
+      done
+
+      # Wait for k8s node object to reflect new instance id
+      echo "== Waiting for new node to be added to k8s.  ==" >&2
+      while true; do
+        external_id=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output=jsonpath='{.spec.externalID}' 2>&1) && kubectl_rc=$? || kubectl_rc=$?
+        if [[ "${kubectl_rc}" != 0 ]]; then
+          echo "== FAILED to get node ${instance} =="
+          echo "${external_id}"
+          echo "  (Will retry.)"
+        elif [[ "${external_id}" == "${new_instance_id}" ]]; then
+          echo "Node ${instance} recreated."
+          break
+        elif [[ "${external_id}" == "${instance_id}" ]]; then
+          echo -n .
+        else
+          echo "Unexpected external_id '${external_id}' matches neither old ('${instance_id}') nor new ('${new_instance_id}')."
+          echo "  (Will retry.)"
+        fi
+        sleep 1
+      done
+
+      # Wait for the node to not have SchedulingDisabled=True and also to have
+      # Ready=True.
+      echo "== Waiting for ${instance} to become ready. ==" >&2
+      while true; do
+        cordoned=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "SchedulingDisabled")].status}')
+        ready=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "Ready")].status}')
+        if [[ "${cordoned}" == 'True' ]]; then
+          echo "Node ${instance} is still not ready: SchedulingDisabled=${ready}"
+        elif [[ "${ready}" != 'True' ]]; then
+          echo "Node ${instance} is still not ready: Ready=${ready}"
+        else
+          echo "Node ${instance} Ready=${ready}"
+          break
+        fi
+        sleep 1
+      done
     done
   done
 
diff --git a/test/e2e/framework/service_util.go b/test/e2e/framework/service_util.go
index a1bce8f1f26..13813f55ef3 100644
--- a/test/e2e/framework/service_util.go
+++ b/test/e2e/framework/service_util.go
@@ -34,6 +34,7 @@ import (
 	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/kubernetes/pkg/api/v1"
+	policyv1beta1 "k8s.io/kubernetes/pkg/apis/policy/v1beta1"
 	"k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
 	"k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
 	"k8s.io/kubernetes/pkg/client/retry"
@@ -510,6 +511,8 @@ func (j *ServiceTestJig) WaitForLoadBalancerDestroyOrFail(namespace, name string
 // this jig, but does not actually create the RC.  The default RC has the same
 // name as the jig and runs the "netexec" container.
 func (j *ServiceTestJig) newRCTemplate(namespace string) *v1.ReplicationController {
+	var replicas int32 = 1
+
 	rc := &v1.ReplicationController{
 		ObjectMeta: metav1.ObjectMeta{
 			Namespace: namespace,
@@ -517,7 +520,7 @@ func (j *ServiceTestJig) newRCTemplate(namespace string) *v1.ReplicationControll
 			Labels:    j.Labels,
 		},
 		Spec: v1.ReplicationControllerSpec{
-			Replicas: func(i int) *int32 { x := int32(i); return &x }(1),
+			Replicas: &replicas,
 			Selector: j.Labels,
 			Template: &v1.PodTemplateSpec{
 				ObjectMeta: metav1.ObjectMeta{
@@ -548,6 +551,59 @@ func (j *ServiceTestJig) newRCTemplate(namespace string) *v1.ReplicationControll
 	return rc
 }
 
+func (j *ServiceTestJig) AddRCAntiAffinity(rc *v1.ReplicationController) {
+	var replicas int32 = 2
+
+	rc.Spec.Replicas = &replicas
+	if rc.Spec.Template.Spec.Affinity == nil {
+		rc.Spec.Template.Spec.Affinity = &v1.Affinity{}
+	}
+	if rc.Spec.Template.Spec.Affinity.PodAntiAffinity == nil {
+		rc.Spec.Template.Spec.Affinity.PodAntiAffinity = &v1.PodAntiAffinity{}
+	}
+	rc.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = append(
+		rc.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution,
+		v1.PodAffinityTerm{
+			LabelSelector: &metav1.LabelSelector{MatchLabels: j.Labels},
+			Namespaces:    nil,
+			TopologyKey:   "kubernetes.io/hostname",
+		})
+}
+
+func (j *ServiceTestJig) CreatePDBOrFail(namespace string, rc *v1.ReplicationController) *policyv1beta1.PodDisruptionBudget {
+	pdb := j.newPDBTemplate(namespace, rc)
+	newPdb, err := j.Client.Policy().PodDisruptionBudgets(namespace).Create(pdb)
+	if err != nil {
+		Failf("Failed to create PDB %q %v", pdb.Name, err)
+	}
+	if err := j.waitForPdbReady(namespace); err != nil {
+		Failf("Failed waiting for PDB to be ready: %v", err)
+	}
+
+	return newPdb
+}
+
+// newPDBTemplate returns the default policyv1beta1.PodDisruptionBudget object for
+// this jig, but does not actually create the PDB.  The default PDB specifies a
+// MinAvailable of N-1 and matches the pods created by the RC.
+func (j *ServiceTestJig) newPDBTemplate(namespace string, rc *v1.ReplicationController) *policyv1beta1.PodDisruptionBudget {
+	minAvailable := intstr.FromInt(int(*rc.Spec.Replicas) - 1)
+
+	pdb := &policyv1beta1.PodDisruptionBudget{
+		ObjectMeta: metav1.ObjectMeta{
+			Namespace: namespace,
+			Name:      j.Name,
+			Labels:    j.Labels,
+		},
+		Spec: policyv1beta1.PodDisruptionBudgetSpec{
+			MinAvailable: &minAvailable,
+			Selector:     &metav1.LabelSelector{MatchLabels: j.Labels},
+		},
+	}
+
+	return pdb
+}
+
 // RunOrFail creates a ReplicationController and Pod(s) and waits for the
 // Pod(s) to be running. Callers can provide a function to tweak the RC object
 // before it is created.
@@ -558,7 +614,7 @@ func (j *ServiceTestJig) RunOrFail(namespace string, tweak func(rc *v1.Replicati
 	}
 	result, err := j.Client.Core().ReplicationControllers(namespace).Create(rc)
 	if err != nil {
-		Failf("Failed to created RC %q: %v", rc.Name, err)
+		Failf("Failed to create RC %q: %v", rc.Name, err)
 	}
 	pods, err := j.waitForPodsCreated(namespace, int(*(rc.Spec.Replicas)))
 	if err != nil {
@@ -570,6 +626,21 @@ func (j *ServiceTestJig) RunOrFail(namespace string, tweak func(rc *v1.Replicati
 	return result
 }
 
+func (j *ServiceTestJig) waitForPdbReady(namespace string) error {
+	timeout := 2 * time.Minute
+	for start := time.Now(); time.Since(start) < timeout; time.Sleep(2 * time.Second) {
+		pdb, err := j.Client.Policy().PodDisruptionBudgets(namespace).Get(j.Name, metav1.GetOptions{})
+		if err != nil {
+			return err
+		}
+		if pdb.Status.PodDisruptionsAllowed > 0 {
+			return nil
+		}
+	}
+
+	return fmt.Errorf("Timeout waiting for PDB %q to be ready", j.Name)
+}
+
 func (j *ServiceTestJig) waitForPodsCreated(namespace string, replicas int) ([]string, error) {
 	timeout := 2 * time.Minute
 	// List the pods, making sure we observe all the replicas.
diff --git a/test/e2e/upgrades/services.go b/test/e2e/upgrades/services.go
index 015c1fd96fc..6d170c0d4f7 100644
--- a/test/e2e/upgrades/services.go
+++ b/test/e2e/upgrades/services.go
@@ -36,6 +36,8 @@ type ServiceUpgradeTest struct {
 
 func (ServiceUpgradeTest) Name() string { return "service-upgrade" }
 
+func shouldTestPDBs() bool { return framework.ProviderIs("gce", "gke") }
+
 // Setup creates a service with a load balancer and makes sure it's reachable.
 func (t *ServiceUpgradeTest) Setup(f *framework.Framework) {
 	serviceName := "service-test"
@@ -55,7 +57,12 @@ func (t *ServiceUpgradeTest) Setup(f *framework.Framework) {
 	svcPort := int(tcpService.Spec.Ports[0].Port)
 
 	By("creating pod to be part of service " + serviceName)
-	jig.RunOrFail(ns.Name, nil)
+	rc := jig.RunOrFail(ns.Name, jig.AddRCAntiAffinity)
+
+	if shouldTestPDBs() {
+		By("creating a PodDisruptionBudget to cover the ReplicationController")
+		jig.CreatePDBOrFail(ns.Name, rc)
+	}
 
 	// Hit it once before considering ourselves ready
 	By("hitting the pod through the service's LoadBalancer")
@@ -72,6 +79,9 @@ func (t *ServiceUpgradeTest) Test(f *framework.Framework, done <-chan struct{},
 	switch upgrade {
 	case MasterUpgrade:
 		t.test(f, done, true)
+	case NodeUpgrade:
+		// Node upgrades should test during disruption only on GCE/GKE for now.
+		t.test(f, done, shouldTestPDBs())
 	default:
 		t.test(f, done, false)
 	}
@@ -87,7 +97,7 @@ func (t *ServiceUpgradeTest) test(f *framework.Framework, done <-chan struct{},
 		// Continuous validation
 		By("continuously hitting the pod through the service's LoadBalancer")
 		wait.Until(func() {
-			t.jig.TestReachableHTTP(t.tcpIngressIP, t.svcPort, framework.Poll)
+			t.jig.TestReachableHTTP(t.tcpIngressIP, t.svcPort, framework.LoadBalancerLagTimeoutDefault)
 		}, framework.Poll, done)
 	} else {
 		// Block until upgrade is done