From 775f2ef9a0d8bf224fd767cf5161b0c42263ad4d Mon Sep 17 00:00:00 2001 From: Matt Liggett Date: Tue, 25 Apr 2017 15:06:24 -0700 Subject: [PATCH] Respect PDBs during GCE node upgrades. Respect PDBs during node upgrades and add test coverage to the ServiceTest upgrade test. Modified that test so that we include pod anti-affinity constraints and a PDB. --- cluster/gce/upgrade.sh | 149 ++++++++++++++++++++--------- test/e2e/framework/service_util.go | 75 ++++++++++++++- test/e2e/upgrades/services.go | 14 ++- 3 files changed, 191 insertions(+), 47 deletions(-) diff --git a/cluster/gce/upgrade.sh b/cluster/gce/upgrade.sh index 4425e99c53c..ad84f1ff090 100755 --- a/cluster/gce/upgrade.sh +++ b/cluster/gce/upgrade.sh @@ -343,51 +343,114 @@ function do-node-upgrade() { --zones="${ZONE}" \ --regexp="${group}" \ --format='value(instanceTemplate)' || true)) - echo "== Calling rolling-update for ${group}. ==" >&2 - update=$(gcloud alpha compute rolling-updates \ - --project="${PROJECT}" \ - --zone="${ZONE}" \ - start \ - --group="${group}" \ - --template="${template_name}" \ - --instance-startup-timeout=300s \ - --max-num-concurrent-instances=1 \ - --max-num-failed-instances=0 \ - --min-instance-update-time=0s 2>&1) && update_rc=$? || update_rc=$? - - if [[ "${update_rc}" != 0 ]]; then - echo "== FAILED to start rolling-update: ==" - echo "${update}" - echo " This may be due to a preexisting rolling-update;" - echo " see https://github.com/kubernetes/kubernetes/issues/33113 for details." - echo " All rolling-updates in project ${PROJECT} zone ${ZONE}:" - gcloud alpha compute rolling-updates \ - --project="${PROJECT}" \ - --zone="${ZONE}" \ - list || true - return ${update_rc} + set_instance_template_out=$(gcloud compute instance-groups managed set-instance-template "${group}" \ + --template="${template_name}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" 2>&1) && set_instance_template_rc=$? || set_instance_template_rc=$? + if [[ "${set_instance_template_rc}" != 0 ]]; then + echo "== FAILED to set-instance-template for ${group} to ${template_name} ==" + echo "${set_instance_template_out}" + return ${set_instance_template_rc} fi - - id=$(echo "${update}" | grep "Started" | cut -d '/' -f 11 | cut -d ']' -f 1) - updates+=("${id}") - done - - echo "== Waiting for Upgrading nodes to be finished. ==" >&2 - # Wait until rolling updates are finished. - for update in ${updates[@]}; do - while true; do - result=$(gcloud alpha compute rolling-updates \ - --project="${PROJECT}" \ - --zone="${ZONE}" \ - describe \ - ${update} \ - --format='value(status)' || true) - if [ $result = "ROLLED_OUT" ]; then - echo "Rolling update ${update} is ${result} state and finished." - break + instances=() + instances+=($(gcloud compute instance-groups managed list-instances "${group}" \ + --format='value(instance)' \ + --project="${PROJECT}" \ + --zone="${ZONE}" 2>&1)) && list_instances_rc=$? || list_instances_rc=$? + if [[ "${list_instances_rc}" != 0 ]]; then + echo "== FAILED to list instances in group ${group} ==" + echo "${instances}" + return ${list_instances_rc} + fi + for instance in ${instances[@]}; do + # Cache instance id for later + instance_id=$(gcloud compute instances describe "${instance}" \ + --format='get(id)' \ + --project="${PROJECT}" \ + --zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$? + if [[ "${describe_rc}" != 0 ]]; then + echo "== FAILED to describe ${instance} ==" + echo "${instance_id}" + return ${describe_rc} fi - echo "Rolling update ${update} is still in ${result} state." - sleep 10 + + # Drain node + echo "== Draining ${instance}. == " >&2 + "${KUBE_ROOT}/cluster/kubectl.sh" drain --delete-local-data --force --ignore-daemonsets "${instance}" \ + && drain_rc=$? || drain_rc=$? + if [[ "${drain_rc}" != 0 ]]; then + echo "== FAILED to drain ${instance} ==" + return ${drain_rc} + fi + + # Recreate instance + echo "== Recreating instance ${instance}. ==" >&2 + recreate=$(gcloud compute instance-groups managed recreate-instances "${group}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + --instances="${instance}" 2>&1) && recreate_rc=$? || recreate_rc=$? + if [[ "${recreate_rc}" != 0 ]]; then + echo "== FAILED to recreate ${instance} ==" + echo "${recreate}" + return ${recreate_rc} + fi + + # Wait for instance to be recreated + echo "== Waiting for instance ${instance} to be recreated. ==" >&2 + while true; do + new_instance_id=$(gcloud compute instances describe "${instance}" \ + --format='get(id)' \ + --project="${PROJECT}" \ + --zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$? + if [[ "${describe_rc}" != 0 ]]; then + echo "== FAILED to describe ${instance} ==" + echo "${new_instance_id}" + echo " (Will retry.)" + elif [[ "${new_instance_id}" == "${instance_id}" ]]; then + echo -n . + else + echo "Instance ${instance} recreated." + break + fi + sleep 1 + done + + # Wait for k8s node object to reflect new instance id + echo "== Waiting for new node to be added to k8s. ==" >&2 + while true; do + external_id=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output=jsonpath='{.spec.externalID}' 2>&1) && kubectl_rc=$? || kubectl_rc=$? + if [[ "${kubectl_rc}" != 0 ]]; then + echo "== FAILED to get node ${instance} ==" + echo "${external_id}" + echo " (Will retry.)" + elif [[ "${external_id}" == "${new_instance_id}" ]]; then + echo "Node ${instance} recreated." + break + elif [[ "${external_id}" == "${instance_id}" ]]; then + echo -n . + else + echo "Unexpected external_id '${external_id}' matches neither old ('${instance_id}') nor new ('${new_instance_id}')." + echo " (Will retry.)" + fi + sleep 1 + done + + # Wait for the node to not have SchedulingDisabled=True and also to have + # Ready=True. + echo "== Waiting for ${instance} to become ready. ==" >&2 + while true; do + cordoned=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "SchedulingDisabled")].status}') + ready=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "Ready")].status}') + if [[ "${cordoned}" == 'True' ]]; then + echo "Node ${instance} is still not ready: SchedulingDisabled=${ready}" + elif [[ "${ready}" != 'True' ]]; then + echo "Node ${instance} is still not ready: Ready=${ready}" + else + echo "Node ${instance} Ready=${ready}" + break + fi + sleep 1 + done done done diff --git a/test/e2e/framework/service_util.go b/test/e2e/framework/service_util.go index a1bce8f1f26..13813f55ef3 100644 --- a/test/e2e/framework/service_util.go +++ b/test/e2e/framework/service_util.go @@ -34,6 +34,7 @@ import ( "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/kubernetes/pkg/api/v1" + policyv1beta1 "k8s.io/kubernetes/pkg/apis/policy/v1beta1" "k8s.io/kubernetes/pkg/client/clientset_generated/clientset" "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset" "k8s.io/kubernetes/pkg/client/retry" @@ -510,6 +511,8 @@ func (j *ServiceTestJig) WaitForLoadBalancerDestroyOrFail(namespace, name string // this jig, but does not actually create the RC. The default RC has the same // name as the jig and runs the "netexec" container. func (j *ServiceTestJig) newRCTemplate(namespace string) *v1.ReplicationController { + var replicas int32 = 1 + rc := &v1.ReplicationController{ ObjectMeta: metav1.ObjectMeta{ Namespace: namespace, @@ -517,7 +520,7 @@ func (j *ServiceTestJig) newRCTemplate(namespace string) *v1.ReplicationControll Labels: j.Labels, }, Spec: v1.ReplicationControllerSpec{ - Replicas: func(i int) *int32 { x := int32(i); return &x }(1), + Replicas: &replicas, Selector: j.Labels, Template: &v1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ @@ -548,6 +551,59 @@ func (j *ServiceTestJig) newRCTemplate(namespace string) *v1.ReplicationControll return rc } +func (j *ServiceTestJig) AddRCAntiAffinity(rc *v1.ReplicationController) { + var replicas int32 = 2 + + rc.Spec.Replicas = &replicas + if rc.Spec.Template.Spec.Affinity == nil { + rc.Spec.Template.Spec.Affinity = &v1.Affinity{} + } + if rc.Spec.Template.Spec.Affinity.PodAntiAffinity == nil { + rc.Spec.Template.Spec.Affinity.PodAntiAffinity = &v1.PodAntiAffinity{} + } + rc.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = append( + rc.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, + v1.PodAffinityTerm{ + LabelSelector: &metav1.LabelSelector{MatchLabels: j.Labels}, + Namespaces: nil, + TopologyKey: "kubernetes.io/hostname", + }) +} + +func (j *ServiceTestJig) CreatePDBOrFail(namespace string, rc *v1.ReplicationController) *policyv1beta1.PodDisruptionBudget { + pdb := j.newPDBTemplate(namespace, rc) + newPdb, err := j.Client.Policy().PodDisruptionBudgets(namespace).Create(pdb) + if err != nil { + Failf("Failed to create PDB %q %v", pdb.Name, err) + } + if err := j.waitForPdbReady(namespace); err != nil { + Failf("Failed waiting for PDB to be ready: %v", err) + } + + return newPdb +} + +// newPDBTemplate returns the default policyv1beta1.PodDisruptionBudget object for +// this jig, but does not actually create the PDB. The default PDB specifies a +// MinAvailable of N-1 and matches the pods created by the RC. +func (j *ServiceTestJig) newPDBTemplate(namespace string, rc *v1.ReplicationController) *policyv1beta1.PodDisruptionBudget { + minAvailable := intstr.FromInt(int(*rc.Spec.Replicas) - 1) + + pdb := &policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: j.Name, + Labels: j.Labels, + }, + Spec: policyv1beta1.PodDisruptionBudgetSpec{ + MinAvailable: &minAvailable, + Selector: &metav1.LabelSelector{MatchLabels: j.Labels}, + }, + } + + return pdb +} + // RunOrFail creates a ReplicationController and Pod(s) and waits for the // Pod(s) to be running. Callers can provide a function to tweak the RC object // before it is created. @@ -558,7 +614,7 @@ func (j *ServiceTestJig) RunOrFail(namespace string, tweak func(rc *v1.Replicati } result, err := j.Client.Core().ReplicationControllers(namespace).Create(rc) if err != nil { - Failf("Failed to created RC %q: %v", rc.Name, err) + Failf("Failed to create RC %q: %v", rc.Name, err) } pods, err := j.waitForPodsCreated(namespace, int(*(rc.Spec.Replicas))) if err != nil { @@ -570,6 +626,21 @@ func (j *ServiceTestJig) RunOrFail(namespace string, tweak func(rc *v1.Replicati return result } +func (j *ServiceTestJig) waitForPdbReady(namespace string) error { + timeout := 2 * time.Minute + for start := time.Now(); time.Since(start) < timeout; time.Sleep(2 * time.Second) { + pdb, err := j.Client.Policy().PodDisruptionBudgets(namespace).Get(j.Name, metav1.GetOptions{}) + if err != nil { + return err + } + if pdb.Status.PodDisruptionsAllowed > 0 { + return nil + } + } + + return fmt.Errorf("Timeout waiting for PDB %q to be ready", j.Name) +} + func (j *ServiceTestJig) waitForPodsCreated(namespace string, replicas int) ([]string, error) { timeout := 2 * time.Minute // List the pods, making sure we observe all the replicas. diff --git a/test/e2e/upgrades/services.go b/test/e2e/upgrades/services.go index 015c1fd96fc..6d170c0d4f7 100644 --- a/test/e2e/upgrades/services.go +++ b/test/e2e/upgrades/services.go @@ -36,6 +36,8 @@ type ServiceUpgradeTest struct { func (ServiceUpgradeTest) Name() string { return "service-upgrade" } +func shouldTestPDBs() bool { return framework.ProviderIs("gce", "gke") } + // Setup creates a service with a load balancer and makes sure it's reachable. func (t *ServiceUpgradeTest) Setup(f *framework.Framework) { serviceName := "service-test" @@ -55,7 +57,12 @@ func (t *ServiceUpgradeTest) Setup(f *framework.Framework) { svcPort := int(tcpService.Spec.Ports[0].Port) By("creating pod to be part of service " + serviceName) - jig.RunOrFail(ns.Name, nil) + rc := jig.RunOrFail(ns.Name, jig.AddRCAntiAffinity) + + if shouldTestPDBs() { + By("creating a PodDisruptionBudget to cover the ReplicationController") + jig.CreatePDBOrFail(ns.Name, rc) + } // Hit it once before considering ourselves ready By("hitting the pod through the service's LoadBalancer") @@ -72,6 +79,9 @@ func (t *ServiceUpgradeTest) Test(f *framework.Framework, done <-chan struct{}, switch upgrade { case MasterUpgrade: t.test(f, done, true) + case NodeUpgrade: + // Node upgrades should test during disruption only on GCE/GKE for now. + t.test(f, done, shouldTestPDBs()) default: t.test(f, done, false) } @@ -87,7 +97,7 @@ func (t *ServiceUpgradeTest) test(f *framework.Framework, done <-chan struct{}, // Continuous validation By("continuously hitting the pod through the service's LoadBalancer") wait.Until(func() { - t.jig.TestReachableHTTP(t.tcpIngressIP, t.svcPort, framework.Poll) + t.jig.TestReachableHTTP(t.tcpIngressIP, t.svcPort, framework.LoadBalancerLagTimeoutDefault) }, framework.Poll, done) } else { // Block until upgrade is done