Respect PDBs during GCE node upgrades.

Respect PDBs during node upgrades and add test coverage to the
ServiceTest upgrade test.  Modified that test so that we include pod
anti-affinity constraints and a PDB.
This commit is contained in:
Matt Liggett 2017-04-25 15:06:24 -07:00
parent 68dd748ba1
commit 775f2ef9a0
3 changed files with 191 additions and 47 deletions

View File

@ -343,51 +343,114 @@ function do-node-upgrade() {
--zones="${ZONE}" \
--regexp="${group}" \
--format='value(instanceTemplate)' || true))
echo "== Calling rolling-update for ${group}. ==" >&2
update=$(gcloud alpha compute rolling-updates \
--project="${PROJECT}" \
--zone="${ZONE}" \
start \
--group="${group}" \
--template="${template_name}" \
--instance-startup-timeout=300s \
--max-num-concurrent-instances=1 \
--max-num-failed-instances=0 \
--min-instance-update-time=0s 2>&1) && update_rc=$? || update_rc=$?
if [[ "${update_rc}" != 0 ]]; then
echo "== FAILED to start rolling-update: =="
echo "${update}"
echo " This may be due to a preexisting rolling-update;"
echo " see https://github.com/kubernetes/kubernetes/issues/33113 for details."
echo " All rolling-updates in project ${PROJECT} zone ${ZONE}:"
gcloud alpha compute rolling-updates \
--project="${PROJECT}" \
--zone="${ZONE}" \
list || true
return ${update_rc}
set_instance_template_out=$(gcloud compute instance-groups managed set-instance-template "${group}" \
--template="${template_name}" \
--project="${PROJECT}" \
--zone="${ZONE}" 2>&1) && set_instance_template_rc=$? || set_instance_template_rc=$?
if [[ "${set_instance_template_rc}" != 0 ]]; then
echo "== FAILED to set-instance-template for ${group} to ${template_name} =="
echo "${set_instance_template_out}"
return ${set_instance_template_rc}
fi
id=$(echo "${update}" | grep "Started" | cut -d '/' -f 11 | cut -d ']' -f 1)
updates+=("${id}")
done
echo "== Waiting for Upgrading nodes to be finished. ==" >&2
# Wait until rolling updates are finished.
for update in ${updates[@]}; do
while true; do
result=$(gcloud alpha compute rolling-updates \
--project="${PROJECT}" \
--zone="${ZONE}" \
describe \
${update} \
--format='value(status)' || true)
if [ $result = "ROLLED_OUT" ]; then
echo "Rolling update ${update} is ${result} state and finished."
break
instances=()
instances+=($(gcloud compute instance-groups managed list-instances "${group}" \
--format='value(instance)' \
--project="${PROJECT}" \
--zone="${ZONE}" 2>&1)) && list_instances_rc=$? || list_instances_rc=$?
if [[ "${list_instances_rc}" != 0 ]]; then
echo "== FAILED to list instances in group ${group} =="
echo "${instances}"
return ${list_instances_rc}
fi
for instance in ${instances[@]}; do
# Cache instance id for later
instance_id=$(gcloud compute instances describe "${instance}" \
--format='get(id)' \
--project="${PROJECT}" \
--zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$?
if [[ "${describe_rc}" != 0 ]]; then
echo "== FAILED to describe ${instance} =="
echo "${instance_id}"
return ${describe_rc}
fi
echo "Rolling update ${update} is still in ${result} state."
sleep 10
# Drain node
echo "== Draining ${instance}. == " >&2
"${KUBE_ROOT}/cluster/kubectl.sh" drain --delete-local-data --force --ignore-daemonsets "${instance}" \
&& drain_rc=$? || drain_rc=$?
if [[ "${drain_rc}" != 0 ]]; then
echo "== FAILED to drain ${instance} =="
return ${drain_rc}
fi
# Recreate instance
echo "== Recreating instance ${instance}. ==" >&2
recreate=$(gcloud compute instance-groups managed recreate-instances "${group}" \
--project="${PROJECT}" \
--zone="${ZONE}" \
--instances="${instance}" 2>&1) && recreate_rc=$? || recreate_rc=$?
if [[ "${recreate_rc}" != 0 ]]; then
echo "== FAILED to recreate ${instance} =="
echo "${recreate}"
return ${recreate_rc}
fi
# Wait for instance to be recreated
echo "== Waiting for instance ${instance} to be recreated. ==" >&2
while true; do
new_instance_id=$(gcloud compute instances describe "${instance}" \
--format='get(id)' \
--project="${PROJECT}" \
--zone="${ZONE}" 2>&1) && describe_rc=$? || describe_rc=$?
if [[ "${describe_rc}" != 0 ]]; then
echo "== FAILED to describe ${instance} =="
echo "${new_instance_id}"
echo " (Will retry.)"
elif [[ "${new_instance_id}" == "${instance_id}" ]]; then
echo -n .
else
echo "Instance ${instance} recreated."
break
fi
sleep 1
done
# Wait for k8s node object to reflect new instance id
echo "== Waiting for new node to be added to k8s. ==" >&2
while true; do
external_id=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output=jsonpath='{.spec.externalID}' 2>&1) && kubectl_rc=$? || kubectl_rc=$?
if [[ "${kubectl_rc}" != 0 ]]; then
echo "== FAILED to get node ${instance} =="
echo "${external_id}"
echo " (Will retry.)"
elif [[ "${external_id}" == "${new_instance_id}" ]]; then
echo "Node ${instance} recreated."
break
elif [[ "${external_id}" == "${instance_id}" ]]; then
echo -n .
else
echo "Unexpected external_id '${external_id}' matches neither old ('${instance_id}') nor new ('${new_instance_id}')."
echo " (Will retry.)"
fi
sleep 1
done
# Wait for the node to not have SchedulingDisabled=True and also to have
# Ready=True.
echo "== Waiting for ${instance} to become ready. ==" >&2
while true; do
cordoned=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "SchedulingDisabled")].status}')
ready=$("${KUBE_ROOT}/cluster/kubectl.sh" get node "${instance}" --output='jsonpath={.status.conditions[?(@.type == "Ready")].status}')
if [[ "${cordoned}" == 'True' ]]; then
echo "Node ${instance} is still not ready: SchedulingDisabled=${ready}"
elif [[ "${ready}" != 'True' ]]; then
echo "Node ${instance} is still not ready: Ready=${ready}"
else
echo "Node ${instance} Ready=${ready}"
break
fi
sleep 1
done
done
done

View File

@ -34,6 +34,7 @@ import (
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/kubernetes/pkg/api/v1"
policyv1beta1 "k8s.io/kubernetes/pkg/apis/policy/v1beta1"
"k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
"k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
"k8s.io/kubernetes/pkg/client/retry"
@ -510,6 +511,8 @@ func (j *ServiceTestJig) WaitForLoadBalancerDestroyOrFail(namespace, name string
// this jig, but does not actually create the RC. The default RC has the same
// name as the jig and runs the "netexec" container.
func (j *ServiceTestJig) newRCTemplate(namespace string) *v1.ReplicationController {
var replicas int32 = 1
rc := &v1.ReplicationController{
ObjectMeta: metav1.ObjectMeta{
Namespace: namespace,
@ -517,7 +520,7 @@ func (j *ServiceTestJig) newRCTemplate(namespace string) *v1.ReplicationControll
Labels: j.Labels,
},
Spec: v1.ReplicationControllerSpec{
Replicas: func(i int) *int32 { x := int32(i); return &x }(1),
Replicas: &replicas,
Selector: j.Labels,
Template: &v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
@ -548,6 +551,59 @@ func (j *ServiceTestJig) newRCTemplate(namespace string) *v1.ReplicationControll
return rc
}
func (j *ServiceTestJig) AddRCAntiAffinity(rc *v1.ReplicationController) {
var replicas int32 = 2
rc.Spec.Replicas = &replicas
if rc.Spec.Template.Spec.Affinity == nil {
rc.Spec.Template.Spec.Affinity = &v1.Affinity{}
}
if rc.Spec.Template.Spec.Affinity.PodAntiAffinity == nil {
rc.Spec.Template.Spec.Affinity.PodAntiAffinity = &v1.PodAntiAffinity{}
}
rc.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = append(
rc.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution,
v1.PodAffinityTerm{
LabelSelector: &metav1.LabelSelector{MatchLabels: j.Labels},
Namespaces: nil,
TopologyKey: "kubernetes.io/hostname",
})
}
func (j *ServiceTestJig) CreatePDBOrFail(namespace string, rc *v1.ReplicationController) *policyv1beta1.PodDisruptionBudget {
pdb := j.newPDBTemplate(namespace, rc)
newPdb, err := j.Client.Policy().PodDisruptionBudgets(namespace).Create(pdb)
if err != nil {
Failf("Failed to create PDB %q %v", pdb.Name, err)
}
if err := j.waitForPdbReady(namespace); err != nil {
Failf("Failed waiting for PDB to be ready: %v", err)
}
return newPdb
}
// newPDBTemplate returns the default policyv1beta1.PodDisruptionBudget object for
// this jig, but does not actually create the PDB. The default PDB specifies a
// MinAvailable of N-1 and matches the pods created by the RC.
func (j *ServiceTestJig) newPDBTemplate(namespace string, rc *v1.ReplicationController) *policyv1beta1.PodDisruptionBudget {
minAvailable := intstr.FromInt(int(*rc.Spec.Replicas) - 1)
pdb := &policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{
Namespace: namespace,
Name: j.Name,
Labels: j.Labels,
},
Spec: policyv1beta1.PodDisruptionBudgetSpec{
MinAvailable: &minAvailable,
Selector: &metav1.LabelSelector{MatchLabels: j.Labels},
},
}
return pdb
}
// RunOrFail creates a ReplicationController and Pod(s) and waits for the
// Pod(s) to be running. Callers can provide a function to tweak the RC object
// before it is created.
@ -558,7 +614,7 @@ func (j *ServiceTestJig) RunOrFail(namespace string, tweak func(rc *v1.Replicati
}
result, err := j.Client.Core().ReplicationControllers(namespace).Create(rc)
if err != nil {
Failf("Failed to created RC %q: %v", rc.Name, err)
Failf("Failed to create RC %q: %v", rc.Name, err)
}
pods, err := j.waitForPodsCreated(namespace, int(*(rc.Spec.Replicas)))
if err != nil {
@ -570,6 +626,21 @@ func (j *ServiceTestJig) RunOrFail(namespace string, tweak func(rc *v1.Replicati
return result
}
func (j *ServiceTestJig) waitForPdbReady(namespace string) error {
timeout := 2 * time.Minute
for start := time.Now(); time.Since(start) < timeout; time.Sleep(2 * time.Second) {
pdb, err := j.Client.Policy().PodDisruptionBudgets(namespace).Get(j.Name, metav1.GetOptions{})
if err != nil {
return err
}
if pdb.Status.PodDisruptionsAllowed > 0 {
return nil
}
}
return fmt.Errorf("Timeout waiting for PDB %q to be ready", j.Name)
}
func (j *ServiceTestJig) waitForPodsCreated(namespace string, replicas int) ([]string, error) {
timeout := 2 * time.Minute
// List the pods, making sure we observe all the replicas.

View File

@ -36,6 +36,8 @@ type ServiceUpgradeTest struct {
func (ServiceUpgradeTest) Name() string { return "service-upgrade" }
func shouldTestPDBs() bool { return framework.ProviderIs("gce", "gke") }
// Setup creates a service with a load balancer and makes sure it's reachable.
func (t *ServiceUpgradeTest) Setup(f *framework.Framework) {
serviceName := "service-test"
@ -55,7 +57,12 @@ func (t *ServiceUpgradeTest) Setup(f *framework.Framework) {
svcPort := int(tcpService.Spec.Ports[0].Port)
By("creating pod to be part of service " + serviceName)
jig.RunOrFail(ns.Name, nil)
rc := jig.RunOrFail(ns.Name, jig.AddRCAntiAffinity)
if shouldTestPDBs() {
By("creating a PodDisruptionBudget to cover the ReplicationController")
jig.CreatePDBOrFail(ns.Name, rc)
}
// Hit it once before considering ourselves ready
By("hitting the pod through the service's LoadBalancer")
@ -72,6 +79,9 @@ func (t *ServiceUpgradeTest) Test(f *framework.Framework, done <-chan struct{},
switch upgrade {
case MasterUpgrade:
t.test(f, done, true)
case NodeUpgrade:
// Node upgrades should test during disruption only on GCE/GKE for now.
t.test(f, done, shouldTestPDBs())
default:
t.test(f, done, false)
}
@ -87,7 +97,7 @@ func (t *ServiceUpgradeTest) test(f *framework.Framework, done <-chan struct{},
// Continuous validation
By("continuously hitting the pod through the service's LoadBalancer")
wait.Until(func() {
t.jig.TestReachableHTTP(t.tcpIngressIP, t.svcPort, framework.Poll)
t.jig.TestReachableHTTP(t.tcpIngressIP, t.svcPort, framework.LoadBalancerLagTimeoutDefault)
}, framework.Poll, done)
} else {
// Block until upgrade is done