From d01998f5fa4146d8f2d5f71a10b48b1199e33293 Mon Sep 17 00:00:00 2001 From: Jerzy Szczepkowski Date: Tue, 11 Oct 2016 14:52:51 +0200 Subject: [PATCH] Fixed e2e tests for HA master. Set of fixes that allows HA master e2e tests to pass for removal/addition of master replicas. --- cluster/gce/debian/master-helper.sh | 2 +- cluster/gce/gci/master-helper.sh | 2 +- cluster/gce/util.sh | 52 +++++++++++++++++++++++------ cluster/validate-cluster.sh | 7 +++- hack/e2e-internal/e2e-add-master.sh | 1 + test/e2e/framework/util.go | 37 ++++++++++++++++++++ test/e2e/ha_master.go | 22 +++++++----- 7 files changed, 101 insertions(+), 22 deletions(-) diff --git a/cluster/gce/debian/master-helper.sh b/cluster/gce/debian/master-helper.sh index 52607aa9a40..72097b96182 100755 --- a/cluster/gce/debian/master-helper.sh +++ b/cluster/gce/debian/master-helper.sh @@ -56,7 +56,7 @@ function replicate-master-instance() { ETCD_CA_KEY="$(echo "${kube_env}" | grep "ETCD_CA_KEY" | sed "s/^.*: '//" | sed "s/'$//")" ETCD_CA_CERT="$(echo "${kube_env}" | grep "ETCD_CA_CERT" | sed "s/^.*: '//" | sed "s/'$//")" - create-etcd-certs "${ETCD_CA_CERT}" "${ETCD_CA_KEY}" + create-etcd-certs "${REPLICA_NAME}" "${ETCD_CA_CERT}" "${ETCD_CA_KEY}" kube_env="$(echo "${kube_env}" | grep -v "ETCD_PEER_KEY")" kube_env="$(echo -e "${kube_env}\nETCD_PEER_KEY: '${ETCD_PEER_KEY_BASE64}'")" diff --git a/cluster/gce/gci/master-helper.sh b/cluster/gce/gci/master-helper.sh index e83bb18ef29..e430aa0a6fe 100755 --- a/cluster/gce/gci/master-helper.sh +++ b/cluster/gce/gci/master-helper.sh @@ -52,7 +52,7 @@ function replicate-master-instance() { ETCD_CA_KEY="$(echo "${kube_env}" | grep "ETCD_CA_KEY" | sed "s/^.*: '//" | sed "s/'$//")" ETCD_CA_CERT="$(echo "${kube_env}" | grep "ETCD_CA_CERT" | sed "s/^.*: '//" | sed "s/'$//")" - create-etcd-certs "${ETCD_CA_CERT}" "${ETCD_CA_KEY}" + create-etcd-certs "${REPLICA_NAME}" "${ETCD_CA_CERT}" "${ETCD_CA_KEY}" kube_env="$(echo "${kube_env}" | grep -v "ETCD_PEER_KEY")" kube_env="$(echo -e "${kube_env}\nETCD_PEER_KEY: '${ETCD_PEER_KEY_BASE64}'")" diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index 5602fd63bdd..9c79c955964 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -747,8 +747,9 @@ function get-master-disk-size() { # KUBE_TEMP: temporary directory # # Args: -# $1: CA certificate -# $2: CA key +# $1: host name +# $2: CA certificate +# $3: CA key # # If CA cert/key is empty, the function will also generate certs for CA. # @@ -759,8 +760,9 @@ function get-master-disk-size() { # ETCD_PEER_CERT_BASE64 # function create-etcd-certs { - local ca_cert=${1:-} - local ca_key=${2:-} + local host=${1} + local ca_cert=${2:-} + local ca_key=${3:-} mkdir -p "${KUBE_TEMP}/cfssl" pushd "${KUBE_TEMP}/cfssl" @@ -810,8 +812,8 @@ EOF ./cfssl gencert -initca ca-csr.json | ./cfssljson -bare ca - fi - echo '{"CN":"'"${MASTER_NAME}"'","hosts":[""],"key":{"algo":"ecdsa","size":256}}' \ - | ./cfssl gencert -ca=ca.pem -ca-key=ca-key.pem -config=ca-config.json -profile=client-server -hostname="${MASTER_NAME}" - \ + echo '{"CN":"'"${host}"'","hosts":[""],"key":{"algo":"ecdsa","size":256}}' \ + | ./cfssl gencert -ca=ca.pem -ca-key=ca-key.pem -config=ca-config.json -profile=client-server -hostname="${host}" - \ | ./cfssljson -bare etcd ETCD_CA_KEY_BASE64=$(cat "ca-key.pem" | base64 | tr -d '\r\n') @@ -878,7 +880,7 @@ function create-master() { MASTER_ADVERTISE_ADDRESS="${MASTER_RESERVED_IP}" create-certs "${MASTER_RESERVED_IP}" - create-etcd-certs + create-etcd-certs ${MASTER_NAME} # Sets MASTER_ROOT_DISK_SIZE that is used by create-master-instance get-master-root-disk-size @@ -904,7 +906,7 @@ function add-replica-to-etcd() { --project "${PROJECT}" \ --zone "${EXISTING_MASTER_ZONE}" \ --command \ - "curl localhost:${client_port}/v2/members -XPOST -H \"Content-Type: application/json\" -d '{\"peerURLs\":[\"https://${REPLICA_NAME}:${internal_port}\"]}'" + "curl localhost:${client_port}/v2/members -XPOST -H \"Content-Type: application/json\" -d '{\"peerURLs\":[\"https://${REPLICA_NAME}:${internal_port}\"]}' -s" return $? } @@ -1381,7 +1383,7 @@ function kube-down() { if [[ "${REMAINING_MASTER_COUNT}" == "1" ]]; then if gcloud compute forwarding-rules describe "${MASTER_NAME}" --region "${REGION}" --project "${PROJECT}" &>/dev/null; then detect-master - local REMAINING_REPLICA_NAME="$(get-replica-name)" + local REMAINING_REPLICA_NAME="$(get-all-replica-names)" local REMAINING_REPLICA_ZONE=$(gcloud compute instances list "${REMAINING_REPLICA_NAME}" \ --project "${PROJECT}" --format="value(zone)") gcloud compute forwarding-rules delete \ @@ -1476,6 +1478,21 @@ function kube-down() { # If there are no more remaining master replicas, we should update kubeconfig. export CONTEXT="${PROJECT}_${INSTANCE_PREFIX}" clear-kubeconfig + else + # If some master replicas remain: cluster has been changed, we need to re-validate it. + echo "... calling validate-cluster" >&2 + # Override errexit + (validate-cluster) && validate_result="$?" || validate_result="$?" + + # We have two different failure modes from validate cluster: + # - 1: fatal error - cluster won't be working correctly + # - 2: weak error - something went wrong, but cluster probably will be working correctly + # We just print an error message in case 2). + if [[ "${validate_result}" == "1" ]]; then + exit 1 + elif [[ "${validate_result}" == "2" ]]; then + echo "...ignoring non-fatal errors in validate-cluster" >&2 + fi fi set -e } @@ -1511,6 +1528,19 @@ function get-all-replica-names() { --format "value(name)" | tr "\n" "," | sed 's/,$//') } +# Prints the number of all of the master replicas in all zones. +# +# Assumed vars: +# MASTER_NAME +function get-master-replicas-count() { + detect-project + local num_masters=$(gcloud compute instances list \ + --project "${PROJECT}" \ + --regexp "$(get-replica-name-regexp)" \ + --format "value(zone)" | wc -l) + echo -n "${num_masters}" +} + # Prints regexp for full master machine name. In a cluster with replicated master, # VM names may either be MASTER_NAME or MASTER_NAME with a suffix for a replica. function get-replica-name-regexp() { @@ -1786,7 +1816,7 @@ function test-setup() { # Detect the project into $PROJECT if it isn't set detect-project - if [[ ${MULTIZONE:-} == "true" ]]; then + if [[ ${MULTIZONE:-} == "true" && -n ${E2E_ZONES:-} ]]; then for KUBE_GCE_ZONE in ${E2E_ZONES} do KUBE_GCE_ZONE="${KUBE_GCE_ZONE}" KUBE_USE_EXISTING_MASTER="${KUBE_USE_EXISTING_MASTER:-}" "${KUBE_ROOT}/cluster/kube-up.sh" @@ -1843,7 +1873,7 @@ function test-teardown() { delete-firewall-rules \ "${NODE_TAG}-${INSTANCE_PREFIX}-http-alt" \ "${NODE_TAG}-${INSTANCE_PREFIX}-nodeports" - if [[ ${MULTIZONE:-} == "true" ]]; then + if [[ ${MULTIZONE:-} == "true" && -n ${E2E_ZONES:-} ]]; then local zones=( ${E2E_ZONES} ) # tear them down in reverse order, finally tearing down the master too. for ((zone_num=${#zones[@]}-1; zone_num>0; zone_num--)) diff --git a/cluster/validate-cluster.sh b/cluster/validate-cluster.sh index c10d6148467..d3727a5bb69 100755 --- a/cluster/validate-cluster.sh +++ b/cluster/validate-cluster.sh @@ -52,7 +52,12 @@ CLUSTER_READY_ADDITIONAL_TIME_SECONDS="${CLUSTER_READY_ADDITIONAL_TIME_SECONDS:- EXPECTED_NUM_NODES="${NUM_NODES}" if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]]; then - EXPECTED_NUM_NODES=$((EXPECTED_NUM_NODES+1)) + if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]]; then + NUM_MASTERS=$(get-master-replicas-count) + else + NUM_MASTERS=1 + fi + EXPECTED_NUM_NODES=$((EXPECTED_NUM_NODES+NUM_MASTERS)) fi REQUIRED_NUM_NODES=$((EXPECTED_NUM_NODES - ALLOWED_NOTREADY_NODES)) # Make several attempts to deal with slow cluster birth. diff --git a/hack/e2e-internal/e2e-add-master.sh b/hack/e2e-internal/e2e-add-master.sh index 9d64a997c4b..039855726b3 100755 --- a/hack/e2e-internal/e2e-add-master.sh +++ b/hack/e2e-internal/e2e-add-master.sh @@ -20,6 +20,7 @@ if [[ ! -z "${1:-}" ]]; then export KUBE_GCE_ZONE="${1}" fi export KUBE_REPLICATE_EXISTING_MASTER=true +export MULTIZONE=true source "${KUBE_ROOT}/hack/e2e-internal/e2e-up.sh" diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index e49adaa4c0a..149ad803018 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -4011,6 +4011,43 @@ func WaitForClusterSize(c clientset.Interface, size int, timeout time.Duration) return fmt.Errorf("timeout waiting %v for cluster size to be %d", timeout, size) } +// waitForMasters waits until the cluster has the desired number of ready masters in it. +func WaitForMasters(masterPrefix string, c clientset.Interface, size int, timeout time.Duration) error { + for start := time.Now(); time.Since(start) < timeout; time.Sleep(20 * time.Second) { + nodes, err := c.Core().Nodes().List(api.ListOptions{}) + if err != nil { + Logf("Failed to list nodes: %v", err) + continue + } + + // Filter out nodes that are not master replicas + FilterNodes(nodes, func(node api.Node) bool { + res, err := regexp.Match(masterPrefix+"(-...)?", ([]byte)(node.Name)) + if err != nil { + Logf("Failed to match regexp to node name: %v", err) + return false + } + return res + }) + + numNodes := len(nodes.Items) + + // Filter out not-ready nodes. + FilterNodes(nodes, func(node api.Node) bool { + return IsNodeConditionSetAsExpected(&node, api.NodeReady, true) + }) + + numReady := len(nodes.Items) + + if numNodes == size && numReady == size { + Logf("Cluster has reached the desired number of masters %d", size) + return nil + } + Logf("Waiting for the number of masters %d, current %d, not ready master nodes %d", size, numNodes, numNodes-numReady) + } + return fmt.Errorf("timeout waiting %v for the number of masters to be %d", timeout, size) +} + // GetHostExternalAddress gets the node for a pod and returns the first External // address. Returns an error if the node the pod is on doesn't have an External // address. diff --git a/test/e2e/ha_master.go b/test/e2e/ha_master.go index 932f92a90cc..4f29cb0b7ae 100644 --- a/test/e2e/ha_master.go +++ b/test/e2e/ha_master.go @@ -23,6 +23,7 @@ import ( "path" "strconv" "strings" + "time" . "github.com/onsi/ginkgo" clientset "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset" @@ -63,7 +64,6 @@ func createNewRC(c clientset.Interface, ns string, name string) { func verifyNumberOfMasterReplicas(expected int) { output, err := exec.Command("gcloud", "compute", "instances", "list", "--project="+framework.TestContext.CloudConfig.ProjectID, - "--zones="+framework.TestContext.CloudConfig.Zone, "--regexp="+framework.TestContext.CloudConfig.MasterName+"(-...)?", "--filter=status=RUNNING", "--format=[no-heading]").CombinedOutput() @@ -73,7 +73,7 @@ func verifyNumberOfMasterReplicas(expected int) { replicas := bytes.Count(output, newline) framework.Logf("Num master replicas/expected: %d/%d", replicas, expected) if replicas != expected { - framework.Failf("Wrong number of master replicas") + framework.Failf("Wrong number of master replicas %d expected %d", replicas, expected) } } @@ -131,6 +131,8 @@ var _ = framework.KubeDescribe("HA-master [Feature:HAMaster]", func() { for _, zone := range additionalReplicaZones { removeMasterReplica(zone) } + framework.WaitForMasters(framework.TestContext.CloudConfig.MasterName, c, 1, 10*time.Minute) + verifyNumberOfMasterReplicas(1) }) type Action int @@ -151,6 +153,7 @@ var _ = framework.KubeDescribe("HA-master [Feature:HAMaster]", func() { additionalReplicaZones = removeZoneFromZones(additionalReplicaZones, zone) } verifyNumberOfMasterReplicas(len(additionalReplicaZones) + 1) + framework.WaitForMasters(framework.TestContext.CloudConfig.MasterName, c, len(additionalReplicaZones)+1, 10*time.Minute) // Verify that API server works correctly with HA master. rcName := "ha-master-" + strconv.Itoa(len(existingRCs)) @@ -159,16 +162,19 @@ var _ = framework.KubeDescribe("HA-master [Feature:HAMaster]", func() { verifyRCs(c, ns, existingRCs) } - It("pods survive addition/removal same zone [Slow]", func() { + It("survive addition/removal replicas same zone [Serial][Disruptive]", func() { zone := framework.TestContext.CloudConfig.Zone step(None, "") - step(AddReplica, zone) - step(AddReplica, zone) - step(RemoveReplica, zone) - step(RemoveReplica, zone) + numAdditionalReplicas := 2 + for i := 0; i < numAdditionalReplicas; i++ { + step(AddReplica, zone) + } + for i := 0; i < numAdditionalReplicas; i++ { + step(RemoveReplica, zone) + } }) - It("pods survive addition/removal different zones [Slow]", func() { + It("survive addition/removal replicas different zones [Serial][Disruptive]", func() { zone := framework.TestContext.CloudConfig.Zone region := findRegionForZone(zone) zones := findZonesForRegion(region)