diff --git a/hack/e2e-internal/build-release.sh b/hack/e2e-internal/build-release.sh index 019abe3adad..a27333d9419 100755 --- a/hack/e2e-internal/build-release.sh +++ b/hack/e2e-internal/build-release.sh @@ -21,7 +21,7 @@ set -o pipefail KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../.. : ${KUBE_VERSION_ROOT:=${KUBE_ROOT}} -: ${KUBECTL:="${KUBE_VERSION_ROOT}/cluster/kubectl.sh"} +: ${KUBECTL:=${KUBE_VERSION_ROOT}/cluster/kubectl.sh} : ${KUBE_CONFIG_FILE:="config-test.sh"} export KUBECTL KUBE_CONFIG_FILE diff --git a/hack/e2e-internal/e2e-cluster-size.sh b/hack/e2e-internal/e2e-cluster-size.sh index d1544fa58cd..9e2149d9f7f 100755 --- a/hack/e2e-internal/e2e-cluster-size.sh +++ b/hack/e2e-internal/e2e-cluster-size.sh @@ -21,7 +21,7 @@ set -o pipefail KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../.. : ${KUBE_VERSION_ROOT:=${KUBE_ROOT}} -: ${KUBECTL:="${KUBE_VERSION_ROOT}/cluster/kubectl.sh"} +: ${KUBECTL:=${KUBE_VERSION_ROOT}/cluster/kubectl.sh} : ${KUBE_CONFIG_FILE:="config-test.sh"} export KUBECTL KUBE_CONFIG_FILE diff --git a/hack/e2e-internal/e2e-down.sh b/hack/e2e-internal/e2e-down.sh index 9b175215b92..cf56a7f5ed5 100755 --- a/hack/e2e-internal/e2e-down.sh +++ b/hack/e2e-internal/e2e-down.sh @@ -21,7 +21,7 @@ set -o pipefail KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../.. : ${KUBE_VERSION_ROOT:=${KUBE_ROOT}} -: ${KUBECTL:="${KUBE_VERSION_ROOT}/cluster/kubectl.sh"} +: ${KUBECTL:=${KUBE_VERSION_ROOT}/cluster/kubectl.sh} : ${KUBE_CONFIG_FILE:="config-test.sh"} export KUBECTL KUBE_CONFIG_FILE diff --git a/hack/e2e-internal/e2e-push.sh b/hack/e2e-internal/e2e-push.sh index 38fb988c180..02ded0aac50 100755 --- a/hack/e2e-internal/e2e-push.sh +++ b/hack/e2e-internal/e2e-push.sh @@ -21,7 +21,7 @@ set -o pipefail KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../.. : ${KUBE_VERSION_ROOT:=${KUBE_ROOT}} -: ${KUBECTL:="${KUBE_VERSION_ROOT}/cluster/kubectl.sh"} +: ${KUBECTL:=${KUBE_VERSION_ROOT}/cluster/kubectl.sh} : ${KUBE_CONFIG_FILE:="config-test.sh"} export KUBECTL KUBE_CONFIG_FILE diff --git a/hack/e2e-internal/e2e-status.sh b/hack/e2e-internal/e2e-status.sh index e62e4a1b862..4dcf4503ba8 100755 --- a/hack/e2e-internal/e2e-status.sh +++ b/hack/e2e-internal/e2e-status.sh @@ -14,8 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# e2e-status checks that the status of a cluster is acceptable for running -# e2e tests. set -o errexit set -o nounset set -o pipefail @@ -23,7 +21,7 @@ set -o pipefail KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../.. : ${KUBE_VERSION_ROOT:=${KUBE_ROOT}} -: ${KUBECTL:="${KUBE_VERSION_ROOT}/cluster/kubectl.sh"} +: ${KUBECTL:=${KUBE_VERSION_ROOT}/cluster/kubectl.sh} : ${KUBE_CONFIG_FILE:="config-test.sh"} export KUBECTL KUBE_CONFIG_FILE @@ -34,49 +32,3 @@ source "${KUBE_VERSION_ROOT}/cluster/${KUBERNETES_PROVIDER}/util.sh" prepare-e2e ${KUBECTL} version - -# Before running tests, ensure that all pods are 'Running'. Tests can timeout -# and fail because the test pods don't run in time. The problem is that the pods -# that a cluster runs on startup take too long to start running, with sequential -# Docker pulls of large images being the culprit. These startup pods block the -# test pods from running. - -# Settings: -# timeout is in seconds; 1200 = 20 minutes. -timeout=1200 -# pause is how many seconds to sleep between pod get calls. -pause=5 -# min_pods is the minimum number of pods we require. -min_pods=1 - -# Check pod statuses. -deadline=$(($(date '+%s')+${timeout})) -echo "Waiting at most ${timeout} seconds for all pods to be 'Running'" >&2 -all_running=0 -until [[ ${all_running} == 1 ]]; do - if [[ "$(date '+%s')" -ge "${deadline}" ]]; then - echo "All pods never 'Running' in time." >&2 - exit 1 - fi - statuses=($(${KUBECTL} get pods --template='{{range.items}}{{.status.phase}} {{end}}' --api-version=v1beta3)) - - # Ensure that we have enough pods. - echo "Found ${#statuses[@]} pods with statuses: ${statuses[@]}" >&2 - if [[ ${#statuses[@]} -lt ${min_pods} ]]; then - continue - fi - - # Then, ensure all pods found are 'Running'. - found_running=1 - for status in "${statuses[@]}"; do - if [[ "${status}" != "Running" ]]; then - # If we find a pod that isn't 'Running', sleep here to avoid delaying - # other code paths (where all pods are 'Running'). - found_running=0 - sleep ${pause} - break - fi - done - all_running=${found_running} -done -echo "All pods are 'Running'" >&2 diff --git a/hack/e2e-internal/e2e-up.sh b/hack/e2e-internal/e2e-up.sh index 4a68c61462f..c39593da3df 100755 --- a/hack/e2e-internal/e2e-up.sh +++ b/hack/e2e-internal/e2e-up.sh @@ -21,7 +21,7 @@ set -o pipefail KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../.. : ${KUBE_VERSION_ROOT:=${KUBE_ROOT}} -: ${KUBECTL:="${KUBE_VERSION_ROOT}/cluster/kubectl.sh"} +: ${KUBECTL:=${KUBE_VERSION_ROOT}/cluster/kubectl.sh} : ${KUBE_CONFIG_FILE:="config-test.sh"} export KUBECTL KUBE_CONFIG_FILE diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 4d8e134f21a..75b36b5c82a 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -22,7 +22,9 @@ import ( "path" "strings" "testing" + "time" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/client/clientcmd" "github.com/GoogleCloudPlatform/kubernetes/pkg/cloudprovider" "github.com/GoogleCloudPlatform/kubernetes/pkg/util" @@ -33,6 +35,28 @@ import ( "github.com/onsi/gomega" ) +const ( + // podStartupTimeout is the time to allow all pods in the cluster to become + // running and ready before any e2e tests run. It includes pulling all of + // the pods (as of 5/18/15 this is 8 pods). + podStartupTimeout = 10 * time.Minute + + // minStartupPods is the minimum number of pods that will allow + // wiatForPodsRunningReady(...) to succeed. More verbosely, that function + // checks that all pods in the cluster are both in a phase of "running" and + // have a condition of "ready": "true". It aims to ensure that the cluster's + // pods are fully healthy before beginning e2e tests. However, if there were + // only 0 pods, it would technically pass if there wasn't a required minimum + // number of pods. We expect every cluster to come up with some number of + // pods (which in practice is more than this number), so we have this + // minimum here as a sanity check to make sure that there are actually pods + // on the cluster (i.e. preventing a possible race with kube-addons). This + // does *not* mean that the function will succeed as soon as minStartupPods + // are found to be running and ready; it ensures that *all* pods it finds + // are running and ready. This is the minimum number it must find. + minStartupPods = 1 +) + var ( cloudConfig = &testContext.CloudConfig @@ -92,6 +116,15 @@ func TestE2E(t *testing.T) { } gomega.RegisterFailHandler(ginkgo.Fail) + + // Ensure all pods are running and ready before starting tests (otherwise, + // cluster infrastructure pods that are being pulled or started can block + // test pods from running, and tests that ensure all pods are running and + // ready will fail). + if err := waitForPodsRunningReady(api.NamespaceDefault, minStartupPods, podStartupTimeout); err != nil { + glog.Fatalf("Error waiting for all pods to be running and ready: %v", err) + } + // Run tests through the Ginkgo runner with output to console + JUnit for Jenkins var r []ginkgo.Reporter if *reportDir != "" { diff --git a/test/e2e/reboot.go b/test/e2e/reboot.go index a9ee269b7bf..04323894904 100644 --- a/test/e2e/reboot.go +++ b/test/e2e/reboot.go @@ -190,25 +190,6 @@ func rebootNode(c *client.Client, provider, name string, result chan bool) { result <- true } -// podRunningReady is the checker function passed to waitForPodCondition(...) -// (found in util.go). It ensures that the pods' phase is running and that the -// ready condition is true. -func podRunningReady(p *api.Pod) (bool, error) { - // Check the phase is running. - if p.Status.Phase != api.PodRunning { - return false, fmt.Errorf("want pod %s on %s to be %v but was %v", - p.ObjectMeta.Name, p.Spec.Host, api.PodRunning, p.Status.Phase) - } - // Check the ready condition is true. - for _, cond := range p.Status.Conditions { - if cond.Type == api.PodReady && cond.Status == api.ConditionTrue { - return true, nil - } - } - return false, fmt.Errorf("pod %s on %s didn't have condition %v, %v; conditions: %v", - p.ObjectMeta.Name, p.Spec.Host, api.PodReady, api.ConditionTrue, p.Status.Conditions) -} - // checkPodsRunning returns whether all pods whose names are listed in podNames // are running. func checkPodsRunning(c *client.Client, podNames []string, timeout time.Duration) bool { diff --git a/test/e2e/util.go b/test/e2e/util.go index 1da46386698..60f60443826 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -105,6 +105,83 @@ func providerIs(providers ...string) bool { type podCondition func(pod *api.Pod) (bool, error) +// podReady returns whether pod has a condition of Ready with a status of true. +func podReady(pod *api.Pod) bool { + for _, cond := range pod.Status.Conditions { + if cond.Type == api.PodReady && cond.Status == api.ConditionTrue { + return true + } + } + return false +} + +// logPodStates logs all pod states for debugging. +func logPodStates(c *client.Client, ns string) { + podList, err := c.Pods(ns).List(labels.Everything(), fields.Everything()) + if err != nil { + Logf("Error getting pods for logPodStates(...): %v", err) + return + } + Logf("Phase and conditions for all pods in namespace '%s':", ns) + for _, pod := range podList.Items { + Logf("- pod '%s' on '%s' has phase '%v' and conditions %v", + pod.ObjectMeta.Name, pod.Spec.Host, pod.Status.Phase, pod.Status.Conditions) + } +} + +// podRunningReady checks whether pod p's phase is running and it has a ready +// condition of status true. +func podRunningReady(p *api.Pod) (bool, error) { + // Check the phase is running. + if p.Status.Phase != api.PodRunning { + return false, fmt.Errorf("want pod '%s' on '%s' to be '%v' but was '%v'", + p.ObjectMeta.Name, p.Spec.Host, api.PodRunning, p.Status.Phase) + } + // Check the ready condition is true. + if !podReady(p) { + return false, fmt.Errorf("pod '%s' on '%s' didn't have condition {%v %v}; conditions: %v", + p.ObjectMeta.Name, p.Spec.Host, api.PodReady, api.ConditionTrue, p.Status.Conditions) + + } + return true, nil +} + +// waitForPodsRunningReady waits up to timeout to ensure that all pods in +// namespace ns are running and ready, requiring that it finds at least minPods. +// It has separate behavior from other 'wait for' pods functions in that it re- +// queries the list of pods on every iteration. This is useful, for example, in +// cluster startup, because the number of pods increases while waiting. +func waitForPodsRunningReady(ns string, minPods int, timeout time.Duration) error { + c, err := loadClient() + if err != nil { + return err + } + Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready", + timeout, minPods, ns) + for start := time.Now(); time.Since(start) < timeout; time.Sleep(podPoll) { + // We get the new list of pods in every iteration beause more pods come + // online during startup and we want to ensure they are also checked. + podList, err := c.Pods(ns).List(labels.Everything(), fields.Everything()) + if err != nil { + Logf("Error getting pods in namespace '%s': %v", ns, err) + continue + } + nOk := 0 + for _, pod := range podList.Items { + if res, err := podRunningReady(&pod); res && err == nil { + nOk++ + } + } + Logf("%d / %d pods in namespace '%s' are running and ready (%v elapsed)", + nOk, len(podList.Items), ns, time.Since(start)) + if nOk == len(podList.Items) && nOk >= minPods { + return nil + } + } + logPodStates(c, ns) + return fmt.Errorf("Not all pods in namespace '%s' running and ready within %v", ns, timeout) +} + func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeout time.Duration, condition podCondition) error { Logf("Waiting up to %v for pod %s status to be %s", timeout, podName, desc) for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) { @@ -117,9 +194,10 @@ func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeo if done { return err } - Logf("Waiting for pod %s in namespace %s status to be %q (found %q) (%v)", podName, ns, desc, pod.Status.Phase, time.Since(start)) + Logf("Waiting for pod '%s' in namespace '%s' status to be '%q' (found phase: '%q', readiness: %t) (%v)", + podName, ns, desc, pod.Status.Phase, podReady(pod), time.Since(start)) } - return fmt.Errorf("gave up waiting for pod %s to be %s after %v", podName, desc, timeout) + return fmt.Errorf("gave up waiting for pod '%s' to be '%s' after %v", podName, desc, timeout) } // createNS should be used by every test, note that we append a common prefix to the provided test name. @@ -149,7 +227,7 @@ func waitForPodRunning(c *client.Client, podName string) error { func waitForPodNotPending(c *client.Client, ns, podName string) error { return waitForPodCondition(c, ns, podName, "!pending", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) { if pod.Status.Phase != api.PodPending { - Logf("Saw pod %s in namespace %s out of pending state (found %q)", podName, ns, pod.Status.Phase) + Logf("Saw pod '%s' in namespace '%s' out of pending state (found '%q')", podName, ns, pod.Status.Phase) return true, nil } return false, nil @@ -162,17 +240,17 @@ func waitForPodSuccessInNamespace(c *client.Client, podName string, contName str // Cannot use pod.Status.Phase == api.PodSucceeded/api.PodFailed due to #2632 ci, ok := api.GetContainerStatus(pod.Status.ContainerStatuses, contName) if !ok { - Logf("No Status.Info for container %s in pod %s yet", contName, podName) + Logf("No Status.Info for container '%s' in pod '%s' yet", contName, podName) } else { if ci.State.Termination != nil { if ci.State.Termination.ExitCode == 0 { By("Saw pod success") return true, nil } else { - return true, fmt.Errorf("pod %s terminated with failure: %+v", podName, ci.State.Termination) + return true, fmt.Errorf("pod '%s' terminated with failure: %+v", podName, ci.State.Termination) } } else { - Logf("Nil State.Termination for container %s in pod %s in namespace %s so far", contName, podName, namespace) + Logf("Nil State.Termination for container '%s' in pod '%s' in namespace '%s' so far", contName, podName, namespace) } } return false, nil