Revert "Revert "Added metrics/debug gathering methods to utils and used them in density ...""

This reverts commit 70500a64a7.
2026-01-05 15:37:24 +00:00 · 2015-05-26 10:24:46 -04:00
parent 1845ca88fc
commit 13b8d947fc
5 changed files with 333 additions and 95 deletions
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@@ -19,8 +19,10 @@ package e2e
 import (
 	"bytes"
 	"fmt"
+	"io/ioutil"
 	"math"
 	"math/rand"
+	"net/http"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -132,6 +134,16 @@ func (s *podStore) Stop() {
 	close(s.stopCh)
 }

+type RCConfig struct {
+	Client        *client.Client
+	Image         string
+	Name          string
+	Namespace     string
+	PollInterval  int
+	PodStatusFile *os.File
+	Replicas      int
+}
+
 func Logf(format string, a ...interface{}) {
 	fmt.Fprintf(GinkgoWriter, "INFO: "+format+"\n", a...)
 }
@@ -703,12 +715,20 @@ func Diff(oldPods []*api.Pod, curPods []*api.Pod) PodDiff {
 // It will waits for all pods it spawns to become "Running".
 // It's the caller's responsibility to clean up externally (i.e. use the
 // namespace lifecycle for handling cleanup).
-func RunRC(c *client.Client, name string, ns, image string, replicas int) error {
+func RunRC(config RCConfig) error {
 	var last int
+	c := config.Client
+	name := config.Name
+	ns := config.Namespace
+	image := config.Image
+	replicas := config.Replicas
+	interval := config.PollInterval

 	maxContainerFailures := int(math.Max(1.0, float64(replicas)*.01))
 	current := 0
 	same := 0
+	label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
+	podLists := newFifoQueue()

 	By(fmt.Sprintf("%v Creating replication controller %s", time.Now(), name))
 	rc := &api.ReplicationController{
@@ -741,32 +761,49 @@ func RunRC(c *client.Client, name string, ns, image string, replicas int) error
 		return fmt.Errorf("Error creating replication controller: %v", err)
 	}
 	Logf("%v Created replication controller with name: %v, namespace: %v, replica count: %v", time.Now(), rc.Name, ns, rc.Spec.Replicas)
-
-	By(fmt.Sprintf("Making sure all %d replicas of rc %s in namespace %s exist", replicas, name, ns))
-	label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
 	podStore := newPodStore(c, ns, label, fields.Everything())
 	defer podStore.Stop()
-	pods := podStore.List()
-	current = len(pods)
-	failCount := 24
+
+	// Create a routine to query for the list of pods
+	stop := make(chan struct{})
+	go func(stop <-chan struct{}, n string, ns string, l labels.Selector, i int) {
+		for {
+			select {
+			case <-stop:
+				return
+			default:
+				podLists.Push(podStore.List())
+				time.Sleep(time.Duration(i) * time.Second)
+			}
+		}
+	}(stop, name, ns, label, interval)
+	defer close(stop)
+
+	By(fmt.Sprintf("Making sure all %d replicas of rc %s in namespace %s exist", replicas, name, ns))
+	failCount := int(120.0 / float64(interval))
 	for same < failCount && current < replicas {
-		Logf("%v Controller %s: Found %d pods out of %d", time.Now(), name, current, replicas)
-		if last < current {
-			same = 0
-		} else if last == current {
-			same++
-		} else if current < last {
-			return fmt.Errorf("Controller %s: Number of submitted pods dropped from %d to %d", name, last, current)
-		}
+		time.Sleep(time.Duration(float32(interval)*1.1) * time.Second)

-		if same >= failCount {
-			return fmt.Errorf("Controller %s: No pods submitted for the last %d checks", name, failCount)
-		}
+		// Greedily read all existing entries in the queue until
+		// all pods are found submitted or the queue is empty
+		for podLists.Len() > 0 && current < replicas {
+			item := podLists.Pop()
+			pods := item.value.([]*api.Pod)
+			current = len(pods)
+			Logf("%v Controller %s: Found %d pods out of %d", time.Now(), name, current, replicas)
+			if last < current {
+				same = 0
+			} else if last == current {
+				same++
+			} else if current < last {
+				return fmt.Errorf("Controller %s: Number of submitted pods dropped from %d to %d", name, last, current)
+			}

-		last = current
-		time.Sleep(5 * time.Second)
-		pods = podStore.List()
-		current = len(pods)
+			if same >= failCount {
+				return fmt.Errorf("Controller %s: No pods submitted for the last %d checks", name, failCount)
+			}
+			last = current
+		}
 	}
 	if current != replicas {
 		return fmt.Errorf("Controller %s: Only found %d replicas out of %d", name, current, replicas)
@@ -776,77 +813,92 @@ func RunRC(c *client.Client, name string, ns, image string, replicas int) error
 	By(fmt.Sprintf("%v Waiting for all %d replicas to be running with a max container failures of %d", time.Now(), replicas, maxContainerFailures))
 	same = 0
 	last = 0
-	failCount = 20
+	failCount = int(100.0 / float64(interval))
 	current = 0
-	oldPods := make([]*api.Pod, 0)
+	var oldPods []*api.Pod
+	podLists.Reset()
+	foundAllPods := false
 	for same < failCount && current < replicas {
-		current = 0
-		waiting := 0
-		pending := 0
-		unknown := 0
-		inactive := 0
-		failedContainers := 0
-		time.Sleep(5 * time.Second)
+		time.Sleep(time.Duration(float32(interval)*1.1) * time.Second)

-		currentPods := podStore.List()
-		for _, p := range currentPods {
-			if p.Status.Phase == api.PodRunning {
-				current++
-				for _, v := range FailedContainers(*p) {
-					failedContainers = failedContainers + v.restarts
+		// Greedily read all existing entries in the queue until
+		// either all pods are running or the queue is empty
+		for podLists.Len() > 0 && current < replicas {
+			item := podLists.Pop()
+			current = 0
+			waiting := 0
+			pending := 0
+			unknown := 0
+			inactive := 0
+			failedContainers := 0
+			currentPods := item.value.([]*api.Pod)
+			for _, p := range currentPods {
+				if p.Status.Phase == api.PodRunning {
+					current++
+					for _, v := range FailedContainers(p) {
+						failedContainers = failedContainers + v.restarts
+					}
+				} else if p.Status.Phase == api.PodPending {
+					if p.Spec.NodeName == "" {
+						waiting++
+					} else {
+						pending++
+					}
+				} else if p.Status.Phase == api.PodSucceeded || p.Status.Phase == api.PodFailed {
+					inactive++
+				} else if p.Status.Phase == api.PodUnknown {
+					unknown++
 				}
-			} else if p.Status.Phase == api.PodPending {
-				if p.Spec.NodeName == "" {
-					waiting++
-				} else {
-					pending++
-				}
-			} else if p.Status.Phase == api.PodSucceeded || p.Status.Phase == api.PodFailed {
-				inactive++
-			} else if p.Status.Phase == api.PodUnknown {
-				unknown++
 			}
-		}
-		Logf("Pod States: %d running, %d pending, %d waiting, %d inactive, %d unknown ", current, pending, waiting, inactive, unknown)

-		if len(currentPods) != len(pods) {
+			Logf("Pod States: %d running, %d pending, %d waiting, %d inactive, %d unknown ", current, pending, waiting, inactive, unknown)
+			if config.PodStatusFile != nil {
+				fmt.Fprintf(config.PodStatusFile, "%s, %d, running, %d, pending, %d, waiting, %d, inactive, %d, unknown\n", item.createTime, current, pending, waiting, inactive, unknown)
+			}

-			// This failure mode includes:
-			// kubelet is dead, so node controller deleted pods and rc creates more
-			//	- diagnose by noting the pod diff below.
-			// pod is unhealthy, so replication controller creates another to take its place
-			//	- diagnose by comparing the previous "2 Pod states" lines for inactive pods
-			errorStr := fmt.Sprintf("Number of reported pods changed: %d vs %d", len(currentPods), len(pods))
-			Logf("%v, pods that changed since the last iteration:", errorStr)
-			Diff(oldPods, currentPods).Print(util.NewStringSet())
-			return fmt.Errorf(errorStr)
-		}
-		if last < current {
-			same = 0
-		} else if last == current {
-			same++
-		} else if current < last {
+			if foundAllPods && len(currentPods) != len(oldPods) {

-			// The pod failed or succeeded, or was somehow pushed out of running by the kubelet.
-			errorStr := fmt.Sprintf("Number of running pods dropped from %d to %d", last, current)
-			Logf("%v, pods that changed since the last iteration:", errorStr)
-			Diff(oldPods, currentPods).Print(util.NewStringSet())
-			return fmt.Errorf(errorStr)
-		}
-		if same >= failCount {
+				// This failure mode includes:
+				// kubelet is dead, so node controller deleted pods and rc creates more
+				//	- diagnose by noting the pod diff below.
+				// pod is unhealthy, so replication controller creates another to take its place
+				//	- diagnose by comparing the previous "2 Pod states" lines for inactive pods
+				errorStr := fmt.Sprintf("Number of reported pods changed: %d vs %d", len(currentPods), len(oldPods))
+				Logf("%v, pods that changed since the last iteration:", errorStr)
+				Diff(oldPods, currentPods).Print(util.NewStringSet())
+				return fmt.Errorf(errorStr)
+			}
+			if last < current {
+				same = 0
+			} else if last == current {
+				same++
+			} else if current < last {

-			// Most times this happens because a few nodes have kubelet problems, and their pods are
-			// stuck in pending.
-			errorStr := fmt.Sprintf("No pods started for the last %d checks", failCount)
-			Logf("%v, pods currently in pending:", errorStr)
-			Diff(currentPods, make([]*api.Pod, 0)).Print(util.NewStringSet(string(api.PodRunning)))
-			return fmt.Errorf(errorStr)
-		}
-		last = current
-		oldPods = currentPods
+				// The pod failed or succeeded, or was somehow pushed out of running by the kubelet.
+				errorStr := fmt.Sprintf("Number of running pods dropped from %d to %d", last, current)
+				Logf("%v, pods that changed since the last iteration:", errorStr)
+				Diff(oldPods, currentPods).Print(util.NewStringSet())
+				return fmt.Errorf(errorStr)
+			}
+			if same >= failCount {

-		if failedContainers > maxContainerFailures {
-			return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures)
+				// Most times this happens because a few nodes have kubelet problems, and their pods are
+				// stuck in pending.
+				errorStr := fmt.Sprintf("No pods started for the last %d checks", failCount)
+				Logf("%v, pods currently in pending:", errorStr)
+				Diff(currentPods, make([]*api.Pod, 0)).Print(util.NewStringSet(string(api.PodRunning)))
+				return fmt.Errorf(errorStr)
+			}
+
+			if !foundAllPods {
+				foundAllPods = len(currentPods) == replicas
+			}
+			last = current
+			oldPods = currentPods
+
+			if failedContainers > maxContainerFailures {
+				return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures)
+			}
 		}
 	}
 	if current != replicas {
@@ -908,7 +960,7 @@ func DeleteRC(c *client.Client, ns, name string) error {
 // information for containers that have failed or been restarted.
 // A map is returned where the key is the containerID and the value is a
 // struct containing the restart and failure information
-func FailedContainers(pod api.Pod) map[string]ContainerFailures {
+func FailedContainers(pod *api.Pod) map[string]ContainerFailures {
 	var state ContainerFailures
 	states := make(map[string]ContainerFailures)

@@ -1048,7 +1100,7 @@ func (a LatencyMetricByLatency) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a LatencyMetricByLatency) Less(i, j int) bool { return a[i].Latency < a[j].Latency }

 func ReadLatencyMetrics(c *client.Client) ([]LatencyMetric, error) {
-	body, err := c.Get().AbsPath("/metrics").DoRaw()
+	body, err := getMetrics(c)
 	if err != nil {
 		return nil, err
 	}
@@ -1118,3 +1170,74 @@ func HighLatencyRequests(c *client.Client, threshold time.Duration, ignoredResou

 	return len(badMetrics), nil
 }
+
+// Retrieve metrics information
+func getMetrics(c *client.Client) (string, error) {
+	body, err := c.Get().AbsPath("/metrics").DoRaw()
+	if err != nil {
+		return "", err
+	}
+	return string(body), nil
+}
+
+// Retrieve debug information
+func getDebugInfo(c *client.Client) (map[string]string, error) {
+	data := make(map[string]string)
+	for _, key := range []string{"block", "goroutine", "heap", "threadcreate"} {
+		resp, err := http.Get(c.Get().AbsPath(fmt.Sprintf("debug/pprof/%s", key)).URL().String() + "?debug=2")
+		body, err := ioutil.ReadAll(resp.Body)
+		resp.Body.Close()
+		if err != nil {
+			Logf("Warning: Error trying to fetch %s debug data: %v", key, err)
+		}
+		data[key] = string(body)
+	}
+	return data, nil
+}
+
+func writePerfData(c *client.Client, dirName string, postfix string) error {
+	fname := fmt.Sprintf("%s/metrics_%s.txt", dirName, postfix)
+
+	handler, err := os.Create(fname)
+	if err != nil {
+		return fmt.Errorf("Error creating file '%s': %v", fname, err)
+	}
+
+	metrics, err := getMetrics(c)
+	if err != nil {
+		return fmt.Errorf("Error retrieving metrics: %v", err)
+	}
+
+	_, err = handler.WriteString(metrics)
+	if err != nil {
+		return fmt.Errorf("Error writing metrics: %v", err)
+	}
+
+	err = handler.Close()
+	if err != nil {
+		return fmt.Errorf("Error closing '%s': %v", fname, err)
+	}
+
+	debug, err := getDebugInfo(c)
+	if err != nil {
+		return fmt.Errorf("Error retrieving debug information: %v", err)
+	}
+
+	for key, value := range debug {
+		fname := fmt.Sprintf("%s/%s_%s.txt", dirName, key, postfix)
+		handler, err = os.Create(fname)
+		if err != nil {
+			return fmt.Errorf("Error creating file '%s': %v", fname, err)
+		}
+		_, err = handler.WriteString(value)
+		if err != nil {
+			return fmt.Errorf("Error writing %s: %v", key, err)
+		}
+
+		err = handler.Close()
+		if err != nil {
+			return fmt.Errorf("Error closing '%s': %v", fname, err)
+		}
+	}
+	return nil
+}