From 70500a64a71ecf78de7a6d3adf50950a6c771405 Mon Sep 17 00:00:00 2001
From: Filip Grzadkowski <filipg@google.com>
Date: Fri, 22 May 2015 13:00:46 -0700
Subject: [PATCH] Revert "Added metrics/debug gathering methods to utils and
 used them in density ..."

---
 test/e2e/density.go    |  34 +----
 test/e2e/fifo_queue.go |  75 ----------
 test/e2e/load.go       |   8 +-
 test/e2e/scale.go      |  10 +-
 test/e2e/util.go       | 310 +++++++++++++----------------------------
 5 files changed, 106 insertions(+), 331 deletions(-)
 delete mode 100644 test/e2e/fifo_queue.go

diff --git a/test/e2e/density.go b/test/e2e/density.go
index 90c02056977..6a5fe3e67b8 100644
--- a/test/e2e/density.go
+++ b/test/e2e/density.go
@@ -19,7 +19,6 @@ package e2e
 import (
 	"fmt"
 	"math"
-	"os"
 	"strconv"
 	"time"
 
@@ -46,7 +45,6 @@ var _ = Describe("Density", func() {
 	var minionCount int
 	var RCName string
 	var ns string
-	var uuid string
 
 	BeforeEach(func() {
 		var err error
@@ -59,9 +57,6 @@ var _ = Describe("Density", func() {
 		nsForTesting, err := createTestingNS("density", c)
 		ns = nsForTesting.Name
 		expectNoError(err)
-		uuid = string(util.NewUUID())
-		expectNoError(os.Mkdir(uuid, 0777))
-		expectNoError(writePerfData(c, uuid, "before"))
 	})
 
 	AfterEach(func() {
@@ -81,8 +76,6 @@ var _ = Describe("Density", func() {
 			Failf("Couldn't delete ns %s", err)
 		}
 
-		expectNoError(writePerfData(c, uuid, "after"))
-
 		// Verify latency metrics
 		// TODO: Update threshold to 1s once we reach this goal
 		// TODO: We should reset metrics before the test. Currently previous tests influence latency metrics.
@@ -96,18 +89,16 @@ var _ = Describe("Density", func() {
 	type Density struct {
 		skip          bool
 		podsPerMinion int
-		/* Controls how often the apiserver is polled for pods */
-		interval int
 	}
 
 	densityTests := []Density{
 		// This test should always run, even if larger densities are skipped.
-		{podsPerMinion: 3, skip: false, interval: 10},
-		{podsPerMinion: 30, skip: false, interval: 10},
+		{podsPerMinion: 3, skip: false},
+		{podsPerMinion: 30, skip: false},
 		// More than 30 pods per node is outside our v1.0 goals.
 		// We might want to enable those tests in the future.
-		{podsPerMinion: 50, skip: true, interval: 10},
-		{podsPerMinion: 100, skip: true, interval: 1},
+		{podsPerMinion: 50, skip: true},
+		{podsPerMinion: 100, skip: true},
 	}
 
 	for _, testArg := range densityTests {
@@ -121,19 +112,8 @@ var _ = Describe("Density", func() {
 		itArg := testArg
 		It(name, func() {
 			totalPods := itArg.podsPerMinion * minionCount
-			RCName = "density" + strconv.Itoa(totalPods) + "-" + uuid
-			fileHndl, err := os.Create(fmt.Sprintf("%s/pod_states.csv", uuid))
-			expectNoError(err)
-			defer fileHndl.Close()
-
-			config := RCConfig{Client: c,
-				Image:         "gcr.io/google_containers/pause:go",
-				Name:          RCName,
-				Namespace:     ns,
-				PollInterval:  itArg.interval,
-				PodStatusFile: fileHndl,
-				Replicas:      totalPods,
-			}
+			nameStr := strconv.Itoa(totalPods) + "-" + string(util.NewUUID())
+			RCName = "my-hostname-density" + nameStr
 
 			// Create a listener for events.
 			events := make([](*api.Event), 0)
@@ -159,7 +139,7 @@ var _ = Describe("Density", func() {
 
 			// Start the replication controller.
 			startTime := time.Now()
-			expectNoError(RunRC(config))
+			expectNoError(RunRC(c, RCName, ns, "gcr.io/google_containers/pause:go", totalPods))
 			e2eStartupTime := time.Now().Sub(startTime)
 			Logf("E2E startup time for %d pods: %v", totalPods, e2eStartupTime)
 
diff --git a/test/e2e/fifo_queue.go b/test/e2e/fifo_queue.go
deleted file mode 100644
index 0942c37df72..00000000000
--- a/test/e2e/fifo_queue.go
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
-Copyright 2014 The Kubernetes Authors All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package e2e
-
-import (
-	"sync"
-	"time"
-)
-
-type QueueItem struct {
-	createTime string
-	value      interface{}
-}
-
-type QueueItems struct {
-	pos   int
-	mutex *sync.Mutex
-	list  []QueueItem
-}
-
-type FifoQueue QueueItems
-
-func (fq *FifoQueue) Push(elem interface{}) {
-	fq.mutex.Lock()
-	fq.list = append(fq.list, QueueItem{time.Now().String(), elem})
-	fq.mutex.Unlock()
-}
-
-func (fq *FifoQueue) Pop() QueueItem {
-	fq.mutex.Lock()
-	var val QueueItem
-	if len(fq.list)-1 >= fq.pos {
-		val = fq.list[fq.pos]
-		fq.pos++
-	}
-	fq.mutex.Unlock()
-	return val
-}
-
-func (fq FifoQueue) Len() int {
-	return len(fq.list[fq.pos:])
-}
-
-func (fq *FifoQueue) First() QueueItem {
-	return fq.list[fq.pos]
-}
-
-func (fq *FifoQueue) Last() QueueItem {
-	return fq.list[len(fq.list)-1]
-}
-
-func (fq *FifoQueue) Reset() {
-	fq.pos = 0
-}
-
-func newFifoQueue() *FifoQueue {
-	tmp := new(FifoQueue)
-	tmp.mutex = &sync.Mutex{}
-	tmp.pos = 0
-	return tmp
-}
diff --git a/test/e2e/load.go b/test/e2e/load.go
index 482382f0b5d..8e3dad52431 100644
--- a/test/e2e/load.go
+++ b/test/e2e/load.go
@@ -120,13 +120,7 @@ func playWithRC(c *client.Client, wg *sync.WaitGroup, ns, name string, size int)
 	// Once every 1-2 minutes perform resize of RC.
 	for start := time.Now(); time.Since(start) < simulationTime; time.Sleep(time.Duration(60+rand.Intn(60)) * time.Second) {
 		if !rcExist {
-			config := RCConfig{Client: c,
-				Name:      name,
-				Namespace: ns,
-				Image:     image,
-				Replicas:  size,
-			}
-			expectNoError(RunRC(config), fmt.Sprintf("creating rc %s in namespace %s", name, ns))
+			expectNoError(RunRC(c, name, ns, image, size), fmt.Sprintf("creating rc %s in namespace %s", name, ns))
 			rcExist = true
 		}
 		// Resize RC to a random size between 0.5x and 1.5x of the original size.
diff --git a/test/e2e/scale.go b/test/e2e/scale.go
index 52f1496b6fb..a787c28f731 100644
--- a/test/e2e/scale.go
+++ b/test/e2e/scale.go
@@ -107,15 +107,7 @@ var _ = Describe("Scale", func() {
 					for i := 0; i < itArg.rcsPerThread; i++ {
 						name := "my-short-lived-pod" + string(util.NewUUID())
 						n := itArg.podsPerMinion * minionCount
-
-						config := RCConfig{Client: c,
-							Name:      name,
-							Namespace: ns,
-							Image:     "gcr.io/google_containers/pause:go",
-							Replicas:  n,
-						}
-
-						expectNoError(RunRC(config))
+						expectNoError(RunRC(c, name, ns, "gcr.io/google_containers/pause:go", n))
 						podsLaunched += n
 						Logf("Launched %v pods so far...", podsLaunched)
 						err := DeleteRC(c, ns, name)
diff --git a/test/e2e/util.go b/test/e2e/util.go
index ea3386422e7..d45dbaf0eb3 100644
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@@ -22,7 +22,6 @@ import (
 	"io/ioutil"
 	"math"
 	"math/rand"
-	"net/http"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -87,16 +86,6 @@ type ContainerFailures struct {
 	restarts int
 }
 
-type RCConfig struct {
-	Client        *client.Client
-	Image         string
-	Name          string
-	Namespace     string
-	PollInterval  int
-	PodStatusFile *os.File
-	Replicas      int
-}
-
 func Logf(format string, a ...interface{}) {
 	fmt.Fprintf(GinkgoWriter, "INFO: "+format+"\n", a...)
 }
@@ -596,16 +585,16 @@ func (p PodDiff) Print(ignorePhases util.StringSet) {
 }
 
 // Diff computes a PodDiff given 2 lists of pods.
-func Diff(oldPods []api.Pod, curPods []api.Pod) PodDiff {
+func Diff(oldPods *api.PodList, curPods *api.PodList) PodDiff {
 	podInfoMap := PodDiff{}
 
 	// New pods will show up in the curPods list but not in oldPods. They have oldhostname/phase == nonexist.
-	for _, pod := range curPods {
+	for _, pod := range curPods.Items {
 		podInfoMap[pod.Name] = &podInfo{hostname: pod.Spec.Host, phase: string(pod.Status.Phase), oldHostname: nonExist, oldPhase: nonExist}
 	}
 
 	// Deleted pods will show up in the oldPods list but not in curPods. They have a hostname/phase == nonexist.
-	for _, pod := range oldPods {
+	for _, pod := range oldPods.Items {
 		if info, ok := podInfoMap[pod.Name]; ok {
 			info.oldHostname, info.oldPhase = pod.Spec.Host, string(pod.Status.Phase)
 		} else {
@@ -619,20 +608,12 @@ func Diff(oldPods []api.Pod, curPods []api.Pod) PodDiff {
 // It will waits for all pods it spawns to become "Running".
 // It's the caller's responsibility to clean up externally (i.e. use the
 // namespace lifecycle for handling cleanup).
-func RunRC(config RCConfig) error {
+func RunRC(c *client.Client, name string, ns, image string, replicas int) error {
 	var last int
-	c := config.Client
-	name := config.Name
-	ns := config.Namespace
-	image := config.Image
-	replicas := config.Replicas
-	interval := config.PollInterval
 
 	maxContainerFailures := int(math.Max(1.0, float64(replicas)*.01))
 	current := 0
 	same := 0
-	label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
-	podLists := newFifoQueue()
 
 	By(fmt.Sprintf("Creating replication controller %s", name))
 	rc := &api.ReplicationController{
@@ -666,52 +647,35 @@ func RunRC(config RCConfig) error {
 	}
 	Logf("Created replication controller with name: %v, namespace: %v, replica count: %v", rc.Name, ns, rc.Spec.Replicas)
 
-	// Create a routine to query for the list of pods
-	stop := make(chan struct{})
-	go func(stop <-chan struct{}, n string, ns string, l labels.Selector, i int) {
-		for {
-			select {
-			case <-stop:
-				return
-			default:
-				p, err := c.Pods(ns).List(l, fields.Everything())
-				if err != nil {
-					Logf("Warning: Failed to get pod list: %v", err)
-				} else {
-					podLists.Push(p.Items)
-				}
-				time.Sleep(time.Duration(i) * time.Second)
-			}
-		}
-	}(stop, name, ns, label, interval)
-	defer close(stop)
-
 	By(fmt.Sprintf("Making sure all %d replicas of rc %s in namespace %s exist", replicas, name, ns))
-	failCount := int(25 / interval)
+	label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
+	pods, err := listPods(c, ns, label, fields.Everything())
+	if err != nil {
+		return fmt.Errorf("Error listing pods: %v", err)
+	}
+	current = len(pods.Items)
+	failCount := 5
 	for same < failCount && current < replicas {
-		time.Sleep(time.Duration(float32(interval)*1.1) * time.Second)
-
-		// Greedily read all existing entries in the queue until
-		// all pods are found submitted or the queue is empty
-		for podLists.Len() > 0 && current < replicas {
-			item := podLists.Pop()
-			pods := item.value.([]api.Pod)
-			current = len(pods)
-			Logf("Controller %s: Found %d pods out of %d", name, current, replicas)
-			if last < current {
-				same = 0
-			} else if last == current {
-				same++
-			} else if current < last {
-				return fmt.Errorf("Controller %s: Number of submitted pods dropped from %d to %d", name, last, current)
-			}
-
-			if same >= failCount {
-				return fmt.Errorf("Controller %s: No pods submitted for the last %d checks", name, failCount)
-			}
-
-			last = current
+		Logf("Controller %s: Found %d pods out of %d", name, current, replicas)
+		if last < current {
+			same = 0
+		} else if last == current {
+			same++
+		} else if current < last {
+			return fmt.Errorf("Controller %s: Number of submitted pods dropped from %d to %d", name, last, current)
 		}
+
+		if same >= failCount {
+			return fmt.Errorf("Controller %s: No pods submitted for the last %d checks", name, failCount)
+		}
+
+		last = current
+		time.Sleep(5 * time.Second)
+		pods, err = listPods(c, ns, label, fields.Everything())
+		if err != nil {
+			return fmt.Errorf("Error listing pods: %v", err)
+		}
+		current = len(pods.Items)
 	}
 	if current != replicas {
 		return fmt.Errorf("Controller %s: Only found %d replicas out of %d", name, current, replicas)
@@ -721,91 +685,82 @@ func RunRC(config RCConfig) error {
 	By(fmt.Sprintf("Waiting for all %d replicas to be running with a max container failures of %d", replicas, maxContainerFailures))
 	same = 0
 	last = 0
-	failCount = int(100 / interval)
+	failCount = 10
 	current = 0
-	var oldPods []api.Pod
-	podLists.Reset()
-	foundAllPods := false
+	oldPods := &api.PodList{}
 	for same < failCount && current < replicas {
-		time.Sleep(time.Duration(float32(interval)*1.1) * time.Second)
+		current = 0
+		waiting := 0
+		pending := 0
+		unknown := 0
+		inactive := 0
+		failedContainers := 0
+		time.Sleep(10 * time.Second)
 
-		// Greedily read all existing entries in the queue until
-		// either all pods are running or the queue is empty
-		for podLists.Len() > 0 && current < replicas {
-			item := podLists.Pop()
-			current = 0
-			waiting := 0
-			pending := 0
-			unknown := 0
-			inactive := 0
-			failedContainers := 0
-			currentPods := item.value.([]api.Pod)
-			for _, p := range currentPods {
-				if p.Status.Phase == api.PodRunning {
-					current++
-					for _, v := range FailedContainers(p) {
-						failedContainers = failedContainers + v.restarts
-					}
-				} else if p.Status.Phase == api.PodPending {
-					if p.Spec.Host == "" {
-						waiting++
-					} else {
-						pending++
-					}
-				} else if p.Status.Phase == api.PodSucceeded || p.Status.Phase == api.PodFailed {
-					inactive++
-				} else if p.Status.Phase == api.PodUnknown {
-					unknown++
+		// TODO: Use a reflector both to put less strain on the cluster and
+		// for more clarity.
+		currentPods, err := listPods(c, ns, label, fields.Everything())
+		if err != nil {
+			return fmt.Errorf("Error listing pods: %v", err)
+		}
+		for _, p := range currentPods.Items {
+			if p.Status.Phase == api.PodRunning {
+				current++
+				for _, v := range FailedContainers(p) {
+					failedContainers = failedContainers + v.restarts
 				}
+			} else if p.Status.Phase == api.PodPending {
+				if p.Spec.Host == "" {
+					waiting++
+				} else {
+					pending++
+				}
+			} else if p.Status.Phase == api.PodSucceeded || p.Status.Phase == api.PodFailed {
+				inactive++
+			} else if p.Status.Phase == api.PodUnknown {
+				unknown++
 			}
-			Logf("Pod States: %d running, %d pending, %d waiting, %d inactive, %d unknown ", current, pending, waiting, inactive, unknown)
-			if config.PodStatusFile != nil {
-				fmt.Fprintf(config.PodStatusFile, "%s, %d, running, %d, pending, %d, waiting, %d, inactive, %d, unknown\n", item.createTime, current, pending, waiting, inactive, unknown)
-			}
+		}
+		Logf("Pod States: %d running, %d pending, %d waiting, %d inactive, %d unknown ", current, pending, waiting, inactive, unknown)
 
-			if foundAllPods && len(currentPods) != len(oldPods) {
+		if len(currentPods.Items) != len(pods.Items) {
 
-				// This failure mode includes:
-				// kubelet is dead, so node controller deleted pods and rc creates more
-				//	- diagnose by noting the pod diff below.
-				// pod is unhealthy, so replication controller creates another to take its place
-				//	- diagnose by comparing the previous "2 Pod states" lines for inactive pods
-				errorStr := fmt.Sprintf("Number of reported pods changed: %d vs %d", len(currentPods), len(oldPods))
-				Logf("%v, pods that changed since the last iteration:", errorStr)
-				Diff(oldPods, currentPods).Print(util.NewStringSet())
-				return fmt.Errorf(errorStr)
-			}
-			if last < current {
-				same = 0
-			} else if last == current {
-				same++
-			} else if current < last {
+			// This failure mode includes:
+			// kubelet is dead, so node controller deleted pods and rc creates more
+			//	- diagnose by noting the pod diff below.
+			// pod is unhealthy, so replication controller creates another to take its place
+			//	- diagnose by comparing the previous "2 Pod states" lines for inactive pods
+			errorStr := fmt.Sprintf("Number of reported pods changed: %d vs %d", len(currentPods.Items), len(pods.Items))
+			Logf("%v, pods that changed since the last iteration:", errorStr)
+			Diff(oldPods, currentPods).Print(util.NewStringSet())
+			return fmt.Errorf(errorStr)
+		}
+		if last < current {
+			same = 0
+		} else if last == current {
+			same++
+		} else if current < last {
 
-				// The pod failed or succeeded, or was somehow pushed out of running by the kubelet.
-				errorStr := fmt.Sprintf("Number of running pods dropped from %d to %d", last, current)
-				Logf("%v, pods that changed since the last iteration:", errorStr)
-				Diff(oldPods, currentPods).Print(util.NewStringSet())
-				return fmt.Errorf(errorStr)
-			}
-			if same >= failCount {
+			// The pod failed or succeeded, or was somehow pushed out of running by the kubelet.
+			errorStr := fmt.Sprintf("Number of running pods dropped from %d to %d", last, current)
+			Logf("%v, pods that changed since the last iteration:", errorStr)
+			Diff(oldPods, currentPods).Print(util.NewStringSet())
+			return fmt.Errorf(errorStr)
+		}
+		if same >= failCount {
 
-				// Most times this happens because a few nodes have kubelet problems, and their pods are
-				// stuck in pending.
-				errorStr := fmt.Sprintf("No pods started for the last %d checks", failCount)
-				Logf("%v, pods currently in pending:", errorStr)
-				Diff(currentPods, make([]api.Pod, 0)).Print(util.NewStringSet(string(api.PodRunning)))
-				return fmt.Errorf(errorStr)
-			}
+			// Most times this happens because a few nodes have kubelet problems, and their pods are
+			// stuck in pending.
+			errorStr := fmt.Sprintf("No pods started for the last %d checks", failCount)
+			Logf("%v, pods currently in pending:", errorStr)
+			Diff(currentPods, &api.PodList{}).Print(util.NewStringSet(string(api.PodRunning)))
+			return fmt.Errorf(errorStr)
+		}
+		last = current
+		oldPods = currentPods
 
-			if !foundAllPods {
-				foundAllPods = len(currentPods) == replicas
-			}
-			last = current
-			oldPods = currentPods
-
-			if failedContainers > maxContainerFailures {
-				return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures)
-			}
+		if failedContainers > maxContainerFailures {
+			return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures)
 		}
 	}
 	if current != replicas {
@@ -1059,7 +1014,7 @@ type LatencyMetric struct {
 }
 
 func ReadLatencyMetrics(c *client.Client) ([]LatencyMetric, error) {
-	body, err := getMetrics(c)
+	body, err := c.Get().AbsPath("/metrics").DoRaw()
 	if err != nil {
 		return nil, err
 	}
@@ -1115,74 +1070,3 @@ func HighLatencyRequests(c *client.Client, threshold time.Duration, ignoredResou
 
 	return len(badMetrics), nil
 }
-
-// Retrieve metrics information
-func getMetrics(c *client.Client) (string, error) {
-	body, err := c.Get().AbsPath("/metrics").DoRaw()
-	if err != nil {
-		return "", err
-	}
-	return string(body), nil
-}
-
-// Retrieve debug information
-func getDebugInfo(c *client.Client) (map[string]string, error) {
-	data := make(map[string]string)
-	for _, key := range []string{"block", "goroutine", "heap", "threadcreate"} {
-		resp, err := http.Get(c.Get().AbsPath(fmt.Sprintf("debug/pprof/%s", key)).URL().String() + "?debug=2")
-		body, err := ioutil.ReadAll(resp.Body)
-		resp.Body.Close()
-		if err != nil {
-			Logf("Warning: Error trying to fetch %s debug data: %v", key, err)
-		}
-		data[key] = string(body)
-	}
-	return data, nil
-}
-
-func writePerfData(c *client.Client, dirName string, postfix string) error {
-	fname := fmt.Sprintf("%s/metrics_%s.txt", dirName, postfix)
-
-	handler, err := os.Create(fname)
-	if err != nil {
-		return fmt.Errorf("Error creating file '%s': %v", fname, err)
-	}
-
-	metrics, err := getMetrics(c)
-	if err != nil {
-		return fmt.Errorf("Error retrieving metrics: %v", err)
-	}
-
-	_, err = handler.WriteString(metrics)
-	if err != nil {
-		return fmt.Errorf("Error writing metrics: %v", err)
-	}
-
-	err = handler.Close()
-	if err != nil {
-		return fmt.Errorf("Error closing '%s': %v", fname, err)
-	}
-
-	debug, err := getDebugInfo(c)
-	if err != nil {
-		return fmt.Errorf("Error retrieving debug information: %v", err)
-	}
-
-	for key, value := range debug {
-		fname := fmt.Sprintf("%s/%s_%s.txt", dirName, key, postfix)
-		handler, err = os.Create(fname)
-		if err != nil {
-			return fmt.Errorf("Error creating file '%s': %v", fname, err)
-		}
-		_, err = handler.WriteString(value)
-		if err != nil {
-			return fmt.Errorf("Error writing %s: %v", key, err)
-		}
-
-		err = handler.Close()
-		if err != nil {
-			return fmt.Errorf("Error closing '%s': %v", fname, err)
-		}
-	}
-	return nil
-}