Created a fifo queue containing results from listing pods and checks

from from the queue to generate finer granularity #7572
This commit is contained in:
Robert Rati
2015-05-06 12:53:54 -04:00
parent bd3306c845
commit ec5e7093b6

View File

@@ -27,6 +27,7 @@ import (
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings" "strings"
"sync"
"time" "time"
"code.google.com/p/go-uuid/uuid" "code.google.com/p/go-uuid/uuid"
@@ -74,6 +75,59 @@ type ContainerFailures struct {
restarts int restarts int
} }
type QueueItem struct {
createTime string
value interface{}
}
type QueueItems struct {
pos int
mutex *sync.Mutex
list []QueueItem
}
type FifoQueue QueueItems
func (fq *FifoQueue) Push(elem interface{}) {
fq.mutex.Lock()
fq.list = append(fq.list, QueueItem{time.Now().String(), elem})
fq.mutex.Unlock()
}
func (fq *FifoQueue) Pop() QueueItem {
fq.mutex.Lock()
var val QueueItem
if len(fq.list) >= fq.pos {
val = fq.list[fq.pos]
fq.pos++
}
fq.mutex.Unlock()
return val
}
func (fq FifoQueue) Len() int {
return len(fq.list[fq.pos:])
}
func (fq *FifoQueue) First() QueueItem {
return fq.list[fq.pos]
}
func (fq *FifoQueue) Last() QueueItem {
return fq.list[len(fq.list)]
}
func (fq *FifoQueue) Reset() {
fq.pos = 0
}
func newFifoQueue() *FifoQueue {
tmp := new(FifoQueue)
tmp.mutex = &sync.Mutex{}
tmp.pos = 0
return tmp
}
func Logf(format string, a ...interface{}) { func Logf(format string, a ...interface{}) {
fmt.Fprintf(GinkgoWriter, "INFO: "+format+"\n", a...) fmt.Fprintf(GinkgoWriter, "INFO: "+format+"\n", a...)
} }
@@ -451,16 +505,16 @@ func (p PodDiff) Print(ignorePhases util.StringSet) {
} }
// Diff computes a PodDiff given 2 lists of pods. // Diff computes a PodDiff given 2 lists of pods.
func Diff(oldPods *api.PodList, curPods *api.PodList) PodDiff { func Diff(oldPods []api.Pod, curPods []api.Pod) PodDiff {
podInfoMap := PodDiff{} podInfoMap := PodDiff{}
// New pods will show up in the curPods list but not in oldPods. They have oldhostname/phase == nonexist. // New pods will show up in the curPods list but not in oldPods. They have oldhostname/phase == nonexist.
for _, pod := range curPods.Items { for _, pod := range curPods {
podInfoMap[pod.Name] = &podInfo{hostname: pod.Spec.Host, phase: string(pod.Status.Phase), oldHostname: nonExist, oldPhase: nonExist} podInfoMap[pod.Name] = &podInfo{hostname: pod.Spec.Host, phase: string(pod.Status.Phase), oldHostname: nonExist, oldPhase: nonExist}
} }
// Deleted pods will show up in the oldPods list but not in curPods. They have a hostname/phase == nonexist. // Deleted pods will show up in the oldPods list but not in curPods. They have a hostname/phase == nonexist.
for _, pod := range oldPods.Items { for _, pod := range oldPods {
if info, ok := podInfoMap[pod.Name]; ok { if info, ok := podInfoMap[pod.Name]; ok {
info.oldHostname, info.oldPhase = pod.Spec.Host, string(pod.Status.Phase) info.oldHostname, info.oldPhase = pod.Spec.Host, string(pod.Status.Phase)
} else { } else {
@@ -480,6 +534,8 @@ func RunRC(c *client.Client, name string, ns, image string, replicas int, podSta
maxContainerFailures := int(math.Max(1.0, float64(replicas)*.01)) maxContainerFailures := int(math.Max(1.0, float64(replicas)*.01))
current := 0 current := 0
same := 0 same := 0
label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
podLists := newFifoQueue()
By(fmt.Sprintf("Creating replication controller %s", name)) By(fmt.Sprintf("Creating replication controller %s", name))
rc := &api.ReplicationController{ rc := &api.ReplicationController{
@@ -513,39 +569,51 @@ func RunRC(c *client.Client, name string, ns, image string, replicas int, podSta
} }
Logf("Created replication controller with name: %v, namespace: %v, replica count: %v", rc.Name, ns, rc.Spec.Replicas) Logf("Created replication controller with name: %v, namespace: %v, replica count: %v", rc.Name, ns, rc.Spec.Replicas)
// Create a routine to query for the list of pods
stop := make(chan struct{})
go func(stop <-chan struct{}, n string, ns string, l labels.Selector) {
for {
select {
case <-stop:
return
default:
p, err := c.Pods(ns).List(l, fields.Everything())
if err != nil {
Logf("Warning: Failed to get pod list: %v", err)
} else {
podLists.Push(p.Items)
}
time.Sleep(1 * time.Second)
}
}
}(stop, name, ns, label)
By(fmt.Sprintf("Making sure all %d replicas of rc %s in namespace %s exist", replicas, name, ns)) By(fmt.Sprintf("Making sure all %d replicas of rc %s in namespace %s exist", replicas, name, ns))
label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
pods, err := listPods(c, ns, label, fields.Everything())
if err != nil {
return fmt.Errorf("Error listing pods: %v", err)
}
current = len(pods.Items)
failCount := 5 failCount := 5
for same < failCount && current < replicas { for same < failCount && current < replicas {
msg := fmt.Sprintf("Controller %s: Found %d pods out of %d", name, current, replicas) time.Sleep(2 * time.Second)
Logf(msg) for item := podLists.Pop(); podLists.Len() > 0 && current < replicas; item = podLists.Pop() {
if podStatusFile != nil { pods := item.value.([]api.Pod)
fmt.Fprintf(podStatusFile, "%s: %s\n", time.Now().String(), msg) current = len(pods)
} msg := fmt.Sprintf("Controller %s: Found %d pods out of %d", name, current, replicas)
if last < current { Logf(msg)
same = 0 if podStatusFile != nil {
} else if last == current { fmt.Fprintf(podStatusFile, "%s: %s\n", item.createTime, msg)
same++ }
} else if current < last { if last < current {
return fmt.Errorf("Controller %s: Number of submitted pods dropped from %d to %d", name, last, current) same = 0
} } else if last == current {
same++
} else if current < last {
return fmt.Errorf("Controller %s: Number of submitted pods dropped from %d to %d", name, last, current)
}
if same >= failCount { if same >= failCount {
return fmt.Errorf("Controller %s: No pods submitted for the last %d checks", name, failCount) return fmt.Errorf("Controller %s: No pods submitted for the last %d checks", name, failCount)
} }
last = current last = current
time.Sleep(5 * time.Second)
pods, err = listPods(c, ns, label, fields.Everything())
if err != nil {
return fmt.Errorf("Error listing pods: %v", err)
} }
current = len(pods.Items)
} }
if current != replicas { if current != replicas {
return fmt.Errorf("Controller %s: Only found %d replicas out of %d", name, current, replicas) return fmt.Errorf("Controller %s: Only found %d replicas out of %d", name, current, replicas)
@@ -561,86 +629,89 @@ func RunRC(c *client.Client, name string, ns, image string, replicas int, podSta
last = 0 last = 0
failCount = 10 failCount = 10
current = 0 current = 0
oldPods := &api.PodList{} var oldPods []api.Pod
podLists.Reset()
foundAllPods := false
for same < failCount && current < replicas { for same < failCount && current < replicas {
current = 0 time.Sleep(2 * time.Second)
waiting := 0 for item := podLists.Pop(); podLists.Len() > 0 && current < replicas; item = podLists.Pop() {
pending := 0 current = 0
unknown := 0 waiting := 0
inactive := 0 pending := 0
failedContainers := 0 unknown := 0
time.Sleep(10 * time.Second) inactive := 0
failedContainers := 0
currentPods := item.value.([]api.Pod)
for _, p := range currentPods {
if p.Status.Phase == api.PodRunning {
current++
for _, v := range FailedContainers(p) {
failedContainers = failedContainers + v.restarts
}
} else if p.Status.Phase == api.PodPending {
if p.Spec.Host == "" {
waiting++
} else {
pending++
}
} else if p.Status.Phase == api.PodSucceeded || p.Status.Phase == api.PodFailed {
inactive++
} else if p.Status.Phase == api.PodUnknown {
unknown++
}
}
msg := fmt.Sprintf("Pod States: %d running, %d pending, %d waiting, %d inactive, %d unknown ", current, pending, waiting, inactive, unknown)
Logf(msg)
if podStatusFile != nil {
fmt.Fprintf(podStatusFile, "%s: %s\n", item.createTime, msg)
}
// TODO: Use a reflector both to put less strain on the cluster and if foundAllPods && len(currentPods) != len(oldPods) {
// for more clarity.
currentPods, err := listPods(c, ns, label, fields.Everything()) // This failure mode includes:
if err != nil { // kubelet is dead, so node controller deleted pods and rc creates more
return fmt.Errorf("Error listing pods: %v", err) // - diagnose by noting the pod diff below.
} // pod is unhealthy, so replication controller creates another to take its place
for _, p := range currentPods.Items { // - diagnose by comparing the previous "2 Pod states" lines for inactive pods
if p.Status.Phase == api.PodRunning { errorStr := fmt.Sprintf("Number of reported pods changed: %d vs %d", len(currentPods), len(oldPods))
current++ Logf("%v, pods that changed since the last iteration:", errorStr)
for _, v := range FailedContainers(p) { Diff(oldPods, currentPods).Print(util.NewStringSet())
failedContainers = failedContainers + v.restarts return fmt.Errorf(errorStr)
} }
} else if p.Status.Phase == api.PodPending { if last < current {
if p.Spec.Host == "" { same = 0
waiting++ } else if last == current {
} else { same++
pending++ } else if current < last {
}
} else if p.Status.Phase == api.PodSucceeded || p.Status.Phase == api.PodFailed { // The pod failed or succeeded, or was somehow pushed out of running by the kubelet.
inactive++ errorStr := fmt.Sprintf("Number of running pods dropped from %d to %d", last, current)
} else if p.Status.Phase == api.PodUnknown { Logf("%v, pods that changed since the last iteration:", errorStr)
unknown++ Diff(oldPods, currentPods).Print(util.NewStringSet())
return fmt.Errorf(errorStr)
}
if same >= failCount {
// Most times this happens because a few nodes have kubelet problems, and their pods are
// stuck in pending.
errorStr := fmt.Sprintf("No pods started for the last %d checks", failCount)
Logf("%v, pods currently in pending:", errorStr)
Diff(currentPods, make([]api.Pod, 0)).Print(util.NewStringSet(string(api.PodRunning)))
return fmt.Errorf(errorStr)
}
if !foundAllPods {
foundAllPods = len(currentPods) == replicas
}
last = current
oldPods = currentPods
if failedContainers > maxContainerFailures {
return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures)
} }
} }
msg := fmt.Sprintf("Pod States: %d running, %d pending, %d waiting, %d inactive, %d unknown ", current, pending, waiting, inactive, unknown)
Logf(msg)
if podStatusFile != nil {
fmt.Fprintf(podStatusFile, "%s: %s\n", time.Now().String(), msg)
}
if len(currentPods.Items) != len(pods.Items) {
// This failure mode includes:
// kubelet is dead, so node controller deleted pods and rc creates more
// - diagnose by noting the pod diff below.
// pod is unhealthy, so replication controller creates another to take its place
// - diagnose by comparing the previous "2 Pod states" lines for inactive pods
errorStr := fmt.Sprintf("Number of reported pods changed: %d vs %d", len(currentPods.Items), len(pods.Items))
Logf("%v, pods that changed since the last iteration:", errorStr)
Diff(oldPods, currentPods).Print(util.NewStringSet())
return fmt.Errorf(errorStr)
}
if last < current {
same = 0
} else if last == current {
same++
} else if current < last {
// The pod failed or succeeded, or was somehow pushed out of running by the kubelet.
errorStr := fmt.Sprintf("Number of running pods dropped from %d to %d", last, current)
Logf("%v, pods that changed since the last iteration:", errorStr)
Diff(oldPods, currentPods).Print(util.NewStringSet())
return fmt.Errorf(errorStr)
}
if same >= failCount {
// Most times this happens because a few nodes have kubelet problems, and their pods are
// stuck in pending.
errorStr := fmt.Sprintf("No pods started for the last %d checks", failCount)
Logf("%v, pods currently in pending:", errorStr)
Diff(currentPods, &api.PodList{}).Print(util.NewStringSet(string(api.PodRunning)))
return fmt.Errorf(errorStr)
}
last = current
oldPods = currentPods
if failedContainers > maxContainerFailures {
return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures)
}
} }
close(stop)
if current != replicas { if current != replicas {
return fmt.Errorf("Only %d pods started out of %d", current, replicas) return fmt.Errorf("Only %d pods started out of %d", current, replicas)
} }