Merge pull request #51224 from enisoc/sts-deflake-restart

Automatic merge from submit-queue (batch tested with PRs 51224, 51191, 51158, 50669, 51222)

StatefulSet: Deflake e2e "restart" phase.

This addresses another source of flakiness found while investigating #48031.

The test used to scale the StatefulSet down to 0, wait for ListPods to return 0 matching Pods, and then scale the StatefulSet back up.

This was prone to a race in which StatefulSet was told to scale back up before it had observed its own deletion of the last Pod, as evidenced by logs showing the creation of Pod ss-1 prior to the creation of the replacement Pod ss-0.

Instead, we now wait for the controller to observe all deletions before scaling it back up. This should fix flakes of the form:

```
Too many pods scheduled, expected 1 got 2
```
This commit is contained in:
Kubernetes Submit Queue 2017-08-24 22:59:28 -07:00 committed by GitHub
commit ce3e2d9b10
2 changed files with 21 additions and 12 deletions

View File

@ -757,7 +757,7 @@ var _ = SIGDescribe("StatefulSet", func() {
By("Scaling down stateful set " + ssName + " to 0 replicas and waiting until none of pods will run in namespace" + ns) By("Scaling down stateful set " + ssName + " to 0 replicas and waiting until none of pods will run in namespace" + ns)
sst.RestoreHttpProbe(ss) sst.RestoreHttpProbe(ss)
sst.Scale(ss, 0) sst.Scale(ss, 0)
sst.WaitForStatusReadyReplicas(ss, 0) sst.WaitForStatusReplicas(ss, 0)
}) })
It("Should recreate evicted statefulset", func() { It("Should recreate evicted statefulset", func() {

View File

@ -197,10 +197,12 @@ func getStatefulSetPodNameAtIndex(index int, ss *apps.StatefulSet) string {
} }
// Scale scales ss to count replicas. // Scale scales ss to count replicas.
func (s *StatefulSetTester) Scale(ss *apps.StatefulSet, count int32) error { func (s *StatefulSetTester) Scale(ss *apps.StatefulSet, count int32) (*apps.StatefulSet, error) {
name := ss.Name name := ss.Name
ns := ss.Namespace ns := ss.Namespace
s.update(ns, name, func(ss *apps.StatefulSet) { *(ss.Spec.Replicas) = count })
Logf("Scaling statefulset %s to %d", name, count)
ss = s.update(ns, name, func(ss *apps.StatefulSet) { *(ss.Spec.Replicas) = count })
var statefulPodList *v1.PodList var statefulPodList *v1.PodList
pollErr := wait.PollImmediate(StatefulSetPoll, StatefulSetTimeout, func() (bool, error) { pollErr := wait.PollImmediate(StatefulSetPoll, StatefulSetTimeout, func() (bool, error) {
@ -218,9 +220,9 @@ func (s *StatefulSetTester) Scale(ss *apps.StatefulSet, count int32) error {
unhealthy = append(unhealthy, fmt.Sprintf("%v: deletion %v, phase %v, readiness %v", statefulPod.Name, delTs, phase, readiness)) unhealthy = append(unhealthy, fmt.Sprintf("%v: deletion %v, phase %v, readiness %v", statefulPod.Name, delTs, phase, readiness))
} }
} }
return fmt.Errorf("Failed to scale statefulset to %d in %v. Remaining pods:\n%v", count, StatefulSetTimeout, unhealthy) return ss, fmt.Errorf("Failed to scale statefulset to %d in %v. Remaining pods:\n%v", count, StatefulSetTimeout, unhealthy)
} }
return nil return ss, nil
} }
// UpdateReplicas updates the replicas of ss to count. // UpdateReplicas updates the replicas of ss to count.
@ -231,11 +233,16 @@ func (s *StatefulSetTester) UpdateReplicas(ss *apps.StatefulSet, count int32) {
// Restart scales ss to 0 and then back to its previous number of replicas. // Restart scales ss to 0 and then back to its previous number of replicas.
func (s *StatefulSetTester) Restart(ss *apps.StatefulSet) { func (s *StatefulSetTester) Restart(ss *apps.StatefulSet) {
oldReplicas := *(ss.Spec.Replicas) oldReplicas := *(ss.Spec.Replicas)
ExpectNoError(s.Scale(ss, 0)) ss, err := s.Scale(ss, 0)
ExpectNoError(err)
// Wait for controller to report the desired number of Pods.
// This way we know the controller has observed all Pod deletions
// before we scale it back up.
s.WaitForStatusReplicas(ss, 0)
s.update(ss.Namespace, ss.Name, func(ss *apps.StatefulSet) { *(ss.Spec.Replicas) = oldReplicas }) s.update(ss.Namespace, ss.Name, func(ss *apps.StatefulSet) { *(ss.Spec.Replicas) = oldReplicas })
} }
func (s *StatefulSetTester) update(ns, name string, update func(ss *apps.StatefulSet)) { func (s *StatefulSetTester) update(ns, name string, update func(ss *apps.StatefulSet)) *apps.StatefulSet {
for i := 0; i < 3; i++ { for i := 0; i < 3; i++ {
ss, err := s.c.AppsV1beta1().StatefulSets(ns).Get(name, metav1.GetOptions{}) ss, err := s.c.AppsV1beta1().StatefulSets(ns).Get(name, metav1.GetOptions{})
if err != nil { if err != nil {
@ -244,13 +251,14 @@ func (s *StatefulSetTester) update(ns, name string, update func(ss *apps.Statefu
update(ss) update(ss)
ss, err = s.c.AppsV1beta1().StatefulSets(ns).Update(ss) ss, err = s.c.AppsV1beta1().StatefulSets(ns).Update(ss)
if err == nil { if err == nil {
return return ss
} }
if !apierrs.IsConflict(err) && !apierrs.IsServerTimeout(err) { if !apierrs.IsConflict(err) && !apierrs.IsServerTimeout(err) {
Failf("failed to update statefulset %q: %v", name, err) Failf("failed to update statefulset %q: %v", name, err)
} }
} }
Failf("too many retries draining statefulset %q", name) Failf("too many retries draining statefulset %q", name)
return nil
} }
// GetPodList gets the current Pods in ss. // GetPodList gets the current Pods in ss.
@ -669,12 +677,13 @@ func DeleteAllStatefulSets(c clientset.Interface, ns string) {
// Scale down each statefulset, then delete it completely. // Scale down each statefulset, then delete it completely.
// Deleting a pvc without doing this will leak volumes, #25101. // Deleting a pvc without doing this will leak volumes, #25101.
errList := []string{} errList := []string{}
for _, ss := range ssList.Items { for i := range ssList.Items {
Logf("Scaling statefulset %v to 0", ss.Name) ss := &ssList.Items[i]
if err := sst.Scale(&ss, 0); err != nil { var err error
if ss, err = sst.Scale(ss, 0); err != nil {
errList = append(errList, fmt.Sprintf("%v", err)) errList = append(errList, fmt.Sprintf("%v", err))
} }
sst.WaitForStatusReplicas(&ss, 0) sst.WaitForStatusReplicas(ss, 0)
Logf("Deleting statefulset %v", ss.Name) Logf("Deleting statefulset %v", ss.Name)
// Use OrphanDependents=false so it's deleted synchronously. // Use OrphanDependents=false so it's deleted synchronously.
// We already made sure the Pods are gone inside Scale(). // We already made sure the Pods are gone inside Scale().