controller: support perma-failed deployments

This commit adds support for failing deployments based on a timeout parameter defined in the spec. If there is no progress for the amount of time defined as progressDeadlineSeconds then the deployment will be marked as failed by adding a condition with a ProgressDeadlineExceeded reason in it. Progress in the context of a deployment means the creation or adoption of a new replica set, scaling up new pods, and scaling down old pods.
2026-01-04 23:17:50 +00:00 · 2016-09-15 17:57:53 +02:00
parent c4ff44b66d
commit a5029bf373
10 changed files with 834 additions and 25 deletions
--- a/pkg/controller/deployment/util/deployment_util.go
+++ b/pkg/controller/deployment/util/deployment_util.go
@@ -70,8 +70,111 @@ const (
 	// TODO: Delete this annotation when we gracefully handle overlapping selectors.
 	// See https://github.com/kubernetes/kubernetes/issues/2210
 	SelectorUpdateAnnotation = "deployment.kubernetes.io/selector-updated-at"
+
+	// Reasons for deployment conditions
+	//
+	// Progressing:
+	//
+	// ReplicaSetUpdatedReason is added in a deployment when one of its replica sets is updated as part
+	// of the rollout process.
+	ReplicaSetUpdatedReason = "ReplicaSetUpdated"
+	// FailedRSCreateReason is added in a deployment when it cannot create a new replica set.
+	FailedRSCreateReason = "ReplicaSetCreateError"
+	// NewReplicaSetReason is added in a deployment when it creates a new replica set.
+	NewReplicaSetReason = "NewReplicaSetCreated"
+	// FoundNewRSReason is added in a deployment when it adopts an existing replica set.
+	FoundNewRSReason = "FoundNewReplicaSet"
+	// NewRSAvailableReason is added in a deployment when its newest replica set is made available
+	// ie. the number of new pods that have passed readiness checks and run for at least minReadySeconds
+	// is at least the minimum available pods that need to run for the deployment.
+	NewRSAvailableReason = "NewReplicaSetAvailable"
+	// TimedOutReason is added in a deployment when its newest replica set fails to show any progress
+	// within the given deadline (progressDeadlineSeconds).
+	TimedOutReason = "ProgressDeadlineExceeded"
+	// PausedDeployReason is added in a deployment when it is paused. Lack of progress shouldn't be
+	// estimated once a deployment is paused.
+	PausedDeployReason = "DeploymentPaused"
+	// ResumedDeployReason is added in a deployment when it is resumed. Useful for not failing accidentally
+	// deployments that paused amidst a rollout and are bounded by a deadline.
+	ResumedDeployReason = "DeploymentResumed"
+	//
+	// Available:
+	//
+	// MinimumReplicasAvailable is added in a deployment when it has its minimum replicas required available.
+	MinimumReplicasAvailable = "MinimumReplicasAvailable"
+	// MinimumReplicasUnavailable is added in a deployment when it doesn't have the minimum required replicas
+	// available.
+	MinimumReplicasUnavailable = "MinimumReplicasUnavailable"
 )

+// NewDeploymentCondition creates a new deployment condition.
+func NewDeploymentCondition(condType extensions.DeploymentConditionType, status api.ConditionStatus, reason, message string) *extensions.DeploymentCondition {
+	return &extensions.DeploymentCondition{
+		Type:               condType,
+		Status:             status,
+		LastUpdateTime:     unversioned.Now(),
+		LastTransitionTime: unversioned.Now(),
+		Reason:             reason,
+		Message:            message,
+	}
+}
+
+// GetDeploymentCondition returns the condition with the provided type.
+func GetDeploymentCondition(status extensions.DeploymentStatus, condType extensions.DeploymentConditionType) *extensions.DeploymentCondition {
+	for i := range status.Conditions {
+		c := status.Conditions[i]
+		if c.Type == condType {
+			return &c
+		}
+	}
+	return nil
+}
+
+// SetDeploymentCondition updates the deployment to include the provided condition. If the condition that
+// we are about to add already exists and has the same status and reason then we are not going to update.
+func SetDeploymentCondition(status *extensions.DeploymentStatus, condition extensions.DeploymentCondition) {
+	currentCond := GetDeploymentCondition(*status, condition.Type)
+	if currentCond != nil && currentCond.Status == condition.Status && currentCond.Reason == condition.Reason {
+		return
+	}
+	// Do not update lastTransitionTime if the status of the condition doesn't change.
+	if currentCond != nil && currentCond.Status == condition.Status {
+		condition.LastTransitionTime = currentCond.LastTransitionTime
+	}
+	newConditions := filterOutCondition(status.Conditions, condition.Type)
+	status.Conditions = append(newConditions, condition)
+}
+
+// RemoveDeploymentCondition removes the deployment condition with the provided type.
+func RemoveDeploymentCondition(status *extensions.DeploymentStatus, condType extensions.DeploymentConditionType) {
+	status.Conditions = filterOutCondition(status.Conditions, condType)
+}
+
+// filterOutCondition returns a new slice of deployment conditions without conditions with the provided type.
+func filterOutCondition(conditions []extensions.DeploymentCondition, condType extensions.DeploymentConditionType) []extensions.DeploymentCondition {
+	var newConditions []extensions.DeploymentCondition
+	for _, c := range conditions {
+		if c.Type == condType {
+			continue
+		}
+		newConditions = append(newConditions, c)
+	}
+	return newConditions
+}
+
+// ReplicaSetToDeploymentCondition converts a replica set condition into a deployment condition.
+// Useful for promoting replica set failure conditions into deployments.
+func ReplicaSetToDeploymentCondition(cond extensions.ReplicaSetCondition) extensions.DeploymentCondition {
+	return extensions.DeploymentCondition{
+		Type:               extensions.DeploymentConditionType(cond.Type),
+		Status:             cond.Status,
+		LastTransitionTime: cond.LastTransitionTime,
+		LastUpdateTime:     cond.LastTransitionTime,
+		Reason:             cond.Reason,
+		Message:            cond.Message,
+	}
+}
+
 // SetDeploymentRevision updates the revision for a deployment.
 func SetDeploymentRevision(deployment *extensions.Deployment, revision string) bool {
 	updated := false
@@ -696,6 +799,56 @@ func IsRollingUpdate(deployment *extensions.Deployment) bool {
 	return deployment.Spec.Strategy.Type == extensions.RollingUpdateDeploymentStrategyType
 }

+// DeploymentComplete considers a deployment to be complete once its desired replicas equals its
+// updatedReplicas and it doesn't violate minimum availability.
+func DeploymentComplete(deployment *extensions.Deployment, newStatus *extensions.DeploymentStatus) bool {
+	return newStatus.UpdatedReplicas == deployment.Spec.Replicas &&
+		newStatus.AvailableReplicas >= deployment.Spec.Replicas-MaxUnavailable(*deployment)
+}
+
+// DeploymentProgressing reports progress for a deployment. Progress is estimated by comparing the
+// current with the new status of the deployment that the controller is observing. The following
+// algorithm is already used in the kubectl rolling updater to report lack of progress.
+func DeploymentProgressing(deployment *extensions.Deployment, newStatus *extensions.DeploymentStatus) bool {
+	oldStatus := deployment.Status
+
+	// Old replicas that need to be scaled down
+	oldStatusOldReplicas := oldStatus.Replicas - oldStatus.UpdatedReplicas
+	newStatusOldReplicas := newStatus.Replicas - newStatus.UpdatedReplicas
+
+	return (newStatus.UpdatedReplicas > oldStatus.UpdatedReplicas) || (newStatusOldReplicas < oldStatusOldReplicas)
+}
+
+// used for unit testing
+var nowFn = func() time.Time { return time.Now() }
+
+// DeploymentTimedOut considers a deployment to have timed out once its condition that reports progress
+// is older than progressDeadlineSeconds or a Progressing condition with a TimedOutReason reason already
+// exists.
+func DeploymentTimedOut(deployment *extensions.Deployment, newStatus *extensions.DeploymentStatus) bool {
+	if deployment.Spec.ProgressDeadlineSeconds == nil {
+		return false
+	}
+
+	// Look for the Progressing condition. If it doesn't exist, we have no base to estimate progress.
+	// If it's already set with a TimedOutReason reason, we have already timed out, no need to check
+	// again.
+	condition := GetDeploymentCondition(*newStatus, extensions.DeploymentProgressing)
+	if condition == nil {
+		return false
+	}
+	if condition.Reason == TimedOutReason {
+		return true
+	}
+
+	// Look at the difference in seconds between now and the last time we reported any
+	// progress or tried to create a replica set, or resumed a paused deployment and
+	// compare against progressDeadlineSeconds.
+	from := condition.LastTransitionTime
+	delta := time.Duration(*deployment.Spec.ProgressDeadlineSeconds) * time.Second
+	return from.Add(delta).Before(nowFn())
+}
+
 // NewRSNewReplicas calculates the number of replicas a deployment's new RS should have.
 // When one of the followings is true, we're rolling out the deployment; otherwise, we're scaling it.
 // 1) The new RS is saturated: newRS's replicas == deployment's replicas
--- a/pkg/controller/deployment/util/deployment_util_test.go
+++ b/pkg/controller/deployment/util/deployment_util_test.go
@@ -688,7 +688,6 @@ func TestResolveFenceposts(t *testing.T) {
 }

 func TestNewRSNewReplicas(t *testing.T) {
-
 	tests := []struct {
 		test          string
 		strategyType  extensions.DeploymentStrategyType
@@ -703,12 +702,12 @@ func TestNewRSNewReplicas(t *testing.T) {
 			1, 5, 1, 5,
 		},
 		{
-			"scale up - to depDeplicas",
+			"scale up - to depReplicas",
 			extensions.RollingUpdateDeploymentStrategyType,
 			6, 2, 10, 6,
 		},
 		{
-			"recreate - to depDeplicas",
+			"recreate - to depReplicas",
 			extensions.RecreateDeploymentStrategyType,
 			3, 1, 1, 3,
 		},
@@ -735,3 +734,373 @@ func TestNewRSNewReplicas(t *testing.T) {
 		}
 	}
 }
+
+var (
+	condProgressing = func() extensions.DeploymentCondition {
+		return extensions.DeploymentCondition{
+			Type:   extensions.DeploymentProgressing,
+			Status: api.ConditionFalse,
+			Reason: "ForSomeReason",
+		}
+	}
+
+	condProgressing2 = func() extensions.DeploymentCondition {
+		return extensions.DeploymentCondition{
+			Type:   extensions.DeploymentProgressing,
+			Status: api.ConditionTrue,
+			Reason: "BecauseItIs",
+		}
+	}
+
+	condAvailable = func() extensions.DeploymentCondition {
+		return extensions.DeploymentCondition{
+			Type:   extensions.DeploymentAvailable,
+			Status: api.ConditionTrue,
+			Reason: "AwesomeController",
+		}
+	}
+
+	status = func() *extensions.DeploymentStatus {
+		return &extensions.DeploymentStatus{
+			Conditions: []extensions.DeploymentCondition{condProgressing(), condAvailable()},
+		}
+	}
+)
+
+func TestGetCondition(t *testing.T) {
+	exampleStatus := status()
+
+	tests := []struct {
+		name string
+
+		status     extensions.DeploymentStatus
+		condType   extensions.DeploymentConditionType
+		condStatus api.ConditionStatus
+		condReason string
+
+		expected bool
+	}{
+		{
+			name: "condition exists",
+
+			status:   *exampleStatus,
+			condType: extensions.DeploymentAvailable,
+
+			expected: true,
+		},
+		{
+			name: "condition does not exist",
+
+			status:   *exampleStatus,
+			condType: extensions.DeploymentReplicaFailure,
+
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		cond := GetDeploymentCondition(test.status, test.condType)
+		exists := cond != nil
+		if exists != test.expected {
+			t.Errorf("%s: expected condition to exist: %t, got: %t", test.name, test.expected, exists)
+		}
+	}
+}
+
+func TestSetCondition(t *testing.T) {
+	tests := []struct {
+		name string
+
+		status *extensions.DeploymentStatus
+		cond   extensions.DeploymentCondition
+
+		expectedStatus *extensions.DeploymentStatus
+	}{
+		{
+			name: "set for the first time",
+
+			status: &extensions.DeploymentStatus{},
+			cond:   condAvailable(),
+
+			expectedStatus: &extensions.DeploymentStatus{Conditions: []extensions.DeploymentCondition{condAvailable()}},
+		},
+		{
+			name: "simple set",
+
+			status: &extensions.DeploymentStatus{Conditions: []extensions.DeploymentCondition{condProgressing()}},
+			cond:   condAvailable(),
+
+			expectedStatus: status(),
+		},
+		{
+			name: "overwrite",
+
+			status: &extensions.DeploymentStatus{Conditions: []extensions.DeploymentCondition{condProgressing()}},
+			cond:   condProgressing2(),
+
+			expectedStatus: &extensions.DeploymentStatus{Conditions: []extensions.DeploymentCondition{condProgressing2()}},
+		},
+	}
+
+	for _, test := range tests {
+		SetDeploymentCondition(test.status, test.cond)
+		if !reflect.DeepEqual(test.status, test.expectedStatus) {
+			t.Errorf("%s: expected status: %v, got: %v", test.name, test.expectedStatus, test.status)
+		}
+	}
+}
+
+func TestRemoveCondition(t *testing.T) {
+	tests := []struct {
+		name string
+
+		status   *extensions.DeploymentStatus
+		condType extensions.DeploymentConditionType
+
+		expectedStatus *extensions.DeploymentStatus
+	}{
+		{
+			name: "remove from empty status",
+
+			status:   &extensions.DeploymentStatus{},
+			condType: extensions.DeploymentProgressing,
+
+			expectedStatus: &extensions.DeploymentStatus{},
+		},
+		{
+			name: "simple remove",
+
+			status:   &extensions.DeploymentStatus{Conditions: []extensions.DeploymentCondition{condProgressing()}},
+			condType: extensions.DeploymentProgressing,
+
+			expectedStatus: &extensions.DeploymentStatus{},
+		},
+		{
+			name: "doesn't remove anything",
+
+			status:   status(),
+			condType: extensions.DeploymentReplicaFailure,
+
+			expectedStatus: status(),
+		},
+	}
+
+	for _, test := range tests {
+		RemoveDeploymentCondition(test.status, test.condType)
+		if !reflect.DeepEqual(test.status, test.expectedStatus) {
+			t.Errorf("%s: expected status: %v, got: %v", test.name, test.expectedStatus, test.status)
+		}
+	}
+}
+
+func TestDeploymentComplete(t *testing.T) {
+	deployment := func(desired, current, updated, available, maxUnavailable int32) *extensions.Deployment {
+		return &extensions.Deployment{
+			Spec: extensions.DeploymentSpec{
+				Replicas: desired,
+				Strategy: extensions.DeploymentStrategy{
+					RollingUpdate: &extensions.RollingUpdateDeployment{
+						MaxUnavailable: intstr.FromInt(int(maxUnavailable)),
+					},
+					Type: extensions.RollingUpdateDeploymentStrategyType,
+				},
+			},
+			Status: extensions.DeploymentStatus{
+				Replicas:          current,
+				UpdatedReplicas:   updated,
+				AvailableReplicas: available,
+			},
+		}
+	}
+
+	tests := []struct {
+		name string
+
+		d *extensions.Deployment
+
+		expected bool
+	}{
+		{
+			name: "complete",
+
+			d:        deployment(5, 5, 5, 4, 1),
+			expected: true,
+		},
+		{
+			name: "not complete",
+
+			d:        deployment(5, 5, 5, 3, 1),
+			expected: false,
+		},
+		{
+			name: "complete #2",
+
+			d:        deployment(5, 5, 5, 5, 0),
+			expected: true,
+		},
+		{
+			name: "not complete #2",
+
+			d:        deployment(5, 5, 4, 5, 0),
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Log(test.name)
+
+		if got, exp := DeploymentComplete(test.d, &test.d.Status), test.expected; got != exp {
+			t.Errorf("expected complete: %t, got: %t", exp, got)
+		}
+	}
+}
+
+func TestDeploymentProgressing(t *testing.T) {
+	deployment := func(current, updated int32) *extensions.Deployment {
+		return &extensions.Deployment{
+			Status: extensions.DeploymentStatus{
+				Replicas:        current,
+				UpdatedReplicas: updated,
+			},
+		}
+	}
+	newStatus := func(current, updated int32) extensions.DeploymentStatus {
+		return extensions.DeploymentStatus{
+			Replicas:        current,
+			UpdatedReplicas: updated,
+		}
+	}
+
+	tests := []struct {
+		name string
+
+		d         *extensions.Deployment
+		newStatus extensions.DeploymentStatus
+
+		expected bool
+	}{
+		{
+			name: "progressing",
+
+			d:         deployment(10, 4),
+			newStatus: newStatus(10, 6),
+
+			expected: true,
+		},
+		{
+			name: "not progressing",
+
+			d:         deployment(10, 4),
+			newStatus: newStatus(10, 4),
+
+			expected: false,
+		},
+		{
+			name: "progressing #2",
+
+			d:         deployment(10, 4),
+			newStatus: newStatus(8, 4),
+
+			expected: true,
+		},
+		{
+			name: "not progressing #2",
+
+			d:         deployment(10, 7),
+			newStatus: newStatus(10, 6),
+
+			expected: false,
+		},
+		{
+			name: "progressing #3",
+
+			d:         deployment(10, 4),
+			newStatus: newStatus(8, 8),
+
+			expected: true,
+		},
+		{
+			name: "not progressing #2",
+
+			d:         deployment(10, 7),
+			newStatus: newStatus(10, 7),
+
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Log(test.name)
+
+		if got, exp := DeploymentProgressing(test.d, &test.newStatus), test.expected; got != exp {
+			t.Errorf("expected progressing: %t, got: %t", exp, got)
+		}
+	}
+}
+
+func TestDeploymentTimedOut(t *testing.T) {
+	var (
+		null *int32
+		ten  = int32(10)
+	)
+
+	timeFn := func(min, sec int) time.Time {
+		return time.Date(2016, 1, 1, 0, min, sec, 0, time.UTC)
+	}
+	deployment := func(condType extensions.DeploymentConditionType, status api.ConditionStatus, pds *int32, from time.Time) extensions.Deployment {
+		return extensions.Deployment{
+			Spec: extensions.DeploymentSpec{
+				ProgressDeadlineSeconds: pds,
+			},
+			Status: extensions.DeploymentStatus{
+				Conditions: []extensions.DeploymentCondition{
+					{
+						Type:               condType,
+						Status:             status,
+						LastTransitionTime: unversioned.Time{Time: from},
+					},
+				},
+			},
+		}
+	}
+
+	tests := []struct {
+		name string
+
+		d     extensions.Deployment
+		nowFn func() time.Time
+
+		expected bool
+	}{
+		{
+			name: "no progressDeadlineSeconds specified - no timeout",
+
+			d:        deployment(extensions.DeploymentProgressing, api.ConditionTrue, null, timeFn(1, 9)),
+			nowFn:    func() time.Time { return timeFn(1, 20) },
+			expected: false,
+		},
+		{
+			name: "progressDeadlineSeconds: 10s, now - started => 00:01:20 - 00:01:09 => 11s",
+
+			d:        deployment(extensions.DeploymentProgressing, api.ConditionTrue, &ten, timeFn(1, 9)),
+			nowFn:    func() time.Time { return timeFn(1, 20) },
+			expected: true,
+		},
+		{
+			name: "progressDeadlineSeconds: 10s, now - started => 00:01:20 - 00:01:11 => 9s",
+
+			d:        deployment(extensions.DeploymentProgressing, api.ConditionTrue, &ten, timeFn(1, 11)),
+			nowFn:    func() time.Time { return timeFn(1, 20) },
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Log(test.name)
+
+		nowFn = test.nowFn
+		if got, exp := DeploymentTimedOut(&test.d, &test.d.Status), test.expected; got != exp {
+			t.Errorf("expected timeout: %t, got: %t", exp, got)
+		}
+	}
+}