ReplicaSet controller can set/remove ControllerRef

This commit is contained in:
Wojciech Tyczynski
2016-07-19 11:46:11 +02:00
parent 68687013e8
commit 85de930a66
4 changed files with 395 additions and 112 deletions

View File

@@ -27,8 +27,10 @@ import (
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/errors"
"k8s.io/kubernetes/pkg/api/unversioned"
"k8s.io/kubernetes/pkg/apis/extensions"
"k8s.io/kubernetes/pkg/apis/extensions/v1beta1"
"k8s.io/kubernetes/pkg/client/cache"
clientset "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
unversionedcore "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset/typed/core/unversioned"
@@ -38,6 +40,7 @@ import (
"k8s.io/kubernetes/pkg/controller/framework/informers"
"k8s.io/kubernetes/pkg/labels"
"k8s.io/kubernetes/pkg/runtime"
utilerrors "k8s.io/kubernetes/pkg/util/errors"
"k8s.io/kubernetes/pkg/util/metrics"
utilruntime "k8s.io/kubernetes/pkg/util/runtime"
"k8s.io/kubernetes/pkg/util/wait"
@@ -63,6 +66,10 @@ const (
statusUpdateRetries = 1
)
func getRSKind() unversioned.GroupVersionKind {
return v1beta1.SchemeGroupVersion.WithKind("ReplicaSet")
}
// ReplicaSetController is responsible for synchronizing ReplicaSet objects stored
// in the system with actual running pods.
type ReplicaSetController struct {
@@ -101,21 +108,25 @@ type ReplicaSetController struct {
// Controllers that need to be synced
queue *workqueue.Type
// garbageCollectorEnabled denotes if the garbage collector is enabled. RC
// manager behaves differently if GC is enabled.
garbageCollectorEnabled bool
}
// NewReplicaSetController creates a new ReplicaSetController.
func NewReplicaSetController(podInformer framework.SharedIndexInformer, kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc, burstReplicas int, lookupCacheSize int) *ReplicaSetController {
func NewReplicaSetController(podInformer framework.SharedIndexInformer, kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc, burstReplicas int, lookupCacheSize int, garbageCollectorEnabled bool) *ReplicaSetController {
eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartLogging(glog.Infof)
eventBroadcaster.StartRecordingToSink(&unversionedcore.EventSinkImpl{Interface: kubeClient.Core().Events("")})
return newReplicaSetController(
eventBroadcaster.NewRecorder(api.EventSource{Component: "replicaset-controller"}),
podInformer, kubeClient, resyncPeriod, burstReplicas, lookupCacheSize)
podInformer, kubeClient, resyncPeriod, burstReplicas, lookupCacheSize, garbageCollectorEnabled)
}
// newReplicaSetController configures a replica set controller with the specified event recorder
func newReplicaSetController(eventRecorder record.EventRecorder, podInformer framework.SharedIndexInformer, kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc, burstReplicas int, lookupCacheSize int) *ReplicaSetController {
func newReplicaSetController(eventRecorder record.EventRecorder, podInformer framework.SharedIndexInformer, kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc, burstReplicas int, lookupCacheSize int, garbageCollectorEnabled bool) *ReplicaSetController {
if kubeClient != nil && kubeClient.Core().GetRESTClient().GetRateLimiter() != nil {
metrics.RegisterMetricAndTrackRateLimiterUsage("replicaset_controller", kubeClient.Core().GetRESTClient().GetRateLimiter())
}
@@ -129,6 +140,7 @@ func newReplicaSetController(eventRecorder record.EventRecorder, podInformer fra
burstReplicas: burstReplicas,
expectations: controller.NewUIDTrackingControllerExpectations(controller.NewControllerExpectations()),
queue: workqueue.New(),
garbageCollectorEnabled: garbageCollectorEnabled,
}
rsc.rsStore.Store, rsc.rsController = framework.NewInformer(
@@ -144,43 +156,8 @@ func newReplicaSetController(eventRecorder record.EventRecorder, podInformer fra
// TODO: Can we have much longer period here?
FullControllerResyncPeriod,
framework.ResourceEventHandlerFuncs{
AddFunc: rsc.enqueueReplicaSet,
UpdateFunc: func(old, cur interface{}) {
oldRS := old.(*extensions.ReplicaSet)
curRS := cur.(*extensions.ReplicaSet)
// We should invalidate the whole lookup cache if a RS's selector has been updated.
//
// Imagine that you have two RSs:
// * old RS1
// * new RS2
// You also have a pod that is attached to RS2 (because it doesn't match RS1 selector).
// Now imagine that you are changing RS1 selector so that it is now matching that pod,
// in such case we must invalidate the whole cache so that pod could be adopted by RS1
//
// This makes the lookup cache less helpful, but selector update does not happen often,
// so it's not a big problem
if !reflect.DeepEqual(oldRS.Spec.Selector, curRS.Spec.Selector) {
rsc.lookupCache.InvalidateAll()
}
// You might imagine that we only really need to enqueue the
// replica set when Spec changes, but it is safer to sync any
// time this function is triggered. That way a full informer
// resync can requeue any replica set that don't yet have pods
// but whose last attempts at creating a pod have failed (since
// we don't block on creation of pods) instead of those
// replica sets stalling indefinitely. Enqueueing every time
// does result in some spurious syncs (like when Status.Replica
// is updated and the watch notification from it retriggers
// this function), but in general extra resyncs shouldn't be
// that bad as ReplicaSets that haven't met expectations yet won't
// sync, and all the listing is done using local stores.
if oldRS.Status.Replicas != curRS.Status.Replicas {
glog.V(4).Infof("Observed updated replica count for ReplicaSet: %v, %d->%d", curRS.Name, oldRS.Status.Replicas, curRS.Status.Replicas)
}
rsc.enqueueReplicaSet(cur)
},
AddFunc: rsc.enqueueReplicaSet,
UpdateFunc: rsc.updateRS,
// This will enter the sync loop and no-op, because the replica set has been deleted from the store.
// Note that deleting a replica set immediately after scaling it to 0 will not work. The recommended
// way of achieving this is by performing a `stop` operation on the replica set.
@@ -208,7 +185,8 @@ func newReplicaSetController(eventRecorder record.EventRecorder, podInformer fra
// NewReplicationManagerFromClient creates a new ReplicationManager that runs its own informer.
func NewReplicaSetControllerFromClient(kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc, burstReplicas int, lookupCacheSize int) *ReplicaSetController {
podInformer := informers.CreateSharedPodIndexInformer(kubeClient, resyncPeriod())
rsc := NewReplicaSetController(podInformer, kubeClient, resyncPeriod, burstReplicas, lookupCacheSize)
garbageCollectorEnabled := false
rsc := NewReplicaSetController(podInformer, kubeClient, resyncPeriod, burstReplicas, lookupCacheSize, garbageCollectorEnabled)
rsc.internalPodInformer = podInformer
return rsc
}
@@ -239,13 +217,14 @@ func (rsc *ReplicaSetController) Run(workers int, stopCh <-chan struct{}) {
// getPodReplicaSet returns the replica set managing the given pod.
// TODO: Surface that we are ignoring multiple replica sets for a single pod.
// TODO: use ownerReference.Controller to determine if the rs controls the pod.
func (rsc *ReplicaSetController) getPodReplicaSet(pod *api.Pod) *extensions.ReplicaSet {
// look up in the cache, if cached and the cache is valid, just return cached value
if obj, cached := rsc.lookupCache.GetMatchingObject(pod); cached {
rs, ok := obj.(*extensions.ReplicaSet)
if !ok {
// This should not happen
glog.Errorf("lookup cache does not retuen a ReplicaSet object")
glog.Errorf("lookup cache does not return a ReplicaSet object")
return nil
}
if cached && rsc.isCacheValid(pod, rs) {
@@ -278,6 +257,44 @@ func (rsc *ReplicaSetController) getPodReplicaSet(pod *api.Pod) *extensions.Repl
return &rss[0]
}
// callback when RS is updated
func (rsc *ReplicaSetController) updateRS(old, cur interface{}) {
oldRS := old.(*extensions.ReplicaSet)
curRS := cur.(*extensions.ReplicaSet)
// We should invalidate the whole lookup cache if a RS's selector has been updated.
//
// Imagine that you have two RSs:
// * old RS1
// * new RS2
// You also have a pod that is attached to RS2 (because it doesn't match RS1 selector).
// Now imagine that you are changing RS1 selector so that it is now matching that pod,
// in such case we must invalidate the whole cache so that pod could be adopted by RS1
//
// This makes the lookup cache less helpful, but selector update does not happen often,
// so it's not a big problem
if !reflect.DeepEqual(oldRS.Spec.Selector, curRS.Spec.Selector) {
rsc.lookupCache.InvalidateAll()
}
// You might imagine that we only really need to enqueue the
// replica set when Spec changes, but it is safer to sync any
// time this function is triggered. That way a full informer
// resync can requeue any replica set that don't yet have pods
// but whose last attempts at creating a pod have failed (since
// we don't block on creation of pods) instead of those
// replica sets stalling indefinitely. Enqueueing every time
// does result in some spurious syncs (like when Status.Replica
// is updated and the watch notification from it retriggers
// this function), but in general extra resyncs shouldn't be
// that bad as ReplicaSets that haven't met expectations yet won't
// sync, and all the listing is done using local stores.
if oldRS.Status.Replicas != curRS.Status.Replicas {
glog.V(4).Infof("Observed updated replica count for ReplicaSet: %v, %d->%d", curRS.Name, oldRS.Status.Replicas, curRS.Status.Replicas)
}
rsc.enqueueReplicaSet(cur)
}
// isCacheValid check if the cache is valid
func (rsc *ReplicaSetController) isCacheValid(pod *api.Pod, cachedRS *extensions.ReplicaSet) bool {
_, exists, err := rsc.rsStore.Get(cachedRS)
@@ -357,9 +374,7 @@ func (rsc *ReplicaSetController) updatePod(old, cur interface{}) {
return
}
if rs := rsc.getPodReplicaSet(curPod); rs != nil {
rsc.enqueueReplicaSet(rs)
}
// Enqueue the oldRC before the curRC to give curRC a chance to adopt the oldPod.
if labelChanged {
// If the old and new ReplicaSet are the same, the first one that syncs
// will set expectations preventing any damage from the second.
@@ -367,6 +382,10 @@ func (rsc *ReplicaSetController) updatePod(old, cur interface{}) {
rsc.enqueueReplicaSet(oldRS)
}
}
if curRS := rsc.getPodReplicaSet(curPod); curRS != nil {
rsc.enqueueReplicaSet(curRS)
}
}
// When a pod is deleted, enqueue the replica set that manages the pod and update its expectations.
@@ -456,13 +475,28 @@ func (rsc *ReplicaSetController) manageReplicas(filteredPods []*api.Pod, rs *ext
// into a performance bottleneck. We should generate a UID for the pod
// beforehand and store it via ExpectCreations.
rsc.expectations.ExpectCreations(rsKey, diff)
wait := sync.WaitGroup{}
wait.Add(diff)
var wg sync.WaitGroup
wg.Add(diff)
glog.V(2).Infof("Too few %q/%q replicas, need %d, creating %d", rs.Namespace, rs.Name, rs.Spec.Replicas, diff)
for i := 0; i < diff; i++ {
go func() {
defer wait.Done()
if err := rsc.podControl.CreatePods(rs.Namespace, &rs.Spec.Template, rs); err != nil {
defer wg.Done()
var err error
if rsc.garbageCollectorEnabled {
var trueVar = true
controllerRef := &api.OwnerReference{
APIVersion: getRSKind().GroupVersion().String(),
Kind: getRSKind().Kind,
Name: rs.Name,
UID: rs.UID,
Controller: &trueVar,
}
err = rsc.podControl.CreatePodsWithControllerRef(rs.Namespace, &rs.Spec.Template, rs, controllerRef)
} else {
err = rsc.podControl.CreatePods(rs.Namespace, &rs.Spec.Template, rs)
}
if err != nil {
// Decrement the expected number of creates because the informer won't observe this pod
glog.V(2).Infof("Failed creation, decrementing expectations for replica set %q/%q", rs.Namespace, rs.Name)
rsc.expectations.CreationObserved(rsKey)
@@ -470,7 +504,7 @@ func (rsc *ReplicaSetController) manageReplicas(filteredPods []*api.Pod, rs *ext
}
}()
}
wait.Wait()
wg.Wait()
} else if diff > 0 {
if diff > rsc.burstReplicas {
diff = rsc.burstReplicas
@@ -494,11 +528,11 @@ func (rsc *ReplicaSetController) manageReplicas(filteredPods []*api.Pod, rs *ext
deletedPodKeys = append(deletedPodKeys, controller.PodKey(filteredPods[i]))
}
rsc.expectations.ExpectDeletions(rsKey, deletedPodKeys)
wait := sync.WaitGroup{}
wait.Add(diff)
var wg sync.WaitGroup
wg.Add(diff)
for i := 0; i < diff; i++ {
go func(ix int) {
defer wait.Done()
defer wg.Done()
if err := rsc.podControl.DeletePod(rs.Namespace, filteredPods[ix].Name, rs); err != nil {
// Decrement the expected number of deletes because the informer won't observe this deletion
podKey := controller.PodKey(filteredPods[ix])
@@ -508,7 +542,7 @@ func (rsc *ReplicaSetController) manageReplicas(filteredPods []*api.Pod, rs *ext
}
}(i)
}
wait.Wait()
wg.Wait()
}
}
@@ -557,16 +591,60 @@ func (rsc *ReplicaSetController) syncReplicaSet(key string) error {
glog.Errorf("Error converting pod selector to selector: %v", err)
return err
}
podList, err := rsc.podStore.Pods(rs.Namespace).List(selector)
if err != nil {
glog.Errorf("Error getting pods for ReplicaSet %q: %v", key, err)
rsc.queue.Add(key)
return err
// TODO: Do the List and Filter in a single pass, or use an index.
var filteredPods []*api.Pod
if rsc.garbageCollectorEnabled {
// list all pods to include the pods that don't match the rs`s selector
// anymore but has the stale controller ref.
podList, err := rsc.podStore.Pods(rs.Namespace).List(labels.Everything())
if err != nil {
glog.Errorf("Error getting pods for rs %q: %v", key, err)
rsc.queue.Add(key)
return err
}
cm := controller.NewPodControllerRefManager(rsc.podControl, rs.ObjectMeta, selector, getRSKind())
matchesAndControlled, matchesNeedsController, controlledDoesNotMatch := cm.Classify(podList.Items)
for _, pod := range matchesNeedsController {
err := cm.AdoptPod(pod)
// continue to next pod if adoption fails.
if err != nil {
// If the pod no longer exists, don't even log the error.
if !errors.IsNotFound(err) {
utilruntime.HandleError(err)
}
} else {
matchesAndControlled = append(matchesAndControlled, pod)
}
}
filteredPods = matchesAndControlled
// remove the controllerRef for the pods that no longer have matching labels
var errlist []error
for _, pod := range controlledDoesNotMatch {
err := cm.ReleasePod(pod)
if err != nil {
errlist = append(errlist, err)
}
}
if len(errlist) != 0 {
aggregate := utilerrors.NewAggregate(errlist)
// push the RS into work queue again. We need to try to free the
// pods again otherwise they will stuck with the stale
// controllerRef.
rsc.queue.Add(key)
return aggregate
}
} else {
podList, err := rsc.podStore.Pods(rs.Namespace).List(selector)
if err != nil {
glog.Errorf("Error getting pods for rs %q: %v", key, err)
rsc.queue.Add(key)
return err
}
filteredPods = controller.FilterActivePods(podList.Items)
}
// TODO: Do this in a single pass, or use an index.
filteredPods := controller.FilterActivePods(podList.Items)
if rsNeedsSync {
if rsNeedsSync && rs.DeletionTimestamp == nil {
rsc.manageReplicas(filteredPods, &rs)
}