mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-13 05:46:16 +00:00
Merge pull request #125796 from haorenfsa/fix-gc-sync-blocked
garbagecollector: controller should not be blocking on failed cache sync
This commit is contained in:
commit
5dd244ff00
@ -698,11 +698,12 @@ func startGarbageCollectorController(ctx context.Context, controllerContext Cont
|
|||||||
|
|
||||||
// Start the garbage collector.
|
// Start the garbage collector.
|
||||||
workers := int(controllerContext.ComponentConfig.GarbageCollectorController.ConcurrentGCSyncs)
|
workers := int(controllerContext.ComponentConfig.GarbageCollectorController.ConcurrentGCSyncs)
|
||||||
go garbageCollector.Run(ctx, workers)
|
const syncPeriod = 30 * time.Second
|
||||||
|
go garbageCollector.Run(ctx, workers, syncPeriod)
|
||||||
|
|
||||||
// Periodically refresh the RESTMapper with new discovery information and sync
|
// Periodically refresh the RESTMapper with new discovery information and sync
|
||||||
// the garbage collector.
|
// the garbage collector.
|
||||||
go garbageCollector.Sync(ctx, discoveryClient, 30*time.Second)
|
go garbageCollector.Sync(ctx, discoveryClient, syncPeriod)
|
||||||
|
|
||||||
return garbageCollector, true, nil
|
return garbageCollector, true, nil
|
||||||
}
|
}
|
||||||
|
@ -74,8 +74,6 @@ type GarbageCollector struct {
|
|||||||
|
|
||||||
kubeClient clientset.Interface
|
kubeClient clientset.Interface
|
||||||
eventBroadcaster record.EventBroadcaster
|
eventBroadcaster record.EventBroadcaster
|
||||||
|
|
||||||
workerLock sync.RWMutex
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ controller.Interface = (*GarbageCollector)(nil)
|
var _ controller.Interface = (*GarbageCollector)(nil)
|
||||||
@ -131,7 +129,7 @@ func (gc *GarbageCollector) resyncMonitors(logger klog.Logger, deletableResource
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Run starts garbage collector workers.
|
// Run starts garbage collector workers.
|
||||||
func (gc *GarbageCollector) Run(ctx context.Context, workers int) {
|
func (gc *GarbageCollector) Run(ctx context.Context, workers int, initialSyncTimeout time.Duration) {
|
||||||
defer utilruntime.HandleCrash()
|
defer utilruntime.HandleCrash()
|
||||||
defer gc.attemptToDelete.ShutDown()
|
defer gc.attemptToDelete.ShutDown()
|
||||||
defer gc.attemptToOrphan.ShutDown()
|
defer gc.attemptToOrphan.ShutDown()
|
||||||
@ -148,13 +146,15 @@ func (gc *GarbageCollector) Run(ctx context.Context, workers int) {
|
|||||||
|
|
||||||
go gc.dependencyGraphBuilder.Run(ctx)
|
go gc.dependencyGraphBuilder.Run(ctx)
|
||||||
|
|
||||||
if !cache.WaitForNamedCacheSync("garbage collector", ctx.Done(), func() bool {
|
if !cache.WaitForNamedCacheSync("garbage collector", waitForStopOrTimeout(ctx.Done(), initialSyncTimeout), func() bool {
|
||||||
return gc.dependencyGraphBuilder.IsSynced(logger)
|
return gc.dependencyGraphBuilder.IsSynced(logger)
|
||||||
}) {
|
}) {
|
||||||
return
|
logger.Info("Garbage collector: not all resource monitors could be synced, proceeding anyways")
|
||||||
|
} else {
|
||||||
|
logger.Info("Garbage collector: all resource monitors have synced")
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.Info("All resource monitors have synced. Proceeding to collect garbage")
|
logger.Info("Proceeding to collect garbage")
|
||||||
|
|
||||||
// gc workers
|
// gc workers
|
||||||
for i := 0; i < workers; i++ {
|
for i := 0; i < workers; i++ {
|
||||||
@ -166,8 +166,8 @@ func (gc *GarbageCollector) Run(ctx context.Context, workers int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Sync periodically resyncs the garbage collector when new resources are
|
// Sync periodically resyncs the garbage collector when new resources are
|
||||||
// observed from discovery. When new resources are detected, Sync will stop all
|
// observed from discovery. When new resources are detected, it will reset
|
||||||
// GC workers, reset gc.restMapper, and resync the monitors.
|
// gc.restMapper, and resync the monitors.
|
||||||
//
|
//
|
||||||
// Note that discoveryClient should NOT be shared with gc.restMapper, otherwise
|
// Note that discoveryClient should NOT be shared with gc.restMapper, otherwise
|
||||||
// the mapper's underlying discovery client will be unnecessarily reset during
|
// the mapper's underlying discovery client will be unnecessarily reset during
|
||||||
@ -200,85 +200,48 @@ func (gc *GarbageCollector) Sync(ctx context.Context, discoveryClient discovery.
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure workers are paused to avoid processing events before informers
|
logger.V(2).Info(
|
||||||
// have resynced.
|
"syncing garbage collector with updated resources from discovery",
|
||||||
gc.workerLock.Lock()
|
"diff", printDiff(oldResources, newResources),
|
||||||
defer gc.workerLock.Unlock()
|
)
|
||||||
|
|
||||||
// Once we get here, we should not unpause workers until we've successfully synced
|
// Resetting the REST mapper will also invalidate the underlying discovery
|
||||||
attempt := 0
|
// client. This is a leaky abstraction and assumes behavior about the REST
|
||||||
wait.PollImmediateUntilWithContext(ctx, 100*time.Millisecond, func(ctx context.Context) (bool, error) {
|
// mapper, but we'll deal with it for now.
|
||||||
attempt++
|
gc.restMapper.Reset()
|
||||||
|
logger.V(4).Info("reset restmapper")
|
||||||
|
|
||||||
// On a reattempt, check if available resources have changed
|
// Perform the monitor resync and wait for controllers to report cache sync.
|
||||||
if attempt > 1 {
|
//
|
||||||
newResources, err = GetDeletableResources(logger, discoveryClient)
|
// NOTE: It's possible that newResources will diverge from the resources
|
||||||
|
// discovered by restMapper during the call to Reset, since they are
|
||||||
|
// distinct discovery clients invalidated at different times. For example,
|
||||||
|
// newResources may contain resources not returned in the restMapper's
|
||||||
|
// discovery call if the resources appeared in-between the calls. In that
|
||||||
|
// case, the restMapper will fail to map some of newResources until the next
|
||||||
|
// attempt.
|
||||||
|
if err := gc.resyncMonitors(logger, newResources); err != nil {
|
||||||
|
utilruntime.HandleError(fmt.Errorf("failed to sync resource monitors: %w", err))
|
||||||
|
metrics.GarbageCollectorResourcesSyncError.Inc()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
logger.V(4).Info("resynced monitors")
|
||||||
|
|
||||||
if len(newResources) == 0 {
|
// gc worker no longer waits for cache to be synced, but we will keep the periodical check to provide logs & metrics
|
||||||
logger.V(2).Info("no resources reported by discovery", "attempt", attempt)
|
cacheSynced := cache.WaitForNamedCacheSync("garbage collector", waitForStopOrTimeout(ctx.Done(), period), func() bool {
|
||||||
metrics.GarbageCollectorResourcesSyncError.Inc()
|
return gc.dependencyGraphBuilder.IsSynced(logger)
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
if groupLookupFailures, isLookupFailure := discovery.GroupDiscoveryFailedErrorGroups(err); isLookupFailure {
|
|
||||||
// In partial discovery cases, preserve existing synced informers for resources in the failed groups, so resyncMonitors will only add informers for newly seen resources
|
|
||||||
for k, v := range oldResources {
|
|
||||||
if _, failed := groupLookupFailures[k.GroupVersion()]; failed && gc.dependencyGraphBuilder.IsResourceSynced(k) {
|
|
||||||
newResources[k] = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.V(2).Info(
|
|
||||||
"syncing garbage collector with updated resources from discovery",
|
|
||||||
"attempt", attempt,
|
|
||||||
"diff", printDiff(oldResources, newResources),
|
|
||||||
)
|
|
||||||
|
|
||||||
// Resetting the REST mapper will also invalidate the underlying discovery
|
|
||||||
// client. This is a leaky abstraction and assumes behavior about the REST
|
|
||||||
// mapper, but we'll deal with it for now.
|
|
||||||
gc.restMapper.Reset()
|
|
||||||
logger.V(4).Info("reset restmapper")
|
|
||||||
|
|
||||||
// Perform the monitor resync and wait for controllers to report cache sync.
|
|
||||||
//
|
|
||||||
// NOTE: It's possible that newResources will diverge from the resources
|
|
||||||
// discovered by restMapper during the call to Reset, since they are
|
|
||||||
// distinct discovery clients invalidated at different times. For example,
|
|
||||||
// newResources may contain resources not returned in the restMapper's
|
|
||||||
// discovery call if the resources appeared in-between the calls. In that
|
|
||||||
// case, the restMapper will fail to map some of newResources until the next
|
|
||||||
// attempt.
|
|
||||||
if err := gc.resyncMonitors(logger, newResources); err != nil {
|
|
||||||
utilruntime.HandleError(fmt.Errorf("failed to sync resource monitors (attempt %d): %v", attempt, err))
|
|
||||||
metrics.GarbageCollectorResourcesSyncError.Inc()
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
logger.V(4).Info("resynced monitors")
|
|
||||||
|
|
||||||
// wait for caches to fill for a while (our sync period) before attempting to rediscover resources and retry syncing.
|
|
||||||
// this protects us from deadlocks where available resources changed and one of our informer caches will never fill.
|
|
||||||
// informers keep attempting to sync in the background, so retrying doesn't interrupt them.
|
|
||||||
// the call to resyncMonitors on the reattempt will no-op for resources that still exist.
|
|
||||||
// note that workers stay paused until we successfully resync.
|
|
||||||
if !cache.WaitForNamedCacheSync("garbage collector", waitForStopOrTimeout(ctx.Done(), period), func() bool {
|
|
||||||
return gc.dependencyGraphBuilder.IsSynced(logger)
|
|
||||||
}) {
|
|
||||||
utilruntime.HandleError(fmt.Errorf("timed out waiting for dependency graph builder sync during GC sync (attempt %d)", attempt))
|
|
||||||
metrics.GarbageCollectorResourcesSyncError.Inc()
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// success, break out of the loop
|
|
||||||
return true, nil
|
|
||||||
})
|
})
|
||||||
|
if cacheSynced {
|
||||||
|
logger.V(2).Info("synced garbage collector")
|
||||||
|
} else {
|
||||||
|
utilruntime.HandleError(fmt.Errorf("timed out waiting for dependency graph builder sync during GC sync"))
|
||||||
|
metrics.GarbageCollectorResourcesSyncError.Inc()
|
||||||
|
}
|
||||||
|
|
||||||
// Finally, keep track of our new state. Do this after all preceding steps
|
// Finally, keep track of our new resource monitor state.
|
||||||
// have succeeded to ensure we'll retry on subsequent syncs if an error
|
// Monitors where the cache sync times out are still tracked here as
|
||||||
// occurred.
|
// subsequent runs should stop them if their resources were removed.
|
||||||
oldResources = newResources
|
oldResources = newResources
|
||||||
logger.V(2).Info("synced garbage collector")
|
|
||||||
}, period)
|
}, period)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -328,8 +291,6 @@ var namespacedOwnerOfClusterScopedObjectErr = goerrors.New("cluster-scoped objec
|
|||||||
|
|
||||||
func (gc *GarbageCollector) processAttemptToDeleteWorker(ctx context.Context) bool {
|
func (gc *GarbageCollector) processAttemptToDeleteWorker(ctx context.Context) bool {
|
||||||
item, quit := gc.attemptToDelete.Get()
|
item, quit := gc.attemptToDelete.Get()
|
||||||
gc.workerLock.RLock()
|
|
||||||
defer gc.workerLock.RUnlock()
|
|
||||||
if quit {
|
if quit {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
@ -754,8 +715,6 @@ func (gc *GarbageCollector) runAttemptToOrphanWorker(logger klog.Logger) {
|
|||||||
// these steps fail.
|
// these steps fail.
|
||||||
func (gc *GarbageCollector) processAttemptToOrphanWorker(logger klog.Logger) bool {
|
func (gc *GarbageCollector) processAttemptToOrphanWorker(logger klog.Logger) bool {
|
||||||
item, quit := gc.attemptToOrphan.Get()
|
item, quit := gc.attemptToOrphan.Get()
|
||||||
gc.workerLock.RLock()
|
|
||||||
defer gc.workerLock.RUnlock()
|
|
||||||
if quit {
|
if quit {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
@ -24,6 +24,7 @@ import (
|
|||||||
"reflect"
|
"reflect"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -49,6 +50,7 @@ import (
|
|||||||
"k8s.io/apimachinery/pkg/util/json"
|
"k8s.io/apimachinery/pkg/util/json"
|
||||||
"k8s.io/apimachinery/pkg/util/sets"
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
"k8s.io/apimachinery/pkg/util/strategicpatch"
|
"k8s.io/apimachinery/pkg/util/strategicpatch"
|
||||||
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
"k8s.io/client-go/discovery"
|
"k8s.io/client-go/discovery"
|
||||||
"k8s.io/client-go/informers"
|
"k8s.io/client-go/informers"
|
||||||
"k8s.io/client-go/kubernetes"
|
"k8s.io/client-go/kubernetes"
|
||||||
@ -60,9 +62,11 @@ import (
|
|||||||
clientgotesting "k8s.io/client-go/testing"
|
clientgotesting "k8s.io/client-go/testing"
|
||||||
"k8s.io/client-go/tools/record"
|
"k8s.io/client-go/tools/record"
|
||||||
"k8s.io/client-go/util/workqueue"
|
"k8s.io/client-go/util/workqueue"
|
||||||
|
metricsutil "k8s.io/component-base/metrics/testutil"
|
||||||
"k8s.io/controller-manager/pkg/informerfactory"
|
"k8s.io/controller-manager/pkg/informerfactory"
|
||||||
"k8s.io/kubernetes/pkg/api/legacyscheme"
|
"k8s.io/kubernetes/pkg/api/legacyscheme"
|
||||||
c "k8s.io/kubernetes/pkg/controller"
|
c "k8s.io/kubernetes/pkg/controller"
|
||||||
|
"k8s.io/kubernetes/pkg/controller/garbagecollector/metrics"
|
||||||
"k8s.io/kubernetes/test/utils/ktesting"
|
"k8s.io/kubernetes/test/utils/ktesting"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -120,7 +124,7 @@ func TestGarbageCollectorConstruction(t *testing.T) {
|
|||||||
}
|
}
|
||||||
assert.Len(t, gc.dependencyGraphBuilder.monitors, 1)
|
assert.Len(t, gc.dependencyGraphBuilder.monitors, 1)
|
||||||
|
|
||||||
go gc.Run(tCtx, 1)
|
go gc.Run(tCtx, 1, 5*time.Second)
|
||||||
|
|
||||||
err = gc.resyncMonitors(logger, twoResources)
|
err = gc.resyncMonitors(logger, twoResources)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -814,7 +818,8 @@ func TestGetDeletableResources(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TestGarbageCollectorSync ensures that a discovery client error
|
// TestGarbageCollectorSync ensures that a discovery client error
|
||||||
// will not cause the garbage collector to block infinitely.
|
// or an informer sync error will not cause the garbage collector
|
||||||
|
// to block infinitely.
|
||||||
func TestGarbageCollectorSync(t *testing.T) {
|
func TestGarbageCollectorSync(t *testing.T) {
|
||||||
serverResources := []*metav1.APIResourceList{
|
serverResources := []*metav1.APIResourceList{
|
||||||
{
|
{
|
||||||
@ -845,7 +850,6 @@ func TestGarbageCollectorSync(t *testing.T) {
|
|||||||
PreferredResources: serverResources,
|
PreferredResources: serverResources,
|
||||||
Error: nil,
|
Error: nil,
|
||||||
Lock: sync.Mutex{},
|
Lock: sync.Mutex{},
|
||||||
InterfaceUsedCount: 0,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
testHandler := &fakeActionHandler{
|
testHandler := &fakeActionHandler{
|
||||||
@ -864,7 +868,24 @@ func TestGarbageCollectorSync(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
srv, clientConfig := testServerAndClientConfig(testHandler.ServeHTTP)
|
|
||||||
|
testHandler2 := &fakeActionHandler{
|
||||||
|
response: map[string]FakeResponse{
|
||||||
|
"GET" + "/api/v1/secrets": {
|
||||||
|
200,
|
||||||
|
[]byte("{}"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
var secretSyncOK atomic.Bool
|
||||||
|
var alternativeTestHandler = func(response http.ResponseWriter, request *http.Request) {
|
||||||
|
if request.URL.Path == "/api/v1/secrets" && secretSyncOK.Load() {
|
||||||
|
testHandler2.ServeHTTP(response, request)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
testHandler.ServeHTTP(response, request)
|
||||||
|
}
|
||||||
|
srv, clientConfig := testServerAndClientConfig(alternativeTestHandler)
|
||||||
defer srv.Close()
|
defer srv.Close()
|
||||||
clientConfig.ContentConfig.NegotiatedSerializer = nil
|
clientConfig.ContentConfig.NegotiatedSerializer = nil
|
||||||
client, err := kubernetes.NewForConfig(clientConfig)
|
client, err := kubernetes.NewForConfig(clientConfig)
|
||||||
@ -884,7 +905,7 @@ func TestGarbageCollectorSync(t *testing.T) {
|
|||||||
|
|
||||||
sharedInformers := informers.NewSharedInformerFactory(client, 0)
|
sharedInformers := informers.NewSharedInformerFactory(client, 0)
|
||||||
|
|
||||||
tCtx := ktesting.Init(t)
|
logger, tCtx := ktesting.NewTestContext(t)
|
||||||
defer tCtx.Cancel("test has completed")
|
defer tCtx.Cancel("test has completed")
|
||||||
alwaysStarted := make(chan struct{})
|
alwaysStarted := make(chan struct{})
|
||||||
close(alwaysStarted)
|
close(alwaysStarted)
|
||||||
@ -893,7 +914,8 @@ func TestGarbageCollectorSync(t *testing.T) {
|
|||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
go gc.Run(tCtx, 1)
|
syncPeriod := 200 * time.Millisecond
|
||||||
|
go gc.Run(tCtx, 1, syncPeriod)
|
||||||
// The pseudo-code of GarbageCollector.Sync():
|
// The pseudo-code of GarbageCollector.Sync():
|
||||||
// GarbageCollector.Sync(client, period, stopCh):
|
// GarbageCollector.Sync(client, period, stopCh):
|
||||||
// wait.Until() loops with `period` until the `stopCh` is closed :
|
// wait.Until() loops with `period` until the `stopCh` is closed :
|
||||||
@ -908,14 +930,14 @@ func TestGarbageCollectorSync(t *testing.T) {
|
|||||||
// The 1s sleep in the test allows GetDeletableResources and
|
// The 1s sleep in the test allows GetDeletableResources and
|
||||||
// gc.resyncMonitors to run ~5 times to ensure the changes to the
|
// gc.resyncMonitors to run ~5 times to ensure the changes to the
|
||||||
// fakeDiscoveryClient are picked up.
|
// fakeDiscoveryClient are picked up.
|
||||||
go gc.Sync(tCtx, fakeDiscoveryClient, 200*time.Millisecond)
|
go gc.Sync(tCtx, fakeDiscoveryClient, syncPeriod)
|
||||||
|
|
||||||
// Wait until the sync discovers the initial resources
|
// Wait until the sync discovers the initial resources
|
||||||
time.Sleep(1 * time.Second)
|
time.Sleep(1 * time.Second)
|
||||||
|
|
||||||
err = expectSyncNotBlocked(fakeDiscoveryClient, &gc.workerLock)
|
err = expectSyncNotBlocked(fakeDiscoveryClient)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("Expected garbagecollector.Sync to be running but it is blocked: %v", err)
|
t.Fatalf("Expected garbagecollector.Sync to still be running but it is blocked: %v", err)
|
||||||
}
|
}
|
||||||
assertMonitors(t, gc, "pods", "deployments")
|
assertMonitors(t, gc, "pods", "deployments")
|
||||||
|
|
||||||
@ -930,7 +952,7 @@ func TestGarbageCollectorSync(t *testing.T) {
|
|||||||
// Remove the error from being returned and see if the garbage collector sync is still working
|
// Remove the error from being returned and see if the garbage collector sync is still working
|
||||||
fakeDiscoveryClient.setPreferredResources(serverResources, nil)
|
fakeDiscoveryClient.setPreferredResources(serverResources, nil)
|
||||||
|
|
||||||
err = expectSyncNotBlocked(fakeDiscoveryClient, &gc.workerLock)
|
err = expectSyncNotBlocked(fakeDiscoveryClient)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("Expected garbagecollector.Sync to still be running but it is blocked: %v", err)
|
t.Fatalf("Expected garbagecollector.Sync to still be running but it is blocked: %v", err)
|
||||||
}
|
}
|
||||||
@ -946,7 +968,7 @@ func TestGarbageCollectorSync(t *testing.T) {
|
|||||||
// Put the resources back to normal and ensure garbage collector sync recovers
|
// Put the resources back to normal and ensure garbage collector sync recovers
|
||||||
fakeDiscoveryClient.setPreferredResources(serverResources, nil)
|
fakeDiscoveryClient.setPreferredResources(serverResources, nil)
|
||||||
|
|
||||||
err = expectSyncNotBlocked(fakeDiscoveryClient, &gc.workerLock)
|
err = expectSyncNotBlocked(fakeDiscoveryClient)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("Expected garbagecollector.Sync to still be running but it is blocked: %v", err)
|
t.Fatalf("Expected garbagecollector.Sync to still be running but it is blocked: %v", err)
|
||||||
}
|
}
|
||||||
@ -963,12 +985,33 @@ func TestGarbageCollectorSync(t *testing.T) {
|
|||||||
fakeDiscoveryClient.setPreferredResources(serverResources, nil)
|
fakeDiscoveryClient.setPreferredResources(serverResources, nil)
|
||||||
// Wait until sync discovers the change
|
// Wait until sync discovers the change
|
||||||
time.Sleep(1 * time.Second)
|
time.Sleep(1 * time.Second)
|
||||||
err = expectSyncNotBlocked(fakeDiscoveryClient, &gc.workerLock)
|
err = expectSyncNotBlocked(fakeDiscoveryClient)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("Expected garbagecollector.Sync to still be running but it is blocked: %v", err)
|
t.Fatalf("Expected garbagecollector.Sync to still be running but it is blocked: %v", err)
|
||||||
}
|
}
|
||||||
// Unsyncable monitor removed
|
// Unsyncable monitor removed
|
||||||
assertMonitors(t, gc, "pods", "deployments")
|
assertMonitors(t, gc, "pods", "deployments")
|
||||||
|
|
||||||
|
// Simulate initial not-synced informer which will be synced at the end.
|
||||||
|
metrics.GarbageCollectorResourcesSyncError.Reset()
|
||||||
|
fakeDiscoveryClient.setPreferredResources(unsyncableServerResources, nil)
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
assertMonitors(t, gc, "pods", "secrets")
|
||||||
|
if gc.IsSynced(logger) {
|
||||||
|
t.Fatal("cache from garbage collector should not be synced")
|
||||||
|
}
|
||||||
|
val, _ := metricsutil.GetCounterMetricValue(metrics.GarbageCollectorResourcesSyncError)
|
||||||
|
if val < 1 {
|
||||||
|
t.Fatalf("expect sync error metric > 0")
|
||||||
|
}
|
||||||
|
|
||||||
|
// The informer is synced now.
|
||||||
|
secretSyncOK.Store(true)
|
||||||
|
if err := wait.PollUntilContextTimeout(tCtx, time.Second, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) {
|
||||||
|
return gc.IsSynced(logger), nil
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func assertMonitors(t *testing.T, gc *GarbageCollector, resources ...string) {
|
func assertMonitors(t *testing.T, gc *GarbageCollector, resources ...string) {
|
||||||
@ -983,27 +1026,15 @@ func assertMonitors(t *testing.T, gc *GarbageCollector, resources ...string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func expectSyncNotBlocked(fakeDiscoveryClient *fakeServerResources, workerLock *sync.RWMutex) error {
|
func expectSyncNotBlocked(fakeDiscoveryClient *fakeServerResources) error {
|
||||||
before := fakeDiscoveryClient.getInterfaceUsedCount()
|
before := fakeDiscoveryClient.getInterfaceUsedCount()
|
||||||
t := 1 * time.Second
|
t := 1 * time.Second
|
||||||
time.Sleep(t)
|
time.Sleep(t)
|
||||||
after := fakeDiscoveryClient.getInterfaceUsedCount()
|
after := fakeDiscoveryClient.getInterfaceUsedCount()
|
||||||
if before == after {
|
if before == after {
|
||||||
return fmt.Errorf("discoveryClient.ServerPreferredResources() called %d times over %v", after-before, t)
|
return fmt.Errorf("discoveryClient.ServerPreferredResources() not called over %v", t)
|
||||||
}
|
|
||||||
|
|
||||||
workerLockAcquired := make(chan struct{})
|
|
||||||
go func() {
|
|
||||||
workerLock.Lock()
|
|
||||||
defer workerLock.Unlock()
|
|
||||||
close(workerLockAcquired)
|
|
||||||
}()
|
|
||||||
select {
|
|
||||||
case <-workerLockAcquired:
|
|
||||||
return nil
|
|
||||||
case <-time.After(t):
|
|
||||||
return fmt.Errorf("workerLock blocked for at least %v", t)
|
|
||||||
}
|
}
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type fakeServerResources struct {
|
type fakeServerResources struct {
|
||||||
|
@ -81,6 +81,29 @@ const oneValidOwnerPodName = "test.pod.3"
|
|||||||
const toBeDeletedRCName = "test.rc.1"
|
const toBeDeletedRCName = "test.rc.1"
|
||||||
const remainingRCName = "test.rc.2"
|
const remainingRCName = "test.rc.2"
|
||||||
|
|
||||||
|
// testCert was generated from crypto/tls/generate_cert.go with the following command:
|
||||||
|
//
|
||||||
|
// go run generate_cert.go --rsa-bits 2048 --host 127.0.0.1,::1,example.com --ca --start-date "Jan 1 00:00:00 1970" --duration=1000000h
|
||||||
|
var testCert = []byte(`-----BEGIN CERTIFICATE-----
|
||||||
|
MIIDGDCCAgCgAwIBAgIQTKCKn99d5HhQVCLln2Q+eTANBgkqhkiG9w0BAQsFADAS
|
||||||
|
MRAwDgYDVQQKEwdBY21lIENvMCAXDTcwMDEwMTAwMDAwMFoYDzIwODQwMTI5MTYw
|
||||||
|
MDAwWjASMRAwDgYDVQQKEwdBY21lIENvMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8A
|
||||||
|
MIIBCgKCAQEA1Z5/aTwqY706M34tn60l8ZHkanWDl8mM1pYf4Q7qg3zA9XqWLX6S
|
||||||
|
4rTYDYCb4stEasC72lQnbEWHbthiQE76zubP8WOFHdvGR3mjAvHWz4FxvLOTheZ+
|
||||||
|
3iDUrl6Aj9UIsYqzmpBJAoY4+vGGf+xHvuukHrVcFqR9ZuBdZuJ/HbbjUyuNr3X9
|
||||||
|
erNIr5Ha17gVzf17SNbYgNrX9gbCeEB8Z9Ox7dVuJhLDkpF0T/B5Zld3BjyUVY/T
|
||||||
|
cukU4dTVp6isbWPvCMRCZCCOpb+qIhxEjJ0n6tnPt8nf9lvDl4SWMl6X1bH+2EFa
|
||||||
|
a8R06G0QI+XhwPyjXUyCR8QEOZPCR5wyqQIDAQABo2gwZjAOBgNVHQ8BAf8EBAMC
|
||||||
|
AqQwEwYDVR0lBAwwCgYIKwYBBQUHAwEwDwYDVR0TAQH/BAUwAwEB/zAuBgNVHREE
|
||||||
|
JzAlggtleGFtcGxlLmNvbYcEfwAAAYcQAAAAAAAAAAAAAAAAAAAAATANBgkqhkiG
|
||||||
|
9w0BAQsFAAOCAQEAThqgJ/AFqaANsOp48lojDZfZBFxJQ3A4zfR/MgggUoQ9cP3V
|
||||||
|
rxuKAFWQjze1EZc7J9iO1WvH98lOGVNRY/t2VIrVoSsBiALP86Eew9WucP60tbv2
|
||||||
|
8/zsBDSfEo9Wl+Q/gwdEh8dgciUKROvCm76EgAwPGicMAgRsxXgwXHhS5e8nnbIE
|
||||||
|
Ewaqvb5dY++6kh0Oz+adtNT5OqOwXTIRI67WuEe6/B3Z4LNVPQDIj7ZUJGNw8e6L
|
||||||
|
F4nkUthwlKx4yEJHZBRuFPnO7Z81jNKuwL276+mczRH7piI6z9uyMV/JbEsOIxyL
|
||||||
|
W6CzB7pZ9Nj1YLpgzc1r6oONHLokMJJIz/IvkQ==
|
||||||
|
-----END CERTIFICATE-----`)
|
||||||
|
|
||||||
func newPod(podName, podNamespace string, ownerReferences []metav1.OwnerReference) *v1.Pod {
|
func newPod(podName, podNamespace string, ownerReferences []metav1.OwnerReference) *v1.Pod {
|
||||||
for i := 0; i < len(ownerReferences); i++ {
|
for i := 0; i < len(ownerReferences); i++ {
|
||||||
if len(ownerReferences[i].Kind) == 0 {
|
if len(ownerReferences[i].Kind) == 0 {
|
||||||
@ -252,6 +275,7 @@ func setupWithServer(t *testing.T, result *kubeapiservertesting.TestServer, work
|
|||||||
logger := tCtx.Logger()
|
logger := tCtx.Logger()
|
||||||
alwaysStarted := make(chan struct{})
|
alwaysStarted := make(chan struct{})
|
||||||
close(alwaysStarted)
|
close(alwaysStarted)
|
||||||
|
|
||||||
gc, err := garbagecollector.NewGarbageCollector(
|
gc, err := garbagecollector.NewGarbageCollector(
|
||||||
tCtx,
|
tCtx,
|
||||||
clientSet,
|
clientSet,
|
||||||
@ -277,7 +301,7 @@ func setupWithServer(t *testing.T, result *kubeapiservertesting.TestServer, work
|
|||||||
// mapper, but we'll deal with it for now.
|
// mapper, but we'll deal with it for now.
|
||||||
restMapper.Reset()
|
restMapper.Reset()
|
||||||
}, syncPeriod, tCtx.Done())
|
}, syncPeriod, tCtx.Done())
|
||||||
go gc.Run(tCtx, workers)
|
go gc.Run(tCtx, workers, syncPeriod)
|
||||||
go gc.Sync(tCtx, clientSet.Discovery(), syncPeriod)
|
go gc.Sync(tCtx, clientSet.Discovery(), syncPeriod)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1285,3 +1309,121 @@ func testCRDDeletion(t *testing.T, ctx *testContext, ns *v1.Namespace, definitio
|
|||||||
t.Fatalf("failed waiting for dependent %q (owned by %q) to be deleted", dependent.GetName(), owner.GetName())
|
t.Fatalf("failed waiting for dependent %q (owned by %q) to be deleted", dependent.GetName(), owner.GetName())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestCascadingDeleteOnCRDConversionFailure tests that a bad conversion webhook cannot block the entire GC controller.
|
||||||
|
// Historically, a cache sync failure from a single resource prevented GC controller from running. This test creates
|
||||||
|
// a CRD, updates the storage version with a bad conversion webhook and then runs a simple cascading delete test.
|
||||||
|
func TestCascadingDeleteOnCRDConversionFailure(t *testing.T) {
|
||||||
|
ctx := setup(t, 0)
|
||||||
|
defer ctx.tearDown()
|
||||||
|
gc, apiExtensionClient, dynamicClient, clientSet := ctx.gc, ctx.apiExtensionClient, ctx.dynamicClient, ctx.clientSet
|
||||||
|
|
||||||
|
ns := createNamespaceOrDie("gc-cache-sync-fail", clientSet, t)
|
||||||
|
defer deleteNamespaceOrDie(ns.Name, clientSet, t)
|
||||||
|
|
||||||
|
// Create a CRD with storage/serving version v1beta2. Then update the CRD with v1 as the storage version
|
||||||
|
// and an invalid conversion webhook. This should result in cache sync failures for the CRD from the GC controller.
|
||||||
|
def, dc := createRandomCustomResourceDefinition(t, apiExtensionClient, dynamicClient, ns.Name)
|
||||||
|
_, err := dc.Create(context.TODO(), newCRDInstance(def, ns.Name, names.SimpleNameGenerator.GenerateName("test")), metav1.CreateOptions{})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to create custom resource: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
def, err = apiExtensionClient.ApiextensionsV1().CustomResourceDefinitions().Get(context.TODO(), def.Name, metav1.GetOptions{})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to get custom resource: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
newDefinition := def.DeepCopy()
|
||||||
|
newDefinition.Spec.Conversion = &apiextensionsv1.CustomResourceConversion{
|
||||||
|
Strategy: apiextensionsv1.WebhookConverter,
|
||||||
|
Webhook: &apiextensionsv1.WebhookConversion{
|
||||||
|
ClientConfig: &apiextensionsv1.WebhookClientConfig{
|
||||||
|
Service: &apiextensionsv1.ServiceReference{
|
||||||
|
Name: "foobar",
|
||||||
|
Namespace: ns.Name,
|
||||||
|
},
|
||||||
|
CABundle: testCert,
|
||||||
|
},
|
||||||
|
ConversionReviewVersions: []string{
|
||||||
|
"v1", "v1beta1",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
newDefinition.Spec.Versions = []apiextensionsv1.CustomResourceDefinitionVersion{
|
||||||
|
{
|
||||||
|
Name: "v1",
|
||||||
|
Served: true,
|
||||||
|
Storage: true,
|
||||||
|
Schema: apiextensionstestserver.AllowAllSchema(),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "v1beta1",
|
||||||
|
Served: true,
|
||||||
|
Storage: false,
|
||||||
|
Schema: apiextensionstestserver.AllowAllSchema(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = apiExtensionClient.ApiextensionsV1().CustomResourceDefinitions().Update(context.TODO(), newDefinition, metav1.UpdateOptions{})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Error updating CRD with conversion webhook: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.startGC(5)
|
||||||
|
// make sure gc.Sync finds the new CRD and starts monitoring it
|
||||||
|
time.Sleep(ctx.syncPeriod + 1*time.Second)
|
||||||
|
|
||||||
|
rcClient := clientSet.CoreV1().ReplicationControllers(ns.Name)
|
||||||
|
podClient := clientSet.CoreV1().Pods(ns.Name)
|
||||||
|
|
||||||
|
toBeDeletedRC, err := rcClient.Create(context.TODO(), newOwnerRC(toBeDeletedRCName, ns.Name), metav1.CreateOptions{})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to create replication controller: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
rcs, err := rcClient.List(context.TODO(), metav1.ListOptions{})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to list replication controllers: %v", err)
|
||||||
|
}
|
||||||
|
if len(rcs.Items) != 1 {
|
||||||
|
t.Fatalf("Expect only 1 replication controller")
|
||||||
|
}
|
||||||
|
|
||||||
|
pod := newPod(garbageCollectedPodName, ns.Name, []metav1.OwnerReference{{UID: toBeDeletedRC.ObjectMeta.UID, Name: toBeDeletedRCName}})
|
||||||
|
_, err = podClient.Create(context.TODO(), pod, metav1.CreateOptions{})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to create Pod: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
pods, err := podClient.List(context.TODO(), metav1.ListOptions{})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to list pods: %v", err)
|
||||||
|
}
|
||||||
|
if len(pods.Items) != 1 {
|
||||||
|
t.Fatalf("Expect only 1 pods")
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := rcClient.Delete(context.TODO(), toBeDeletedRCName, getNonOrphanOptions()); err != nil {
|
||||||
|
t.Fatalf("failed to delete replication controller: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// sometimes the deletion of the RC takes long time to be observed by
|
||||||
|
// the gc, so wait for the garbage collector to observe the deletion of
|
||||||
|
// the toBeDeletedRC
|
||||||
|
if err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, 60*time.Second, true, func(ctx context.Context) (bool, error) {
|
||||||
|
return !gc.GraphHasUID(toBeDeletedRC.ObjectMeta.UID), nil
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := integration.WaitForPodToDisappear(podClient, garbageCollectedPodName, 1*time.Second, 30*time.Second); err != nil {
|
||||||
|
t.Fatalf("expect pod %s to be garbage collected, got err= %v", garbageCollectedPodName, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that the cache is still not synced after cascading delete succeeded
|
||||||
|
// If this check passes, check that the conversion webhook is correctly misconfigured
|
||||||
|
// to prevent watch cache from listing the CRD.
|
||||||
|
if ctx.gc.IsSynced(ctx.logger) {
|
||||||
|
t.Fatal("cache is not expected to be synced due to bad conversion webhook")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -219,7 +219,7 @@ func CreateGCController(ctx context.Context, tb ktesting.TB, restConfig restclie
|
|||||||
go wait.Until(func() {
|
go wait.Until(func() {
|
||||||
restMapper.Reset()
|
restMapper.Reset()
|
||||||
}, syncPeriod, ctx.Done())
|
}, syncPeriod, ctx.Done())
|
||||||
go gc.Run(ctx, 1)
|
go gc.Run(ctx, 1, syncPeriod)
|
||||||
go gc.Sync(ctx, clientSet.Discovery(), syncPeriod)
|
go gc.Sync(ctx, clientSet.Discovery(), syncPeriod)
|
||||||
}
|
}
|
||||||
return startGC
|
return startGC
|
||||||
|
Loading…
Reference in New Issue
Block a user