From ada5b33d98607e33e1620f8c091077e219f280ab Mon Sep 17 00:00:00 2001 From: Colleen Murphy Date: Thu, 19 May 2022 12:32:23 -0700 Subject: [PATCH 1/2] Return websocket error and add logging for watches Add debug logs and send websocket messages when the watch is closed unexpectedly. In addition to being helpful for debugging, the dashboard specifically looks for a `resource.error` event containing the string "too old" in order to trigger the watch to be resynced with a refreshed revision number. Without this error returned, the dashboard will only see `resource.stop` events and never change its behavior, continuing to try to restart the watch with an incorrect resource version. --- pkg/stores/proxy/proxy_store.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pkg/stores/proxy/proxy_store.go b/pkg/stores/proxy/proxy_store.go index b707923..3bb521e 100644 --- a/pkg/stores/proxy/proxy_store.go +++ b/pkg/stores/proxy/proxy_store.go @@ -318,6 +318,9 @@ func (s *Store) listAndWatch(apiOp *types.APIRequest, client dynamic.ResourceInt obj, err := s.byID(apiOp, schema, rel.Namespace, rel.Name) if err == nil { result <- s.toAPIEvent(apiOp, schema, watch.Modified, obj) + } else { + logrus.Debugf("notifier watch error: %v", err) + returnErr(errors.Wrapf(err, "notifier watch error: %v", err), result) } } return fmt.Errorf("closed") @@ -327,6 +330,12 @@ func (s *Store) listAndWatch(apiOp *types.APIRequest, client dynamic.ResourceInt eg.Go(func() error { for event := range watcher.ResultChan() { if event.Type == watch.Error { + if status, ok := event.Object.(*metav1.Status); ok { + logrus.Debugf("event watch error: %s", status.Message) + returnErr(fmt.Errorf("event watch error: %s", status.Message), result) + } else { + logrus.Debugf("event watch error: could not decode event object %T", event.Object) + } continue } result <- s.toAPIEvent(apiOp, schema, event.Type, event.Object) From 11fe86ab7ed130f6af59f5f335ffe93d0d2a2fb2 Mon Sep 17 00:00:00 2001 From: Colleen Murphy Date: Thu, 19 May 2022 14:25:13 -0700 Subject: [PATCH 2/2] Make watch timeout configurable By default, a watch times out after 30 minutes. For debugging purposes, it's convenient if this can be decreased. Add an environment variable CATTLE_WATCH_TIMEOUT_SECONDS to enable setting the timeout in seconds. --- pkg/stores/proxy/proxy_store.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pkg/stores/proxy/proxy_store.go b/pkg/stores/proxy/proxy_store.go index 3bb521e..ce9fd99 100644 --- a/pkg/stores/proxy/proxy_store.go +++ b/pkg/stores/proxy/proxy_store.go @@ -7,8 +7,10 @@ import ( "io" "io/ioutil" "net/http" + "os" "reflect" "regexp" + "strconv" "github.com/pkg/errors" "github.com/rancher/apiserver/pkg/types" @@ -32,6 +34,8 @@ import ( "k8s.io/client-go/kubernetes" ) +const watchTimeoutEnv = "CATTLE_WATCH_TIMEOUT_SECONDS" + var ( lowerChars = regexp.MustCompile("[a-z]+") paramScheme = runtime.NewScheme() @@ -291,6 +295,15 @@ func (s *Store) listAndWatch(apiOp *types.APIRequest, client dynamic.ResourceInt } timeout := int64(60 * 30) + timeoutSetting := os.Getenv(watchTimeoutEnv) + if timeoutSetting != "" { + userSetTimeout, err := strconv.Atoi(timeoutSetting) + if err != nil { + logrus.Debugf("could not parse %s environment variable, error: %v", watchTimeoutEnv, err) + } else { + timeout = int64(userSetTimeout) + } + } k8sClient, _ := metricsStore.Wrap(client, nil) watcher, err := k8sClient.Watch(apiOp, metav1.ListOptions{ Watch: true,