From 04b864a91f6d95064e7b6dbbefaade6d0d2fb795 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Mon, 26 Jan 2015 20:23:46 -0500 Subject: [PATCH 1/2] Allow panics and unhandled errors to be reported to external targets It would be better if Kubernetes could be instrumented to report panics and handled but unreturned errors in controller loops to a remote service. This commit introduces settable handlers for HandleCrash and a new HandleError that can be overriden to report errors to a remote service for analysis. HandleError() in particular is for control loops that do not return an error, and so there is no ability to report those errors other than in log files. ReplicationController is provided as an example. --- pkg/util/util.go | 49 +++++++++++++++++++++++++++++++++---------- pkg/util/util_test.go | 35 +++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 11 deletions(-) diff --git a/pkg/util/util.go b/pkg/util/util.go index a18dc874db2..d7e6a37dea2 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -34,26 +34,53 @@ import ( // For testing, bypass HandleCrash. var ReallyCrash bool +// PanicHandlers is a list of functions which will be invoked when a panic happens. +var PanicHandlers = []func(interface{}){logPanic} + // HandleCrash simply catches a crash and logs an error. Meant to be called via defer. func HandleCrash() { if ReallyCrash { return } - - r := recover() - if r != nil { - callers := "" - for i := 0; true; i++ { - _, file, line, ok := runtime.Caller(i) - if !ok { - break - } - callers = callers + fmt.Sprintf("%v:%v\n", file, line) + if r := recover(); r != nil { + for _, fn := range PanicHandlers { + fn(r) } - glog.Infof("Recovered from panic: %#v (%v)\n%v", r, r, callers) } } +// logPanic logs the caller tree when a panic occurs. +func logPanic(r interface{}) { + callers := "" + for i := 0; true; i++ { + _, file, line, ok := runtime.Caller(i) + if !ok { + break + } + callers = callers + fmt.Sprintf("%v:%v\n", file, line) + } + glog.Infof("Recovered from panic: %#v (%v)\n%v", r, r, callers) +} + +// ErrorHandlers is a list of functions which will be invoked when an unreturnable +// error occurs. +var ErrorHandlers = []func(error){logError} + +// HandlerError is a method to invoke when a non-user facing piece of code cannot +// return an error and needs to indicate it has been ignored. Invoking this method +// is preferable to logging the error - the default behavior is to log but the +// errors may be sent to a remote server for analysis. +func HandleError(err error) { + for _, fn := range ErrorHandlers { + fn(err) + } +} + +// logError prints an error with the call stack of the location it was reported +func logError(err error) { + glog.ErrorDepth(2, err) +} + // Forever loops forever running f every period. Catches any panics, and keeps going. func Forever(f func(), period time.Duration) { Until(f, period, nil) diff --git a/pkg/util/util_test.go b/pkg/util/util_test.go index 73790c733ae..57ee2c91384 100644 --- a/pkg/util/util_test.go +++ b/pkg/util/util_test.go @@ -18,6 +18,7 @@ package util import ( "encoding/json" + "fmt" "reflect" "testing" @@ -59,6 +60,40 @@ func TestHandleCrash(t *testing.T) { } } +func TestCustomHandleCrash(t *testing.T) { + old := PanicHandlers + defer func() { PanicHandlers = old }() + var result interface{} + PanicHandlers = []func(interface{}){ + func(r interface{}) { + result = r + }, + } + func() { + defer HandleCrash() + panic("test") + }() + if result != "test" { + t.Errorf("did not receive custom handler") + } +} + +func TestCustomHandleError(t *testing.T) { + old := ErrorHandlers + defer func() { ErrorHandlers = old }() + var result error + ErrorHandlers = []func(error){ + func(err error) { + result = err + }, + } + err := fmt.Errorf("test") + HandleError(err) + if result != err { + t.Errorf("did not receive custom handler") + } +} + func TestNewIntOrStringFromInt(t *testing.T) { i := NewIntOrStringFromInt(93) if i.Kind != IntstrInt || i.IntVal != 93 { From 19987612bf429fcfd753c44bc4d8363d2b9a9dd3 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Mon, 26 Jan 2015 20:54:29 -0500 Subject: [PATCH 2/2] Replace code that eats errors with a more consistent reporting In order to allow certain programmer error / unexpected error states to be captured, replace sections that log and eat errors with a method that can be centrally replaced. This captures two common error sources, replication errors, and apiserver code that returns invalid errors. --- pkg/apiserver/apiserver.go | 3 ++- pkg/apiserver/errors.go | 4 ++-- pkg/controller/replication_controller.go | 19 ++++++++++--------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pkg/apiserver/apiserver.go b/pkg/apiserver/apiserver.go index 70eec069e37..6eb64c32129 100644 --- a/pkg/apiserver/apiserver.go +++ b/pkg/apiserver/apiserver.go @@ -31,6 +31,7 @@ import ( "github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/healthz" "github.com/GoogleCloudPlatform/kubernetes/pkg/runtime" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" "github.com/GoogleCloudPlatform/kubernetes/pkg/version" "github.com/emicklei/go-restful" @@ -373,6 +374,7 @@ func errorJSON(err error, codec runtime.Codec, w http.ResponseWriter) { // errorJSONFatal renders an error to the response, and if codec fails will render plaintext func errorJSONFatal(err error, codec runtime.Codec, w http.ResponseWriter) { + util.HandleError(fmt.Errorf("apiserver was unable to write a JSON response: %v", err)) status := errToAPIStatus(err) output, err := codec.Encode(status) if err != nil { @@ -387,7 +389,6 @@ func errorJSONFatal(err error, codec runtime.Codec, w http.ResponseWriter) { // writeRawJSON writes a non-API object in JSON. func writeRawJSON(statusCode int, object interface{}, w http.ResponseWriter) { - // PR #2243: Pretty-print JSON by default. output, err := json.MarshalIndent(object, "", " ") if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) diff --git a/pkg/apiserver/errors.go b/pkg/apiserver/errors.go index bb859409523..91b7e564147 100644 --- a/pkg/apiserver/errors.go +++ b/pkg/apiserver/errors.go @@ -22,7 +22,7 @@ import ( "github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/tools" - "github.com/golang/glog" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" ) // statusError is an object that can be converted into an api.Status @@ -49,7 +49,7 @@ func errToAPIStatus(err error) *api.Status { // by REST storage - these typically indicate programmer // error by not using pkg/api/errors, or unexpected failure // cases. - glog.V(1).Infof("An unchecked error was received: %v", err) + util.HandleError(fmt.Errorf("apiserver received an error that is not an api.Status: %v", err)) return &api.Status{ Status: api.StatusFailure, Code: status, diff --git a/pkg/controller/replication_controller.go b/pkg/controller/replication_controller.go index b347d94c72c..704384edd1b 100644 --- a/pkg/controller/replication_controller.go +++ b/pkg/controller/replication_controller.go @@ -17,6 +17,7 @@ limitations under the License. package controller import ( + "fmt" "sync" "time" @@ -65,15 +66,15 @@ func (r RealPodControl) createReplica(namespace string, controller api.Replicati }, } if err := api.Scheme.Convert(&controller.Spec.Template.Spec, &pod.Spec); err != nil { - glog.Errorf("Unable to convert pod template: %v", err) + util.HandleError(fmt.Errorf("unable to convert pod template: %v", err)) return } if labels.Set(pod.Labels).AsSelector().Empty() { - glog.Errorf("Unable to create pod replica, no labels") + util.HandleError(fmt.Errorf("unable to create pod replica, no labels")) return } if _, err := r.kubeClient.Pods(namespace).Create(pod); err != nil { - glog.Errorf("Unable to create pod replica: %v", err) + util.HandleError(fmt.Errorf("unable to create pod replica: %v", err)) } } @@ -108,7 +109,7 @@ func (rm *ReplicationManager) watchControllers(resourceVersion *string) { *resourceVersion, ) if err != nil { - glog.Errorf("Unexpected failure to watch: %v", err) + util.HandleError(fmt.Errorf("unable to watch: %v", err)) time.Sleep(5 * time.Second) return } @@ -125,13 +126,13 @@ func (rm *ReplicationManager) watchControllers(resourceVersion *string) { return } if event.Type == watch.Error { - glog.Errorf("error from watch during sync: %v", errors.FromObject(event.Object)) + util.HandleError(fmt.Errorf("error from watch during sync: %v", errors.FromObject(event.Object))) continue } glog.V(4).Infof("Got watch: %#v", event) rc, ok := event.Object.(*api.ReplicationController) if !ok { - glog.Errorf("unexpected object: %#v", event.Object) + util.HandleError(fmt.Errorf("unexpected object: %#v", event.Object)) continue } // If we get disconnected, start where we left off. @@ -140,7 +141,7 @@ func (rm *ReplicationManager) watchControllers(resourceVersion *string) { // it in the desired state. glog.V(4).Infof("About to sync from watch: %v", rc.Name) if err := rm.syncHandler(*rc); err != nil { - glog.Errorf("unexpected sync. error: %v", err) + util.HandleError(fmt.Errorf("unexpected sync error: %v", err)) } } } @@ -199,7 +200,7 @@ func (rm *ReplicationManager) synchronize() { var controllers []api.ReplicationController list, err := rm.kubeClient.ReplicationControllers(api.NamespaceAll).List(labels.Everything()) if err != nil { - glog.Errorf("Synchronization error: %v (%#v)", err, err) + util.HandleError(fmt.Errorf("synchronization error: %v", err)) return } controllers = list.Items @@ -211,7 +212,7 @@ func (rm *ReplicationManager) synchronize() { glog.V(4).Infof("periodic sync of %v", controllers[ix].Name) err := rm.syncHandler(controllers[ix]) if err != nil { - glog.Errorf("Error synchronizing: %v", err) + util.HandleError(fmt.Errorf("error synchronizing: %v", err)) } }(ix) }