Merge pull request #66389 from bertinatto/metrics_pv_controller

Automatic merge from submit-queue (batch tested with PRs 66098, 66389, 66400, 66413, 66378). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Add metrics in PV Controller

**What this PR does / why we need it**:

This PR adds a few metrics described in the [Metrics Spec](https://docs.google.com/document/d/1Fh0T60T_y888LsRwC51CQHO75b2IZ3A34ZQS71s_F0g/edit#heading=h.ys6pjpbasqdu) (PV Controller only):

Additional metrics for PV Controller:
* Total provision and deletion time
* Number of times PV provisioning and deletion failed

**Release note**:

```release-note
NONE
```
This commit is contained in:
Kubernetes Submit Queue 2018-07-20 05:30:09 -07:00 committed by GitHub
commit e74a68e4c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 129 additions and 36 deletions

View File

@ -56,6 +56,8 @@ type PVCLister interface {
func Register(pvLister PVLister, pvcLister PVCLister) {
registerMetrics.Do(func() {
prometheus.MustRegister(newPVAndPVCCountCollector(pvLister, pvcLister))
prometheus.MustRegister(volumeOperationMetric)
prometheus.MustRegister(volumeOperationErrorsMetric)
})
}
@ -89,6 +91,19 @@ var (
prometheus.BuildFQName("", pvControllerSubsystem, unboundPVCKey),
"Gauge measuring number of persistent volume claim currently unbound",
[]string{namespaceLabel}, nil)
volumeOperationMetric = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "volume_operation_total_seconds",
Help: "Total volume operation time",
},
[]string{"plugin_name", "operation_name"})
volumeOperationErrorsMetric = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "volume_operation_total_errors",
Help: "Total volume operation erros",
},
[]string{"plugin_name", "operation_name"})
)
func (collector *pvAndPVCCountCollector) Describe(ch chan<- *prometheus.Desc) {
@ -182,3 +197,15 @@ func (collector *pvAndPVCCountCollector) pvcCollect(ch chan<- prometheus.Metric)
ch <- metric
}
}
// RecordVolumeOperationMetric records the latency and errors of volume operations.
func RecordVolumeOperationMetric(pluginName, opName string, timeTaken float64, err error) {
if pluginName == "" {
pluginName = "N/A"
}
if err != nil {
volumeOperationErrorsMetric.WithLabelValues(pluginName, opName).Inc()
return
}
volumeOperationMetric.WithLabelValues(pluginName, opName).Observe(timeTaken)
}

View File

@ -40,6 +40,7 @@ import (
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/cloudprovider"
"k8s.io/kubernetes/pkg/controller/volume/events"
"k8s.io/kubernetes/pkg/controller/volume/persistentvolume/metrics"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/util/goroutinemap"
"k8s.io/kubernetes/pkg/util/goroutinemap/exponentialbackoff"
@ -1058,8 +1059,12 @@ func (ctrl *PersistentVolumeController) reclaimVolume(volume *v1.PersistentVolum
case v1.PersistentVolumeReclaimDelete:
glog.V(4).Infof("reclaimVolume[%s]: policy is Delete", volume.Name)
opName := fmt.Sprintf("delete-%s[%s]", volume.Name, string(volume.UID))
startTime := time.Now()
ctrl.scheduleOperation(opName, func() error {
return ctrl.deleteVolumeOperation(volume)
pluginName, err := ctrl.deleteVolumeOperation(volume)
timeTaken := time.Since(startTime).Seconds()
metrics.RecordVolumeOperationMetric(pluginName, "delete", timeTaken, err)
return err
})
default:
@ -1157,7 +1162,7 @@ func (ctrl *PersistentVolumeController) recycleVolumeOperation(volume *v1.Persis
// deleteVolumeOperation deletes a volume. This method is running in standalone
// goroutine and already has all necessary locks.
func (ctrl *PersistentVolumeController) deleteVolumeOperation(volume *v1.PersistentVolume) error {
func (ctrl *PersistentVolumeController) deleteVolumeOperation(volume *v1.PersistentVolume) (string, error) {
glog.V(4).Infof("deleteVolumeOperation [%s] started", volume.Name)
// This method may have been waiting for a volume lock for some time.
@ -1166,19 +1171,19 @@ func (ctrl *PersistentVolumeController) deleteVolumeOperation(volume *v1.Persist
newVolume, err := ctrl.kubeClient.CoreV1().PersistentVolumes().Get(volume.Name, metav1.GetOptions{})
if err != nil {
glog.V(3).Infof("error reading persistent volume %q: %v", volume.Name, err)
return nil
return "", nil
}
needsReclaim, err := ctrl.isVolumeReleased(newVolume)
if err != nil {
glog.V(3).Infof("error reading claim for volume %q: %v", volume.Name, err)
return nil
return "", nil
}
if !needsReclaim {
glog.V(3).Infof("volume %q no longer needs deletion, skipping", volume.Name)
return nil
return "", nil
}
deleted, err := ctrl.doDeleteVolume(volume)
pluginName, deleted, err := ctrl.doDeleteVolume(volume)
if err != nil {
// Delete failed, update the volume and emit an event.
glog.V(3).Infof("deletion of volume %q failed: %v", volume.Name, err)
@ -1192,17 +1197,17 @@ func (ctrl *PersistentVolumeController) deleteVolumeOperation(volume *v1.Persist
if _, err := ctrl.updateVolumePhaseWithEvent(volume, v1.VolumeFailed, v1.EventTypeWarning, events.VolumeFailedDelete, err.Error()); err != nil {
glog.V(4).Infof("deleteVolumeOperation [%s]: failed to mark volume as failed: %v", volume.Name, err)
// Save failed, retry on the next deletion attempt
return err
return pluginName, err
}
}
// Despite the volume being Failed, the controller will retry deleting
// the volume in every syncVolume() call.
return err
return pluginName, err
}
if !deleted {
// The volume waits for deletion by an external plugin. Do nothing.
return nil
return pluginName, nil
}
glog.V(4).Infof("deleteVolumeOperation [%s]: success", volume.Name)
@ -1213,9 +1218,9 @@ func (ctrl *PersistentVolumeController) deleteVolumeOperation(volume *v1.Persist
// cache of "recently deleted volumes" and avoid unnecessary deletion,
// this is left out as future optimization.
glog.V(3).Infof("failed to delete volume %q from database: %v", volume.Name, err)
return nil
return pluginName, nil
}
return nil
return pluginName, nil
}
// isVolumeReleased returns true if given volume is released and can be recycled
@ -1292,43 +1297,44 @@ func (ctrl *PersistentVolumeController) isVolumeUsed(pv *v1.PersistentVolume) ([
return podNames.List(), podNames.Len() != 0, nil
}
// doDeleteVolume finds appropriate delete plugin and deletes given volume. It
// returns 'true', when the volume was deleted and 'false' when the volume
// cannot be deleted because of the deleter is external. No error should be
// reported in this case.
func (ctrl *PersistentVolumeController) doDeleteVolume(volume *v1.PersistentVolume) (bool, error) {
// doDeleteVolume finds appropriate delete plugin and deletes given volume, returning
// the volume plugin name. Also, it returns 'true', when the volume was deleted and
// 'false' when the volume cannot be deleted because of the deleter is external. No
// error should be reported in this case.
func (ctrl *PersistentVolumeController) doDeleteVolume(volume *v1.PersistentVolume) (string, bool, error) {
glog.V(4).Infof("doDeleteVolume [%s]", volume.Name)
var err error
plugin, err := ctrl.findDeletablePlugin(volume)
if err != nil {
return false, err
return "", false, err
}
if plugin == nil {
// External deleter is requested, do nothing
glog.V(3).Infof("external deleter for volume %q requested, ignoring", volume.Name)
return false, nil
return "", false, nil
}
// Plugin found
glog.V(5).Infof("found a deleter plugin %q for volume %q", plugin.GetPluginName(), volume.Name)
pluginName := plugin.GetPluginName()
glog.V(5).Infof("found a deleter plugin %q for volume %q", pluginName, volume.Name)
spec := vol.NewSpecFromPersistentVolume(volume, false)
deleter, err := plugin.NewDeleter(spec)
if err != nil {
// Cannot create deleter
return false, fmt.Errorf("Failed to create deleter for volume %q: %v", volume.Name, err)
return pluginName, false, fmt.Errorf("Failed to create deleter for volume %q: %v", volume.Name, err)
}
opComplete := util.OperationCompleteHook(plugin.GetPluginName(), "volume_delete")
opComplete := util.OperationCompleteHook(pluginName, "volume_delete")
err = deleter.Delete()
opComplete(&err)
if err != nil {
// Deleter failed
return false, err
return pluginName, false, err
}
glog.V(2).Infof("volume %q deleted", volume.Name)
return true, nil
return pluginName, true, nil
}
// provisionClaim starts new asynchronous operation to provision a claim if
@ -1339,16 +1345,19 @@ func (ctrl *PersistentVolumeController) provisionClaim(claim *v1.PersistentVolum
}
glog.V(4).Infof("provisionClaim[%s]: started", claimToClaimKey(claim))
opName := fmt.Sprintf("provision-%s[%s]", claimToClaimKey(claim), string(claim.UID))
startTime := time.Now()
ctrl.scheduleOperation(opName, func() error {
ctrl.provisionClaimOperation(claim)
return nil
pluginName, err := ctrl.provisionClaimOperation(claim)
timeTaken := time.Since(startTime).Seconds()
metrics.RecordVolumeOperationMetric(pluginName, "provision", timeTaken, err)
return err
})
return nil
}
// provisionClaimOperation provisions a volume. This method is running in
// standalone goroutine and already has all necessary locks.
func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.PersistentVolumeClaim) {
func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.PersistentVolumeClaim) (string, error) {
claimClass := v1helper.GetPersistentVolumeClaimClass(claim)
glog.V(4).Infof("provisionClaimOperation [%s] started, class: %q", claimToClaimKey(claim), claimClass)
@ -1358,7 +1367,12 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
glog.V(2).Infof("error finding provisioning plugin for claim %s: %v", claimToClaimKey(claim), err)
// The controller will retry provisioning the volume in every
// syncVolume() call.
return
return "", err
}
var pluginName string
if plugin != nil {
pluginName = plugin.GetPluginName()
}
// Add provisioner annotation so external provisioners know when to start
@ -1366,7 +1380,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
if err != nil {
// Save failed, the controller will retry in the next sync
glog.V(2).Infof("error saving claim %s: %v", claimToClaimKey(claim), err)
return
return pluginName, err
}
claim = newClaim
@ -1377,7 +1391,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
msg := fmt.Sprintf("waiting for a volume to be created, either by external provisioner %q or manually created by system administrator", storageClass.Provisioner)
ctrl.eventRecorder.Event(claim, v1.EventTypeNormal, events.ExternalProvisioning, msg)
glog.V(3).Infof("provisioning claim %q: %s", claimToClaimKey(claim), msg)
return
return pluginName, nil
}
// internal provisioning
@ -1391,7 +1405,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
if err == nil && volume != nil {
// Volume has been already provisioned, nothing to do.
glog.V(4).Infof("provisionClaimOperation [%s]: volume already exists, skipping", claimToClaimKey(claim))
return
return pluginName, err
}
// Prepare a claimRef to the claim early (to fail before a volume is
@ -1399,7 +1413,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
claimRef, err := ref.GetReference(scheme.Scheme, claim)
if err != nil {
glog.V(3).Infof("unexpected error getting claim reference: %v", err)
return
return pluginName, err
}
// Gather provisioning options
@ -1424,7 +1438,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
strerr := fmt.Sprintf("Mount options are not supported by the provisioner but StorageClass %q has mount options %v", storageClass.Name, options.MountOptions)
glog.V(2).Infof("Mount options are not supported by the provisioner but claim %q's StorageClass %q has mount options %v", claimToClaimKey(claim), storageClass.Name, options.MountOptions)
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, strerr)
return
return pluginName, fmt.Errorf("provisioner %q doesn't support mount options", plugin.GetPluginName())
}
// Provision the volume
@ -1433,7 +1447,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
strerr := fmt.Sprintf("Failed to create provisioner: %v", err)
glog.V(2).Infof("failed to create provisioner for claim %q with StorageClass %q: %v", claimToClaimKey(claim), storageClass.Name, err)
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, strerr)
return
return pluginName, err
}
var selectedNode *v1.Node = nil
@ -1445,7 +1459,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
strerr := fmt.Sprintf("Failed to get target node: %v", err)
glog.V(3).Infof("unexpected error getting target node %q for claim %q: %v", nodeName, claimToClaimKey(claim), err)
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, strerr)
return
return pluginName, err
}
}
allowedTopologies = storageClass.AllowedTopologies
@ -1463,7 +1477,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
strerr := fmt.Sprintf("Failed to provision volume with StorageClass %q: %v", storageClass.Name, err)
glog.V(2).Infof("failed to provision volume for claim %q with StorageClass %q: %v", claimToClaimKey(claim), storageClass.Name, err)
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, strerr)
return
return pluginName, err
}
glog.V(3).Infof("volume %q for claim %q created", volume.Name, claimToClaimKey(claim))
@ -1518,7 +1532,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
var deleteErr error
var deleted bool
for i := 0; i < ctrl.createProvisionedPVRetryCount; i++ {
deleted, deleteErr = ctrl.doDeleteVolume(volume)
_, deleted, deleteErr = ctrl.doDeleteVolume(volume)
if deleteErr == nil && deleted {
// Delete succeeded
glog.V(4).Infof("provisionClaimOperation [%s]: cleaning volume %s succeeded", claimToClaimKey(claim), volume.Name)
@ -1548,6 +1562,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
msg := fmt.Sprintf("Successfully provisioned volume %s using %s", volume.Name, plugin.GetPluginName())
ctrl.eventRecorder.Event(claim, v1.EventTypeNormal, events.ProvisioningSucceeded, msg)
}
return pluginName, nil
}
// rescheduleProvisioning signal back to the scheduler to retry dynamic provisioning

View File

@ -167,6 +167,37 @@ var _ = utils.SIGDescribe("[Serial] Volume metrics", func() {
framework.ExpectNoError(framework.DeletePodWithWait(f, c, pod))
})
It("should create metrics for total time taken in volume operations in P/V Controller", func() {
var err error
pvc, err = c.CoreV1().PersistentVolumeClaims(pvc.Namespace).Create(pvc)
Expect(err).NotTo(HaveOccurred())
Expect(pvc).ToNot(Equal(nil))
claims := []*v1.PersistentVolumeClaim{pvc}
pod := framework.MakePod(ns, nil, claims, false, "")
pod, err = c.CoreV1().Pods(ns).Create(pod)
Expect(err).NotTo(HaveOccurred())
err = framework.WaitForPodRunningInNamespace(c, pod)
framework.ExpectNoError(framework.WaitForPodRunningInNamespace(c, pod), "Error starting pod ", pod.Name)
pod, err = c.CoreV1().Pods(ns).Get(pod.Name, metav1.GetOptions{})
Expect(err).NotTo(HaveOccurred())
controllerMetrics, err := metricsGrabber.GrabFromControllerManager()
if err != nil {
framework.Skipf("Could not get controller-manager metrics - skipping")
}
metricKey := "volume_operation_total_seconds_count"
dimensions := []string{"operation_name", "plugin_name"}
valid := hasValidMetrics(metrics.Metrics(controllerMetrics), metricKey, dimensions...)
Expect(valid).To(BeTrue(), "Invalid metric in P/V Controller metrics: %q", metricKey)
framework.Logf("Deleting pod %q/%q", pod.Namespace, pod.Name)
framework.ExpectNoError(framework.DeletePodWithWait(f, c, pod))
})
// Test for pv controller metrics, concretely: bound/unbound pv/pvc count.
Describe("PVController", func() {
const (
@ -454,3 +485,23 @@ func calculateRelativeValues(originValues, updatedValues map[string]int64) map[s
}
return relativeValues
}
func hasValidMetrics(metrics metrics.Metrics, metricKey string, dimensions ...string) bool {
var errCount int
framework.Logf("Looking for sample in metric %q", metricKey)
samples, ok := metrics[metricKey]
if !ok {
framework.Logf("Key %q was not found in metrics", metricKey)
return false
}
for _, sample := range samples {
framework.Logf("Found sample %q", sample.String())
for _, d := range dimensions {
if _, ok := sample.Metric[model.LabelName(d)]; !ok {
framework.Logf("Error getting dimension %q for metric %q, sample %q", d, metricKey, sample.String())
errCount++
}
}
}
return errCount == 0
}