mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 19:31:44 +00:00
Merge pull request #66389 from bertinatto/metrics_pv_controller
Automatic merge from submit-queue (batch tested with PRs 66098, 66389, 66400, 66413, 66378). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Add metrics in PV Controller **What this PR does / why we need it**: This PR adds a few metrics described in the [Metrics Spec](https://docs.google.com/document/d/1Fh0T60T_y888LsRwC51CQHO75b2IZ3A34ZQS71s_F0g/edit#heading=h.ys6pjpbasqdu) (PV Controller only): Additional metrics for PV Controller: * Total provision and deletion time * Number of times PV provisioning and deletion failed **Release note**: ```release-note NONE ```
This commit is contained in:
commit
e74a68e4c5
@ -56,6 +56,8 @@ type PVCLister interface {
|
||||
func Register(pvLister PVLister, pvcLister PVCLister) {
|
||||
registerMetrics.Do(func() {
|
||||
prometheus.MustRegister(newPVAndPVCCountCollector(pvLister, pvcLister))
|
||||
prometheus.MustRegister(volumeOperationMetric)
|
||||
prometheus.MustRegister(volumeOperationErrorsMetric)
|
||||
})
|
||||
}
|
||||
|
||||
@ -89,6 +91,19 @@ var (
|
||||
prometheus.BuildFQName("", pvControllerSubsystem, unboundPVCKey),
|
||||
"Gauge measuring number of persistent volume claim currently unbound",
|
||||
[]string{namespaceLabel}, nil)
|
||||
|
||||
volumeOperationMetric = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "volume_operation_total_seconds",
|
||||
Help: "Total volume operation time",
|
||||
},
|
||||
[]string{"plugin_name", "operation_name"})
|
||||
volumeOperationErrorsMetric = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "volume_operation_total_errors",
|
||||
Help: "Total volume operation erros",
|
||||
},
|
||||
[]string{"plugin_name", "operation_name"})
|
||||
)
|
||||
|
||||
func (collector *pvAndPVCCountCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
@ -182,3 +197,15 @@ func (collector *pvAndPVCCountCollector) pvcCollect(ch chan<- prometheus.Metric)
|
||||
ch <- metric
|
||||
}
|
||||
}
|
||||
|
||||
// RecordVolumeOperationMetric records the latency and errors of volume operations.
|
||||
func RecordVolumeOperationMetric(pluginName, opName string, timeTaken float64, err error) {
|
||||
if pluginName == "" {
|
||||
pluginName = "N/A"
|
||||
}
|
||||
if err != nil {
|
||||
volumeOperationErrorsMetric.WithLabelValues(pluginName, opName).Inc()
|
||||
return
|
||||
}
|
||||
volumeOperationMetric.WithLabelValues(pluginName, opName).Observe(timeTaken)
|
||||
}
|
||||
|
@ -40,6 +40,7 @@ import (
|
||||
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
||||
"k8s.io/kubernetes/pkg/cloudprovider"
|
||||
"k8s.io/kubernetes/pkg/controller/volume/events"
|
||||
"k8s.io/kubernetes/pkg/controller/volume/persistentvolume/metrics"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/pkg/util/goroutinemap"
|
||||
"k8s.io/kubernetes/pkg/util/goroutinemap/exponentialbackoff"
|
||||
@ -1058,8 +1059,12 @@ func (ctrl *PersistentVolumeController) reclaimVolume(volume *v1.PersistentVolum
|
||||
case v1.PersistentVolumeReclaimDelete:
|
||||
glog.V(4).Infof("reclaimVolume[%s]: policy is Delete", volume.Name)
|
||||
opName := fmt.Sprintf("delete-%s[%s]", volume.Name, string(volume.UID))
|
||||
startTime := time.Now()
|
||||
ctrl.scheduleOperation(opName, func() error {
|
||||
return ctrl.deleteVolumeOperation(volume)
|
||||
pluginName, err := ctrl.deleteVolumeOperation(volume)
|
||||
timeTaken := time.Since(startTime).Seconds()
|
||||
metrics.RecordVolumeOperationMetric(pluginName, "delete", timeTaken, err)
|
||||
return err
|
||||
})
|
||||
|
||||
default:
|
||||
@ -1157,7 +1162,7 @@ func (ctrl *PersistentVolumeController) recycleVolumeOperation(volume *v1.Persis
|
||||
|
||||
// deleteVolumeOperation deletes a volume. This method is running in standalone
|
||||
// goroutine and already has all necessary locks.
|
||||
func (ctrl *PersistentVolumeController) deleteVolumeOperation(volume *v1.PersistentVolume) error {
|
||||
func (ctrl *PersistentVolumeController) deleteVolumeOperation(volume *v1.PersistentVolume) (string, error) {
|
||||
glog.V(4).Infof("deleteVolumeOperation [%s] started", volume.Name)
|
||||
|
||||
// This method may have been waiting for a volume lock for some time.
|
||||
@ -1166,19 +1171,19 @@ func (ctrl *PersistentVolumeController) deleteVolumeOperation(volume *v1.Persist
|
||||
newVolume, err := ctrl.kubeClient.CoreV1().PersistentVolumes().Get(volume.Name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
glog.V(3).Infof("error reading persistent volume %q: %v", volume.Name, err)
|
||||
return nil
|
||||
return "", nil
|
||||
}
|
||||
needsReclaim, err := ctrl.isVolumeReleased(newVolume)
|
||||
if err != nil {
|
||||
glog.V(3).Infof("error reading claim for volume %q: %v", volume.Name, err)
|
||||
return nil
|
||||
return "", nil
|
||||
}
|
||||
if !needsReclaim {
|
||||
glog.V(3).Infof("volume %q no longer needs deletion, skipping", volume.Name)
|
||||
return nil
|
||||
return "", nil
|
||||
}
|
||||
|
||||
deleted, err := ctrl.doDeleteVolume(volume)
|
||||
pluginName, deleted, err := ctrl.doDeleteVolume(volume)
|
||||
if err != nil {
|
||||
// Delete failed, update the volume and emit an event.
|
||||
glog.V(3).Infof("deletion of volume %q failed: %v", volume.Name, err)
|
||||
@ -1192,17 +1197,17 @@ func (ctrl *PersistentVolumeController) deleteVolumeOperation(volume *v1.Persist
|
||||
if _, err := ctrl.updateVolumePhaseWithEvent(volume, v1.VolumeFailed, v1.EventTypeWarning, events.VolumeFailedDelete, err.Error()); err != nil {
|
||||
glog.V(4).Infof("deleteVolumeOperation [%s]: failed to mark volume as failed: %v", volume.Name, err)
|
||||
// Save failed, retry on the next deletion attempt
|
||||
return err
|
||||
return pluginName, err
|
||||
}
|
||||
}
|
||||
|
||||
// Despite the volume being Failed, the controller will retry deleting
|
||||
// the volume in every syncVolume() call.
|
||||
return err
|
||||
return pluginName, err
|
||||
}
|
||||
if !deleted {
|
||||
// The volume waits for deletion by an external plugin. Do nothing.
|
||||
return nil
|
||||
return pluginName, nil
|
||||
}
|
||||
|
||||
glog.V(4).Infof("deleteVolumeOperation [%s]: success", volume.Name)
|
||||
@ -1213,9 +1218,9 @@ func (ctrl *PersistentVolumeController) deleteVolumeOperation(volume *v1.Persist
|
||||
// cache of "recently deleted volumes" and avoid unnecessary deletion,
|
||||
// this is left out as future optimization.
|
||||
glog.V(3).Infof("failed to delete volume %q from database: %v", volume.Name, err)
|
||||
return nil
|
||||
return pluginName, nil
|
||||
}
|
||||
return nil
|
||||
return pluginName, nil
|
||||
}
|
||||
|
||||
// isVolumeReleased returns true if given volume is released and can be recycled
|
||||
@ -1292,43 +1297,44 @@ func (ctrl *PersistentVolumeController) isVolumeUsed(pv *v1.PersistentVolume) ([
|
||||
return podNames.List(), podNames.Len() != 0, nil
|
||||
}
|
||||
|
||||
// doDeleteVolume finds appropriate delete plugin and deletes given volume. It
|
||||
// returns 'true', when the volume was deleted and 'false' when the volume
|
||||
// cannot be deleted because of the deleter is external. No error should be
|
||||
// reported in this case.
|
||||
func (ctrl *PersistentVolumeController) doDeleteVolume(volume *v1.PersistentVolume) (bool, error) {
|
||||
// doDeleteVolume finds appropriate delete plugin and deletes given volume, returning
|
||||
// the volume plugin name. Also, it returns 'true', when the volume was deleted and
|
||||
// 'false' when the volume cannot be deleted because of the deleter is external. No
|
||||
// error should be reported in this case.
|
||||
func (ctrl *PersistentVolumeController) doDeleteVolume(volume *v1.PersistentVolume) (string, bool, error) {
|
||||
glog.V(4).Infof("doDeleteVolume [%s]", volume.Name)
|
||||
var err error
|
||||
|
||||
plugin, err := ctrl.findDeletablePlugin(volume)
|
||||
if err != nil {
|
||||
return false, err
|
||||
return "", false, err
|
||||
}
|
||||
if plugin == nil {
|
||||
// External deleter is requested, do nothing
|
||||
glog.V(3).Infof("external deleter for volume %q requested, ignoring", volume.Name)
|
||||
return false, nil
|
||||
return "", false, nil
|
||||
}
|
||||
|
||||
// Plugin found
|
||||
glog.V(5).Infof("found a deleter plugin %q for volume %q", plugin.GetPluginName(), volume.Name)
|
||||
pluginName := plugin.GetPluginName()
|
||||
glog.V(5).Infof("found a deleter plugin %q for volume %q", pluginName, volume.Name)
|
||||
spec := vol.NewSpecFromPersistentVolume(volume, false)
|
||||
deleter, err := plugin.NewDeleter(spec)
|
||||
if err != nil {
|
||||
// Cannot create deleter
|
||||
return false, fmt.Errorf("Failed to create deleter for volume %q: %v", volume.Name, err)
|
||||
return pluginName, false, fmt.Errorf("Failed to create deleter for volume %q: %v", volume.Name, err)
|
||||
}
|
||||
|
||||
opComplete := util.OperationCompleteHook(plugin.GetPluginName(), "volume_delete")
|
||||
opComplete := util.OperationCompleteHook(pluginName, "volume_delete")
|
||||
err = deleter.Delete()
|
||||
opComplete(&err)
|
||||
if err != nil {
|
||||
// Deleter failed
|
||||
return false, err
|
||||
return pluginName, false, err
|
||||
}
|
||||
|
||||
glog.V(2).Infof("volume %q deleted", volume.Name)
|
||||
return true, nil
|
||||
return pluginName, true, nil
|
||||
}
|
||||
|
||||
// provisionClaim starts new asynchronous operation to provision a claim if
|
||||
@ -1339,16 +1345,19 @@ func (ctrl *PersistentVolumeController) provisionClaim(claim *v1.PersistentVolum
|
||||
}
|
||||
glog.V(4).Infof("provisionClaim[%s]: started", claimToClaimKey(claim))
|
||||
opName := fmt.Sprintf("provision-%s[%s]", claimToClaimKey(claim), string(claim.UID))
|
||||
startTime := time.Now()
|
||||
ctrl.scheduleOperation(opName, func() error {
|
||||
ctrl.provisionClaimOperation(claim)
|
||||
return nil
|
||||
pluginName, err := ctrl.provisionClaimOperation(claim)
|
||||
timeTaken := time.Since(startTime).Seconds()
|
||||
metrics.RecordVolumeOperationMetric(pluginName, "provision", timeTaken, err)
|
||||
return err
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
// provisionClaimOperation provisions a volume. This method is running in
|
||||
// standalone goroutine and already has all necessary locks.
|
||||
func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.PersistentVolumeClaim) {
|
||||
func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.PersistentVolumeClaim) (string, error) {
|
||||
claimClass := v1helper.GetPersistentVolumeClaimClass(claim)
|
||||
glog.V(4).Infof("provisionClaimOperation [%s] started, class: %q", claimToClaimKey(claim), claimClass)
|
||||
|
||||
@ -1358,7 +1367,12 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
|
||||
glog.V(2).Infof("error finding provisioning plugin for claim %s: %v", claimToClaimKey(claim), err)
|
||||
// The controller will retry provisioning the volume in every
|
||||
// syncVolume() call.
|
||||
return
|
||||
return "", err
|
||||
}
|
||||
|
||||
var pluginName string
|
||||
if plugin != nil {
|
||||
pluginName = plugin.GetPluginName()
|
||||
}
|
||||
|
||||
// Add provisioner annotation so external provisioners know when to start
|
||||
@ -1366,7 +1380,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
|
||||
if err != nil {
|
||||
// Save failed, the controller will retry in the next sync
|
||||
glog.V(2).Infof("error saving claim %s: %v", claimToClaimKey(claim), err)
|
||||
return
|
||||
return pluginName, err
|
||||
}
|
||||
claim = newClaim
|
||||
|
||||
@ -1377,7 +1391,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
|
||||
msg := fmt.Sprintf("waiting for a volume to be created, either by external provisioner %q or manually created by system administrator", storageClass.Provisioner)
|
||||
ctrl.eventRecorder.Event(claim, v1.EventTypeNormal, events.ExternalProvisioning, msg)
|
||||
glog.V(3).Infof("provisioning claim %q: %s", claimToClaimKey(claim), msg)
|
||||
return
|
||||
return pluginName, nil
|
||||
}
|
||||
|
||||
// internal provisioning
|
||||
@ -1391,7 +1405,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
|
||||
if err == nil && volume != nil {
|
||||
// Volume has been already provisioned, nothing to do.
|
||||
glog.V(4).Infof("provisionClaimOperation [%s]: volume already exists, skipping", claimToClaimKey(claim))
|
||||
return
|
||||
return pluginName, err
|
||||
}
|
||||
|
||||
// Prepare a claimRef to the claim early (to fail before a volume is
|
||||
@ -1399,7 +1413,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
|
||||
claimRef, err := ref.GetReference(scheme.Scheme, claim)
|
||||
if err != nil {
|
||||
glog.V(3).Infof("unexpected error getting claim reference: %v", err)
|
||||
return
|
||||
return pluginName, err
|
||||
}
|
||||
|
||||
// Gather provisioning options
|
||||
@ -1424,7 +1438,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
|
||||
strerr := fmt.Sprintf("Mount options are not supported by the provisioner but StorageClass %q has mount options %v", storageClass.Name, options.MountOptions)
|
||||
glog.V(2).Infof("Mount options are not supported by the provisioner but claim %q's StorageClass %q has mount options %v", claimToClaimKey(claim), storageClass.Name, options.MountOptions)
|
||||
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, strerr)
|
||||
return
|
||||
return pluginName, fmt.Errorf("provisioner %q doesn't support mount options", plugin.GetPluginName())
|
||||
}
|
||||
|
||||
// Provision the volume
|
||||
@ -1433,7 +1447,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
|
||||
strerr := fmt.Sprintf("Failed to create provisioner: %v", err)
|
||||
glog.V(2).Infof("failed to create provisioner for claim %q with StorageClass %q: %v", claimToClaimKey(claim), storageClass.Name, err)
|
||||
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, strerr)
|
||||
return
|
||||
return pluginName, err
|
||||
}
|
||||
|
||||
var selectedNode *v1.Node = nil
|
||||
@ -1445,7 +1459,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
|
||||
strerr := fmt.Sprintf("Failed to get target node: %v", err)
|
||||
glog.V(3).Infof("unexpected error getting target node %q for claim %q: %v", nodeName, claimToClaimKey(claim), err)
|
||||
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, strerr)
|
||||
return
|
||||
return pluginName, err
|
||||
}
|
||||
}
|
||||
allowedTopologies = storageClass.AllowedTopologies
|
||||
@ -1463,7 +1477,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
|
||||
strerr := fmt.Sprintf("Failed to provision volume with StorageClass %q: %v", storageClass.Name, err)
|
||||
glog.V(2).Infof("failed to provision volume for claim %q with StorageClass %q: %v", claimToClaimKey(claim), storageClass.Name, err)
|
||||
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, strerr)
|
||||
return
|
||||
return pluginName, err
|
||||
}
|
||||
|
||||
glog.V(3).Infof("volume %q for claim %q created", volume.Name, claimToClaimKey(claim))
|
||||
@ -1518,7 +1532,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
|
||||
var deleteErr error
|
||||
var deleted bool
|
||||
for i := 0; i < ctrl.createProvisionedPVRetryCount; i++ {
|
||||
deleted, deleteErr = ctrl.doDeleteVolume(volume)
|
||||
_, deleted, deleteErr = ctrl.doDeleteVolume(volume)
|
||||
if deleteErr == nil && deleted {
|
||||
// Delete succeeded
|
||||
glog.V(4).Infof("provisionClaimOperation [%s]: cleaning volume %s succeeded", claimToClaimKey(claim), volume.Name)
|
||||
@ -1548,6 +1562,7 @@ func (ctrl *PersistentVolumeController) provisionClaimOperation(claim *v1.Persis
|
||||
msg := fmt.Sprintf("Successfully provisioned volume %s using %s", volume.Name, plugin.GetPluginName())
|
||||
ctrl.eventRecorder.Event(claim, v1.EventTypeNormal, events.ProvisioningSucceeded, msg)
|
||||
}
|
||||
return pluginName, nil
|
||||
}
|
||||
|
||||
// rescheduleProvisioning signal back to the scheduler to retry dynamic provisioning
|
||||
|
@ -167,6 +167,37 @@ var _ = utils.SIGDescribe("[Serial] Volume metrics", func() {
|
||||
framework.ExpectNoError(framework.DeletePodWithWait(f, c, pod))
|
||||
})
|
||||
|
||||
It("should create metrics for total time taken in volume operations in P/V Controller", func() {
|
||||
var err error
|
||||
pvc, err = c.CoreV1().PersistentVolumeClaims(pvc.Namespace).Create(pvc)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(pvc).ToNot(Equal(nil))
|
||||
|
||||
claims := []*v1.PersistentVolumeClaim{pvc}
|
||||
pod := framework.MakePod(ns, nil, claims, false, "")
|
||||
pod, err = c.CoreV1().Pods(ns).Create(pod)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
err = framework.WaitForPodRunningInNamespace(c, pod)
|
||||
framework.ExpectNoError(framework.WaitForPodRunningInNamespace(c, pod), "Error starting pod ", pod.Name)
|
||||
|
||||
pod, err = c.CoreV1().Pods(ns).Get(pod.Name, metav1.GetOptions{})
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
controllerMetrics, err := metricsGrabber.GrabFromControllerManager()
|
||||
if err != nil {
|
||||
framework.Skipf("Could not get controller-manager metrics - skipping")
|
||||
}
|
||||
|
||||
metricKey := "volume_operation_total_seconds_count"
|
||||
dimensions := []string{"operation_name", "plugin_name"}
|
||||
valid := hasValidMetrics(metrics.Metrics(controllerMetrics), metricKey, dimensions...)
|
||||
Expect(valid).To(BeTrue(), "Invalid metric in P/V Controller metrics: %q", metricKey)
|
||||
|
||||
framework.Logf("Deleting pod %q/%q", pod.Namespace, pod.Name)
|
||||
framework.ExpectNoError(framework.DeletePodWithWait(f, c, pod))
|
||||
})
|
||||
|
||||
// Test for pv controller metrics, concretely: bound/unbound pv/pvc count.
|
||||
Describe("PVController", func() {
|
||||
const (
|
||||
@ -454,3 +485,23 @@ func calculateRelativeValues(originValues, updatedValues map[string]int64) map[s
|
||||
}
|
||||
return relativeValues
|
||||
}
|
||||
|
||||
func hasValidMetrics(metrics metrics.Metrics, metricKey string, dimensions ...string) bool {
|
||||
var errCount int
|
||||
framework.Logf("Looking for sample in metric %q", metricKey)
|
||||
samples, ok := metrics[metricKey]
|
||||
if !ok {
|
||||
framework.Logf("Key %q was not found in metrics", metricKey)
|
||||
return false
|
||||
}
|
||||
for _, sample := range samples {
|
||||
framework.Logf("Found sample %q", sample.String())
|
||||
for _, d := range dimensions {
|
||||
if _, ok := sample.Metric[model.LabelName(d)]; !ok {
|
||||
framework.Logf("Error getting dimension %q for metric %q, sample %q", d, metricKey, sample.String())
|
||||
errCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
return errCount == 0
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user