mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-20 02:11:09 +00:00
All code must use the context from Ginkgo when doing API calls or polling for a change, otherwise the code would not return immediately when the test gets aborted.
139 lines
4.3 KiB
Go
139 lines
4.3 KiB
Go
/*
|
|
Copyright 2017 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
"time"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/util/wait"
|
|
"k8s.io/kubernetes/test/e2e/framework"
|
|
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
|
|
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
|
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
|
|
instrumentation "k8s.io/kubernetes/test/e2e/instrumentation/common"
|
|
"k8s.io/kubernetes/test/e2e/scheduling"
|
|
"k8s.io/kubernetes/test/utils/image"
|
|
admissionapi "k8s.io/pod-security-admission/api"
|
|
|
|
"github.com/onsi/ginkgo/v2"
|
|
"golang.org/x/oauth2/google"
|
|
gcm "google.golang.org/api/monitoring/v3"
|
|
"google.golang.org/api/option"
|
|
)
|
|
|
|
// Stackdriver container accelerator metrics, as described here:
|
|
// https://cloud.google.com/monitoring/api/metrics_gcp#gcp-container
|
|
var acceleratorMetrics = []string{
|
|
"accelerator/duty_cycle",
|
|
"accelerator/memory_total",
|
|
"accelerator/memory_used",
|
|
}
|
|
|
|
var _ = instrumentation.SIGDescribe("Stackdriver Monitoring", func() {
|
|
ginkgo.BeforeEach(func() {
|
|
e2eskipper.SkipUnlessProviderIs("gce", "gke")
|
|
})
|
|
|
|
f := framework.NewDefaultFramework("stackdriver-monitoring")
|
|
f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged
|
|
|
|
ginkgo.It("should have accelerator metrics [Feature:StackdriverAcceleratorMonitoring]", func(ctx context.Context) {
|
|
testStackdriverAcceleratorMonitoring(ctx, f)
|
|
})
|
|
|
|
})
|
|
|
|
func testStackdriverAcceleratorMonitoring(ctx context.Context, f *framework.Framework) {
|
|
projectID := framework.TestContext.CloudConfig.ProjectID
|
|
|
|
client, err := google.DefaultClient(ctx, gcm.CloudPlatformScope)
|
|
framework.ExpectNoError(err)
|
|
|
|
gcmService, err := gcm.NewService(ctx, option.WithHTTPClient(client))
|
|
|
|
framework.ExpectNoError(err)
|
|
|
|
// set this env var if accessing Stackdriver test endpoint (default is prod):
|
|
// $ export STACKDRIVER_API_ENDPOINT_OVERRIDE=https://test-monitoring.sandbox.googleapis.com/
|
|
basePathOverride := os.Getenv("STACKDRIVER_API_ENDPOINT_OVERRIDE")
|
|
if basePathOverride != "" {
|
|
gcmService.BasePath = basePathOverride
|
|
}
|
|
|
|
scheduling.SetupNVIDIAGPUNode(ctx, f, false)
|
|
|
|
e2epod.NewPodClient(f).Create(ctx, &v1.Pod{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: rcName,
|
|
},
|
|
Spec: v1.PodSpec{
|
|
RestartPolicy: v1.RestartPolicyNever,
|
|
Containers: []v1.Container{
|
|
{
|
|
Name: rcName,
|
|
Image: image.GetE2EImage(image.CudaVectorAdd),
|
|
Command: []string{"/bin/sh", "-c"},
|
|
Args: []string{"nvidia-smi && sleep infinity"},
|
|
Resources: v1.ResourceRequirements{
|
|
Limits: v1.ResourceList{
|
|
e2egpu.NVIDIAGPUResourceName: *resource.NewQuantity(1, resource.DecimalSI),
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
})
|
|
|
|
metricsMap := map[string]bool{}
|
|
pollingFunction := checkForAcceleratorMetrics(projectID, gcmService, time.Now(), metricsMap)
|
|
err = wait.Poll(pollFrequency, pollTimeout, pollingFunction)
|
|
if err != nil {
|
|
framework.Logf("Missing metrics: %+v", metricsMap)
|
|
}
|
|
framework.ExpectNoError(err)
|
|
}
|
|
|
|
func checkForAcceleratorMetrics(projectID string, gcmService *gcm.Service, start time.Time, metricsMap map[string]bool) func() (bool, error) {
|
|
return func() (bool, error) {
|
|
counter := 0
|
|
for _, metric := range acceleratorMetrics {
|
|
metricsMap[metric] = false
|
|
}
|
|
for _, metric := range acceleratorMetrics {
|
|
// TODO: check only for metrics from this cluster
|
|
ts, err := fetchTimeSeries(projectID, gcmService, metric, start, time.Now())
|
|
framework.ExpectNoError(err)
|
|
if len(ts) > 0 {
|
|
counter = counter + 1
|
|
metricsMap[metric] = true
|
|
framework.Logf("Received %v timeseries for metric %v", len(ts), metric)
|
|
} else {
|
|
framework.Logf("No timeseries for metric %v", metric)
|
|
}
|
|
}
|
|
if counter < 3 {
|
|
return false, nil
|
|
}
|
|
return true, nil
|
|
}
|
|
}
|