mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 03:11:40 +00:00
Adding kubelet metrics for started and failed to start HostProcess containers
Signed-off-by: Mark Rossetti <marosset@microsoft.com>
This commit is contained in:
parent
6d30c96d4a
commit
ef324d6bbd
@ -55,6 +55,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/types"
|
||||
"k8s.io/kubernetes/pkg/kubelet/util/cache"
|
||||
"k8s.io/kubernetes/pkg/kubelet/util/format"
|
||||
sc "k8s.io/kubernetes/pkg/securitycontext"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -883,12 +884,18 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine
|
||||
}
|
||||
|
||||
metrics.StartedContainersTotal.WithLabelValues(metricLabel).Inc()
|
||||
if sc.HasWindowsHostProcessRequest(pod, spec.container) {
|
||||
metrics.StartedHostProcessContainersTotal.WithLabelValues(metricLabel).Inc()
|
||||
}
|
||||
klog.V(4).InfoS("Creating container in pod", "containerType", typeName, "container", spec.container, "pod", klog.KObj(pod))
|
||||
// NOTE (aramase) podIPs are populated for single stack and dual stack clusters. Send only podIPs.
|
||||
if msg, err := m.startContainer(podSandboxID, podSandboxConfig, spec, pod, podStatus, pullSecrets, podIP, podIPs); err != nil {
|
||||
// startContainer() returns well-defined error codes that have reasonable cardinality for metrics and are
|
||||
// useful to cluster administrators to distinguish "server errors" from "user errors".
|
||||
metrics.StartedContainersErrorsTotal.WithLabelValues(metricLabel, err.Error()).Inc()
|
||||
if sc.HasWindowsHostProcessRequest(pod, spec.container) {
|
||||
metrics.StartedHostProcessContainersErrorsTotal.WithLabelValues(metricLabel, err.Error()).Inc()
|
||||
}
|
||||
startContainerResult.Fail(err, msg)
|
||||
// known errors that are logged in other places are logged at higher levels here to avoid
|
||||
// repetitive log spam
|
||||
|
@ -90,6 +90,10 @@ const (
|
||||
StartedContainersTotalKey = "started_containers_total"
|
||||
StartedContainersErrorsTotalKey = "started_containers_errors_total"
|
||||
|
||||
// Metrics to track HostProcess container usage by this kubelet
|
||||
StartedHostProcessContainersTotalKey = "started_host_process_containers_total"
|
||||
StartedHostProcessContainersErrorsTotalKey = "started_host_process_containers_errors_total"
|
||||
|
||||
// Metrics to track ephemeral container usage by this kubelet
|
||||
ManagedEphemeralContainersKey = "managed_ephemeral_containers"
|
||||
|
||||
@ -488,6 +492,26 @@ var (
|
||||
},
|
||||
[]string{"container_type", "code"},
|
||||
)
|
||||
// StartedHostProcessContainersTotal is a counter that tracks the number of hostprocess container creation operations
|
||||
StartedHostProcessContainersTotal = metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: StartedHostProcessContainersTotalKey,
|
||||
Help: "Cumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
[]string{"container_type"},
|
||||
)
|
||||
// StartedHostProcessContainersErrorsTotal is a counter that tracks the number of errors creating hostprocess containers
|
||||
StartedHostProcessContainersErrorsTotal = metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: StartedHostProcessContainersErrorsTotalKey,
|
||||
Help: "Cumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
[]string{"container_type", "code"},
|
||||
)
|
||||
// ManagedEphemeralContainers is a gauge that indicates how many ephemeral containers are managed by this kubelet.
|
||||
ManagedEphemeralContainers = metrics.NewGauge(
|
||||
&metrics.GaugeOpts{
|
||||
@ -530,6 +554,10 @@ func Register(collectors ...metrics.StableCollector) {
|
||||
legacyregistry.MustRegister(StartedPodsErrorsTotal)
|
||||
legacyregistry.MustRegister(StartedContainersTotal)
|
||||
legacyregistry.MustRegister(StartedContainersErrorsTotal)
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.WindowsHostProcessContainers) {
|
||||
legacyregistry.MustRegister(StartedHostProcessContainersTotal)
|
||||
legacyregistry.MustRegister(StartedHostProcessContainersErrorsTotal)
|
||||
}
|
||||
legacyregistry.MustRegister(RunPodSandboxDuration)
|
||||
legacyregistry.MustRegister(RunPodSandboxErrors)
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) {
|
||||
|
@ -29,6 +29,7 @@ import (
|
||||
"k8s.io/apimachinery/pkg/util/uuid"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
|
||||
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
||||
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
|
||||
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||
@ -69,6 +70,13 @@ const (
|
||||
Write-Output "SUCCESS"`
|
||||
)
|
||||
|
||||
var (
|
||||
trueVar = true
|
||||
|
||||
User_NTAuthorityLocalService = "NT AUTHORITY\\Local Service"
|
||||
User_NTAuthoritySystem = "NT AUTHORITY\\SYSTEM"
|
||||
)
|
||||
|
||||
var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDocker] [MinimumKubeletVersion:1.22] HostProcess containers", func() {
|
||||
ginkgo.BeforeEach(func() {
|
||||
e2eskipper.SkipUnlessNodeOSDistroIs("windows")
|
||||
@ -86,10 +94,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
|
||||
|
||||
ginkgo.By("scheduling a pod with a container that verifies %COMPUTERNAME% matches selected node name")
|
||||
image := imageutils.GetConfig(imageutils.BusyBox)
|
||||
|
||||
trueVar := true
|
||||
podName := "host-process-test-pod"
|
||||
user := "NT AUTHORITY\\Local service"
|
||||
pod := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: podName,
|
||||
@ -98,7 +103,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
|
||||
SecurityContext: &v1.PodSecurityContext{
|
||||
WindowsOptions: &v1.WindowsSecurityContextOptions{
|
||||
HostProcess: &trueVar,
|
||||
RunAsUserName: &user,
|
||||
RunAsUserName: &User_NTAuthoritySystem,
|
||||
},
|
||||
},
|
||||
HostNetwork: true,
|
||||
@ -131,9 +136,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
|
||||
|
||||
ginkgo.It("should support init containers", func() {
|
||||
ginkgo.By("scheduling a pod with a container that verifies init container can configure the node")
|
||||
trueVar := true
|
||||
podName := "host-process-init-pods"
|
||||
user := "NT AUTHORITY\\SYSTEM"
|
||||
filename := fmt.Sprintf("/testfile%s.txt", string(uuid.NewUUID()))
|
||||
pod := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
@ -143,7 +146,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
|
||||
SecurityContext: &v1.PodSecurityContext{
|
||||
WindowsOptions: &v1.WindowsSecurityContextOptions{
|
||||
HostProcess: &trueVar,
|
||||
RunAsUserName: &user,
|
||||
RunAsUserName: &User_NTAuthoritySystem,
|
||||
},
|
||||
},
|
||||
HostNetwork: true,
|
||||
@ -364,8 +367,6 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
|
||||
containers = append(containers, container)
|
||||
}
|
||||
|
||||
trueVar := true
|
||||
user := "NT AUTHORITY\\Local Service"
|
||||
pod := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: podName,
|
||||
@ -374,7 +375,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
|
||||
SecurityContext: &v1.PodSecurityContext{
|
||||
WindowsOptions: &v1.WindowsSecurityContextOptions{
|
||||
HostProcess: &trueVar,
|
||||
RunAsUserName: &user,
|
||||
RunAsUserName: &User_NTAuthorityLocalService,
|
||||
},
|
||||
},
|
||||
HostNetwork: true,
|
||||
@ -479,11 +480,103 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
|
||||
framework.ExpectEqual(p.Status.Phase, v1.PodSucceeded)
|
||||
})
|
||||
|
||||
ginkgo.It("metrics should report count of started and failed to start HostProcess containers", func() {
|
||||
ginkgo.By("Selecting a Windows node")
|
||||
targetNode, err := findWindowsNode(f)
|
||||
framework.ExpectNoError(err, "Error finding Windows node")
|
||||
framework.Logf("Using node: %v", targetNode.Name)
|
||||
|
||||
ginkgo.By("Getting initial kubelet metrics values")
|
||||
beforeMetrics, err := getCurrentHostProcessMetrics(f, targetNode.Name)
|
||||
framework.ExpectNoError(err, "Error getting initial kubelet metrics for node")
|
||||
framework.Logf("Initial HostProcess container metrics -- StartedContainers: %v, StartedContainersErrors: %v, StartedInitContainers: %v, StartedInitContainersErrors: %v",
|
||||
beforeMetrics.StartedContainersCount, beforeMetrics.StartedContainersErrorCount, beforeMetrics.StartedInitContainersCount, beforeMetrics.StartedInitContainersErrorCount)
|
||||
|
||||
ginkgo.By("Scheduling a pod with a HostProcess init container that will fail")
|
||||
|
||||
podName := "host-process-metrics-pod-failing-init-container"
|
||||
pod := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: podName,
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
SecurityContext: &v1.PodSecurityContext{
|
||||
WindowsOptions: &v1.WindowsSecurityContextOptions{
|
||||
HostProcess: &trueVar,
|
||||
RunAsUserName: &User_NTAuthoritySystem,
|
||||
},
|
||||
},
|
||||
HostNetwork: true,
|
||||
InitContainers: []v1.Container{
|
||||
{
|
||||
Image: imageutils.GetE2EImage(imageutils.BusyBox),
|
||||
Name: "failing-init-container",
|
||||
Command: []string{"foobar.exe"},
|
||||
},
|
||||
},
|
||||
Containers: []v1.Container{
|
||||
{
|
||||
Image: imageutils.GetE2EImage(imageutils.BusyBox),
|
||||
Name: "container",
|
||||
Command: []string{"cmd.exe", "/c", "exit", "/b", "0"},
|
||||
},
|
||||
},
|
||||
RestartPolicy: v1.RestartPolicyNever,
|
||||
NodeName: targetNode.Name,
|
||||
},
|
||||
}
|
||||
|
||||
f.PodClient().Create(pod)
|
||||
f.PodClient().WaitForFinish(podName, 3*time.Minute)
|
||||
|
||||
ginkgo.By("Scheduling a pod with a HostProcess container that will fail")
|
||||
podName = "host-process-metrics-pod-failing-container"
|
||||
pod = &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: podName,
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
SecurityContext: &v1.PodSecurityContext{
|
||||
WindowsOptions: &v1.WindowsSecurityContextOptions{
|
||||
HostProcess: &trueVar,
|
||||
RunAsUserName: &User_NTAuthoritySystem,
|
||||
},
|
||||
},
|
||||
HostNetwork: true,
|
||||
Containers: []v1.Container{
|
||||
{
|
||||
Image: imageutils.GetE2EImage(imageutils.BusyBox),
|
||||
Name: "failing-container",
|
||||
Command: []string{"foobar.exe"},
|
||||
},
|
||||
},
|
||||
RestartPolicy: v1.RestartPolicyNever,
|
||||
NodeName: targetNode.Name,
|
||||
},
|
||||
}
|
||||
|
||||
f.PodClient().Create(pod)
|
||||
f.PodClient().WaitForFinish(podName, 3*time.Minute)
|
||||
|
||||
ginkgo.By("Getting subsequent kubelet metrics values")
|
||||
|
||||
afterMetrics, err := getCurrentHostProcessMetrics(f, targetNode.Name)
|
||||
framework.ExpectNoError(err, "Error getting subsequent kubelet metrics for node")
|
||||
framework.Logf("Subsequent HostProcess container metrics -- StartedContainers: %v, StartedContainersErrors: %v, StartedInitContainers: %v, StartedInitContainersErrors: %v",
|
||||
afterMetrics.StartedContainersCount, afterMetrics.StartedContainersErrorCount, afterMetrics.StartedInitContainersCount, afterMetrics.StartedInitContainersErrorCount)
|
||||
|
||||
// Note: This test performs relative comparisons to ensure metrics values were logged and does not validate specific values.
|
||||
// This done so the test can be run in parallel with other tests which may start HostProcess containers on the same node.
|
||||
ginkgo.By("Ensuring metrics were updated")
|
||||
framework.ExpectEqual(beforeMetrics.StartedContainersCount < afterMetrics.StartedContainersCount, true, "Count of started HostProcess containers should increase")
|
||||
framework.ExpectEqual(beforeMetrics.StartedContainersErrorCount < afterMetrics.StartedContainersErrorCount, true, "Count of started HostProcess errors containers should increase")
|
||||
framework.ExpectEqual(beforeMetrics.StartedInitContainersCount < afterMetrics.StartedInitContainersCount, true, "Count of started HostProcess init containers should increase")
|
||||
framework.ExpectEqual(beforeMetrics.StartedInitContainersErrorCount < afterMetrics.StartedInitContainersErrorCount, true, "Count of started HostProcess errors init containers should increase")
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
func makeTestPodWithVolumeMounts(name string) *v1.Pod {
|
||||
trueVar := true
|
||||
username := "NT AUTHORITY\\SYSTEM"
|
||||
hostPathDirectoryOrCreate := v1.HostPathDirectoryOrCreate
|
||||
return &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
@ -493,7 +586,7 @@ func makeTestPodWithVolumeMounts(name string) *v1.Pod {
|
||||
SecurityContext: &v1.PodSecurityContext{
|
||||
WindowsOptions: &v1.WindowsSecurityContextOptions{
|
||||
HostProcess: &trueVar,
|
||||
RunAsUserName: &username,
|
||||
RunAsUserName: &User_NTAuthoritySystem,
|
||||
},
|
||||
},
|
||||
HostNetwork: true,
|
||||
@ -612,6 +705,46 @@ func makeTestPodWithVolumeMounts(name string) *v1.Pod {
|
||||
}
|
||||
}
|
||||
|
||||
type HostProcessContainersMetrics struct {
|
||||
StartedContainersCount int64
|
||||
StartedContainersErrorCount int64
|
||||
StartedInitContainersCount int64
|
||||
StartedInitContainersErrorCount int64
|
||||
}
|
||||
|
||||
// getCurrentHostProcessMetrics returns a HostPRocessContainersMetrics object. Any metrics that do not have any
|
||||
// values reported will be set to 0.
|
||||
func getCurrentHostProcessMetrics(f *framework.Framework, nodeName string) (HostProcessContainersMetrics, error) {
|
||||
var result HostProcessContainersMetrics
|
||||
|
||||
metrics, err := e2emetrics.GetKubeletMetrics(f.ClientSet, nodeName)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
|
||||
for _, sample := range metrics["started_host_process_containers_total"] {
|
||||
switch sample.Metric["container_type"] {
|
||||
case "container":
|
||||
result.StartedContainersCount = int64(sample.Value)
|
||||
case "init_container":
|
||||
result.StartedInitContainersCount = int64(sample.Value)
|
||||
}
|
||||
}
|
||||
|
||||
// note: accumulate failures of all types (ErrImagePull, RunContainerError, etc)
|
||||
// for each container type here.
|
||||
for _, sample := range metrics["started_host_process_containers_errors_total"] {
|
||||
switch sample.Metric["container_type"] {
|
||||
case "container":
|
||||
result.StartedContainersErrorCount += int64(sample.Value)
|
||||
case "init_container":
|
||||
result.StartedInitContainersErrorCount += int64(sample.Value)
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func SkipUnlessWindowsHostProcessContainersEnabled() {
|
||||
if !framework.TestContext.FeatureGates[string(features.WindowsHostProcessContainers)] {
|
||||
e2eskipper.Skipf("Skipping test because feature 'WindowsHostProcessContainers' is not enabled")
|
||||
|
Loading…
Reference in New Issue
Block a user