Adding kubelet metrics for started and failed to start HostProcess containers

Signed-off-by: Mark Rossetti <marosset@microsoft.com>
This commit is contained in:
Mark Rossetti 2021-09-30 10:33:15 -07:00
parent 6d30c96d4a
commit ef324d6bbd
3 changed files with 181 additions and 13 deletions

View File

@ -55,6 +55,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/util/cache" "k8s.io/kubernetes/pkg/kubelet/util/cache"
"k8s.io/kubernetes/pkg/kubelet/util/format" "k8s.io/kubernetes/pkg/kubelet/util/format"
sc "k8s.io/kubernetes/pkg/securitycontext"
) )
const ( const (
@ -883,12 +884,18 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine
} }
metrics.StartedContainersTotal.WithLabelValues(metricLabel).Inc() metrics.StartedContainersTotal.WithLabelValues(metricLabel).Inc()
if sc.HasWindowsHostProcessRequest(pod, spec.container) {
metrics.StartedHostProcessContainersTotal.WithLabelValues(metricLabel).Inc()
}
klog.V(4).InfoS("Creating container in pod", "containerType", typeName, "container", spec.container, "pod", klog.KObj(pod)) klog.V(4).InfoS("Creating container in pod", "containerType", typeName, "container", spec.container, "pod", klog.KObj(pod))
// NOTE (aramase) podIPs are populated for single stack and dual stack clusters. Send only podIPs. // NOTE (aramase) podIPs are populated for single stack and dual stack clusters. Send only podIPs.
if msg, err := m.startContainer(podSandboxID, podSandboxConfig, spec, pod, podStatus, pullSecrets, podIP, podIPs); err != nil { if msg, err := m.startContainer(podSandboxID, podSandboxConfig, spec, pod, podStatus, pullSecrets, podIP, podIPs); err != nil {
// startContainer() returns well-defined error codes that have reasonable cardinality for metrics and are // startContainer() returns well-defined error codes that have reasonable cardinality for metrics and are
// useful to cluster administrators to distinguish "server errors" from "user errors". // useful to cluster administrators to distinguish "server errors" from "user errors".
metrics.StartedContainersErrorsTotal.WithLabelValues(metricLabel, err.Error()).Inc() metrics.StartedContainersErrorsTotal.WithLabelValues(metricLabel, err.Error()).Inc()
if sc.HasWindowsHostProcessRequest(pod, spec.container) {
metrics.StartedHostProcessContainersErrorsTotal.WithLabelValues(metricLabel, err.Error()).Inc()
}
startContainerResult.Fail(err, msg) startContainerResult.Fail(err, msg)
// known errors that are logged in other places are logged at higher levels here to avoid // known errors that are logged in other places are logged at higher levels here to avoid
// repetitive log spam // repetitive log spam

View File

@ -90,6 +90,10 @@ const (
StartedContainersTotalKey = "started_containers_total" StartedContainersTotalKey = "started_containers_total"
StartedContainersErrorsTotalKey = "started_containers_errors_total" StartedContainersErrorsTotalKey = "started_containers_errors_total"
// Metrics to track HostProcess container usage by this kubelet
StartedHostProcessContainersTotalKey = "started_host_process_containers_total"
StartedHostProcessContainersErrorsTotalKey = "started_host_process_containers_errors_total"
// Metrics to track ephemeral container usage by this kubelet // Metrics to track ephemeral container usage by this kubelet
ManagedEphemeralContainersKey = "managed_ephemeral_containers" ManagedEphemeralContainersKey = "managed_ephemeral_containers"
@ -488,6 +492,26 @@ var (
}, },
[]string{"container_type", "code"}, []string{"container_type", "code"},
) )
// StartedHostProcessContainersTotal is a counter that tracks the number of hostprocess container creation operations
StartedHostProcessContainersTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: StartedHostProcessContainersTotalKey,
Help: "Cumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.",
StabilityLevel: metrics.ALPHA,
},
[]string{"container_type"},
)
// StartedHostProcessContainersErrorsTotal is a counter that tracks the number of errors creating hostprocess containers
StartedHostProcessContainersErrorsTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: StartedHostProcessContainersErrorsTotalKey,
Help: "Cumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.",
StabilityLevel: metrics.ALPHA,
},
[]string{"container_type", "code"},
)
// ManagedEphemeralContainers is a gauge that indicates how many ephemeral containers are managed by this kubelet. // ManagedEphemeralContainers is a gauge that indicates how many ephemeral containers are managed by this kubelet.
ManagedEphemeralContainers = metrics.NewGauge( ManagedEphemeralContainers = metrics.NewGauge(
&metrics.GaugeOpts{ &metrics.GaugeOpts{
@ -530,6 +554,10 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(StartedPodsErrorsTotal) legacyregistry.MustRegister(StartedPodsErrorsTotal)
legacyregistry.MustRegister(StartedContainersTotal) legacyregistry.MustRegister(StartedContainersTotal)
legacyregistry.MustRegister(StartedContainersErrorsTotal) legacyregistry.MustRegister(StartedContainersErrorsTotal)
if utilfeature.DefaultFeatureGate.Enabled(features.WindowsHostProcessContainers) {
legacyregistry.MustRegister(StartedHostProcessContainersTotal)
legacyregistry.MustRegister(StartedHostProcessContainersErrorsTotal)
}
legacyregistry.MustRegister(RunPodSandboxDuration) legacyregistry.MustRegister(RunPodSandboxDuration)
legacyregistry.MustRegister(RunPodSandboxErrors) legacyregistry.MustRegister(RunPodSandboxErrors)
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) { if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) {

View File

@ -29,6 +29,7 @@ import (
"k8s.io/apimachinery/pkg/util/uuid" "k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/e2e/framework"
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod" e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
imageutils "k8s.io/kubernetes/test/utils/image" imageutils "k8s.io/kubernetes/test/utils/image"
@ -69,6 +70,13 @@ const (
Write-Output "SUCCESS"` Write-Output "SUCCESS"`
) )
var (
trueVar = true
User_NTAuthorityLocalService = "NT AUTHORITY\\Local Service"
User_NTAuthoritySystem = "NT AUTHORITY\\SYSTEM"
)
var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDocker] [MinimumKubeletVersion:1.22] HostProcess containers", func() { var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDocker] [MinimumKubeletVersion:1.22] HostProcess containers", func() {
ginkgo.BeforeEach(func() { ginkgo.BeforeEach(func() {
e2eskipper.SkipUnlessNodeOSDistroIs("windows") e2eskipper.SkipUnlessNodeOSDistroIs("windows")
@ -86,10 +94,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
ginkgo.By("scheduling a pod with a container that verifies %COMPUTERNAME% matches selected node name") ginkgo.By("scheduling a pod with a container that verifies %COMPUTERNAME% matches selected node name")
image := imageutils.GetConfig(imageutils.BusyBox) image := imageutils.GetConfig(imageutils.BusyBox)
trueVar := true
podName := "host-process-test-pod" podName := "host-process-test-pod"
user := "NT AUTHORITY\\Local service"
pod := &v1.Pod{ pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: podName, Name: podName,
@ -98,7 +103,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
SecurityContext: &v1.PodSecurityContext{ SecurityContext: &v1.PodSecurityContext{
WindowsOptions: &v1.WindowsSecurityContextOptions{ WindowsOptions: &v1.WindowsSecurityContextOptions{
HostProcess: &trueVar, HostProcess: &trueVar,
RunAsUserName: &user, RunAsUserName: &User_NTAuthoritySystem,
}, },
}, },
HostNetwork: true, HostNetwork: true,
@ -131,9 +136,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
ginkgo.It("should support init containers", func() { ginkgo.It("should support init containers", func() {
ginkgo.By("scheduling a pod with a container that verifies init container can configure the node") ginkgo.By("scheduling a pod with a container that verifies init container can configure the node")
trueVar := true
podName := "host-process-init-pods" podName := "host-process-init-pods"
user := "NT AUTHORITY\\SYSTEM"
filename := fmt.Sprintf("/testfile%s.txt", string(uuid.NewUUID())) filename := fmt.Sprintf("/testfile%s.txt", string(uuid.NewUUID()))
pod := &v1.Pod{ pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
@ -143,7 +146,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
SecurityContext: &v1.PodSecurityContext{ SecurityContext: &v1.PodSecurityContext{
WindowsOptions: &v1.WindowsSecurityContextOptions{ WindowsOptions: &v1.WindowsSecurityContextOptions{
HostProcess: &trueVar, HostProcess: &trueVar,
RunAsUserName: &user, RunAsUserName: &User_NTAuthoritySystem,
}, },
}, },
HostNetwork: true, HostNetwork: true,
@ -364,8 +367,6 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
containers = append(containers, container) containers = append(containers, container)
} }
trueVar := true
user := "NT AUTHORITY\\Local Service"
pod := &v1.Pod{ pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: podName, Name: podName,
@ -374,7 +375,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
SecurityContext: &v1.PodSecurityContext{ SecurityContext: &v1.PodSecurityContext{
WindowsOptions: &v1.WindowsSecurityContextOptions{ WindowsOptions: &v1.WindowsSecurityContextOptions{
HostProcess: &trueVar, HostProcess: &trueVar,
RunAsUserName: &user, RunAsUserName: &User_NTAuthorityLocalService,
}, },
}, },
HostNetwork: true, HostNetwork: true,
@ -479,11 +480,103 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
framework.ExpectEqual(p.Status.Phase, v1.PodSucceeded) framework.ExpectEqual(p.Status.Phase, v1.PodSucceeded)
}) })
ginkgo.It("metrics should report count of started and failed to start HostProcess containers", func() {
ginkgo.By("Selecting a Windows node")
targetNode, err := findWindowsNode(f)
framework.ExpectNoError(err, "Error finding Windows node")
framework.Logf("Using node: %v", targetNode.Name)
ginkgo.By("Getting initial kubelet metrics values")
beforeMetrics, err := getCurrentHostProcessMetrics(f, targetNode.Name)
framework.ExpectNoError(err, "Error getting initial kubelet metrics for node")
framework.Logf("Initial HostProcess container metrics -- StartedContainers: %v, StartedContainersErrors: %v, StartedInitContainers: %v, StartedInitContainersErrors: %v",
beforeMetrics.StartedContainersCount, beforeMetrics.StartedContainersErrorCount, beforeMetrics.StartedInitContainersCount, beforeMetrics.StartedInitContainersErrorCount)
ginkgo.By("Scheduling a pod with a HostProcess init container that will fail")
podName := "host-process-metrics-pod-failing-init-container"
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
},
Spec: v1.PodSpec{
SecurityContext: &v1.PodSecurityContext{
WindowsOptions: &v1.WindowsSecurityContextOptions{
HostProcess: &trueVar,
RunAsUserName: &User_NTAuthoritySystem,
},
},
HostNetwork: true,
InitContainers: []v1.Container{
{
Image: imageutils.GetE2EImage(imageutils.BusyBox),
Name: "failing-init-container",
Command: []string{"foobar.exe"},
},
},
Containers: []v1.Container{
{
Image: imageutils.GetE2EImage(imageutils.BusyBox),
Name: "container",
Command: []string{"cmd.exe", "/c", "exit", "/b", "0"},
},
},
RestartPolicy: v1.RestartPolicyNever,
NodeName: targetNode.Name,
},
}
f.PodClient().Create(pod)
f.PodClient().WaitForFinish(podName, 3*time.Minute)
ginkgo.By("Scheduling a pod with a HostProcess container that will fail")
podName = "host-process-metrics-pod-failing-container"
pod = &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
},
Spec: v1.PodSpec{
SecurityContext: &v1.PodSecurityContext{
WindowsOptions: &v1.WindowsSecurityContextOptions{
HostProcess: &trueVar,
RunAsUserName: &User_NTAuthoritySystem,
},
},
HostNetwork: true,
Containers: []v1.Container{
{
Image: imageutils.GetE2EImage(imageutils.BusyBox),
Name: "failing-container",
Command: []string{"foobar.exe"},
},
},
RestartPolicy: v1.RestartPolicyNever,
NodeName: targetNode.Name,
},
}
f.PodClient().Create(pod)
f.PodClient().WaitForFinish(podName, 3*time.Minute)
ginkgo.By("Getting subsequent kubelet metrics values")
afterMetrics, err := getCurrentHostProcessMetrics(f, targetNode.Name)
framework.ExpectNoError(err, "Error getting subsequent kubelet metrics for node")
framework.Logf("Subsequent HostProcess container metrics -- StartedContainers: %v, StartedContainersErrors: %v, StartedInitContainers: %v, StartedInitContainersErrors: %v",
afterMetrics.StartedContainersCount, afterMetrics.StartedContainersErrorCount, afterMetrics.StartedInitContainersCount, afterMetrics.StartedInitContainersErrorCount)
// Note: This test performs relative comparisons to ensure metrics values were logged and does not validate specific values.
// This done so the test can be run in parallel with other tests which may start HostProcess containers on the same node.
ginkgo.By("Ensuring metrics were updated")
framework.ExpectEqual(beforeMetrics.StartedContainersCount < afterMetrics.StartedContainersCount, true, "Count of started HostProcess containers should increase")
framework.ExpectEqual(beforeMetrics.StartedContainersErrorCount < afterMetrics.StartedContainersErrorCount, true, "Count of started HostProcess errors containers should increase")
framework.ExpectEqual(beforeMetrics.StartedInitContainersCount < afterMetrics.StartedInitContainersCount, true, "Count of started HostProcess init containers should increase")
framework.ExpectEqual(beforeMetrics.StartedInitContainersErrorCount < afterMetrics.StartedInitContainersErrorCount, true, "Count of started HostProcess errors init containers should increase")
})
}) })
func makeTestPodWithVolumeMounts(name string) *v1.Pod { func makeTestPodWithVolumeMounts(name string) *v1.Pod {
trueVar := true
username := "NT AUTHORITY\\SYSTEM"
hostPathDirectoryOrCreate := v1.HostPathDirectoryOrCreate hostPathDirectoryOrCreate := v1.HostPathDirectoryOrCreate
return &v1.Pod{ return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
@ -493,7 +586,7 @@ func makeTestPodWithVolumeMounts(name string) *v1.Pod {
SecurityContext: &v1.PodSecurityContext{ SecurityContext: &v1.PodSecurityContext{
WindowsOptions: &v1.WindowsSecurityContextOptions{ WindowsOptions: &v1.WindowsSecurityContextOptions{
HostProcess: &trueVar, HostProcess: &trueVar,
RunAsUserName: &username, RunAsUserName: &User_NTAuthoritySystem,
}, },
}, },
HostNetwork: true, HostNetwork: true,
@ -612,6 +705,46 @@ func makeTestPodWithVolumeMounts(name string) *v1.Pod {
} }
} }
type HostProcessContainersMetrics struct {
StartedContainersCount int64
StartedContainersErrorCount int64
StartedInitContainersCount int64
StartedInitContainersErrorCount int64
}
// getCurrentHostProcessMetrics returns a HostPRocessContainersMetrics object. Any metrics that do not have any
// values reported will be set to 0.
func getCurrentHostProcessMetrics(f *framework.Framework, nodeName string) (HostProcessContainersMetrics, error) {
var result HostProcessContainersMetrics
metrics, err := e2emetrics.GetKubeletMetrics(f.ClientSet, nodeName)
if err != nil {
return result, err
}
for _, sample := range metrics["started_host_process_containers_total"] {
switch sample.Metric["container_type"] {
case "container":
result.StartedContainersCount = int64(sample.Value)
case "init_container":
result.StartedInitContainersCount = int64(sample.Value)
}
}
// note: accumulate failures of all types (ErrImagePull, RunContainerError, etc)
// for each container type here.
for _, sample := range metrics["started_host_process_containers_errors_total"] {
switch sample.Metric["container_type"] {
case "container":
result.StartedContainersErrorCount += int64(sample.Value)
case "init_container":
result.StartedInitContainersErrorCount += int64(sample.Value)
}
}
return result, nil
}
func SkipUnlessWindowsHostProcessContainersEnabled() { func SkipUnlessWindowsHostProcessContainersEnabled() {
if !framework.TestContext.FeatureGates[string(features.WindowsHostProcessContainers)] { if !framework.TestContext.FeatureGates[string(features.WindowsHostProcessContainers)] {
e2eskipper.Skipf("Skipping test because feature 'WindowsHostProcessContainers' is not enabled") e2eskipper.Skipf("Skipping test because feature 'WindowsHostProcessContainers' is not enabled")