Merge pull request #93285 from aarnaud/windows-devicemanager

Port deviceManager to windows container manager to enable GPU access
This commit is contained in:
Kubernetes Prow Robot 2020-12-23 12:00:26 -08:00 committed by GitHub
commit e20300b1a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 231 additions and 11 deletions

View File

@ -169,6 +169,7 @@ go_library(
],
"@io_bazel_rules_go//go/platform:windows": [
"//pkg/kubelet/cadvisor:go_default_library",
"//pkg/kubelet/cm/devicemanager:go_default_library",
"//staging/src/k8s.io/client-go/tools/record:go_default_library",
"//staging/src/k8s.io/mount-utils:go_default_library",
],

View File

@ -36,6 +36,7 @@ import (
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
@ -52,6 +53,10 @@ type containerManagerImpl struct {
cadvisorInterface cadvisor.Interface
// Config of this node.
nodeConfig NodeConfig
// Interface for exporting and allocating devices reported by device plugins.
deviceManager devicemanager.Manager
// Interface for Topology resource co-ordination
topologyManager topologymanager.Manager
}
type noopWindowsResourceAllocator struct{}
@ -79,6 +84,11 @@ func (cm *containerManagerImpl) Start(node *v1.Node,
}
}
// Starts device manager.
if err := cm.deviceManager.Start(devicemanager.ActivePodsFunc(activePods), sourcesReady); err != nil {
return err
}
return nil
}
@ -93,11 +103,26 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
}
capacity := cadvisor.CapacityFromMachineInfo(machineInfo)
return &containerManagerImpl{
cm := &containerManagerImpl{
capacity: capacity,
nodeConfig: nodeConfig,
cadvisorInterface: cadvisorInterface,
}, nil
}
cm.topologyManager = topologymanager.NewFakeManager()
klog.Infof("Creating device plugin manager: %t", devicePluginEnabled)
if devicePluginEnabled {
cm.deviceManager, err = devicemanager.NewManagerImpl(nil, cm.topologyManager)
cm.topologyManager.AddHintProvider(cm.deviceManager)
} else {
cm.deviceManager, err = devicemanager.NewManagerStub()
}
if err != nil {
return nil, err
}
return cm, nil
}
func (cm *containerManagerImpl) SystemCgroupsLimit() v1.ResourceList {
@ -150,11 +175,11 @@ func (cm *containerManagerImpl) GetCapacity() v1.ResourceList {
}
func (cm *containerManagerImpl) GetPluginRegistrationHandler() cache.PluginHandler {
return nil
return cm.deviceManager.GetWatcherHandler()
}
func (cm *containerManagerImpl) GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) {
return nil, nil, []string{}
return cm.deviceManager.GetCapacity()
}
func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
@ -162,11 +187,24 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
}
func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
return &kubecontainer.RunContainerOptions{}, nil
opts := &kubecontainer.RunContainerOptions{}
// Allocate should already be called during predicateAdmitHandler.Admit(),
// just try to fetch device runtime information from cached state here
devOpts, err := cm.deviceManager.GetDeviceRunContainerOptions(pod, container)
if err != nil {
return nil, err
} else if devOpts == nil {
return opts, nil
}
opts.Devices = append(opts.Devices, devOpts.Devices...)
opts.Mounts = append(opts.Mounts, devOpts.Mounts...)
opts.Envs = append(opts.Envs, devOpts.Envs...)
opts.Annotations = append(opts.Annotations, devOpts.Annotations...)
return opts, nil
}
func (cm *containerManagerImpl) UpdatePluginResources(*schedulerframework.NodeInfo, *lifecycle.PodAdmitAttributes) error {
return nil
func (cm *containerManagerImpl) UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
return cm.deviceManager.UpdatePluginResources(node, attrs)
}
func (cm *containerManagerImpl) InternalContainerLifecycle() InternalContainerLifecycle {
@ -177,12 +215,12 @@ func (cm *containerManagerImpl) GetPodCgroupRoot() string {
return ""
}
func (cm *containerManagerImpl) GetDevices(_, _ string) []*podresourcesapi.ContainerDevices {
return nil
func (cm *containerManagerImpl) GetDevices(podUID, containerName string) []*podresourcesapi.ContainerDevices {
return cm.deviceManager.GetDevices(podUID, containerName)
}
func (cm *containerManagerImpl) ShouldResetExtendedResourceCapacity() bool {
return false
return cm.deviceManager.ShouldResetExtendedResourceCapacity()
}
func (cm *containerManagerImpl) GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler {

View File

@ -22,6 +22,7 @@ import (
"net"
"os"
"path/filepath"
"runtime"
"sort"
"sync"
"time"
@ -126,7 +127,11 @@ func (s *sourcesReadyStub) AllReady() bool { return true }
// NewManagerImpl creates a new manager.
func NewManagerImpl(topology []cadvisorapi.Node, topologyAffinityStore topologymanager.Store) (*ManagerImpl, error) {
return newManagerImpl(pluginapi.KubeletSocket, topology, topologyAffinityStore)
socketPath := pluginapi.KubeletSocket
if runtime.GOOS == "windows" {
socketPath = os.Getenv("SYSTEMDRIVE") + pluginapi.KubeletSocketWindows
}
return newManagerImpl(socketPath, topology, topologyAffinityStore)
}
func newManagerImpl(socketPath string, topology []cadvisorapi.Node, topologyAffinityStore topologymanager.Store) (*ManagerImpl, error) {

View File

@ -30,7 +30,17 @@ const (
DevicePluginPath = "/var/lib/kubelet/device-plugins/"
// KubeletSocket is the path of the Kubelet registry socket
KubeletSocket = DevicePluginPath + "kubelet.sock"
// DevicePluginPathWindows Avoid failed to run Kubelet: bad socketPath,
// must be an absolute path: /var/lib/kubelet/device-plugins/kubelet.sock
// https://github.com/kubernetes/kubernetes/issues/93262
// https://github.com/kubernetes/kubernetes/pull/93285#discussion_r458140701
DevicePluginPathWindows = "\\var\\lib\\kubelet\\device-plugins\\"
// KubeletSocketWindows is the path of the Kubelet registry socket on windows
KubeletSocketWindows = DevicePluginPathWindows + "kubelet.sock"
// KubeletPreStartContainerRPCTimeoutInSecs is the timeout duration in secs for PreStartContainer RPC
// Timeout duration in secs for PreStartContainer RPC
KubeletPreStartContainerRPCTimeoutInSecs = 30
)

View File

@ -7,6 +7,7 @@ go_library(
srcs = [
"cpu_limits.go",
"density.go",
"device_plugin.go",
"dns.go",
"framework.go",
"gmsa_full.go",
@ -22,6 +23,7 @@ go_library(
importpath = "k8s.io/kubernetes/test/e2e/windows",
visibility = ["//visibility:public"],
deps = [
"//staging/src/k8s.io/api/apps/v1:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/api/rbac/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library",

View File

@ -0,0 +1,157 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package windows
import (
"context"
"time"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/test/e2e/framework"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
imageutils "k8s.io/kubernetes/test/utils/image"
"github.com/onsi/ginkgo"
)
const (
testSlowMultiplier = 60
)
var _ = SIGDescribe("Device Plugin", func() {
f := framework.NewDefaultFramework("device-plugin")
var cs clientset.Interface
ginkgo.BeforeEach(func() {
//Only for Windows containers
e2eskipper.SkipUnlessNodeOSDistroIs("windows")
cs = f.ClientSet
})
ginkgo.It("should be able to create a functioning device plugin for Windows", func() {
ginkgo.By("creating Windows device plugin daemonset")
dsName := "directx-device-plugin"
daemonsetNameLabel := "daemonset-name"
image := "e2eteam/k8s-directx-device-plugin:0.9.0-1809"
mountName := "device-plugin"
mountPath := "/var/lib/kubelet/device-plugins"
labels := map[string]string{
daemonsetNameLabel: dsName,
}
ds := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Name: dsName,
Namespace: "kube-system",
},
Spec: appsv1.DaemonSetSpec{
Selector: &metav1.LabelSelector{
MatchLabels: labels,
},
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
"scheduler.alpha.kubernetes.io/critical-pod": "",
},
Labels: labels,
},
Spec: v1.PodSpec{
Tolerations: []v1.Toleration{
{
Key: "CriticalAddonsOnly",
Operator: "Exists",
},
},
Containers: []v1.Container{
{
Name: "hostdev",
Image: image,
VolumeMounts: []v1.VolumeMount{
{
Name: mountName,
MountPath: mountPath,
},
},
Env: []v1.EnvVar{
{
Name: "DIRECTX_GPU_MATCH_NAME",
Value: " ",
},
},
},
},
Volumes: []v1.Volume{
{
Name: mountName,
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{
Path: mountPath,
},
},
},
},
NodeSelector: map[string]string{
"kubernetes.io/os": "windows",
},
},
},
},
}
sysNs := "kube-system"
_, err := cs.AppsV1().DaemonSets(sysNs).Create(context.TODO(), ds, metav1.CreateOptions{})
framework.ExpectNoError(err)
ginkgo.By("creating Windows testing Pod")
windowsPod := createTestPod(f, imageutils.GetE2EImage(imageutils.WindowsServer), windowsOS)
windowsPod.Spec.Containers[0].Args = []string{"powershell.exe", "Start-Sleep", "3600"}
windowsPod.Spec.Containers[0].Resources.Limits = v1.ResourceList{
"microsoft.com/directx": resource.MustParse("1"),
}
windowsPod, err = cs.CoreV1().Pods(f.Namespace.Name).Create(context.TODO(), windowsPod, metav1.CreateOptions{})
framework.ExpectNoError(err)
ginkgo.By("Waiting for the pod Running")
err = e2epod.WaitTimeoutForPodRunningInNamespace(cs, windowsPod.Name, f.Namespace.Name, testSlowMultiplier*framework.PodStartTimeout)
framework.ExpectNoError(err)
ginkgo.By("verifying device access in Windows testing Pod")
dxdiagCommand := []string{"cmd.exe", "/c", "dxdiag", "/t", "dxdiag_output.txt", "&", "type", "dxdiag_output.txt"}
//If DirectX version issues caused by supsequent windows releases occur, these tests need to do version checks
//based on the windows version running the test.
dxdiagDirectxVersion := "DirectX Version: DirectX 12"
defaultNs := f.Namespace.Name
_, dxdiagDirectxVersionErr := framework.LookForStringInPodExec(defaultNs, windowsPod.Name, dxdiagCommand, dxdiagDirectxVersion, time.Minute)
framework.ExpectNoError(dxdiagDirectxVersionErr, "failed: didn't find directX version dxdiag output.")
dxdiagDdiVersion := "DDI Version: 12"
_, dxdiagDdiVersionErr := framework.LookForStringInPodExec(defaultNs, windowsPod.Name, dxdiagCommand, dxdiagDdiVersion, time.Minute)
framework.ExpectNoError(dxdiagDdiVersionErr, "failed: didn't find DDI version in dxdiag output.")
dxdiagVendorID := "Vendor ID: 0x"
_, dxdiagVendorIDErr := framework.LookForStringInPodExec(defaultNs, windowsPod.Name, dxdiagCommand, dxdiagVendorID, time.Minute)
framework.ExpectNoError(dxdiagVendorIDErr, "failed: didn't find vendorID in dxdiag output.")
envVarCommand := []string{"cmd.exe", "/c", "set", "DIRECTX_GPU_Name"}
envVarDirectxGpuName := "DIRECTX_GPU_Name="
_, envVarDirectxGpuNameErr := framework.LookForStringInPodExec(defaultNs, windowsPod.Name, envVarCommand, envVarDirectxGpuName, time.Minute)
framework.ExpectNoError(envVarDirectxGpuNameErr, "failed: didn't find expected environment variable.")
})
})

View File

@ -40,6 +40,7 @@ type RegistryList struct {
GcrReleaseRegistry string `yaml:"gcrReleaseRegistry"`
PrivateRegistry string `yaml:"privateRegistry"`
SampleRegistry string `yaml:"sampleRegistry"`
MicrosoftRegistry string `yaml:"microsoftRegistry"`
}
// Config holds an images registry, name, and version
@ -79,6 +80,7 @@ func initReg() RegistryList {
GcrReleaseRegistry: "gcr.io/gke-release",
PrivateRegistry: "gcr.io/k8s-authenticated-test",
SampleRegistry: "gcr.io/google-samples",
MicrosoftRegistry: "mcr.microsoft.com",
}
repoList := os.Getenv("KUBE_TEST_REPO_LIST")
if repoList == "" {
@ -116,6 +118,8 @@ var (
// Preconfigured image configs
imageConfigs = initImageConfigs()
microsoftRegistry = registry.MicrosoftRegistry
)
const (
@ -198,6 +202,8 @@ const (
VolumeGlusterServer
// VolumeRBDServer image
VolumeRBDServer
// WindowsServer image
WindowsServer
)
func initImageConfigs() map[int]Config {
@ -241,6 +247,7 @@ func initImageConfigs() map[int]Config {
configs[VolumeISCSIServer] = Config{e2eVolumeRegistry, "iscsi", "2.0"}
configs[VolumeGlusterServer] = Config{e2eVolumeRegistry, "gluster", "1.0"}
configs[VolumeRBDServer] = Config{e2eVolumeRegistry, "rbd", "1.0.1"}
configs[WindowsServer] = Config{microsoftRegistry, "windows", "1809"}
return configs
}