Remove remants of broken stuff - nvidia/autoscaling

Signed-off-by: Davanum Srinivas <davanum@gmail.com>
This commit is contained in:
Davanum Srinivas 2024-09-19 19:19:26 -04:00
parent c9d6fd9ff7
commit 2fbbca6279
10 changed files with 0 additions and 3217 deletions

View File

@ -1,57 +0,0 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-gpu-device-plugin
namespace: kube-system
labels:
k8s-app: nvidia-gpu-device-plugin
addonmanager.kubernetes.io/mode: EnsureExists
spec:
selector:
matchLabels:
k8s-app: nvidia-gpu-device-plugin
template:
metadata:
labels:
k8s-app: nvidia-gpu-device-plugin
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
tolerations:
- operator: "Exists"
effect: "NoExecute"
- operator: "Exists"
effect: "NoSchedule"
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: dev
hostPath:
path: /dev
containers:
- image: "registry.k8s.io/nvidia-gpu-device-plugin@sha256:4b036e8844920336fa48f36edeb7d4398f426d6a934ba022848deed2edbf09aa"
command: ["/usr/bin/nvidia-gpu-device-plugin", "-logtostderr"]
name: nvidia-gpu-device-plugin
resources:
requests:
cpu: 50m
memory: 10Mi
limits:
cpu: 50m
memory: 10Mi
securityContext:
privileged: true
volumeMounts:
- name: device-plugin
mountPath: /device-plugin
- name: dev
mountPath: /dev
updateStrategy:
type: RollingUpdate

View File

@ -2943,9 +2943,6 @@ EOF
sed -i -e "s@{{ metrics_server_memory_per_node }}@${metrics_server_memory_per_node}@g" "${metrics_server_yaml}"
sed -i -e "s@{{ metrics_server_min_cluster_size }}@${metrics_server_min_cluster_size}@g" "${metrics_server_yaml}"
fi
if [[ "${ENABLE_NVIDIA_GPU_DEVICE_PLUGIN:-}" == "true" ]]; then
setup-addon-manifests "addons" "device-plugins/nvidia-gpu"
fi
# Setting up the konnectivity-agent daemonset
if [[ "${RUN_KONNECTIVITY_PODS:-false}" == "true" ]]; then
setup-addon-manifests "addons" "konnectivity-agent"

View File

@ -1512,11 +1512,6 @@ EOF
if [ -n "${CLUSTER_SIGNING_DURATION:-}" ]; then
cat >>"$file" <<EOF
CLUSTER_SIGNING_DURATION: $(yaml-quote "${CLUSTER_SIGNING_DURATION}")
EOF
fi
if [[ "${NODE_ACCELERATORS:-}" == *"type=nvidia"* ]]; then
cat >>"$file" <<EOF
ENABLE_NVIDIA_GPU_DEVICE_PLUGIN: $(yaml-quote "true")
EOF
fi
if [ -n "${ADDON_MANAGER_LEADER_ELECTION:-}" ]; then

View File

@ -1,125 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package autoscaling
import (
"context"
"strings"
"time"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/test/e2e/feature"
"k8s.io/kubernetes/test/e2e/framework"
e2eautoscaling "k8s.io/kubernetes/test/e2e/framework/autoscaling"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
admissionapi "k8s.io/pod-security-admission/api"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
"github.com/onsi/gomega/gmeasure"
)
var _ = SIGDescribe(feature.ClusterSizeAutoscalingScaleUp, framework.WithSlow(), "Autoscaling", func() {
f := framework.NewDefaultFramework("autoscaling")
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
var experiment *gmeasure.Experiment
ginkgo.Describe("Autoscaling a service", func() {
ginkgo.BeforeEach(func(ctx context.Context) {
// Check if Cloud Autoscaler is enabled by trying to get its ConfigMap.
_, err := f.ClientSet.CoreV1().ConfigMaps("kube-system").Get(ctx, "cluster-autoscaler-status", metav1.GetOptions{})
if err != nil {
e2eskipper.Skipf("test expects Cluster Autoscaler to be enabled")
}
experiment = gmeasure.NewExperiment("Autoscaling a service")
ginkgo.AddReportEntry(experiment.Name, experiment)
})
ginkgo.Context("from 1 pod and 3 nodes to 8 pods and >=4 nodes", func() {
const nodesNum = 3 // Expect there to be 3 nodes before and after the test.
var nodeGroupName string // Set by BeforeEach, used by AfterEach to scale this node group down after the test.
var nodes *v1.NodeList // Set by BeforeEach, used by Measure to calculate CPU request based on node's sizes.
ginkgo.BeforeEach(func(ctx context.Context) {
// Make sure there is only 1 node group, otherwise this test becomes useless.
nodeGroups := strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",")
if len(nodeGroups) != 1 {
e2eskipper.Skipf("test expects 1 node group, found %d", len(nodeGroups))
}
nodeGroupName = nodeGroups[0]
// Make sure the node group has exactly 'nodesNum' nodes, otherwise this test becomes useless.
nodeGroupSize, err := framework.GroupSize(nodeGroupName)
framework.ExpectNoError(err)
if nodeGroupSize != nodesNum {
e2eskipper.Skipf("test expects %d nodes, found %d", nodesNum, nodeGroupSize)
}
// Make sure all nodes are schedulable, otherwise we are in some kind of a problem state.
nodes, err = e2enode.GetReadySchedulableNodes(ctx, f.ClientSet)
framework.ExpectNoError(err)
gomega.Expect(nodes.Items).To(gomega.HaveLen(nodeGroupSize), "not all nodes are schedulable")
})
ginkgo.AfterEach(func(ctx context.Context) {
// Attempt cleanup only if a node group was targeted for scale up.
// Otherwise the test was probably skipped and we'll get a gcloud error due to invalid parameters.
if len(nodeGroupName) > 0 {
// Scale down back to only 'nodesNum' nodes, as expected at the start of the test.
framework.ExpectNoError(framework.ResizeGroup(nodeGroupName, nodesNum))
framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, f.ClientSet, nodesNum, 15*time.Minute))
}
})
ginkgo.It("takes less than 15 minutes", func(ctx context.Context) {
// Measured over multiple samples, scaling takes 10 +/- 2 minutes, so 15 minutes should be fully sufficient.
const timeToWait = 15 * time.Minute
// Calculate the CPU request of the service.
// This test expects that 8 pods will not fit in 'nodesNum' nodes, but will fit in >='nodesNum'+1 nodes.
// Make it so that 'nodesNum' pods fit perfectly per node.
nodeCpus := nodes.Items[0].Status.Allocatable[v1.ResourceCPU]
nodeCPUMillis := (&nodeCpus).MilliValue()
cpuRequestMillis := int64(nodeCPUMillis / nodesNum)
// Start the service we want to scale and wait for it to be up and running.
nodeMemoryBytes := nodes.Items[0].Status.Allocatable[v1.ResourceMemory]
nodeMemoryMB := (&nodeMemoryBytes).Value() / 1024 / 1024
memRequestMB := nodeMemoryMB / 10 // Ensure each pod takes not more than 10% of node's allocatable memory.
replicas := 1
resourceConsumer := e2eautoscaling.NewDynamicResourceConsumer(ctx, "resource-consumer", f.Namespace.Name, e2eautoscaling.KindDeployment, replicas, 0, 0, 0, cpuRequestMillis, memRequestMB, f.ClientSet, f.ScalesGetter, e2eautoscaling.Disable, e2eautoscaling.Idle)
ginkgo.DeferCleanup(resourceConsumer.CleanUp)
resourceConsumer.WaitForReplicas(ctx, replicas, 1*time.Minute) // Should finish ~immediately, so 1 minute is more than enough.
// Enable Horizontal Pod Autoscaler with 50% target utilization and
// scale up the CPU usage to trigger autoscaling to 8 pods for target to be satisfied.
targetCPUUtilizationPercent := int32(50)
hpa := e2eautoscaling.CreateCPUResourceHorizontalPodAutoscaler(ctx, resourceConsumer, targetCPUUtilizationPercent, 1, 10)
ginkgo.DeferCleanup(e2eautoscaling.DeleteHorizontalPodAutoscaler, resourceConsumer, hpa.Name)
cpuLoad := 8 * cpuRequestMillis * int64(targetCPUUtilizationPercent) / 100 // 8 pods utilized to the target level
resourceConsumer.ConsumeCPU(int(cpuLoad))
// Measure the time it takes for the service to scale to 8 pods with 50% CPU utilization each.
experiment.SampleDuration("total scale-up time", func(idx int) {
resourceConsumer.WaitForReplicas(ctx, 8, timeToWait)
}, gmeasure.SamplingConfig{N: 1})
}) // Increase to run the test more than once.
})
})
})

View File

@ -1,531 +0,0 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package autoscaling
import (
"context"
"encoding/json"
"fmt"
"math"
"strings"
"time"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/strategicpatch"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"k8s.io/kubernetes/test/e2e/feature"
"k8s.io/kubernetes/test/e2e/framework"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2erc "k8s.io/kubernetes/test/e2e/framework/rc"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
testutils "k8s.io/kubernetes/test/utils"
imageutils "k8s.io/kubernetes/test/utils/image"
admissionapi "k8s.io/pod-security-admission/api"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
)
const (
memoryReservationTimeout = 5 * time.Minute
largeResizeTimeout = 8 * time.Minute
largeScaleUpTimeout = 10 * time.Minute
maxNodes = 1000
)
type clusterPredicates struct {
nodes int
}
type scaleUpTestConfig struct {
initialNodes int
initialPods int
extraPods *testutils.RCConfig
expectedResult *clusterPredicates
}
var _ = SIGDescribe("Cluster size autoscaler scalability", framework.WithSlow(), func() {
f := framework.NewDefaultFramework("autoscaling")
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
var c clientset.Interface
var nodeCount int
var coresPerNode int
var memCapacityMb int
var originalSizes map[string]int
var sum int
ginkgo.BeforeEach(func(ctx context.Context) {
e2eskipper.SkipUnlessProviderIs("gce", "gke", "kubemark")
// Check if Cloud Autoscaler is enabled by trying to get its ConfigMap.
_, err := f.ClientSet.CoreV1().ConfigMaps("kube-system").Get(ctx, "cluster-autoscaler-status", metav1.GetOptions{})
if err != nil {
e2eskipper.Skipf("test expects Cluster Autoscaler to be enabled")
}
c = f.ClientSet
if originalSizes == nil {
originalSizes = make(map[string]int)
sum = 0
for _, mig := range strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") {
size, err := framework.GroupSize(mig)
framework.ExpectNoError(err)
ginkgo.By(fmt.Sprintf("Initial size of %s: %d", mig, size))
originalSizes[mig] = size
sum += size
}
}
framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, sum, scaleUpTimeout))
nodes, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet)
framework.ExpectNoError(err)
nodeCount = len(nodes.Items)
cpu := nodes.Items[0].Status.Capacity[v1.ResourceCPU]
mem := nodes.Items[0].Status.Capacity[v1.ResourceMemory]
coresPerNode = int((&cpu).MilliValue() / 1000)
memCapacityMb = int((&mem).Value() / 1024 / 1024)
gomega.Expect(nodeCount).To(gomega.Equal(sum))
if framework.ProviderIs("gke") {
val, err := isAutoscalerEnabled(3)
framework.ExpectNoError(err)
if !val {
err = enableAutoscaler("default-pool", 3, 5)
framework.ExpectNoError(err)
}
}
})
ginkgo.AfterEach(func(ctx context.Context) {
ginkgo.By(fmt.Sprintf("Restoring initial size of the cluster"))
setMigSizes(originalSizes)
framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, nodeCount, scaleDownTimeout))
nodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
framework.ExpectNoError(err)
s := time.Now()
makeSchedulableLoop:
for start := time.Now(); time.Since(start) < makeSchedulableTimeout; time.Sleep(makeSchedulableDelay) {
for _, n := range nodes.Items {
err = makeNodeSchedulable(ctx, c, &n, true)
switch err.(type) {
case CriticalAddonsOnlyError:
continue makeSchedulableLoop
default:
framework.ExpectNoError(err)
}
}
break
}
klog.Infof("Made nodes schedulable again in %v", time.Since(s).String())
})
f.It("should scale up at all", feature.ClusterAutoscalerScalability1, func(ctx context.Context) {
perNodeReservation := int(float64(memCapacityMb) * 0.95)
replicasPerNode := 10
additionalNodes := maxNodes - nodeCount
replicas := additionalNodes * replicasPerNode
additionalReservation := additionalNodes * perNodeReservation
// saturate cluster
reservationCleanup := ReserveMemory(ctx, f, "some-pod", nodeCount*2, nodeCount*perNodeReservation, true, memoryReservationTimeout)
defer reservationCleanup()
framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c))
// configure pending pods & expected scale up
rcConfig := reserveMemoryRCConfig(f, "extra-pod-1", replicas, additionalReservation, largeScaleUpTimeout)
expectedResult := createClusterPredicates(nodeCount + additionalNodes)
config := createScaleUpTestConfig(nodeCount, nodeCount, rcConfig, expectedResult)
// run test
testCleanup := simpleScaleUpTest(ctx, f, config)
defer testCleanup()
})
f.It("should scale up twice", feature.ClusterAutoscalerScalability2, func(ctx context.Context) {
perNodeReservation := int(float64(memCapacityMb) * 0.95)
replicasPerNode := 10
additionalNodes1 := int(math.Ceil(0.7 * maxNodes))
additionalNodes2 := int(math.Ceil(0.25 * maxNodes))
if additionalNodes1+additionalNodes2 > maxNodes {
additionalNodes2 = maxNodes - additionalNodes1
}
replicas1 := additionalNodes1 * replicasPerNode
replicas2 := additionalNodes2 * replicasPerNode
klog.Infof("cores per node: %v", coresPerNode)
// saturate cluster
initialReplicas := nodeCount
reservationCleanup := ReserveMemory(ctx, f, "some-pod", initialReplicas, nodeCount*perNodeReservation, true, memoryReservationTimeout)
defer reservationCleanup()
framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c))
klog.Infof("Reserved successfully")
// configure pending pods & expected scale up #1
rcConfig := reserveMemoryRCConfig(f, "extra-pod-1", replicas1, additionalNodes1*perNodeReservation, largeScaleUpTimeout)
expectedResult := createClusterPredicates(nodeCount + additionalNodes1)
config := createScaleUpTestConfig(nodeCount, nodeCount, rcConfig, expectedResult)
// run test #1
tolerateUnreadyNodes := additionalNodes1 / 20
tolerateUnreadyPods := (initialReplicas + replicas1) / 20
testCleanup1 := simpleScaleUpTestWithTolerance(ctx, f, config, tolerateUnreadyNodes, tolerateUnreadyPods)
defer testCleanup1()
klog.Infof("Scaled up once")
// configure pending pods & expected scale up #2
rcConfig2 := reserveMemoryRCConfig(f, "extra-pod-2", replicas2, additionalNodes2*perNodeReservation, largeScaleUpTimeout)
expectedResult2 := createClusterPredicates(nodeCount + additionalNodes1 + additionalNodes2)
config2 := createScaleUpTestConfig(nodeCount+additionalNodes1, nodeCount+additionalNodes2, rcConfig2, expectedResult2)
// run test #2
tolerateUnreadyNodes = maxNodes / 20
tolerateUnreadyPods = (initialReplicas + replicas1 + replicas2) / 20
testCleanup2 := simpleScaleUpTestWithTolerance(ctx, f, config2, tolerateUnreadyNodes, tolerateUnreadyPods)
defer testCleanup2()
klog.Infof("Scaled up twice")
})
f.It("should scale down empty nodes", feature.ClusterAutoscalerScalability3, func(ctx context.Context) {
perNodeReservation := int(float64(memCapacityMb) * 0.7)
replicas := int(math.Ceil(maxNodes * 0.7))
totalNodes := maxNodes
// resize cluster to totalNodes
newSizes := map[string]int{
anyKey(originalSizes): totalNodes,
}
setMigSizes(newSizes)
framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, f.ClientSet, totalNodes, largeResizeTimeout))
// run replicas
rcConfig := reserveMemoryRCConfig(f, "some-pod", replicas, replicas*perNodeReservation, largeScaleUpTimeout)
expectedResult := createClusterPredicates(totalNodes)
config := createScaleUpTestConfig(totalNodes, totalNodes, rcConfig, expectedResult)
tolerateUnreadyNodes := totalNodes / 10
tolerateUnreadyPods := replicas / 10
testCleanup := simpleScaleUpTestWithTolerance(ctx, f, config, tolerateUnreadyNodes, tolerateUnreadyPods)
defer testCleanup()
// check if empty nodes are scaled down
framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet,
func(size int) bool {
return size <= replicas+3 // leaving space for non-evictable kube-system pods
}, scaleDownTimeout))
})
f.It("should scale down underutilized nodes", feature.ClusterAutoscalerScalability4, func(ctx context.Context) {
perPodReservation := int(float64(memCapacityMb) * 0.01)
// underutilizedNodes are 10% full
underutilizedPerNodeReplicas := 10
// fullNodes are 70% full
fullPerNodeReplicas := 70
totalNodes := maxNodes
underutilizedRatio := 0.3
maxDelta := 30
// resize cluster to totalNodes
newSizes := map[string]int{
anyKey(originalSizes): totalNodes,
}
setMigSizes(newSizes)
framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, f.ClientSet, totalNodes, largeResizeTimeout))
// annotate all nodes with no-scale-down
ScaleDownDisabledKey := "cluster-autoscaler.kubernetes.io/scale-down-disabled"
nodes, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{
FieldSelector: fields.Set{
"spec.unschedulable": "false",
}.AsSelector().String(),
})
framework.ExpectNoError(err)
framework.ExpectNoError(addAnnotation(ctx, f, nodes.Items, ScaleDownDisabledKey, "true"))
// distribute pods using replication controllers taking up space that should
// be empty after pods are distributed
underutilizedNodesNum := int(float64(maxNodes) * underutilizedRatio)
fullNodesNum := totalNodes - underutilizedNodesNum
podDistribution := []podBatch{
{numNodes: fullNodesNum, podsPerNode: fullPerNodeReplicas},
{numNodes: underutilizedNodesNum, podsPerNode: underutilizedPerNodeReplicas}}
distributeLoad(ctx, f, f.Namespace.Name, "10-70", podDistribution, perPodReservation,
int(0.95*float64(memCapacityMb)), map[string]string{}, largeScaleUpTimeout)
// enable scale down again
framework.ExpectNoError(addAnnotation(ctx, f, nodes.Items, ScaleDownDisabledKey, "false"))
// wait for scale down to start. Node deletion takes a long time, so we just
// wait for maximum of 30 nodes deleted
nodesToScaleDownCount := int(float64(totalNodes) * 0.1)
if nodesToScaleDownCount > maxDelta {
nodesToScaleDownCount = maxDelta
}
expectedSize := totalNodes - nodesToScaleDownCount
timeout := time.Duration(nodesToScaleDownCount)*time.Minute + scaleDownTimeout
framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, func(size int) bool {
return size <= expectedSize
}, timeout))
})
f.It("shouldn't scale down with underutilized nodes due to host port conflicts", feature.ClusterAutoscalerScalability5, func(ctx context.Context) {
fullReservation := int(float64(memCapacityMb) * 0.9)
hostPortPodReservation := int(float64(memCapacityMb) * 0.3)
totalNodes := maxNodes
reservedPort := 4321
// resize cluster to totalNodes
newSizes := map[string]int{
anyKey(originalSizes): totalNodes,
}
setMigSizes(newSizes)
framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, f.ClientSet, totalNodes, largeResizeTimeout))
divider := int(float64(totalNodes) * 0.7)
fullNodesCount := divider
underutilizedNodesCount := totalNodes - fullNodesCount
ginkgo.By("Reserving full nodes")
// run RC1 w/o host port
cleanup := ReserveMemory(ctx, f, "filling-pod", fullNodesCount, fullNodesCount*fullReservation, true, largeScaleUpTimeout*2)
defer cleanup()
ginkgo.By("Reserving host ports on remaining nodes")
// run RC2 w/ host port
ginkgo.DeferCleanup(createHostPortPodsWithMemory, f, "underutilizing-host-port-pod", underutilizedNodesCount, reservedPort, underutilizedNodesCount*hostPortPodReservation, largeScaleUpTimeout)
framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c))
// wait and check scale down doesn't occur
ginkgo.By(fmt.Sprintf("Sleeping %v minutes...", scaleDownTimeout.Minutes()))
time.Sleep(scaleDownTimeout)
ginkgo.By("Checking if the number of nodes is as expected")
nodes, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet)
framework.ExpectNoError(err)
klog.Infof("Nodes: %v, expected: %v", len(nodes.Items), totalNodes)
gomega.Expect(nodes.Items).To(gomega.HaveLen(totalNodes))
})
f.It("CA ignores unschedulable pods while scheduling schedulable pods", feature.ClusterAutoscalerScalability6, func(ctx context.Context) {
// Start a number of pods saturating existing nodes.
perNodeReservation := int(float64(memCapacityMb) * 0.80)
replicasPerNode := 10
initialPodReplicas := nodeCount * replicasPerNode
initialPodsTotalMemory := nodeCount * perNodeReservation
reservationCleanup := ReserveMemory(ctx, f, "initial-pod", initialPodReplicas, initialPodsTotalMemory, true /* wait for pods to run */, memoryReservationTimeout)
ginkgo.DeferCleanup(reservationCleanup)
framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c))
// Configure a number of unschedulable pods.
unschedulableMemReservation := memCapacityMb * 2
unschedulablePodReplicas := 1000
totalMemReservation := unschedulableMemReservation * unschedulablePodReplicas
timeToWait := 5 * time.Minute
podsConfig := reserveMemoryRCConfig(f, "unschedulable-pod", unschedulablePodReplicas, totalMemReservation, timeToWait)
_ = e2erc.RunRC(ctx, *podsConfig) // Ignore error (it will occur because pods are unschedulable)
ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, podsConfig.Name)
// Ensure that no new nodes have been added so far.
readyNodeCount, _ := e2enode.TotalReady(ctx, f.ClientSet)
gomega.Expect(readyNodeCount).To(gomega.Equal(nodeCount))
// Start a number of schedulable pods to ensure CA reacts.
additionalNodes := maxNodes - nodeCount
replicas := additionalNodes * replicasPerNode
totalMemory := additionalNodes * perNodeReservation
rcConfig := reserveMemoryRCConfig(f, "extra-pod", replicas, totalMemory, largeScaleUpTimeout)
expectedResult := createClusterPredicates(nodeCount + additionalNodes)
config := createScaleUpTestConfig(nodeCount, initialPodReplicas, rcConfig, expectedResult)
// Test that scale up happens, allowing 1000 unschedulable pods not to be scheduled.
testCleanup := simpleScaleUpTestWithTolerance(ctx, f, config, 0, unschedulablePodReplicas)
ginkgo.DeferCleanup(testCleanup)
})
})
func anyKey(input map[string]int) string {
for k := range input {
return k
}
return ""
}
func simpleScaleUpTestWithTolerance(ctx context.Context, f *framework.Framework, config *scaleUpTestConfig, tolerateMissingNodeCount int, tolerateMissingPodCount int) func() error {
// resize cluster to start size
// run rc based on config
ginkgo.By(fmt.Sprintf("Running RC %v from config", config.extraPods.Name))
start := time.Now()
framework.ExpectNoError(e2erc.RunRC(ctx, *config.extraPods))
// check results
if tolerateMissingNodeCount > 0 {
// Tolerate some number of nodes not to be created.
minExpectedNodeCount := config.expectedResult.nodes - tolerateMissingNodeCount
framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet,
func(size int) bool { return size >= minExpectedNodeCount }, scaleUpTimeout))
} else {
framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, f.ClientSet, config.expectedResult.nodes, scaleUpTimeout))
}
klog.Infof("cluster is increased")
if tolerateMissingPodCount > 0 {
framework.ExpectNoError(waitForCaPodsReadyInNamespace(ctx, f, f.ClientSet, tolerateMissingPodCount))
} else {
framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, f.ClientSet))
}
timeTrack(start, fmt.Sprintf("Scale up to %v", config.expectedResult.nodes))
return func() error {
return e2erc.DeleteRCAndWaitForGC(ctx, f.ClientSet, f.Namespace.Name, config.extraPods.Name)
}
}
func simpleScaleUpTest(ctx context.Context, f *framework.Framework, config *scaleUpTestConfig) func() error {
return simpleScaleUpTestWithTolerance(ctx, f, config, 0, 0)
}
func reserveMemoryRCConfig(f *framework.Framework, id string, replicas, megabytes int, timeout time.Duration) *testutils.RCConfig {
return &testutils.RCConfig{
Client: f.ClientSet,
Name: id,
Namespace: f.Namespace.Name,
Timeout: timeout,
Image: imageutils.GetPauseImageName(),
Replicas: replicas,
MemRequest: int64(1024 * 1024 * megabytes / replicas),
}
}
func createScaleUpTestConfig(nodes, pods int, extraPods *testutils.RCConfig, expectedResult *clusterPredicates) *scaleUpTestConfig {
return &scaleUpTestConfig{
initialNodes: nodes,
initialPods: pods,
extraPods: extraPods,
expectedResult: expectedResult,
}
}
func createClusterPredicates(nodes int) *clusterPredicates {
return &clusterPredicates{
nodes: nodes,
}
}
func addAnnotation(ctx context.Context, f *framework.Framework, nodes []v1.Node, key, value string) error {
for _, node := range nodes {
oldData, err := json.Marshal(node)
if err != nil {
return err
}
if node.Annotations == nil {
node.Annotations = make(map[string]string)
}
node.Annotations[key] = value
newData, err := json.Marshal(node)
if err != nil {
return err
}
patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, v1.Node{})
if err != nil {
return err
}
_, err = f.ClientSet.CoreV1().Nodes().Patch(ctx, string(node.Name), types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{})
if err != nil {
return err
}
}
return nil
}
func createHostPortPodsWithMemory(ctx context.Context, f *framework.Framework, id string, replicas, port, megabytes int, timeout time.Duration) func() error {
ginkgo.By(fmt.Sprintf("Running RC which reserves host port and memory"))
request := int64(1024 * 1024 * megabytes / replicas)
config := &testutils.RCConfig{
Client: f.ClientSet,
Name: id,
Namespace: f.Namespace.Name,
Timeout: timeout,
Image: imageutils.GetPauseImageName(),
Replicas: replicas,
HostPorts: map[string]int{"port1": port},
MemRequest: request,
}
err := e2erc.RunRC(ctx, *config)
framework.ExpectNoError(err)
return func() error {
return e2erc.DeleteRCAndWaitForGC(ctx, f.ClientSet, f.Namespace.Name, id)
}
}
type podBatch struct {
numNodes int
podsPerNode int
}
// distributeLoad distributes the pods in the way described by podDostribution,
// assuming all pods will have the same memory reservation and all nodes the same
// memory capacity. This allows us generate the load on the cluster in the exact
// way that we want.
//
// To achieve this we do the following:
// 1. Create replication controllers that eat up all the space that should be
// empty after setup, making sure they end up on different nodes by specifying
// conflicting host port
// 2. Create target RC that will generate the load on the cluster
// 3. Remove the rcs created in 1.
func distributeLoad(ctx context.Context, f *framework.Framework, namespace string, id string, podDistribution []podBatch,
podMemRequestMegabytes int, nodeMemCapacity int, labels map[string]string, timeout time.Duration) {
port := 8013
// Create load-distribution RCs with one pod per node, reserving all remaining
// memory to force the distribution of pods for the target RCs.
// The load-distribution RCs will be deleted on function return.
totalPods := 0
for i, podBatch := range podDistribution {
totalPods += podBatch.numNodes * podBatch.podsPerNode
remainingMem := nodeMemCapacity - podBatch.podsPerNode*podMemRequestMegabytes
replicas := podBatch.numNodes
cleanup := createHostPortPodsWithMemory(ctx, f, fmt.Sprintf("load-distribution%d", i), replicas, port, remainingMem*replicas, timeout)
defer cleanup()
}
framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, f.ClientSet))
// Create the target RC
rcConfig := reserveMemoryRCConfig(f, id, totalPods, totalPods*podMemRequestMegabytes, timeout)
framework.ExpectNoError(e2erc.RunRC(ctx, *rcConfig))
framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, f.ClientSet))
ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, id)
}
func timeTrack(start time.Time, name string) {
elapsed := time.Since(start)
klog.Infof("%s took %s", name, elapsed)
}

File diff suppressed because it is too large Load Diff

View File

@ -1,425 +0,0 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package autoscaling
import (
"context"
"fmt"
"math"
"strings"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/test/e2e/framework"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
admissionapi "k8s.io/pod-security-admission/api"
"github.com/onsi/ginkgo/v2"
)
// This test requires coredns to be installed on the cluster with autoscaling enabled.
// Compare your coredns manifest against the command below
// helm template coredns -n kube-system coredns/coredns --set k8sAppLabelOverride=kube-dns --set fullnameOverride=coredns --set autoscaler.enabled=true
// Constants used in dns-autoscaling test.
const (
DNSdefaultTimeout = 5 * time.Minute
ClusterAddonLabelKey = "k8s-app"
DNSLabelName = "kube-dns"
)
var _ = SIGDescribe("DNS horizontal autoscaling", func() {
f := framework.NewDefaultFramework("dns-autoscaling")
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
var c clientset.Interface
var previousParams map[string]string
var configMapNames map[string]string
var originDNSReplicasCount int
var DNSParams1 DNSParamsLinear
var DNSParams2 DNSParamsLinear
var DNSParams3 DNSParamsLinear
ginkgo.BeforeEach(func(ctx context.Context) {
e2eskipper.SkipUnlessProviderIs("gce", "gke")
c = f.ClientSet
nodes, err := e2enode.GetReadySchedulableNodes(ctx, c)
framework.ExpectNoError(err)
nodeCount := len(nodes.Items)
ginkgo.By("Collecting original replicas count and DNS scaling params")
// Check if we are running coredns or kube-dns, the only difference is the name of the autoscaling CM.
// The test should be have identically on both dns providers
provider, err := detectDNSProvider(ctx, c)
framework.ExpectNoError(err)
originDNSReplicasCount, err = getDNSReplicas(ctx, c)
framework.ExpectNoError(err)
configMapNames = map[string]string{
"kube-dns": "kube-dns-autoscaler",
"coredns": "coredns-autoscaler",
}
pcm, err := fetchDNSScalingConfigMap(ctx, c, configMapNames[provider])
framework.Logf("original DNS scaling params: %v", pcm)
framework.ExpectNoError(err)
previousParams = pcm.Data
if nodeCount <= 500 {
DNSParams1 = DNSParamsLinear{
nodesPerReplica: 1,
}
DNSParams2 = DNSParamsLinear{
nodesPerReplica: 2,
}
DNSParams3 = DNSParamsLinear{
nodesPerReplica: 3,
coresPerReplica: 3,
}
} else {
// In large clusters, avoid creating/deleting too many DNS pods,
// it is supposed to be correctness test, not performance one.
// The default setup is: 256 cores/replica, 16 nodes/replica.
// With nodeCount > 500, nodes/13, nodes/14, nodes/15 and nodes/16
// are different numbers.
DNSParams1 = DNSParamsLinear{
nodesPerReplica: 13,
}
DNSParams2 = DNSParamsLinear{
nodesPerReplica: 14,
}
DNSParams3 = DNSParamsLinear{
nodesPerReplica: 15,
coresPerReplica: 15,
}
}
})
// This test is separated because it is slow and need to run serially.
// Will take around 5 minutes to run on a 4 nodes cluster.
// TODO(upodroid) This test will be removed in 1.33 when kubeup is removed
// TODO: make it cloud provider agnostic or move it to cloud-provider-gcp repository
f.It(f.WithSerial(), f.WithSlow(), f.WithLabel("KubeUp"), f.WithLabel("sig-cloud-provider-gcp"), "kube-dns-autoscaler should scale kube-dns pods when cluster size changed", func(ctx context.Context) {
numNodes, err := e2enode.TotalRegistered(ctx, c)
framework.ExpectNoError(err)
configMapNames = map[string]string{
"kube-dns": "kube-dns-autoscaler",
"coredns": "coredns-autoscaler",
}
provider, err := detectDNSProvider(ctx, c)
framework.ExpectNoError(err)
ginkgo.By("Replace the dns autoscaling parameters with testing parameters")
err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], packLinearParams(&DNSParams1)))
framework.ExpectNoError(err)
defer func() {
ginkgo.By("Restoring initial dns autoscaling parameters")
err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], previousParams))
framework.ExpectNoError(err)
ginkgo.By("Wait for number of running and ready kube-dns pods recover")
label := labels.SelectorFromSet(labels.Set(map[string]string{ClusterAddonLabelKey: DNSLabelName}))
_, err := e2epod.WaitForPodsWithLabelRunningReady(ctx, c, metav1.NamespaceSystem, label, originDNSReplicasCount, DNSdefaultTimeout)
framework.ExpectNoError(err)
}()
ginkgo.By("Wait for kube-dns scaled to expected number")
getExpectReplicasLinear := getExpectReplicasFuncLinear(ctx, c, &DNSParams1)
err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
framework.ExpectNoError(err)
originalSizes := make(map[string]int)
for _, mig := range strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") {
size, err := framework.GroupSize(mig)
framework.ExpectNoError(err)
ginkgo.By(fmt.Sprintf("Initial size of %s: %d", mig, size))
originalSizes[mig] = size
}
ginkgo.By("Manually increase cluster size")
increasedSizes := make(map[string]int)
for key, val := range originalSizes {
increasedSizes[key] = val + 1
}
setMigSizes(increasedSizes)
err = WaitForClusterSizeFunc(ctx, c,
func(size int) bool { return size == numNodes+len(originalSizes) }, scaleUpTimeout)
framework.ExpectNoError(err)
ginkgo.By("Wait for kube-dns scaled to expected number")
getExpectReplicasLinear = getExpectReplicasFuncLinear(ctx, c, &DNSParams1)
err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
framework.ExpectNoError(err)
ginkgo.By("Replace the dns autoscaling parameters with another testing parameters")
err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], packLinearParams(&DNSParams3)))
framework.ExpectNoError(err)
ginkgo.By("Wait for kube-dns scaled to expected number")
getExpectReplicasLinear = getExpectReplicasFuncLinear(ctx, c, &DNSParams3)
err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
framework.ExpectNoError(err)
ginkgo.By("Restoring cluster size")
setMigSizes(originalSizes)
err = e2enode.WaitForReadyNodes(ctx, c, numNodes, scaleDownTimeout)
framework.ExpectNoError(err)
ginkgo.By("Wait for kube-dns scaled to expected number")
err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
framework.ExpectNoError(err)
})
ginkgo.It("kube-dns-autoscaler should scale kube-dns pods in both nonfaulty and faulty scenarios", func(ctx context.Context) {
configMapNames = map[string]string{
"kube-dns": "kube-dns-autoscaler",
"coredns": "coredns-autoscaler",
}
provider, err := detectDNSProvider(ctx, c)
framework.ExpectNoError(err)
ginkgo.By("Replace the dns autoscaling parameters with testing parameters")
cm := packDNSScalingConfigMap(configMapNames[provider], packLinearParams(&DNSParams1))
framework.Logf("Updating the following cm: %v", cm)
err = updateDNSScalingConfigMap(ctx, c, cm)
framework.ExpectNoError(err)
defer func() {
ginkgo.By("Restoring initial dns autoscaling parameters")
err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], previousParams))
framework.ExpectNoError(err)
}()
ginkgo.By("Wait for kube-dns scaled to expected number")
getExpectReplicasLinear := getExpectReplicasFuncLinear(ctx, c, &DNSParams1)
err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
framework.ExpectNoError(err)
ginkgo.By("--- Scenario: should scale kube-dns based on changed parameters ---")
ginkgo.By("Replace the dns autoscaling parameters with another testing parameters")
err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], packLinearParams(&DNSParams3)))
framework.ExpectNoError(err)
ginkgo.By("Wait for kube-dns scaled to expected number")
getExpectReplicasLinear = getExpectReplicasFuncLinear(ctx, c, &DNSParams3)
err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
framework.ExpectNoError(err)
ginkgo.By("--- Scenario: should re-create scaling parameters with default value when parameters got deleted ---")
ginkgo.By("Delete the ConfigMap for autoscaler")
err = deleteDNSScalingConfigMap(ctx, c, configMapNames[provider])
framework.ExpectNoError(err)
ginkgo.By("Wait for the ConfigMap got re-created")
_, err = waitForDNSConfigMapCreated(ctx, c, DNSdefaultTimeout, configMapNames[provider])
framework.ExpectNoError(err)
ginkgo.By("Replace the dns autoscaling parameters with another testing parameters")
err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], packLinearParams(&DNSParams2)))
framework.ExpectNoError(err)
ginkgo.By("Wait for kube-dns/coredns scaled to expected number")
getExpectReplicasLinear = getExpectReplicasFuncLinear(ctx, c, &DNSParams2)
err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
framework.ExpectNoError(err)
ginkgo.By("--- Scenario: should recover after autoscaler pod got deleted ---")
ginkgo.By("Delete the autoscaler pod for kube-dns/coredns")
err = deleteDNSAutoscalerPod(ctx, c)
framework.ExpectNoError(err)
ginkgo.By("Replace the dns autoscaling parameters with another testing parameters")
err = updateDNSScalingConfigMap(ctx, c, packDNSScalingConfigMap(configMapNames[provider], packLinearParams(&DNSParams1)))
framework.ExpectNoError(err)
ginkgo.By("Wait for kube-dns/coredns scaled to expected number")
getExpectReplicasLinear = getExpectReplicasFuncLinear(ctx, c, &DNSParams1)
err = waitForDNSReplicasSatisfied(ctx, c, getExpectReplicasLinear, DNSdefaultTimeout)
framework.ExpectNoError(err)
})
})
// DNSParamsLinear is a struct for number of DNS pods.
type DNSParamsLinear struct {
nodesPerReplica float64
coresPerReplica float64
min int
max int
}
type getExpectReplicasFunc func(c clientset.Interface) int
func getExpectReplicasFuncLinear(ctx context.Context, c clientset.Interface, params *DNSParamsLinear) getExpectReplicasFunc {
return func(c clientset.Interface) int {
var replicasFromNodes float64
var replicasFromCores float64
nodes, err := e2enode.GetReadyNodesIncludingTainted(ctx, c)
framework.ExpectNoError(err)
if params.nodesPerReplica > 0 {
replicasFromNodes = math.Ceil(float64(len(nodes.Items)) / params.nodesPerReplica)
}
if params.coresPerReplica > 0 {
replicasFromCores = math.Ceil(float64(getSchedulableCores(nodes.Items)) / params.coresPerReplica)
}
return int(math.Max(1.0, math.Max(replicasFromNodes, replicasFromCores)))
}
}
func getSchedulableCores(nodes []v1.Node) int64 {
var sc resource.Quantity
for _, node := range nodes {
if !node.Spec.Unschedulable {
sc.Add(node.Status.Allocatable[v1.ResourceCPU])
}
}
return sc.Value()
}
func detectDNSProvider(ctx context.Context, c clientset.Interface) (string, error) {
cm, err := c.CoreV1().ConfigMaps(metav1.NamespaceSystem).Get(ctx, "coredns-autoscaler", metav1.GetOptions{})
if cm != nil && err == nil {
return "coredns", nil
}
cm, err = c.CoreV1().ConfigMaps(metav1.NamespaceSystem).Get(ctx, "kube-dns-autoscaler", metav1.GetOptions{})
if cm != nil && err == nil {
return "kube-dns", nil
}
return "", fmt.Errorf("the cluster doesn't have kube-dns or coredns autoscaling configured")
}
func fetchDNSScalingConfigMap(ctx context.Context, c clientset.Interface, configMapName string) (*v1.ConfigMap, error) {
cm, err := c.CoreV1().ConfigMaps(metav1.NamespaceSystem).Get(ctx, configMapName, metav1.GetOptions{})
if err != nil {
return nil, err
}
return cm, nil
}
func deleteDNSScalingConfigMap(ctx context.Context, c clientset.Interface, configMapName string) error {
if err := c.CoreV1().ConfigMaps(metav1.NamespaceSystem).Delete(ctx, configMapName, metav1.DeleteOptions{}); err != nil {
return err
}
framework.Logf("DNS autoscaling ConfigMap deleted.")
return nil
}
func packLinearParams(params *DNSParamsLinear) map[string]string {
paramsMap := make(map[string]string)
paramsMap["linear"] = fmt.Sprintf("{\"nodesPerReplica\": %v,\"coresPerReplica\": %v,\"min\": %v,\"max\": %v}",
params.nodesPerReplica,
params.coresPerReplica,
params.min,
params.max)
return paramsMap
}
func packDNSScalingConfigMap(configMapName string, params map[string]string) *v1.ConfigMap {
configMap := v1.ConfigMap{}
configMap.ObjectMeta.Name = configMapName
configMap.ObjectMeta.Namespace = metav1.NamespaceSystem
configMap.Data = params
return &configMap
}
func updateDNSScalingConfigMap(ctx context.Context, c clientset.Interface, configMap *v1.ConfigMap) error {
_, err := c.CoreV1().ConfigMaps(metav1.NamespaceSystem).Update(ctx, configMap, metav1.UpdateOptions{})
if err != nil {
return err
}
framework.Logf("DNS autoscaling ConfigMap updated.")
return nil
}
func getDNSReplicas(ctx context.Context, c clientset.Interface) (int, error) {
label := labels.SelectorFromSet(labels.Set(map[string]string{ClusterAddonLabelKey: DNSLabelName}))
listOpts := metav1.ListOptions{LabelSelector: label.String()}
deployments, err := c.AppsV1().Deployments(metav1.NamespaceSystem).List(ctx, listOpts)
if err != nil {
return 0, err
}
if len(deployments.Items) != 1 {
return 0, fmt.Errorf("expected 1 DNS deployment, got %v", len(deployments.Items))
}
deployment := deployments.Items[0]
return int(*(deployment.Spec.Replicas)), nil
}
func deleteDNSAutoscalerPod(ctx context.Context, c clientset.Interface) error {
selector, _ := labels.Parse(fmt.Sprintf("%s in (kube-dns-autoscaler, coredns-autoscaler)", ClusterAddonLabelKey))
listOpts := metav1.ListOptions{LabelSelector: selector.String()}
pods, err := c.CoreV1().Pods(metav1.NamespaceSystem).List(ctx, listOpts)
if err != nil {
return err
}
if len(pods.Items) != 1 {
return fmt.Errorf("expected 1 autoscaler pod, got %v", len(pods.Items))
}
podName := pods.Items[0].Name
if err := c.CoreV1().Pods(metav1.NamespaceSystem).Delete(ctx, podName, metav1.DeleteOptions{}); err != nil {
return err
}
framework.Logf("DNS autoscaling pod %v deleted.", podName)
return nil
}
func waitForDNSReplicasSatisfied(ctx context.Context, c clientset.Interface, getExpected getExpectReplicasFunc, timeout time.Duration) (err error) {
var current int
var expected int
framework.Logf("Waiting up to %v for kube-dns to reach expected replicas", timeout)
condition := func(ctx context.Context) (bool, error) {
current, err = getDNSReplicas(ctx, c)
if err != nil {
return false, err
}
expected = getExpected(c)
if current != expected {
framework.Logf("Replicas not as expected: got %v, expected %v", current, expected)
return false, nil
}
return true, nil
}
if err = wait.PollUntilContextTimeout(ctx, 2*time.Second, timeout, false, condition); err != nil {
return fmt.Errorf("err waiting for DNS replicas to satisfy %v, got %v: %w", expected, current, err)
}
framework.Logf("kube-dns reaches expected replicas: %v", expected)
return nil
}
func waitForDNSConfigMapCreated(ctx context.Context, c clientset.Interface, timeout time.Duration, configMapName string) (configMap *v1.ConfigMap, err error) {
framework.Logf("Waiting up to %v for DNS autoscaling ConfigMap to be re-created", timeout)
condition := func(ctx context.Context) (bool, error) {
configMap, err = fetchDNSScalingConfigMap(ctx, c, configMapName)
if err != nil {
return false, nil
}
return true, nil
}
if err = wait.PollUntilContextTimeout(ctx, time.Second, timeout, false, condition); err != nil {
return nil, fmt.Errorf("err waiting for DNS autoscaling ConfigMap got re-created: %w", err)
}
return configMap, nil
}

View File

@ -20,9 +20,4 @@ const (
// NVIDIAGPUResourceName is the extended name of the GPU resource since v1.8
// this uses the device plugin mechanism
NVIDIAGPUResourceName = "nvidia.com/gpu"
// GPUDevicePluginDSYAML is the official Google Device Plugin Daemonset NVIDIA GPU manifest for GKE
// TODO: Parametrize it by making it a feature in TestFramework.
// so we can override the daemonset in other setups (non COS).
GPUDevicePluginDSYAML = "https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml"
)

View File

@ -32,7 +32,6 @@ import (
internalapi "k8s.io/cri-api/pkg/apis"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
commontest "k8s.io/kubernetes/test/e2e/common"
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
@ -83,11 +82,6 @@ func updateImageAllowList(ctx context.Context) {
} else {
e2epod.ImagePrePullList.Insert(sriovDevicePluginImage)
}
if gpuDevicePluginImage, err := getGPUDevicePluginImage(ctx); err != nil {
klog.Errorln(err)
} else {
e2epod.ImagePrePullList.Insert(gpuDevicePluginImage)
}
if samplePluginImage, err := getContainerImageFromE2ETestDaemonset(SampleDevicePluginDSYAML); err != nil {
klog.Errorln(err)
} else {
@ -217,21 +211,6 @@ func PrePullAllImages() error {
return utilerrors.NewAggregate(pullErrs)
}
// getGPUDevicePluginImage returns the image of GPU device plugin.
func getGPUDevicePluginImage(ctx context.Context) (string, error) {
ds, err := e2emanifest.DaemonSetFromURL(ctx, e2egpu.GPUDevicePluginDSYAML)
if err != nil {
return "", fmt.Errorf("failed to parse the device plugin image: %w", err)
}
if ds == nil {
return "", fmt.Errorf("failed to parse the device plugin image: the extracted DaemonSet is nil")
}
if len(ds.Spec.Template.Spec.Containers) < 1 {
return "", fmt.Errorf("failed to parse the device plugin image: cannot extract the container from YAML")
}
return ds.Spec.Template.Spec.Containers[0].Image, nil
}
func getContainerImageFromE2ETestDaemonset(dsYamlPath string) (string, error) {
data, err := e2etestfiles.Read(dsYamlPath)
if err != nil {

View File

@ -1,27 +0,0 @@
#cloud-config
runcmd:
- modprobe configs
# Install GPU drivers - https://cloud.google.com/container-optimized-os/docs/how-to/run-gpus
- cos-extensions install gpu
- mount --bind /var/lib/nvidia /var/lib/nvidia
- mount -o remount,exec /var/lib/nvidia /var/lib/nvidia
# Run nvidia-smi to verify installation
- /var/lib/nvidia/bin/nvidia-smi
# Remove build containers. They're very large.
- docker rm -f $(docker ps -aq)
# Standard installation proceeds
- mount /tmp /tmp -o remount,exec,suid
- usermod -a -G docker jenkins
- mkdir -p /var/lib/kubelet
- mkdir -p /home/kubernetes/containerized_mounter/rootfs
- mount --bind /home/kubernetes/containerized_mounter/ /home/kubernetes/containerized_mounter/
- mount -o remount, exec /home/kubernetes/containerized_mounter/
- wget https://storage.googleapis.com/kubernetes-release/gci-mounter/mounter.tar -O /tmp/mounter.tar
- tar xvf /tmp/mounter.tar -C /home/kubernetes/containerized_mounter/rootfs
- mkdir -p /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
- mount --rbind /var/lib/kubelet /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
- mount --make-rshared /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
- mount --bind /proc /home/kubernetes/containerized_mounter/rootfs/proc
- mount --bind /dev /home/kubernetes/containerized_mounter/rootfs/dev
- rm /tmp/mounter.tar