e2e framework: move resource gathering into framework/debug

This helps getting rid of the ssh dependency. The same init package as for
dumping namespaces takes care of adding the functionality back to framework
instances.
This commit is contained in:
Patrick Ohly 2022-08-25 18:19:16 +02:00
parent f9bc4f837b
commit 70d0824f01
3 changed files with 77 additions and 67 deletions

View File

@ -15,35 +15,47 @@ limitations under the License.
*/
// Package init sets debug.DumpAllNamespaceInfo as implementation in the framework
// and enables log size verification.
// and enables log size verification and resource gathering.
package init
import (
"sync"
"time"
"github.com/onsi/ginkgo/v2"
"k8s.io/kubernetes/test/e2e/framework"
"k8s.io/kubernetes/test/e2e/framework/debug"
e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
)
var (
// TODO: this variable used to be a field in framework.Framework. It is
// not clear how it was ever set. https://grep.app/search?q=AddonResourceConstraints
// returns only the default initialization with an empty map. Perhaps it can be removed?
// Constraints that passed to a check which is executed after data is gathered to
// see if 99% of results are within acceptable bounds. It has to be injected in the test,
// as expectations vary greatly. Constraints are grouped by the container names.
AddonResourceConstraints map[string]e2edebug.ResourceConstraint
)
func init() {
framework.NewFrameworkExtensions = append(framework.NewFrameworkExtensions,
func(f *framework.Framework) {
f.DumpAllNamespaceInfo = func(f *framework.Framework, ns string) {
debug.DumpAllNamespaceInfo(f.ClientSet, ns)
e2edebug.DumpAllNamespaceInfo(f.ClientSet, ns)
}
if framework.TestContext.GatherLogsSizes {
var (
wg sync.WaitGroup
closeChannel chan bool
verifier *debug.LogsSizeVerifier
verifier *e2edebug.LogsSizeVerifier
)
ginkgo.BeforeEach(func() {
wg.Add(1)
closeChannel = make(chan bool)
verifier = debug.NewLogsVerifier(f.ClientSet, closeChannel)
verifier = e2edebug.NewLogsVerifier(f.ClientSet, closeChannel)
go func() {
defer wg.Done()
verifier.Run()
@ -57,6 +69,44 @@ func init() {
})
})
}
if framework.TestContext.GatherKubeSystemResourceUsageData != "false" &&
framework.TestContext.GatherKubeSystemResourceUsageData != "none" {
ginkgo.BeforeEach(func() {
var nodeMode e2edebug.NodesSet
switch framework.TestContext.GatherKubeSystemResourceUsageData {
case "master":
nodeMode = e2edebug.MasterNodes
case "masteranddns":
nodeMode = e2edebug.MasterAndDNSNodes
default:
nodeMode = e2edebug.AllNodes
}
gatherer, err := e2edebug.NewResourceUsageGatherer(f.ClientSet, e2edebug.ResourceGathererOptions{
InKubemark: framework.ProviderIs("kubemark"),
Nodes: nodeMode,
ResourceDataGatheringPeriod: 60 * time.Second,
ProbeDuration: 15 * time.Second,
PrintVerboseLogs: false,
}, nil)
if err != nil {
framework.Logf("Error while creating NewResourceUsageGatherer: %v", err)
return
}
go gatherer.StartGatheringData()
ginkgo.DeferCleanup(func() {
ginkgo.By("Collecting resource usage data", func() {
summary, resourceViolationError := gatherer.StopAndSummarize([]int{90, 99, 100}, AddonResourceConstraints)
// Always record the summary, even if there was an error.
f.TestSummaries = append(f.TestSummaries, summary)
// Now fail if there was an error.
framework.ExpectNoError(resourceViolationError)
})
})
})
}
},
)
}

View File

@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
package debug
import (
"bufio"
@ -38,7 +38,7 @@ import (
clientset "k8s.io/client-go/kubernetes"
kubeletstatsv1alpha1 "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
// TODO: Remove the following imports (ref: https://github.com/kubernetes/kubernetes/issues/81245)
"k8s.io/kubernetes/test/e2e/framework"
e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
)
@ -91,7 +91,7 @@ func (s *ResourceUsageSummary) PrintHumanReadable() string {
// PrintJSON prints resource usage summary in JSON.
func (s *ResourceUsageSummary) PrintJSON() string {
return PrettyPrintJSON(*s)
return framework.PrettyPrintJSON(*s)
}
// SummaryKind returns string of ResourceUsageSummary
@ -198,13 +198,13 @@ func (w *resourceGatherWorker) singleProbe() {
} else {
nodeUsage, err := getOneTimeResourceUsageOnNode(w.c, w.nodeName, w.probeDuration, func() []string { return w.containerIDs })
if err != nil {
Logf("Error while reading data from %v: %v", w.nodeName, err)
framework.Logf("Error while reading data from %v: %v", w.nodeName, err)
return
}
for k, v := range nodeUsage {
data[k] = v
if w.printVerboseLogs {
Logf("Get container %v usage on node %v. CPUUsageInCores: %v, MemoryUsageInBytes: %v, MemoryWorkingSetInBytes: %v", k, w.nodeName, v.CPUUsageInCores, v.MemoryUsageInBytes, v.MemoryWorkingSetInBytes)
framework.Logf("Get container %v usage on node %v. CPUUsageInCores: %v, MemoryUsageInBytes: %v, MemoryWorkingSetInBytes: %v", k, w.nodeName, v.CPUUsageInCores, v.MemoryUsageInBytes, v.MemoryWorkingSetInBytes)
}
}
}
@ -290,13 +290,13 @@ func getOneTimeResourceUsageOnNode(
// getStatsSummary contacts kubelet for the container information.
func getStatsSummary(c clientset.Interface, nodeName string) (*kubeletstatsv1alpha1.Summary, error) {
ctx, cancel := context.WithTimeout(context.Background(), SingleCallTimeout)
ctx, cancel := context.WithTimeout(context.Background(), framework.SingleCallTimeout)
defer cancel()
data, err := c.CoreV1().RESTClient().Get().
Resource("nodes").
SubResource("proxy").
Name(fmt.Sprintf("%v:%v", nodeName, KubeletPort)).
Name(fmt.Sprintf("%v:%v", nodeName, framework.KubeletPort)).
Suffix("stats/summary").
Do(ctx).Raw()
@ -322,7 +322,7 @@ func removeUint64Ptr(ptr *uint64) uint64 {
func (w *resourceGatherWorker) gather(initialSleep time.Duration) {
defer utilruntime.HandleCrash()
defer w.wg.Done()
defer Logf("Closing worker for %v", w.nodeName)
defer framework.Logf("Closing worker for %v", w.nodeName)
defer func() { w.finished = true }()
select {
case <-time.After(initialSleep):
@ -384,7 +384,7 @@ func nodeHasControlPlanePods(c clientset.Interface, nodeName string) (bool, erro
return false, err
}
if len(podList.Items) < 1 {
Logf("Can't find any pods in namespace %s to grab metrics from", metav1.NamespaceSystem)
framework.Logf("Can't find any pods in namespace %s to grab metrics from", metav1.NamespaceSystem)
}
for _, pod := range podList.Items {
if regKubeScheduler.MatchString(pod.Name) || regKubeControllerManager.MatchString(pod.Name) {
@ -422,7 +422,7 @@ func NewResourceUsageGatherer(c clientset.Interface, options ResourceGathererOpt
if pods == nil {
pods, err = c.CoreV1().Pods("kube-system").List(context.TODO(), metav1.ListOptions{})
if err != nil {
Logf("Error while listing Pods: %v", err)
framework.Logf("Error while listing Pods: %v", err)
return nil, err
}
}
@ -458,7 +458,7 @@ func NewResourceUsageGatherer(c clientset.Interface, options ResourceGathererOpt
}
nodeList, err := c.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
if err != nil {
Logf("Error while listing Nodes: %v", err)
framework.Logf("Error while listing Nodes: %v", err)
return nil, err
}
@ -510,7 +510,7 @@ func (g *ContainerResourceGatherer) StartGatheringData() {
// specified resource constraints.
func (g *ContainerResourceGatherer) StopAndSummarize(percentiles []int, constraints map[string]ResourceConstraint) (*ResourceUsageSummary, error) {
close(g.stopCh)
Logf("Closed stop channel. Waiting for %v workers", len(g.workers))
framework.Logf("Closed stop channel. Waiting for %v workers", len(g.workers))
finished := make(chan struct{}, 1)
go func() {
g.workerWg.Wait()
@ -518,7 +518,7 @@ func (g *ContainerResourceGatherer) StopAndSummarize(percentiles []int, constrai
}()
select {
case <-finished:
Logf("Waitgroup finished.")
framework.Logf("Waitgroup finished.")
case <-time.After(2 * time.Minute):
unfinished := make([]string, 0)
for i := range g.workers {
@ -526,11 +526,11 @@ func (g *ContainerResourceGatherer) StopAndSummarize(percentiles []int, constrai
unfinished = append(unfinished, g.workers[i].nodeName)
}
}
Logf("Timed out while waiting for waitgroup, some workers failed to finish: %v", unfinished)
framework.Logf("Timed out while waiting for waitgroup, some workers failed to finish: %v", unfinished)
}
if len(percentiles) == 0 {
Logf("Warning! Empty percentile list for stopAndPrintData.")
framework.Logf("Warning! Empty percentile list for stopAndPrintData.")
return &ResourceUsageSummary{}, fmt.Errorf("Failed to get any resource usage data")
}
data := make(map[int]ResourceUsagePerContainer)
@ -604,7 +604,7 @@ type kubemarkResourceUsage struct {
}
func getMasterUsageByPrefix(prefix string) (string, error) {
sshResult, err := e2essh.SSH(fmt.Sprintf("ps ax -o %%cpu,rss,command | tail -n +2 | grep %v | sed 's/\\s+/ /g'", prefix), APIAddress()+":22", TestContext.Provider)
sshResult, err := e2essh.SSH(fmt.Sprintf("ps ax -o %%cpu,rss,command | tail -n +2 | grep %v | sed 's/\\s+/ /g'", prefix), framework.APIAddress()+":22", framework.TestContext.Provider)
if err != nil {
return "", err
}
@ -617,7 +617,7 @@ func getKubemarkMasterComponentsResourceUsage() map[string]*kubemarkResourceUsag
// Get kubernetes component resource usage
sshResult, err := getMasterUsageByPrefix("kube")
if err != nil {
Logf("Error when trying to SSH to master machine. Skipping probe. %v", err)
framework.Logf("Error when trying to SSH to master machine. Skipping probe. %v", err)
return nil
}
scanner := bufio.NewScanner(strings.NewReader(sshResult))
@ -635,7 +635,7 @@ func getKubemarkMasterComponentsResourceUsage() map[string]*kubemarkResourceUsag
// Get etcd resource usage
sshResult, err = getMasterUsageByPrefix("bin/etcd")
if err != nil {
Logf("Error when trying to SSH to master machine. Skipping probe")
framework.Logf("Error when trying to SSH to master machine. Skipping probe")
return nil
}
scanner = bufio.NewScanner(strings.NewReader(sshResult))

View File

@ -99,12 +99,6 @@ type Framework struct {
NamespaceDeletionTimeout time.Duration
NamespacePodSecurityEnforceLevel admissionapi.Level // The pod security enforcement level for namespaces to be applied.
gatherer *ContainerResourceGatherer
// Constraints that passed to a check which is executed after data is gathered to
// see if 99% of results are within acceptable bounds. It has to be injected in the test,
// as expectations vary greatly. Constraints are grouped by the container names.
AddonResourceConstraints map[string]ResourceConstraint
// Flaky operation failures in an e2e test can be captured through this.
flakeReport *FlakeReport
@ -164,7 +158,6 @@ func NewDefaultFramework(baseName string) *Framework {
func NewFramework(baseName string, options Options, client clientset.Interface) *Framework {
f := &Framework{
BaseName: baseName,
AddonResourceConstraints: make(map[string]ResourceConstraint),
Options: options,
ClientSet: client,
Timeouts: NewTimeoutContextWithDefaults(),
@ -256,32 +249,6 @@ func (f *Framework) BeforeEach() {
f.UniqueName = fmt.Sprintf("%s-%08x", f.BaseName, rand.Int31())
}
if TestContext.GatherKubeSystemResourceUsageData != "false" && TestContext.GatherKubeSystemResourceUsageData != "none" {
var err error
var nodeMode NodesSet
switch TestContext.GatherKubeSystemResourceUsageData {
case "master":
nodeMode = MasterNodes
case "masteranddns":
nodeMode = MasterAndDNSNodes
default:
nodeMode = AllNodes
}
f.gatherer, err = NewResourceUsageGatherer(f.ClientSet, ResourceGathererOptions{
InKubemark: ProviderIs("kubemark"),
Nodes: nodeMode,
ResourceDataGatheringPeriod: 60 * time.Second,
ProbeDuration: 15 * time.Second,
PrintVerboseLogs: false,
}, nil)
if err != nil {
Logf("Error while creating NewResourceUsageGatherer: %v", err)
} else {
go f.gatherer.StartGatheringData()
}
}
f.flakeReport = NewFlakeReport()
}
@ -393,13 +360,6 @@ func (f *Framework) AfterEach() {
}
}()
if TestContext.GatherKubeSystemResourceUsageData != "false" && TestContext.GatherKubeSystemResourceUsageData != "none" && f.gatherer != nil {
ginkgo.By("Collecting resource usage data")
summary, resourceViolationError := f.gatherer.StopAndSummarize([]int{90, 99, 100}, f.AddonResourceConstraints)
defer ExpectNoError(resourceViolationError)
f.TestSummaries = append(f.TestSummaries, summary)
}
TestContext.CloudConfig.Provider.FrameworkAfterEach(f)
// Report any flakes that were observed in the e2e test and reset.