Refactor and clean up the code

This commit is contained in:
Chih-Chieh Yang 2022-08-02 14:29:03 +00:00
parent ed74de833d
commit 07e2bfe1cc

View File

@ -81,6 +81,72 @@ func setupWithAuthorizer(t testing.TB, maxReadonlyRequestsInFlight, maxMutatingR
return kubeConfig, tearDownFn
}
type SumAndCount struct {
Sum float64
Count int
}
type plMetrics struct {
execSeconds SumAndCount
seatUtil SumAndCount
availableSeats int
}
// metricSnapshot maps from a priority level label to
// a plMetrics struct containing APF metrics of interest
type metricSnapshot map[string]plMetrics
// Client request latency measurement
type clientLatencyMeasurement struct {
SumAndCount
SumSq float64 // latency sum of squares
Mu sync.Mutex
}
func (clm *clientLatencyMeasurement) reset() {
clm.Mu.Lock()
clm.Sum = 0
clm.Count = 0
clm.SumSq = 0
clm.Mu.Unlock()
}
func (clm *clientLatencyMeasurement) update(duration float64) {
clm.Mu.Lock()
clm.Count += 1
clm.Sum += duration
clm.SumSq += duration * duration
clm.Mu.Unlock()
}
func (clm *clientLatencyMeasurement) getStats() clientLatencyStats {
mean := clm.Sum / float64(clm.Count)
ss := clm.SumSq - mean*clm.Sum // reduced from ss := sumsq - 2*mean*sum + float64(count)*mean*mean
stdDev := math.Sqrt(ss / float64(clm.Count))
cv := stdDev / mean
return clientLatencyStats{mean: mean, stdDev: stdDev, cv: cv}
}
type clientLatencyStats struct {
mean float64 // latency average
stdDev float64 // latency standard deviation
cv float64 // latency coefficient of variation
}
type plMetricAvg struct {
reqExecution float64 // average request execution time
seatUtil float64 // average seat utilization
}
func intervalMetricAvg(snapshots map[string]metricSnapshot, t0 string, t1 string, plLabel string) plMetricAvg {
plmT0 := snapshots[t0][plLabel]
plmT1 := snapshots[t1][plLabel]
return plMetricAvg{
reqExecution: (plmT1.execSeconds.Sum - plmT0.execSeconds.Sum) / float64(plmT1.execSeconds.Count-plmT0.execSeconds.Count),
seatUtil: (plmT1.seatUtil.Sum - plmT0.seatUtil.Sum) / float64(plmT1.seatUtil.Count-plmT0.seatUtil.Count),
}
}
// This integration test checks the client-side expected concurrency and the server-side observed concurrency
// to make sure that they are close within a small error bound and that the priority levels are isolated.
// This test differs from TestPriorityLevelIsolation since TestPriorityLevelIsolation checks throughput instead
@ -117,65 +183,42 @@ func TestConcurrencyIsolation(t *testing.T) {
queueLength := 50
concurrencyShares := 100
priorityLevelNoxu1, _, err := createPriorityLevelAndBindingFlowSchemaForUser(
plNoxu1, _, err := createPriorityLevelAndBindingFlowSchemaForUser(
loopbackClient, "noxu1", concurrencyShares, queueLength)
if err != nil {
t.Error(err)
}
priorityLevelNoxu2, _, err := createPriorityLevelAndBindingFlowSchemaForUser(
plNoxu2, _, err := createPriorityLevelAndBindingFlowSchemaForUser(
loopbackClient, "noxu2", concurrencyShares, queueLength)
if err != nil {
t.Error(err)
}
availableSeats, err := getAvailableSeatsOfPriorityLevel(loopbackClient)
if err != nil {
t.Error(err)
}
t.Logf("noxu1 priority level concurrency limit: %v", availableSeats[priorityLevelNoxu1.Name])
t.Logf("noxu2 priority level concurrency limit: %v", availableSeats[priorityLevelNoxu2.Name])
if (availableSeats[priorityLevelNoxu1.Name] <= 4) || (availableSeats[priorityLevelNoxu2.Name] <= 4) {
t.Errorf("The number of available seats for test client priority levels are too small: (%v, %v). Expecting a number > 4",
availableSeats[priorityLevelNoxu1.Name], availableSeats[priorityLevelNoxu2.Name])
}
stopCh := make(chan struct{})
wg := sync.WaitGroup{}
// "elephant"
noxu1NumGoroutines := 5 + queueLength
var noxu1ClientRequestLatencySum float64
var noxu1ClientRequestLatencySumSq float64
var noxu1ClientRequestLatencyCount int32
var noxu1Mutex sync.Mutex
streamRequestsWithIndex(noxu1NumGoroutines, func(idx int) {
var noxu1LatMeasure clientLatencyMeasurement
wg.Add(noxu1NumGoroutines)
streamRequests(noxu1NumGoroutines, func() {
start := time.Now()
_, err := noxu1Client.CoreV1().Namespaces().Get(context.Background(), "default", metav1.GetOptions{})
duration := time.Since(start).Seconds()
noxu1Mutex.Lock()
noxu1ClientRequestLatencyCount += 1
noxu1ClientRequestLatencySum += duration
noxu1ClientRequestLatencySumSq += duration * duration
noxu1Mutex.Unlock()
noxu1LatMeasure.update(duration)
if err != nil {
t.Error(err)
}
}, &wg, stopCh)
// "mouse"
noxu2NumGoroutines := 3
var noxu2ClientRequestLatencySum float64
var noxu2ClientRequestLatencySumSq float64
var noxu2ClientRequestLatencyCount int32
var noxu2Mutex sync.Mutex
streamRequestsWithIndex(noxu2NumGoroutines, func(idx int) {
var noxu2LatMeasure clientLatencyMeasurement
wg.Add(noxu2NumGoroutines)
streamRequests(noxu2NumGoroutines, func() {
start := time.Now()
_, err := noxu2Client.CoreV1().Namespaces().Get(context.Background(), "default", metav1.GetOptions{})
duration := time.Since(start).Seconds()
noxu2Mutex.Lock()
noxu2ClientRequestLatencyCount += 1
noxu2ClientRequestLatencySum += duration
noxu2ClientRequestLatencySumSq += duration * duration
noxu2Mutex.Unlock()
noxu2LatMeasure.update(duration)
if err != nil {
t.Error(err)
}
@ -184,97 +227,101 @@ func TestConcurrencyIsolation(t *testing.T) {
// Warm up
time.Sleep(testWarmUpTime)
// Reset counters
noxu1Mutex.Lock()
noxu1ClientRequestLatencyCount = 0
noxu1ClientRequestLatencySum = 0
noxu1ClientRequestLatencySumSq = 0
noxu1Mutex.Unlock()
noxu2Mutex.Lock()
noxu2ClientRequestLatencyCount = 0
noxu2ClientRequestLatencySum = 0
noxu2ClientRequestLatencySumSq = 0
noxu2Mutex.Unlock()
earlierRequestExecutionSecondsSum, earlierRequestExecutionSecondsCount, earlierPLSeatUtilSum, earlierPLSeatUtilCount, err := getRequestExecutionMetrics(loopbackClient)
noxu1LatMeasure.reset()
noxu2LatMeasure.reset()
// Snapshots maps from a time label to a metricSnapshot
snapshots := make(map[string]metricSnapshot)
snapshots["t0"], err = getRequestMetricsSnapshot(loopbackClient)
if err != nil {
t.Error(err)
}
time.Sleep(testTime) // after warming up, the test enters a steady state
laterRequestExecutionSecondsSum, laterRequestExecutionSecondsCount, laterPLSeatUtilSum, laterPLSeatUtilCount, err := getRequestExecutionMetrics(loopbackClient)
snapshots["t1"], err = getRequestMetricsSnapshot(loopbackClient)
if err != nil {
t.Error(err)
}
if (earlierPLSeatUtilCount[priorityLevelNoxu1.Name] >= laterPLSeatUtilCount[priorityLevelNoxu1.Name]) || (earlierPLSeatUtilCount[priorityLevelNoxu2.Name] >= laterPLSeatUtilCount[priorityLevelNoxu2.Name]) {
t.Errorf("PLSeatUtilCount check failed: noxu1 earlier count %v, later count %v; noxu2 earlier count %v, later count %v",
earlierPLSeatUtilCount[priorityLevelNoxu1.Name], laterPLSeatUtilCount[priorityLevelNoxu1.Name], earlierPLSeatUtilCount[priorityLevelNoxu2.Name], laterPLSeatUtilCount[priorityLevelNoxu2.Name])
}
close(stopCh)
noxu1RequestExecutionSecondsAvg := (laterRequestExecutionSecondsSum[priorityLevelNoxu1.Name] - earlierRequestExecutionSecondsSum[priorityLevelNoxu1.Name]) / float64(laterRequestExecutionSecondsCount[priorityLevelNoxu1.Name]-earlierRequestExecutionSecondsCount[priorityLevelNoxu1.Name])
noxu2RequestExecutionSecondsAvg := (laterRequestExecutionSecondsSum[priorityLevelNoxu2.Name] - earlierRequestExecutionSecondsSum[priorityLevelNoxu2.Name]) / float64(laterRequestExecutionSecondsCount[priorityLevelNoxu2.Name]-earlierRequestExecutionSecondsCount[priorityLevelNoxu2.Name])
noxu1PLSeatUtilAvg := (laterPLSeatUtilSum[priorityLevelNoxu1.Name] - earlierPLSeatUtilSum[priorityLevelNoxu1.Name]) / float64(laterPLSeatUtilCount[priorityLevelNoxu1.Name]-earlierPLSeatUtilCount[priorityLevelNoxu1.Name])
noxu2PLSeatUtilAvg := (laterPLSeatUtilSum[priorityLevelNoxu2.Name] - earlierPLSeatUtilSum[priorityLevelNoxu2.Name]) / float64(laterPLSeatUtilCount[priorityLevelNoxu2.Name]-earlierPLSeatUtilCount[priorityLevelNoxu2.Name])
t.Logf("\nnoxu1RequestExecutionSecondsAvg %v\nnoxu2RequestExecutionSecondsAvg %v", noxu1RequestExecutionSecondsAvg, noxu2RequestExecutionSecondsAvg)
t.Logf("\nnoxu1PLSeatUtilAvg %v\nnoxu2PLSeatUtilAvg %v", noxu1PLSeatUtilAvg, noxu2PLSeatUtilAvg)
wg.Wait() // wait till the client goroutines finish before computing the statistics
noxu1ClientRequestLatencySecondsAvg, noxu1ClientRequestLatencySecondsSdev := computeClientRequestLatencyStats(noxu1ClientRequestLatencyCount, noxu1ClientRequestLatencySum, noxu1ClientRequestLatencySumSq)
noxu2ClientRequestLatencySecondsAvg, noxu2ClientRequestLatencySecondsSdev := computeClientRequestLatencyStats(noxu2ClientRequestLatencyCount, noxu2ClientRequestLatencySum, noxu2ClientRequestLatencySumSq)
t.Logf("\nnoxu1ClientRequestLatencyCount %v\nnoxu2ClientRequestLatencyCount %v", noxu1ClientRequestLatencyCount, noxu2ClientRequestLatencyCount)
t.Logf("\nnoxu1ClientRequestLatencySecondsAvg %v\nnoxu2ClientRequestLatencySecondsAvg %v", noxu1ClientRequestLatencySecondsAvg, noxu2ClientRequestLatencySecondsAvg)
t.Logf("\nnoxu1ClientRequestLatencySecondsSdev %v\nnoxu2ClientRequestLatencySecondsSdev %v", noxu1ClientRequestLatencySecondsSdev, noxu2ClientRequestLatencySecondsSdev)
allDispatchedReqCounts, rejectedReqCounts, err := getRequestCountOfPriorityLevel(loopbackClient)
// Check the assumptions of the test
noxu1T0 := snapshots["t0"][plNoxu1.Name]
noxu1T1 := snapshots["t1"][plNoxu1.Name]
noxu2T0 := snapshots["t0"][plNoxu2.Name]
noxu2T1 := snapshots["t1"][plNoxu2.Name]
if noxu1T0.seatUtil.Count >= noxu1T1.seatUtil.Count || noxu2T0.seatUtil.Count >= noxu2T1.seatUtil.Count {
t.Errorf("SeatUtilCount check failed: noxu1 t0 count %d, t1 count %d; noxu2 t0 count %d, t1 count %d",
noxu1T0.seatUtil.Count, noxu1T1.seatUtil.Count, noxu2T0.seatUtil.Count, noxu2T1.seatUtil.Count)
}
t.Logf("noxu1 priority level concurrency limit: %d", noxu1T0.availableSeats)
t.Logf("noxu2 priority level concurrency limit: %d", noxu2T0.availableSeats)
if (noxu1T0.availableSeats != noxu1T1.availableSeats) || (noxu2T0.availableSeats != noxu2T1.availableSeats) {
t.Errorf("The number of available seats changed: noxu1 (%d, %d) noxu2 (%d, %d)",
noxu1T0.availableSeats, noxu1T1.availableSeats, noxu2T0.availableSeats, noxu2T1.availableSeats)
}
if (noxu1T0.availableSeats <= 4) || (noxu2T0.availableSeats <= 4) {
t.Errorf("The number of available seats for test client priority levels are too small: (%d, %d). Expecting a number > 4",
noxu1T0.availableSeats, noxu2T0.availableSeats)
}
// No reuqests should be rejected under normal situations
_, rejectedReqCounts, err := getRequestCountOfPriorityLevel(loopbackClient)
if err != nil {
t.Error(err)
}
t.Logf("\nnoxu1APFRequestCount %v\nnoxu2APFRequestCount %v", allDispatchedReqCounts[priorityLevelNoxu1.Name], allDispatchedReqCounts[priorityLevelNoxu2.Name])
if rejectedReqCounts[priorityLevelNoxu1.Name] > 0 {
t.Errorf(`%v requests from the "elephant" stream were rejected unexpectedly`, rejectedReqCounts[priorityLevelNoxu2.Name])
if rejectedReqCounts[plNoxu1.Name] > 0 {
t.Errorf(`%d requests from the "elephant" stream were rejected unexpectedly`, rejectedReqCounts[plNoxu1.Name])
}
if rejectedReqCounts[priorityLevelNoxu2.Name] > 0 {
t.Errorf(`%v requests from the "mouse" stream were rejected unexpectedly`, rejectedReqCounts[priorityLevelNoxu2.Name])
if rejectedReqCounts[plNoxu2.Name] > 0 {
t.Errorf(`%d requests from the "mouse" stream were rejected unexpectedly`, rejectedReqCounts[plNoxu2.Name])
}
// Calculate APF server side metric averages during the test interval
noxu1Avg := intervalMetricAvg(snapshots, "t0", "t1", plNoxu1.Name)
noxu2Avg := intervalMetricAvg(snapshots, "t0", "t1", plNoxu2.Name)
t.Logf("\nnoxu1 avg request execution time %v\nnoxu2 avg request execution time %v", noxu1Avg.reqExecution, noxu2Avg.reqExecution)
t.Logf("\nnoxu1 avg seat utilization %v\nnoxu2 avg seat utilization %v", noxu1Avg.seatUtil, noxu2Avg.seatUtil)
// Wait till the client goroutines finish before computing the client side request latency statistics
wg.Wait()
noxu1LatStats := noxu1LatMeasure.getStats()
noxu2LatStats := noxu2LatMeasure.getStats()
t.Logf("noxu1 client request count %d duration mean %v stddev %v cv %v", noxu1LatMeasure.Count, noxu1LatStats.mean, noxu1LatStats.stdDev, noxu1LatStats.cv)
t.Logf("noxu2 client request count %d duration mean %v stddev %v cv %v", noxu2LatMeasure.Count, noxu2LatStats.mean, noxu2LatStats.stdDev, noxu2LatStats.cv)
// Calculate server-side observed concurrency
noxu1ObservedConcurrency := noxu1PLSeatUtilAvg * float64(availableSeats[priorityLevelNoxu1.Name])
noxu2ObservedConcurrency := noxu2PLSeatUtilAvg * float64(availableSeats[priorityLevelNoxu2.Name])
noxu1ObservedConcurrency := noxu1Avg.seatUtil * float64(noxu1T0.availableSeats)
noxu2ObservedConcurrency := noxu2Avg.seatUtil * float64(noxu2T0.availableSeats)
// Expected concurrency is derived from equal throughput assumption on both the client-side and the server-side
// Expected concurrency computed can sometimes be larger than the number of available seats. We use the number of available seats as an upper bound
noxu1ExpectedConcurrency := math.Min(float64(noxu1NumGoroutines)*noxu1RequestExecutionSecondsAvg/noxu1ClientRequestLatencySecondsAvg, float64(availableSeats[priorityLevelNoxu1.Name]))
noxu2ExpectedConcurrency := math.Min(float64(noxu2NumGoroutines)*noxu2RequestExecutionSecondsAvg/noxu2ClientRequestLatencySecondsAvg, float64(availableSeats[priorityLevelNoxu2.Name]))
noxu1ExpectedConcurrency := float64(noxu1NumGoroutines) * noxu1Avg.reqExecution / noxu1LatStats.mean
noxu2ExpectedConcurrency := float64(noxu2NumGoroutines) * noxu2Avg.reqExecution / noxu2LatStats.mean
t.Logf("Concurrency of noxu1:noxu2 - expected (%v:%v), observed (%v:%v)", noxu1ExpectedConcurrency, noxu2ExpectedConcurrency, noxu1ObservedConcurrency, noxu2ObservedConcurrency)
// Calculate the tolerable error margin and perform the final check
margin := 2 * math.Min(noxu1ClientRequestLatencySecondsSdev/noxu1ClientRequestLatencySecondsAvg, noxu2ClientRequestLatencySecondsSdev/noxu2ClientRequestLatencySecondsAvg)
t.Logf("\nnoxu1Margin %v\nnoxu2Margin %v", noxu1ClientRequestLatencySecondsSdev/noxu1ClientRequestLatencySecondsAvg, noxu2ClientRequestLatencySecondsSdev/noxu2ClientRequestLatencySecondsAvg)
margin := 2 * math.Min(noxu1LatStats.cv, noxu2LatStats.cv)
t.Logf("Error margin is %v", margin)
isConcurrencyExpected := func(name string, observed float64, expected float64) bool {
t.Logf("%v relative error is %v", name, math.Abs(expected-observed)/expected)
return math.Abs(expected-observed)/expected <= margin
}
if !isConcurrencyExpected(priorityLevelNoxu1.Name, noxu1ObservedConcurrency, noxu1ExpectedConcurrency) {
if !isConcurrencyExpected(plNoxu1.Name, noxu1ObservedConcurrency, noxu1ExpectedConcurrency) {
t.Errorf("Concurrency observed by noxu1 is off. Expected: %v, observed: %v", noxu1ExpectedConcurrency, noxu1ObservedConcurrency)
}
if !isConcurrencyExpected(priorityLevelNoxu2.Name, noxu2ObservedConcurrency, noxu2ExpectedConcurrency) {
if !isConcurrencyExpected(plNoxu2.Name, noxu2ObservedConcurrency, noxu2ExpectedConcurrency) {
t.Errorf("Concurrency observed by noxu2 is off. Expected: %v, observed: %v", noxu2ExpectedConcurrency, noxu2ObservedConcurrency)
}
// Check server-side APF measurements
if math.Abs(1-noxu1PLSeatUtilAvg) > 0.05 {
t.Errorf("noxu1PLSeatUtilAvg=%v is too far from expected=1.0", noxu1PLSeatUtilAvg)
// For the elephant noxu1, the avg seat utilization should be close to 1.0
if math.Abs(1-noxu1Avg.seatUtil) > 0.05 {
t.Errorf("noxu1PLSeatUtilAvg=%v is too far from expected=1.0", noxu1Avg.seatUtil)
}
// For the mouse noxu2, the observed concurrency should be close to the number of goroutines it uses
if math.Abs(1-noxu2ObservedConcurrency/float64(noxu2NumGoroutines)) > 0.05 {
t.Errorf("noxu2ObservedConcurrency=%v is too far from noxu2NumGoroutines=%v", noxu2ObservedConcurrency, noxu2NumGoroutines)
t.Errorf("noxu2ObservedConcurrency=%v is too far from noxu2NumGoroutines=%d", noxu2ObservedConcurrency, noxu2NumGoroutines)
}
}
func computeClientRequestLatencyStats(count int32, sum, sumsq float64) (float64, float64) {
mean := sum / float64(count)
ss := sumsq - mean*sum // reduced from ss := sumsq - 2*mean*sum + float64(count)*mean*mean
return mean, math.Sqrt(ss / float64(count))
}
func getRequestMetricsSnapshot(c clientset.Interface) (metricSnapshot, error) {
func getAvailableSeatsOfPriorityLevel(c clientset.Interface) (map[string]int, error) {
resp, err := getMetrics(c)
if err != nil {
return nil, err
@ -286,85 +333,40 @@ func getAvailableSeatsOfPriorityLevel(c clientset.Interface) (map[string]int, er
Opts: &expfmt.DecodeOptions{},
}
concurrency := make(map[string]int)
snapshot := metricSnapshot{}
for {
var v model.Vector
if err := decoder.Decode(&v); err != nil {
if err == io.EOF {
// Expected loop termination condition.
return concurrency, nil
return snapshot, nil
}
return nil, fmt.Errorf("failed decoding metrics: %v", err)
}
for _, metric := range v {
switch name := string(metric.Metric[model.MetricNameLabel]); name {
case requestConcurrencyLimitMetricsName:
concurrency[string(metric.Metric[labelPriorityLevel])] = int(metric.Value)
plLabel := string(metric.Metric[labelPriorityLevel])
entry := plMetrics{}
if v, ok := snapshot[plLabel]; ok {
entry = v
}
}
}
}
func getRequestExecutionMetrics(c clientset.Interface) (map[string]float64, map[string]int, map[string]float64, map[string]int, error) {
resp, err := getMetrics(c)
if err != nil {
return nil, nil, nil, nil, err
}
dec := expfmt.NewDecoder(strings.NewReader(string(resp)), expfmt.FmtText)
decoder := expfmt.SampleDecoder{
Dec: dec,
Opts: &expfmt.DecodeOptions{},
}
RequestExecutionSecondsSum := make(map[string]float64)
RequestExecutionSecondsCount := make(map[string]int)
PLSeatUtilSum := make(map[string]float64)
PLSeatUtilCount := make(map[string]int)
for {
var v model.Vector
if err := decoder.Decode(&v); err != nil {
if err == io.EOF {
// Expected loop termination condition.
return RequestExecutionSecondsSum, RequestExecutionSecondsCount,
PLSeatUtilSum, PLSeatUtilCount, nil
}
return nil, nil, nil, nil, fmt.Errorf("failed decoding metrics: %v", err)
}
for _, metric := range v {
switch name := string(metric.Metric[model.MetricNameLabel]); name {
case requestExecutionSecondsSumName:
RequestExecutionSecondsSum[string(metric.Metric[labelPriorityLevel])] = float64(metric.Value)
entry.execSeconds.Sum = float64(metric.Value)
case requestExecutionSecondsCountName:
RequestExecutionSecondsCount[string(metric.Metric[labelPriorityLevel])] = int(metric.Value)
entry.execSeconds.Count = int(metric.Value)
case priorityLevelSeatUtilSumName:
PLSeatUtilSum[string(metric.Metric[labelPriorityLevel])] = float64(metric.Value)
entry.seatUtil.Sum = float64(metric.Value)
case priorityLevelSeatUtilCountName:
PLSeatUtilCount[string(metric.Metric[labelPriorityLevel])] = int(metric.Value)
entry.seatUtil.Count = int(metric.Value)
case requestConcurrencyLimitMetricsName:
entry.availableSeats = int(metric.Value)
}
snapshot[plLabel] = entry
}
}
}
func streamRequestsWithIndex(parallel int, request func(idx int), wg *sync.WaitGroup, stopCh <-chan struct{}) {
wg.Add(parallel)
for i := 0; i < parallel; i++ {
go func(idx int) {
defer wg.Done()
for {
select {
case <-stopCh:
return
default:
request(idx)
}
}
}(i)
}
}
// Webhook authorizer code copied from staging/src/k8s.io/apiserver/plugin/pkg/authenticator/token/webhook/webhook_v1_test.go with minor changes
// V1Service mocks a remote service.
type V1Service interface {
@ -454,14 +456,12 @@ func NewV1TestServer(s V1Service, cert, key, caCert []byte) (*httptest.Server, e
type mockV1Service struct {
allow bool
statusCode int
called int
}
func (m *mockV1Service) Review(r *authorizationv1.SubjectAccessReview) {
if r.Spec.User == "noxu1" || r.Spec.User == "noxu2" {
time.Sleep(fakeworkDuration) // simulate fake work with sleep
}
m.called++
r.Status.Allowed = m.allow
}
func (m *mockV1Service) HTTPStatusCode() int { return m.statusCode }