Add regression test for CPUManager distribute NUMA algorithm

We witnessed this exact allocation attempt in a live cluster and witnessed the
algorithm fail with an accounting error. This test was added to verify that
this case is now handled by the updates to the algorithm and that we don't
regress from it in the future.

"test" description="ensure previous failure encountered on live machine has been fixed (1/1)"
"combo remainderSet balance" combo=[2 4 6] remainderSet=[2 4 6] distribution=9 remainder=1 available=[14 2 4 4 0 3 4 1] balance=4.031
"combo remainderSet balance" combo=[2 4 6] remainderSet=[2 4] distribution=9 remainder=1 available=[0 3 4 1 14 2 4 4] balance=4.031
"combo remainderSet balance" combo=[2 4 6] remainderSet=[2 6] distribution=9 remainder=1 available=[1 14 2 4 4 0 3 4] balance=4.031
"combo remainderSet balance" combo=[2 4 6] remainderSet=[4 6] distribution=9 remainder=1 available=[1 3 4 0 14 2 4 4] balance=4.031
"combo remainderSet balance" combo=[2 4 6] remainderSet=[2] distribution=9 remainder=1 available=[4 0 3 4 1 14 2 4] balance=4.031
"combo remainderSet balance" combo=[2 4 6] remainderSet=[4] distribution=9 remainder=1 available=[3 4 0 14 2 4 4 1] balance=4.031
"combo remainderSet balance" combo=[2 4 6] remainderSet=[6] distribution=9 remainder=1 available=[1 13 2 4 4 1 3 4] balance=3.606
"bestCombo found" distribution=9 bestCombo=[2 4 6] bestRemainder=[6]

Signed-off-by: Kevin Klues <kklues@nvidia.com>
This commit is contained in:
Kevin Klues 2021-11-24 20:45:53 +00:00
parent e284c74d93
commit f8511877e2
2 changed files with 291 additions and 0 deletions

View File

@ -845,6 +845,15 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) {
"",
mustParseCPUSet(t, "0-3,10-13,20-23,30-34,40-43,50-53,60-63,70-74"),
},
{
"ensure previous failure encountered on live machine has been fixed (1/1)",
topoDualSocketMultiNumaPerSocketHTLarge,
mustParseCPUSet(t, "0,128,30,31,158,159,43-47,171-175,62,63,190,191,75-79,203-207,94,96,222,223,101-111,229-239,126,127,254,255"),
28,
1,
"",
mustParseCPUSet(t, "43-47,75-79,96,101-105,171-174,203-206,229-232"),
},
}...)
for _, tc := range testCases {

View File

@ -610,4 +610,286 @@ var (
79: {CoreID: 39, SocketID: 3, NUMANodeID: 1},
},
}
/*
Topology from dual AMD EPYC 7742 64-Core Processor; lscpu excerpt
CPU(s): 256
On-line CPU(s) list: 0-255
Thread(s) per core: 2
Core(s) per socket: 64
Socket(s): 2
NUMA node(s): 8 (NPS=4)
NUMA node0 CPU(s): 0-15,128-143
NUMA node1 CPU(s): 16-31,144-159
NUMA node2 CPU(s): 32-47,160-175
NUMA node3 CPU(s): 48-63,176-191
NUMA node4 CPU(s): 64-79,192-207
NUMA node5 CPU(s): 80-95,208-223
NUMA node6 CPU(s): 96-111,224-239
NUMA node7 CPU(s): 112-127,240-255
*/
topoDualSocketMultiNumaPerSocketHTLarge = &topology.CPUTopology{
NumCPUs: 256,
NumSockets: 2,
NumCores: 128,
NumNUMANodes: 8,
CPUDetails: map[int]topology.CPUInfo{
0: {CoreID: 0, SocketID: 0, NUMANodeID: 0},
1: {CoreID: 1, SocketID: 0, NUMANodeID: 0},
2: {CoreID: 2, SocketID: 0, NUMANodeID: 0},
3: {CoreID: 3, SocketID: 0, NUMANodeID: 0},
4: {CoreID: 4, SocketID: 0, NUMANodeID: 0},
5: {CoreID: 5, SocketID: 0, NUMANodeID: 0},
6: {CoreID: 6, SocketID: 0, NUMANodeID: 0},
7: {CoreID: 7, SocketID: 0, NUMANodeID: 0},
8: {CoreID: 8, SocketID: 0, NUMANodeID: 0},
9: {CoreID: 9, SocketID: 0, NUMANodeID: 0},
10: {CoreID: 10, SocketID: 0, NUMANodeID: 0},
11: {CoreID: 11, SocketID: 0, NUMANodeID: 0},
12: {CoreID: 12, SocketID: 0, NUMANodeID: 0},
13: {CoreID: 13, SocketID: 0, NUMANodeID: 0},
14: {CoreID: 14, SocketID: 0, NUMANodeID: 0},
15: {CoreID: 15, SocketID: 0, NUMANodeID: 0},
16: {CoreID: 16, SocketID: 0, NUMANodeID: 1},
17: {CoreID: 17, SocketID: 0, NUMANodeID: 1},
18: {CoreID: 18, SocketID: 0, NUMANodeID: 1},
19: {CoreID: 19, SocketID: 0, NUMANodeID: 1},
20: {CoreID: 20, SocketID: 0, NUMANodeID: 1},
21: {CoreID: 21, SocketID: 0, NUMANodeID: 1},
22: {CoreID: 22, SocketID: 0, NUMANodeID: 1},
23: {CoreID: 23, SocketID: 0, NUMANodeID: 1},
24: {CoreID: 24, SocketID: 0, NUMANodeID: 1},
25: {CoreID: 25, SocketID: 0, NUMANodeID: 1},
26: {CoreID: 26, SocketID: 0, NUMANodeID: 1},
27: {CoreID: 27, SocketID: 0, NUMANodeID: 1},
28: {CoreID: 28, SocketID: 0, NUMANodeID: 1},
29: {CoreID: 29, SocketID: 0, NUMANodeID: 1},
30: {CoreID: 30, SocketID: 0, NUMANodeID: 1},
31: {CoreID: 31, SocketID: 0, NUMANodeID: 1},
32: {CoreID: 32, SocketID: 0, NUMANodeID: 2},
33: {CoreID: 33, SocketID: 0, NUMANodeID: 2},
34: {CoreID: 34, SocketID: 0, NUMANodeID: 2},
35: {CoreID: 35, SocketID: 0, NUMANodeID: 2},
36: {CoreID: 36, SocketID: 0, NUMANodeID: 2},
37: {CoreID: 37, SocketID: 0, NUMANodeID: 2},
38: {CoreID: 38, SocketID: 0, NUMANodeID: 2},
39: {CoreID: 39, SocketID: 0, NUMANodeID: 2},
40: {CoreID: 40, SocketID: 0, NUMANodeID: 2},
41: {CoreID: 41, SocketID: 0, NUMANodeID: 2},
42: {CoreID: 42, SocketID: 0, NUMANodeID: 2},
43: {CoreID: 43, SocketID: 0, NUMANodeID: 2},
44: {CoreID: 44, SocketID: 0, NUMANodeID: 2},
45: {CoreID: 45, SocketID: 0, NUMANodeID: 2},
46: {CoreID: 46, SocketID: 0, NUMANodeID: 2},
47: {CoreID: 47, SocketID: 0, NUMANodeID: 2},
48: {CoreID: 48, SocketID: 0, NUMANodeID: 3},
49: {CoreID: 49, SocketID: 0, NUMANodeID: 3},
50: {CoreID: 50, SocketID: 0, NUMANodeID: 3},
51: {CoreID: 51, SocketID: 0, NUMANodeID: 3},
52: {CoreID: 52, SocketID: 0, NUMANodeID: 3},
53: {CoreID: 53, SocketID: 0, NUMANodeID: 3},
54: {CoreID: 54, SocketID: 0, NUMANodeID: 3},
55: {CoreID: 55, SocketID: 0, NUMANodeID: 3},
56: {CoreID: 56, SocketID: 0, NUMANodeID: 3},
57: {CoreID: 57, SocketID: 0, NUMANodeID: 3},
58: {CoreID: 58, SocketID: 0, NUMANodeID: 3},
59: {CoreID: 59, SocketID: 0, NUMANodeID: 3},
60: {CoreID: 60, SocketID: 0, NUMANodeID: 3},
61: {CoreID: 61, SocketID: 0, NUMANodeID: 3},
62: {CoreID: 62, SocketID: 0, NUMANodeID: 3},
63: {CoreID: 63, SocketID: 0, NUMANodeID: 3},
64: {CoreID: 64, SocketID: 1, NUMANodeID: 4},
65: {CoreID: 65, SocketID: 1, NUMANodeID: 4},
66: {CoreID: 66, SocketID: 1, NUMANodeID: 4},
67: {CoreID: 67, SocketID: 1, NUMANodeID: 4},
68: {CoreID: 68, SocketID: 1, NUMANodeID: 4},
69: {CoreID: 69, SocketID: 1, NUMANodeID: 4},
70: {CoreID: 70, SocketID: 1, NUMANodeID: 4},
71: {CoreID: 71, SocketID: 1, NUMANodeID: 4},
72: {CoreID: 72, SocketID: 1, NUMANodeID: 4},
73: {CoreID: 73, SocketID: 1, NUMANodeID: 4},
74: {CoreID: 74, SocketID: 1, NUMANodeID: 4},
75: {CoreID: 75, SocketID: 1, NUMANodeID: 4},
76: {CoreID: 76, SocketID: 1, NUMANodeID: 4},
77: {CoreID: 77, SocketID: 1, NUMANodeID: 4},
78: {CoreID: 78, SocketID: 1, NUMANodeID: 4},
79: {CoreID: 79, SocketID: 1, NUMANodeID: 4},
80: {CoreID: 80, SocketID: 1, NUMANodeID: 5},
81: {CoreID: 81, SocketID: 1, NUMANodeID: 5},
82: {CoreID: 82, SocketID: 1, NUMANodeID: 5},
83: {CoreID: 83, SocketID: 1, NUMANodeID: 5},
84: {CoreID: 84, SocketID: 1, NUMANodeID: 5},
85: {CoreID: 85, SocketID: 1, NUMANodeID: 5},
86: {CoreID: 86, SocketID: 1, NUMANodeID: 5},
87: {CoreID: 87, SocketID: 1, NUMANodeID: 5},
88: {CoreID: 88, SocketID: 1, NUMANodeID: 5},
89: {CoreID: 89, SocketID: 1, NUMANodeID: 5},
90: {CoreID: 90, SocketID: 1, NUMANodeID: 5},
91: {CoreID: 91, SocketID: 1, NUMANodeID: 5},
92: {CoreID: 92, SocketID: 1, NUMANodeID: 5},
93: {CoreID: 93, SocketID: 1, NUMANodeID: 5},
94: {CoreID: 94, SocketID: 1, NUMANodeID: 5},
95: {CoreID: 95, SocketID: 1, NUMANodeID: 5},
96: {CoreID: 96, SocketID: 1, NUMANodeID: 6},
97: {CoreID: 97, SocketID: 1, NUMANodeID: 6},
98: {CoreID: 98, SocketID: 1, NUMANodeID: 6},
99: {CoreID: 99, SocketID: 1, NUMANodeID: 6},
100: {CoreID: 100, SocketID: 1, NUMANodeID: 6},
101: {CoreID: 101, SocketID: 1, NUMANodeID: 6},
102: {CoreID: 102, SocketID: 1, NUMANodeID: 6},
103: {CoreID: 103, SocketID: 1, NUMANodeID: 6},
104: {CoreID: 104, SocketID: 1, NUMANodeID: 6},
105: {CoreID: 105, SocketID: 1, NUMANodeID: 6},
106: {CoreID: 106, SocketID: 1, NUMANodeID: 6},
107: {CoreID: 107, SocketID: 1, NUMANodeID: 6},
108: {CoreID: 108, SocketID: 1, NUMANodeID: 6},
109: {CoreID: 109, SocketID: 1, NUMANodeID: 6},
110: {CoreID: 110, SocketID: 1, NUMANodeID: 6},
111: {CoreID: 111, SocketID: 1, NUMANodeID: 6},
112: {CoreID: 112, SocketID: 1, NUMANodeID: 7},
113: {CoreID: 113, SocketID: 1, NUMANodeID: 7},
114: {CoreID: 114, SocketID: 1, NUMANodeID: 7},
115: {CoreID: 115, SocketID: 1, NUMANodeID: 7},
116: {CoreID: 116, SocketID: 1, NUMANodeID: 7},
117: {CoreID: 117, SocketID: 1, NUMANodeID: 7},
118: {CoreID: 118, SocketID: 1, NUMANodeID: 7},
119: {CoreID: 119, SocketID: 1, NUMANodeID: 7},
120: {CoreID: 120, SocketID: 1, NUMANodeID: 7},
121: {CoreID: 121, SocketID: 1, NUMANodeID: 7},
122: {CoreID: 122, SocketID: 1, NUMANodeID: 7},
123: {CoreID: 123, SocketID: 1, NUMANodeID: 7},
124: {CoreID: 124, SocketID: 1, NUMANodeID: 7},
125: {CoreID: 125, SocketID: 1, NUMANodeID: 7},
126: {CoreID: 126, SocketID: 1, NUMANodeID: 7},
127: {CoreID: 127, SocketID: 1, NUMANodeID: 7},
128: {CoreID: 0, SocketID: 0, NUMANodeID: 0},
129: {CoreID: 1, SocketID: 0, NUMANodeID: 0},
130: {CoreID: 2, SocketID: 0, NUMANodeID: 0},
131: {CoreID: 3, SocketID: 0, NUMANodeID: 0},
132: {CoreID: 4, SocketID: 0, NUMANodeID: 0},
133: {CoreID: 5, SocketID: 0, NUMANodeID: 0},
134: {CoreID: 6, SocketID: 0, NUMANodeID: 0},
135: {CoreID: 7, SocketID: 0, NUMANodeID: 0},
136: {CoreID: 8, SocketID: 0, NUMANodeID: 0},
137: {CoreID: 9, SocketID: 0, NUMANodeID: 0},
138: {CoreID: 10, SocketID: 0, NUMANodeID: 0},
139: {CoreID: 11, SocketID: 0, NUMANodeID: 0},
140: {CoreID: 12, SocketID: 0, NUMANodeID: 0},
141: {CoreID: 13, SocketID: 0, NUMANodeID: 0},
142: {CoreID: 14, SocketID: 0, NUMANodeID: 0},
143: {CoreID: 15, SocketID: 0, NUMANodeID: 0},
144: {CoreID: 16, SocketID: 0, NUMANodeID: 1},
145: {CoreID: 17, SocketID: 0, NUMANodeID: 1},
146: {CoreID: 18, SocketID: 0, NUMANodeID: 1},
147: {CoreID: 19, SocketID: 0, NUMANodeID: 1},
148: {CoreID: 20, SocketID: 0, NUMANodeID: 1},
149: {CoreID: 21, SocketID: 0, NUMANodeID: 1},
150: {CoreID: 22, SocketID: 0, NUMANodeID: 1},
151: {CoreID: 23, SocketID: 0, NUMANodeID: 1},
152: {CoreID: 24, SocketID: 0, NUMANodeID: 1},
153: {CoreID: 25, SocketID: 0, NUMANodeID: 1},
154: {CoreID: 26, SocketID: 0, NUMANodeID: 1},
155: {CoreID: 27, SocketID: 0, NUMANodeID: 1},
156: {CoreID: 28, SocketID: 0, NUMANodeID: 1},
157: {CoreID: 29, SocketID: 0, NUMANodeID: 1},
158: {CoreID: 30, SocketID: 0, NUMANodeID: 1},
159: {CoreID: 31, SocketID: 0, NUMANodeID: 1},
160: {CoreID: 32, SocketID: 0, NUMANodeID: 2},
161: {CoreID: 33, SocketID: 0, NUMANodeID: 2},
162: {CoreID: 34, SocketID: 0, NUMANodeID: 2},
163: {CoreID: 35, SocketID: 0, NUMANodeID: 2},
164: {CoreID: 36, SocketID: 0, NUMANodeID: 2},
165: {CoreID: 37, SocketID: 0, NUMANodeID: 2},
166: {CoreID: 38, SocketID: 0, NUMANodeID: 2},
167: {CoreID: 39, SocketID: 0, NUMANodeID: 2},
168: {CoreID: 40, SocketID: 0, NUMANodeID: 2},
169: {CoreID: 41, SocketID: 0, NUMANodeID: 2},
170: {CoreID: 42, SocketID: 0, NUMANodeID: 2},
171: {CoreID: 43, SocketID: 0, NUMANodeID: 2},
172: {CoreID: 44, SocketID: 0, NUMANodeID: 2},
173: {CoreID: 45, SocketID: 0, NUMANodeID: 2},
174: {CoreID: 46, SocketID: 0, NUMANodeID: 2},
175: {CoreID: 47, SocketID: 0, NUMANodeID: 2},
176: {CoreID: 48, SocketID: 0, NUMANodeID: 3},
177: {CoreID: 49, SocketID: 0, NUMANodeID: 3},
178: {CoreID: 50, SocketID: 0, NUMANodeID: 3},
179: {CoreID: 51, SocketID: 0, NUMANodeID: 3},
180: {CoreID: 52, SocketID: 0, NUMANodeID: 3},
181: {CoreID: 53, SocketID: 0, NUMANodeID: 3},
182: {CoreID: 54, SocketID: 0, NUMANodeID: 3},
183: {CoreID: 55, SocketID: 0, NUMANodeID: 3},
184: {CoreID: 56, SocketID: 0, NUMANodeID: 3},
185: {CoreID: 57, SocketID: 0, NUMANodeID: 3},
186: {CoreID: 58, SocketID: 0, NUMANodeID: 3},
187: {CoreID: 59, SocketID: 0, NUMANodeID: 3},
188: {CoreID: 60, SocketID: 0, NUMANodeID: 3},
189: {CoreID: 61, SocketID: 0, NUMANodeID: 3},
190: {CoreID: 62, SocketID: 0, NUMANodeID: 3},
191: {CoreID: 63, SocketID: 0, NUMANodeID: 3},
192: {CoreID: 64, SocketID: 1, NUMANodeID: 4},
193: {CoreID: 65, SocketID: 1, NUMANodeID: 4},
194: {CoreID: 66, SocketID: 1, NUMANodeID: 4},
195: {CoreID: 67, SocketID: 1, NUMANodeID: 4},
196: {CoreID: 68, SocketID: 1, NUMANodeID: 4},
197: {CoreID: 69, SocketID: 1, NUMANodeID: 4},
198: {CoreID: 70, SocketID: 1, NUMANodeID: 4},
199: {CoreID: 71, SocketID: 1, NUMANodeID: 4},
200: {CoreID: 72, SocketID: 1, NUMANodeID: 4},
201: {CoreID: 73, SocketID: 1, NUMANodeID: 4},
202: {CoreID: 74, SocketID: 1, NUMANodeID: 4},
203: {CoreID: 75, SocketID: 1, NUMANodeID: 4},
204: {CoreID: 76, SocketID: 1, NUMANodeID: 4},
205: {CoreID: 77, SocketID: 1, NUMANodeID: 4},
206: {CoreID: 78, SocketID: 1, NUMANodeID: 4},
207: {CoreID: 79, SocketID: 1, NUMANodeID: 4},
208: {CoreID: 80, SocketID: 1, NUMANodeID: 5},
209: {CoreID: 81, SocketID: 1, NUMANodeID: 5},
210: {CoreID: 82, SocketID: 1, NUMANodeID: 5},
211: {CoreID: 83, SocketID: 1, NUMANodeID: 5},
212: {CoreID: 84, SocketID: 1, NUMANodeID: 5},
213: {CoreID: 85, SocketID: 1, NUMANodeID: 5},
214: {CoreID: 86, SocketID: 1, NUMANodeID: 5},
215: {CoreID: 87, SocketID: 1, NUMANodeID: 5},
216: {CoreID: 88, SocketID: 1, NUMANodeID: 5},
217: {CoreID: 89, SocketID: 1, NUMANodeID: 5},
218: {CoreID: 90, SocketID: 1, NUMANodeID: 5},
219: {CoreID: 91, SocketID: 1, NUMANodeID: 5},
220: {CoreID: 92, SocketID: 1, NUMANodeID: 5},
221: {CoreID: 93, SocketID: 1, NUMANodeID: 5},
222: {CoreID: 94, SocketID: 1, NUMANodeID: 5},
223: {CoreID: 95, SocketID: 1, NUMANodeID: 5},
224: {CoreID: 96, SocketID: 1, NUMANodeID: 6},
225: {CoreID: 97, SocketID: 1, NUMANodeID: 6},
226: {CoreID: 98, SocketID: 1, NUMANodeID: 6},
227: {CoreID: 99, SocketID: 1, NUMANodeID: 6},
228: {CoreID: 100, SocketID: 1, NUMANodeID: 6},
229: {CoreID: 101, SocketID: 1, NUMANodeID: 6},
230: {CoreID: 102, SocketID: 1, NUMANodeID: 6},
231: {CoreID: 103, SocketID: 1, NUMANodeID: 6},
232: {CoreID: 104, SocketID: 1, NUMANodeID: 6},
233: {CoreID: 105, SocketID: 1, NUMANodeID: 6},
234: {CoreID: 106, SocketID: 1, NUMANodeID: 6},
235: {CoreID: 107, SocketID: 1, NUMANodeID: 6},
236: {CoreID: 108, SocketID: 1, NUMANodeID: 6},
237: {CoreID: 109, SocketID: 1, NUMANodeID: 6},
238: {CoreID: 110, SocketID: 1, NUMANodeID: 6},
239: {CoreID: 111, SocketID: 1, NUMANodeID: 6},
240: {CoreID: 112, SocketID: 1, NUMANodeID: 7},
241: {CoreID: 113, SocketID: 1, NUMANodeID: 7},
242: {CoreID: 114, SocketID: 1, NUMANodeID: 7},
243: {CoreID: 115, SocketID: 1, NUMANodeID: 7},
244: {CoreID: 116, SocketID: 1, NUMANodeID: 7},
245: {CoreID: 117, SocketID: 1, NUMANodeID: 7},
246: {CoreID: 118, SocketID: 1, NUMANodeID: 7},
247: {CoreID: 119, SocketID: 1, NUMANodeID: 7},
248: {CoreID: 120, SocketID: 1, NUMANodeID: 7},
249: {CoreID: 121, SocketID: 1, NUMANodeID: 7},
250: {CoreID: 122, SocketID: 1, NUMANodeID: 7},
251: {CoreID: 123, SocketID: 1, NUMANodeID: 7},
252: {CoreID: 124, SocketID: 1, NUMANodeID: 7},
253: {CoreID: 125, SocketID: 1, NUMANodeID: 7},
254: {CoreID: 126, SocketID: 1, NUMANodeID: 7},
255: {CoreID: 127, SocketID: 1, NUMANodeID: 7},
},
}
)