mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-03-06 04:42:04 +00:00
Compare commits
722 Commits
2.5.0-alph
...
3.0.0-alph
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8cd1e50eb6 | ||
|
|
dd93d4ad5a | ||
|
|
6d6c068692 | ||
|
|
eab7c8f28f | ||
|
|
828574d27c | ||
|
|
334c7b3355 | ||
|
|
f9d3181533 | ||
|
|
3e9077f6ee | ||
|
|
830fb266e6 | ||
|
|
52133ef66e | ||
|
|
4d7f3edbaf | ||
|
|
5aa83754e5 | ||
|
|
97b7fe438a | ||
|
|
2cd964ca79 | ||
|
|
6a8e8dfc8e | ||
|
|
caada34f1d | ||
|
|
bfa86246f8 | ||
|
|
c280d6965b | ||
|
|
b61dda40b7 | ||
|
|
881c87a25c | ||
|
|
ca9d16e5ea | ||
|
|
99a7b4f3e1 | ||
|
|
d14e80e9fd | ||
|
|
cb7f9524be | ||
|
|
4813a3cef9 | ||
|
|
1f4b6e6460 | ||
|
|
4d07c86cf1 | ||
|
|
b0fa44165e | ||
|
|
a8176d0218 | ||
|
|
8a4e690089 | ||
|
|
8854b4de2c | ||
|
|
065305f4a1 | ||
|
|
1444d7ce42 | ||
|
|
2ae807fd29 | ||
|
|
8d4d98587f | ||
|
|
9516286f6d | ||
|
|
83a919a5ea | ||
|
|
c8d4ea84e3 | ||
|
|
e2968b177d | ||
|
|
d8ad16a34e | ||
|
|
b828190158 | ||
|
|
f791169efc | ||
|
|
8bbffc42cf | ||
|
|
e403838131 | ||
|
|
931251105b | ||
|
|
587c0c5e55 | ||
|
|
c5452faec6 | ||
|
|
2764bd7522 | ||
|
|
389ae97020 | ||
|
|
945e02227c | ||
|
|
578121124e | ||
|
|
869e408516 | ||
|
|
8d1cb1d513 | ||
|
|
62f05d4b48 | ||
|
|
9972487f6e | ||
|
|
c9358155a2 | ||
|
|
dd397ff1bf | ||
|
|
8b0e1859cb | ||
|
|
b337390c28 | ||
|
|
873e75b915 | ||
|
|
230a229052 | ||
|
|
889557ecb1 | ||
|
|
c9b5bde30b | ||
|
|
e6a5a5106d | ||
|
|
42eaf19b43 | ||
|
|
4d33b0541d | ||
|
|
7247575fa2 | ||
|
|
9803393f2f | ||
|
|
7503bdab6e | ||
|
|
b06bc82284 | ||
|
|
8d9135a7ce | ||
|
|
86ac653ba7 | ||
|
|
81fe51ab0b | ||
|
|
845c1c03cf | ||
|
|
993ae24080 | ||
|
|
adfad44efe | ||
|
|
9b1940e93e | ||
|
|
0aefab4d80 | ||
|
|
5457deb034 | ||
|
|
54147db921 | ||
|
|
638c2c4164 | ||
|
|
f0b58e38d2 | ||
|
|
fa0b11fc52 | ||
|
|
a67402cc1f | ||
|
|
229ff29c0f | ||
|
|
5c3155f7e2 | ||
|
|
4ab45e5c93 | ||
|
|
9dfd949f23 | ||
|
|
326eb2f910 | ||
|
|
50b0b7cc15 | ||
|
|
557229c39d | ||
|
|
1b01ea53d9 | ||
|
|
27c82018d1 | ||
|
|
6fddf031df | ||
|
|
f5aa6ae467 | ||
|
|
6e149b43f7 | ||
|
|
85f4e7caf6 | ||
|
|
129335714b | ||
|
|
71384b60f3 | ||
|
|
56d49b5073 | ||
|
|
b3147411e3 | ||
|
|
1ef3f8eac6 | ||
|
|
57c556a801 | ||
|
|
0e24f47a43 | ||
|
|
e764a726ab | ||
|
|
3f4dd92c2d | ||
|
|
a3127a03f3 | ||
|
|
427b29454a | ||
|
|
0337377838 | ||
|
|
c825065b27 | ||
|
|
e0194dcb5e | ||
|
|
534a4920b1 | ||
|
|
fa85fd584e | ||
|
|
0b4a91ec1a | ||
|
|
896478c92b | ||
|
|
68c265587c | ||
|
|
df79c8fe1d | ||
|
|
912641509e | ||
|
|
43045be8d1 | ||
|
|
0d7cb7eb16 | ||
|
|
eec9ac81ef | ||
|
|
402bfa0ce3 | ||
|
|
54f53d57ef | ||
|
|
6d56cdb9ac | ||
|
|
540303880e | ||
|
|
7c146a5d95 | ||
|
|
08a6581673 | ||
|
|
4331ef80d0 | ||
|
|
4c3bd6b1d1 | ||
|
|
72dbd1fcb4 | ||
|
|
960f2a7f70 | ||
|
|
e9988f0c68 | ||
|
|
cebbebbe8a | ||
|
|
758cc47b32 | ||
|
|
25be4d00fd | ||
|
|
62182db645 | ||
|
|
99654ce694 | ||
|
|
f4c3adf596 | ||
|
|
545ae3f0ee | ||
|
|
19eca71cd9 | ||
|
|
d8920b00cd | ||
|
|
2b01e9ba40 | ||
|
|
996a6b80bc | ||
|
|
f690b0aad0 | ||
|
|
d93e4b939d | ||
|
|
575b5eb5f5 | ||
|
|
9f49f7adca | ||
|
|
3c989521b1 | ||
|
|
274598ae56 | ||
|
|
1befbe6738 | ||
|
|
3d6156f6ec | ||
|
|
3f6123b4dd | ||
|
|
9ae2a45b38 | ||
|
|
0cc20f014d | ||
|
|
418a03a128 | ||
|
|
be31207f6e | ||
|
|
39974fbacc | ||
|
|
051181249c | ||
|
|
dc3b6f6592 | ||
|
|
201ff223f6 | ||
|
|
f3335c99ce | ||
|
|
9f0e4bb775 | ||
|
|
b424cf3c90 | ||
|
|
cda1919a0a | ||
|
|
1a25afcdf5 | ||
|
|
0024b8d10a | ||
|
|
80c68b80a8 | ||
|
|
d2584991eb | ||
|
|
458f6f42f6 | ||
|
|
58b0fc4794 | ||
|
|
0826a2157d | ||
|
|
939959e726 | ||
|
|
f6f96b8fee | ||
|
|
7a4183980e | ||
|
|
46fd7ce025 | ||
|
|
30da3fb954 | ||
|
|
f7ccf92dc8 | ||
|
|
33360f1710 | ||
|
|
386a523a05 | ||
|
|
13df57c393 | ||
|
|
f36bc8bc52 | ||
|
|
57c2d8b749 | ||
|
|
e57a1c831e | ||
|
|
ee3f5558ae | ||
|
|
c09634dbc7 | ||
|
|
2551924bda | ||
|
|
bee7915932 | ||
|
|
9cee52153b | ||
|
|
47a4142e0d | ||
|
|
e14e98bbeb | ||
|
|
5d3b53ee7b | ||
|
|
6a1fe85f10 | ||
|
|
5ea35ddcdc | ||
|
|
b646d7cb37 | ||
|
|
cb54ac6c6e | ||
|
|
bde6609b93 | ||
|
|
d88b1bf01c | ||
|
|
dd003ebe0e | ||
|
|
38957fe00b | ||
|
|
11b3f95140 | ||
|
|
948381bdbe | ||
|
|
3d20387a25 | ||
|
|
87d38ae49f | ||
|
|
2bb1eeaecc | ||
|
|
026aaeeccc | ||
|
|
fffcb81652 | ||
|
|
42ea854eb6 | ||
|
|
efdb92366b | ||
|
|
0e40ecf383 | ||
|
|
f59939a31f | ||
|
|
be68cf0712 | ||
|
|
4d89476c91 | ||
|
|
ac91fb7a12 | ||
|
|
090de2dae2 | ||
|
|
a1593322bd | ||
|
|
89b9ba8603 | ||
|
|
95fa0c70c3 | ||
|
|
5c1ccc376b | ||
|
|
4d234f5742 | ||
|
|
cfd5dae47c | ||
|
|
527b73a8e5 | ||
|
|
3bafafec58 | ||
|
|
5010c643c4 | ||
|
|
2d29791c19 | ||
|
|
f4eea832a1 | ||
|
|
071dd4c790 | ||
|
|
514b4e7235 | ||
|
|
d9e868f44e | ||
|
|
b33ad7e57a | ||
|
|
0189738283 | ||
|
|
cd2d8c6fe2 | ||
|
|
a1de394e51 | ||
|
|
44ec9684d8 | ||
|
|
0ddb34a38d | ||
|
|
7120afe4ed | ||
|
|
648d285a24 | ||
|
|
7dad7c89f3 | ||
|
|
fbb2e9bce9 | ||
|
|
acd3302bef | ||
|
|
635fa543a3 | ||
|
|
59cab9e835 | ||
|
|
1f363a386c | ||
|
|
4e48509ed9 | ||
|
|
18093251ec | ||
|
|
c29038a2e2 | ||
|
|
02a51e75a7 | ||
|
|
aa561b49f5 | ||
|
|
48ccd42339 | ||
|
|
2a4fbd6d8c | ||
|
|
433816cca2 | ||
|
|
2a94261df5 | ||
|
|
1e12d56512 | ||
|
|
a5a25ed13d | ||
|
|
96553e8bd2 | ||
|
|
c656457e90 | ||
|
|
99f5ca80fc | ||
|
|
0f9856c465 | ||
|
|
2c1efcc697 | ||
|
|
20f11877be | ||
|
|
ab5f1c9564 | ||
|
|
07231b2f3f | ||
|
|
c8a9052063 | ||
|
|
242992e3de | ||
|
|
8a697268d0 | ||
|
|
9c526292e7 | ||
|
|
e5be5cb086 | ||
|
|
5f936f268f | ||
|
|
323271403e | ||
|
|
0939f5181b | ||
|
|
58ff2bd5c9 | ||
|
|
ad055235a5 | ||
|
|
b2c0387993 | ||
|
|
12c1b9e6d6 | ||
|
|
1a78c3df2e | ||
|
|
f3907aa127 | ||
|
|
badbbcd8be | ||
|
|
916ffb75d7 | ||
|
|
afdc960424 | ||
|
|
4e30e11b31 | ||
|
|
bdf5e5229b | ||
|
|
469e098543 | ||
|
|
71db2dd5b8 | ||
|
|
8bb00a3dc8 | ||
|
|
2aedd4d12a | ||
|
|
bec22ad01f | ||
|
|
07f44c3e0a | ||
|
|
78c9718752 | ||
|
|
7d1953b52e | ||
|
|
468c73b3cb | ||
|
|
27b1bb5ed9 | ||
|
|
e32bf53318 | ||
|
|
f97d9b45c8 | ||
|
|
f9e96c6506 | ||
|
|
3880e0c077 | ||
|
|
2488a0f6c0 | ||
|
|
083ca5f217 | ||
|
|
03fca8b459 | ||
|
|
c70d3a2c35 | ||
|
|
612fd79bae | ||
|
|
d4417f210e | ||
|
|
93874cb3bb | ||
|
|
07b1367c2b | ||
|
|
133528dd14 | ||
|
|
f186a52b16 | ||
|
|
1b7d36fdb0 | ||
|
|
9ff10c0830 | ||
|
|
78e27de6c3 | ||
|
|
e227b4c404 | ||
|
|
72049350ae | ||
|
|
8eac22ac53 | ||
|
|
e7e7dc9dfe | ||
|
|
e422730c7f | ||
|
|
e11fcf7d3c | ||
|
|
c7dd10e5ed | ||
|
|
0bbbe70687 | ||
|
|
6fd40085ef | ||
|
|
98f041ed8e | ||
|
|
2c1b68d6e4 | ||
|
|
86123f49f2 | ||
|
|
ef925d40ce | ||
|
|
28995301b3 | ||
|
|
9941588c00 | ||
|
|
e89e6507a4 | ||
|
|
f30fe86dc1 | ||
|
|
553ec46115 | ||
|
|
0d33b28802 | ||
|
|
9766a285a4 | ||
|
|
90a7763ac6 | ||
|
|
d06dd8fcdc | ||
|
|
a305bafeef | ||
|
|
185360cb9a | ||
|
|
db2a4d6cdf | ||
|
|
bee7703436 | ||
|
|
ac5dbd8598 | ||
|
|
0b75522e1f | ||
|
|
93b61e0f07 | ||
|
|
bf3ddc125d | ||
|
|
55ed32e924 | ||
|
|
01fe09a4ee | ||
|
|
2e07538334 | ||
|
|
c84a425250 | ||
|
|
1d5448fbca | ||
|
|
a80eb33cd6 | ||
|
|
81acfc1286 | ||
|
|
9b93db0220 | ||
|
|
1ef0b7ded0 | ||
|
|
b6cb2c4ae3 | ||
|
|
e80e0c4645 | ||
|
|
bb26bd73b1 | ||
|
|
1a5ba31cb0 | ||
|
|
f23d7092e3 | ||
|
|
d5ee3fc856 | ||
|
|
721ca72a64 | ||
|
|
93c10dfd86 | ||
|
|
dfe6de7714 | ||
|
|
39ff85d610 | ||
|
|
71f24d8271 | ||
|
|
a1df6d0969 | ||
|
|
8619f2b3d6 | ||
|
|
52d42af636 | ||
|
|
c1c1e5152a | ||
|
|
6850ef99ae | ||
|
|
0bcb422fcb | ||
|
|
3c45c0715f | ||
|
|
3d38bb3005 | ||
|
|
aff6040555 | ||
|
|
8835db6b0f | ||
|
|
9cb15ab4c5 | ||
|
|
ff7874bc23 | ||
|
|
aefe11b9ba | ||
|
|
7deb87dcbc | ||
|
|
f811c8b60e | ||
|
|
06f398a34f | ||
|
|
fd4c26f9c1 | ||
|
|
4be7185aa4 | ||
|
|
10343b1f3d | ||
|
|
9887272db9 | ||
|
|
3ff0db05a7 | ||
|
|
234d7bca04 | ||
|
|
75e282b4c1 | ||
|
|
bdfee005fa | ||
|
|
4296e3069f | ||
|
|
d3da156eea | ||
|
|
e705ee07c5 | ||
|
|
8c0a60e191 | ||
|
|
278f843f92 | ||
|
|
641b736106 | ||
|
|
69ba1ae9e4 | ||
|
|
d2a9bc6674 | ||
|
|
9773838c01 | ||
|
|
aee9633ced | ||
|
|
8509de0aea | ||
|
|
6d59e8e197 | ||
|
|
5300ea23ad | ||
|
|
1d5c898d7f | ||
|
|
87887026f6 | ||
|
|
b0e090f40b | ||
|
|
ccd03e2cae | ||
|
|
45a00b4f02 | ||
|
|
48c201a1ac | ||
|
|
b9b6d70aae | ||
|
|
05ad026fc0 | ||
|
|
d96716b4d2 | ||
|
|
6cffd943be | ||
|
|
6ae87d9d66 | ||
|
|
45e5780e7c | ||
|
|
2599a06a56 | ||
|
|
8ffff40af4 | ||
|
|
626828696d | ||
|
|
97d8c6c0fa | ||
|
|
8cdd70f6c2 | ||
|
|
e19d04719f | ||
|
|
387ffa914e | ||
|
|
69f10afb71 | ||
|
|
21cc02d724 | ||
|
|
5b89c1df2f | ||
|
|
4f62a7618c | ||
|
|
6f8acb94c2 | ||
|
|
7cdee4980c | ||
|
|
426f38de94 | ||
|
|
392f1ecdf5 | ||
|
|
575df4dc4d | ||
|
|
db5048d52c | ||
|
|
1b845978f9 | ||
|
|
412441308b | ||
|
|
ae911d0cd3 | ||
|
|
05022975c8 | ||
|
|
aaa74e8a2b | ||
|
|
a57515bdae | ||
|
|
4ebf9d38b9 | ||
|
|
eff4e1017d | ||
|
|
eb24e97150 | ||
|
|
5d7fb7b7b0 | ||
|
|
d0ca2fcbbc | ||
|
|
a60dcff4d8 | ||
|
|
dbf50672e1 | ||
|
|
8e2847bd52 | ||
|
|
e9ada165ff | ||
|
|
adad9cef18 | ||
|
|
34bcef8846 | ||
|
|
815157bf02 | ||
|
|
5bd81ba232 | ||
|
|
f5099620f1 | ||
|
|
fe3c1d9cdd | ||
|
|
a238d8c6bd | ||
|
|
f981190621 | ||
|
|
f7b22eb777 | ||
|
|
8f10e13e07 | ||
|
|
430da47215 | ||
|
|
9c9e5984ba | ||
|
|
9d27c1fced | ||
|
|
9726f56fdc | ||
|
|
168f325c43 | ||
|
|
38a3188206 | ||
|
|
a0805742d6 | ||
|
|
24182d72d9 | ||
|
|
295a01f9b1 | ||
|
|
b8e98b175c | ||
|
|
e8d0be364f | ||
|
|
7ae11cad67 | ||
|
|
25b1317ead | ||
|
|
b9fc24ff3a | ||
|
|
c1476a174b | ||
|
|
002f2cd109 | ||
|
|
2e04833fb9 | ||
|
|
8b57bf97ab | ||
|
|
6d0ff901ab | ||
|
|
9b108d9937 | ||
|
|
894f661cc4 | ||
|
|
d759f6c3e5 | ||
|
|
3e2817f7b5 | ||
|
|
a9a3074828 | ||
|
|
9f81c2dbf0 | ||
|
|
5903815746 | ||
|
|
9658c6218e | ||
|
|
d2df1209a5 | ||
|
|
22b6a94a84 | ||
|
|
af2ef3f7a5 | ||
|
|
65f0cef16c | ||
|
|
3201ad0830 | ||
|
|
0706fb28ac | ||
|
|
2a09378dd9 | ||
|
|
640173cfc2 | ||
|
|
0136be22ca | ||
|
|
bd50d463b2 | ||
|
|
7c4049aabb | ||
|
|
03176a9e09 | ||
|
|
38ebbc705b | ||
|
|
78d45b434f | ||
|
|
c7b3941c96 | ||
|
|
6dbce7c3de | ||
|
|
6ecea84bc5 | ||
|
|
648b8d0aec | ||
|
|
96c8df40b5 | ||
|
|
5205efd9b4 | ||
|
|
d157f9b71e | ||
|
|
d862ca0590 | ||
|
|
d50937435d | ||
|
|
56591804b3 | ||
|
|
cb2b30970d | ||
|
|
60823abb9c | ||
|
|
4134beee39 | ||
|
|
fff832874e | ||
|
|
49361749ed | ||
|
|
27d903b76a | ||
|
|
d7b4ce049e | ||
|
|
43de5440e5 | ||
|
|
c9b291509d | ||
|
|
62d1ed0651 | ||
|
|
8a2b82ff51 | ||
|
|
6d00701ec9 | ||
|
|
122a85e222 | ||
|
|
35619b45aa | ||
|
|
b9315af092 | ||
|
|
10c13d719a | ||
|
|
d20bc5a4d2 | ||
|
|
c95ba63c0c | ||
|
|
34b80382b6 | ||
|
|
dfad5728a7 | ||
|
|
8e7c5975c6 | ||
|
|
4428ceae16 | ||
|
|
ffdc065b4c | ||
|
|
f295953183 | ||
|
|
2c238c8504 | ||
|
|
811ac6a8ce | ||
|
|
d8be0f8e9f | ||
|
|
7a5ccd1264 | ||
|
|
fa61bd43ee | ||
|
|
ce2e521a0f | ||
|
|
834f93ce8a | ||
|
|
d7aded7238 | ||
|
|
f4994e486b | ||
|
|
24a2b0f6a2 | ||
|
|
c88a48be21 | ||
|
|
9458cc0053 | ||
|
|
42c64b3d2c | ||
|
|
abad33eba0 | ||
|
|
04bd8f16f0 | ||
|
|
12f0ab120a | ||
|
|
e87eb13c4f | ||
|
|
8052fe62fa | ||
|
|
5d43718494 | ||
|
|
c67b9d2975 | ||
|
|
44814dce19 | ||
|
|
856c8e81f1 | ||
|
|
4f586d2a91 | ||
|
|
4b437d91f0 | ||
|
|
6ffdebd202 | ||
|
|
ee9ee77388 | ||
|
|
88fb9b72e2 | ||
|
|
0e2459d13e | ||
|
|
d1f2852d8b | ||
|
|
c39852e83f | ||
|
|
b4b9068cb7 | ||
|
|
b780be99d7 | ||
|
|
a475956abd | ||
|
|
71f59f3a7b | ||
|
|
c7ac55b6d7 | ||
|
|
8e2042d055 | ||
|
|
dbedea5086 | ||
|
|
e73b70baff | ||
|
|
f24a6e761f | ||
|
|
cf465feb02 | ||
|
|
34c4ac599c | ||
|
|
0aff5aaa39 | ||
|
|
557c4cfd00 | ||
|
|
04c8b52e04 | ||
|
|
7f76914422 | ||
|
|
13c2577004 | ||
|
|
97425a7fe6 | ||
|
|
4210646802 | ||
|
|
51fa4ab671 | ||
|
|
79fb4fc5cb | ||
|
|
271933fec0 | ||
|
|
c7dacb1211 | ||
|
|
61a167139c | ||
|
|
82ea018281 | ||
|
|
8aad2c59c5 | ||
|
|
2a1d394147 | ||
|
|
7bc4ab68c3 | ||
|
|
79d93f1fe7 | ||
|
|
e2f68c6093 | ||
|
|
d16097a805 | ||
|
|
9b863b0e01 | ||
|
|
bd5da4a7d9 | ||
|
|
ec250c10e9 | ||
|
|
33a8b70558 | ||
|
|
475e3bf38f | ||
|
|
70eda2fa6c | ||
|
|
4a1e13bd8f | ||
|
|
383be2203a | ||
|
|
c633780ba7 | ||
|
|
97d7b1845b | ||
|
|
1dd6f85a17 | ||
|
|
1b7fd19acb | ||
|
|
7772f7dd99 | ||
|
|
cc839772d3 | ||
|
|
2d5f11501c | ||
|
|
982c32358a | ||
|
|
da11c21b4a | ||
|
|
7ffe5a16f2 | ||
|
|
ea857bb1b8 | ||
|
|
9fdc88101f | ||
|
|
081f6de874 | ||
|
|
3f069c7acb | ||
|
|
666aee54d2 | ||
|
|
86d348e065 | ||
|
|
4b9b62bb3e | ||
|
|
c4dd029566 | ||
|
|
9fb9c80fd3 | ||
|
|
b6467ddd73 | ||
|
|
7580bb5a78 | ||
|
|
a88adabaae | ||
|
|
63c4da03a9 | ||
|
|
511f7f822d | ||
|
|
5b18575dfe | ||
|
|
1cf9469297 | ||
|
|
00a5b1bda9 | ||
|
|
be1bb7e39f | ||
|
|
c9f6496d6d | ||
|
|
2d35e6066d | ||
|
|
b0e439cb66 | ||
|
|
ab067cf074 | ||
|
|
ccb0183934 | ||
|
|
9d39362e30 | ||
|
|
18d27f7949 | ||
|
|
6948b4b360 | ||
|
|
b221a2590f | ||
|
|
2c218a07b9 | ||
|
|
dd4bd7f471 | ||
|
|
d02db3a268 | ||
|
|
0a6e7d443e | ||
|
|
7b20707197 | ||
|
|
411053e2bd | ||
|
|
832c33d5b5 | ||
|
|
b658dccc5f | ||
|
|
afbd60da27 | ||
|
|
5b6e45ed6c | ||
|
|
4b9e78b837 | ||
|
|
bc919cc54c | ||
|
|
cb8dd0f4fc | ||
|
|
81f6b48626 | ||
|
|
2629c9fc7b | ||
|
|
96bc3ec2e9 | ||
|
|
0239502781 | ||
|
|
1b931f4203 | ||
|
|
ef6d54a781 | ||
|
|
5d8438e939 | ||
|
|
963d03ea8a | ||
|
|
1719a8b491 | ||
|
|
bec59f9e39 | ||
|
|
f7ba21c86f | ||
|
|
90b2f5b776 | ||
|
|
2eeb5dc223 | ||
|
|
83979ece18 | ||
|
|
0ad89ebd7c | ||
|
|
b63774ec61 | ||
|
|
f385b21b05 | ||
|
|
baa67d8cc5 | ||
|
|
0e7f1a5e3a | ||
|
|
604a795073 | ||
|
|
f619c65b6a | ||
|
|
7ec42951f2 | ||
|
|
e6bc912439 | ||
|
|
33e244f284 | ||
|
|
dbb0c67523 | ||
|
|
0af13b469d | ||
|
|
b19bfac7cd | ||
|
|
4ec1967542 | ||
|
|
362201605e | ||
|
|
2256bcb6ab | ||
|
|
7b2ff02647 | ||
|
|
29e569aa92 | ||
|
|
6012c19707 | ||
|
|
aabcebbf58 | ||
|
|
32f92e75cc | ||
|
|
4443bb68a4 | ||
|
|
d136c9c240 | ||
|
|
66aa07649b | ||
|
|
8d8c0388fa | ||
|
|
86977ff780 | ||
|
|
78f30c33c6 | ||
|
|
6e79042aa0 | ||
|
|
9b6f24b2ee | ||
|
|
c3776b1792 | ||
|
|
9c22d9554e | ||
|
|
c108bb7a2a | ||
|
|
bf98c99f14 | ||
|
|
92c00c7e84 | ||
|
|
6e9e4e8ce5 | ||
|
|
532d53977e | ||
|
|
6a47b82c81 | ||
|
|
9d5e7ee0d4 | ||
|
|
f8cc5d1ad8 | ||
|
|
698e45f403 | ||
|
|
761e8313de | ||
|
|
4f551e3428 | ||
|
|
a83a16e32c | ||
|
|
95e45fab38 | ||
|
|
c31cd0e81a | ||
|
|
9c1c219a3f | ||
|
|
1118a3d2da | ||
|
|
9d5b03a1b7 | ||
|
|
eff7c7e0ff | ||
|
|
d0d3787233 | ||
|
|
465d3a5506 | ||
|
|
b975f2e8d2 | ||
|
|
962d05ec86 | ||
|
|
b39caf43f1 | ||
|
|
354cd3b9b6 | ||
|
|
485aeabb6b | ||
|
|
4405b188e8 | ||
|
|
e62bc8e7f3 | ||
|
|
8980d04e25 | ||
|
|
98750d792b | ||
|
|
59c7165ee1 | ||
|
|
ff17c756d2 | ||
|
|
1cad3a4696 | ||
|
|
33c953ace4 | ||
|
|
39a35b693a | ||
|
|
d8f39fb269 | ||
|
|
a2f5c1768e |
2
.github/workflows/add-pr-sizing-label.yaml
vendored
2
.github/workflows/add-pr-sizing-label.yaml
vendored
@@ -33,6 +33,8 @@ jobs:
|
||||
GITHUB_TOKEN: ${{ secrets.KATA_GITHUB_ACTIONS_PR_SIZE_TOKEN }}
|
||||
run: |
|
||||
pr=${{ github.event.number }}
|
||||
# Removing man-db, workflow kept failing, fixes: #4480
|
||||
sudo apt -y remove --purge man-db
|
||||
sudo apt -y install diffstat patchutils
|
||||
|
||||
pr-add-size-label.sh -p "$pr"
|
||||
|
||||
7
.github/workflows/commit-message-check.yaml
vendored
7
.github/workflows/commit-message-check.yaml
vendored
@@ -63,7 +63,8 @@ jobs:
|
||||
# the entire commit message.
|
||||
#
|
||||
# - Body lines *can* be longer than the maximum if they start
|
||||
# with a non-alphabetic character.
|
||||
# with a non-alphabetic character or if there is no whitespace in
|
||||
# the line.
|
||||
#
|
||||
# This allows stack traces, log files snippets, emails, long URLs,
|
||||
# etc to be specified. Some of these naturally "work" as they start
|
||||
@@ -74,8 +75,8 @@ jobs:
|
||||
#
|
||||
# - A SoB comment can be any length (as it is unreasonable to penalise
|
||||
# people with long names/email addresses :)
|
||||
pattern: '^.+(\n([a-zA-Z].{0,149}|[^a-zA-Z\n].*|Signed-off-by:.*|))+$'
|
||||
error: 'Body line too long (max 72)'
|
||||
pattern: '^.+(\n([a-zA-Z].{0,150}|[^a-zA-Z\n].*|[^\s\n]*|Signed-off-by:.*|))+$'
|
||||
error: 'Body line too long (max 150)'
|
||||
post_error: ${{ env.error_msg }}
|
||||
|
||||
- name: Check Fixes
|
||||
|
||||
7
.github/workflows/docs-url-alive-check.yaml
vendored
7
.github/workflows/docs-url-alive-check.yaml
vendored
@@ -10,35 +10,32 @@ jobs:
|
||||
go-version: [1.17.x]
|
||||
os: [ubuntu-20.04]
|
||||
runs-on: ${{ matrix.os }}
|
||||
# don't run this action on forks
|
||||
if: github.repository_owner == 'kata-containers'
|
||||
env:
|
||||
target_branch: ${{ github.base_ref }}
|
||||
steps:
|
||||
- name: Install Go
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
uses: actions/setup-go@v2
|
||||
with:
|
||||
go-version: ${{ matrix.go-version }}
|
||||
env:
|
||||
GOPATH: ${{ runner.workspace }}/kata-containers
|
||||
- name: Set env
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
echo "GOPATH=${{ github.workspace }}" >> $GITHUB_ENV
|
||||
echo "${{ github.workspace }}/bin" >> $GITHUB_PATH
|
||||
- name: Checkout code
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
path: ./src/github.com/${{ github.repository }}
|
||||
- name: Setup
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
cd ${GOPATH}/src/github.com/${{ github.repository }} && ./ci/setup.sh
|
||||
env:
|
||||
GOPATH: ${{ runner.workspace }}/kata-containers
|
||||
# docs url alive check
|
||||
- name: Docs URL Alive Check
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
cd ${GOPATH}/src/github.com/${{ github.repository }} && make docs-url-alive-check
|
||||
|
||||
1
.github/workflows/kata-deploy-push.yaml
vendored
1
.github/workflows/kata-deploy-push.yaml
vendored
@@ -24,6 +24,7 @@ jobs:
|
||||
- firecracker
|
||||
- rootfs-image
|
||||
- rootfs-initrd
|
||||
- virtiofsd
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Install docker
|
||||
|
||||
2
.github/workflows/kata-deploy-test.yaml
vendored
2
.github/workflows/kata-deploy-test.yaml
vendored
@@ -1,4 +1,5 @@
|
||||
on:
|
||||
workflow_dispatch: # this is used to trigger the workflow on non-main branches
|
||||
issue_comment:
|
||||
types: [created, edited]
|
||||
|
||||
@@ -47,6 +48,7 @@ jobs:
|
||||
- rootfs-image
|
||||
- rootfs-initrd
|
||||
- shim-v2
|
||||
- virtiofsd
|
||||
steps:
|
||||
- name: get-PR-ref
|
||||
id: get-PR-ref
|
||||
|
||||
5
.github/workflows/release.yaml
vendored
5
.github/workflows/release.yaml
vendored
@@ -1,8 +1,8 @@
|
||||
name: Publish Kata 2.x release artifacts
|
||||
name: Publish Kata release artifacts
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- '2.*'
|
||||
- '[0-9]+.[0-9]+.[0-9]+*'
|
||||
|
||||
jobs:
|
||||
build-asset:
|
||||
@@ -17,6 +17,7 @@ jobs:
|
||||
- rootfs-image
|
||||
- rootfs-initrd
|
||||
- shim-v2
|
||||
- virtiofsd
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Install docker
|
||||
|
||||
9
.github/workflows/snap-release.yaml
vendored
9
.github/workflows/snap-release.yaml
vendored
@@ -1,8 +1,9 @@
|
||||
name: Release Kata 2.x in snapcraft store
|
||||
name: Release Kata in snapcraft store
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- '2.*'
|
||||
- '[0-9]+.[0-9]+.[0-9]+*'
|
||||
|
||||
jobs:
|
||||
release-snap:
|
||||
runs-on: ubuntu-20.04
|
||||
@@ -19,6 +20,8 @@ jobs:
|
||||
|
||||
- name: Build snap
|
||||
run: |
|
||||
# Removing man-db, workflow kept failing, fixes: #4480
|
||||
sudo apt -y remove --purge man-db
|
||||
sudo apt-get install -y git git-extras
|
||||
kata_url="https://github.com/kata-containers/kata-containers"
|
||||
latest_version=$(git ls-remote --tags ${kata_url} | egrep -o "refs.*" | egrep -v "\-alpha|\-rc|{}" | egrep -o "[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+" | sort -V -r | head -1)
|
||||
@@ -26,7 +29,7 @@ jobs:
|
||||
# Check semantic versioning format (x.y.z) and if the current tag is the latest tag
|
||||
if echo "${current_version}" | grep -q "^[[:digit:]]\+\.[[:digit:]]\+\.[[:digit:]]\+$" && echo -e "$latest_version\n$current_version" | sort -C -V; then
|
||||
# Current version is the latest version, build it
|
||||
snapcraft -d snap --destructive-mode
|
||||
snapcraft snap --debug --destructive-mode
|
||||
fi
|
||||
|
||||
- name: Upload snap
|
||||
|
||||
2
.github/workflows/snap.yaml
vendored
2
.github/workflows/snap.yaml
vendored
@@ -24,4 +24,4 @@ jobs:
|
||||
- name: Build snap
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
snapcraft -d snap --destructive-mode
|
||||
snapcraft snap --debug --destructive-mode
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -10,4 +10,5 @@ src/agent/kata-agent.service
|
||||
src/agent/protocols/src/*.rs
|
||||
!src/agent/protocols/src/lib.rs
|
||||
build
|
||||
src/tools/log-parser/kata-log-parser
|
||||
|
||||
|
||||
10
Makefile
10
Makefile
@@ -6,24 +6,23 @@
|
||||
# List of available components
|
||||
COMPONENTS =
|
||||
|
||||
COMPONENTS += libs
|
||||
COMPONENTS += agent
|
||||
COMPONENTS += runtime
|
||||
COMPONENTS += runtime-rs
|
||||
|
||||
# List of available tools
|
||||
TOOLS =
|
||||
|
||||
TOOLS += agent-ctl
|
||||
TOOLS += trace-forwarder
|
||||
TOOLS += runk
|
||||
TOOLS += log-parser
|
||||
|
||||
STANDARD_TARGETS = build check clean install test vendor
|
||||
|
||||
default: all
|
||||
|
||||
all: logging-crate-tests build
|
||||
|
||||
logging-crate-tests:
|
||||
make -C src/libs/logging
|
||||
|
||||
include utils.mk
|
||||
include ./tools/packaging/kata-deploy/local-build/Makefile
|
||||
|
||||
@@ -47,7 +46,6 @@ docs-url-alive-check:
|
||||
binary-tarball \
|
||||
default \
|
||||
install-binary-tarball \
|
||||
logging-crate-tests \
|
||||
static-checks \
|
||||
docs-url-alive-check
|
||||
|
||||
|
||||
@@ -71,6 +71,7 @@ See the [official documentation](docs) including:
|
||||
- [Developer guide](docs/Developer-Guide.md)
|
||||
- [Design documents](docs/design)
|
||||
- [Architecture overview](docs/design/architecture)
|
||||
- [Architecture 3.0 overview](docs/design/architecture_3.0/)
|
||||
|
||||
## Configuration
|
||||
|
||||
@@ -117,7 +118,10 @@ The table below lists the core parts of the project:
|
||||
|-|-|-|
|
||||
| [runtime](src/runtime) | core | Main component run by a container manager and providing a containerd shimv2 runtime implementation. |
|
||||
| [agent](src/agent) | core | Management process running inside the virtual machine / POD that sets up the container environment. |
|
||||
| [libraries](src/libs) | core | Library crates shared by multiple Kata Container components or published to [`crates.io`](https://crates.io/index.html) |
|
||||
| [`dragonball`](src/dragonball) | core | An optional built-in VMM brings out-of-the-box Kata Containers experience with optimizations on container workloads |
|
||||
| [documentation](docs) | documentation | Documentation common to all components (such as design and install documentation). |
|
||||
| [libraries](src/libs) | core | Library crates shared by multiple Kata Container components or published to [`crates.io`](https://crates.io/index.html) |
|
||||
| [tests](https://github.com/kata-containers/tests) | tests | Excludes unit tests which live with the main code. |
|
||||
|
||||
### Additional components
|
||||
@@ -131,6 +135,7 @@ The table below lists the remaining parts of the project:
|
||||
| [osbuilder](tools/osbuilder) | infrastructure | Tool to create "mini O/S" rootfs and initrd images and kernel for the hypervisor. |
|
||||
| [`agent-ctl`](src/tools/agent-ctl) | utility | Tool that provides low-level access for testing the agent. |
|
||||
| [`trace-forwarder`](src/tools/trace-forwarder) | utility | Agent tracing helper. |
|
||||
| [`runk`](src/tools/runk) | utility | Standard OCI container runtime based on the agent. |
|
||||
| [`ci`](https://github.com/kata-containers/ci) | CI | Continuous Integration configuration files and scripts. |
|
||||
| [`katacontainers.io`](https://github.com/kata-containers/www.katacontainers.io) | Source for the [`katacontainers.io`](https://www.katacontainers.io) site. |
|
||||
|
||||
@@ -138,7 +143,7 @@ The table below lists the remaining parts of the project:
|
||||
|
||||
Kata Containers is now
|
||||
[available natively for most distributions](docs/install/README.md#packaged-installation-methods).
|
||||
However, packaging scripts and metadata are still used to generate snap and GitHub releases. See
|
||||
However, packaging scripts and metadata are still used to generate [snap](snap/local) and GitHub releases. See
|
||||
the [components](#components) section for further details.
|
||||
|
||||
## Glossary of Terms
|
||||
|
||||
@@ -11,10 +11,10 @@ runtimedir=$cidir/../src/runtime
|
||||
|
||||
build_working_packages() {
|
||||
# working packages:
|
||||
device_api=$runtimedir/virtcontainers/device/api
|
||||
device_config=$runtimedir/virtcontainers/device/config
|
||||
device_drivers=$runtimedir/virtcontainers/device/drivers
|
||||
device_manager=$runtimedir/virtcontainers/device/manager
|
||||
device_api=$runtimedir/pkg/device/api
|
||||
device_config=$runtimedir/pkg/device/config
|
||||
device_drivers=$runtimedir/pkg/device/drivers
|
||||
device_manager=$runtimedir/pkg/device/manager
|
||||
rc_pkg_dir=$runtimedir/pkg/resourcecontrol/
|
||||
utils_pkg_dir=$runtimedir/virtcontainers/utils
|
||||
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# Copyright (c) 2020 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -e
|
||||
|
||||
cidir=$(dirname "$0")
|
||||
source "${cidir}/lib.sh"
|
||||
|
||||
run_go_test
|
||||
13
ci/lib.sh
13
ci/lib.sh
@@ -18,6 +18,13 @@ clone_tests_repo()
|
||||
{
|
||||
if [ -d "$tests_repo_dir" ]; then
|
||||
[ -n "${CI:-}" ] && return
|
||||
# git config --global --add safe.directory will always append
|
||||
# the target to .gitconfig without checking the existence of
|
||||
# the target, so it's better to check it before adding the target repo.
|
||||
local sd="$(git config --global --get safe.directory ${tests_repo_dir} || true)"
|
||||
if [ -z "${sd}" ]; then
|
||||
git config --global --add safe.directory ${tests_repo_dir}
|
||||
fi
|
||||
pushd "${tests_repo_dir}"
|
||||
git checkout "${branch}"
|
||||
git pull
|
||||
@@ -39,12 +46,6 @@ run_static_checks()
|
||||
bash "$tests_repo_dir/.ci/static-checks.sh" "$@"
|
||||
}
|
||||
|
||||
run_go_test()
|
||||
{
|
||||
clone_tests_repo
|
||||
bash "$tests_repo_dir/.ci/go-test.sh"
|
||||
}
|
||||
|
||||
run_docs_url_alive_check()
|
||||
{
|
||||
clone_tests_repo
|
||||
|
||||
@@ -116,7 +116,7 @@ detailed below.
|
||||
The Kata logs appear in the `containerd` log files, along with logs from `containerd` itself.
|
||||
|
||||
For more information about `containerd` debug, please see the
|
||||
[`containerd` documentation](https://github.com/containerd/containerd/blob/master/docs/getting-started.md).
|
||||
[`containerd` documentation](https://github.com/containerd/containerd/blob/main/docs/getting-started.md).
|
||||
|
||||
#### Enabling full `containerd` debug
|
||||
|
||||
@@ -425,7 +425,7 @@ To build utilizing the same options as Kata, you should make use of the `configu
|
||||
$ cd $your_qemu_directory
|
||||
$ $packaging_dir/scripts/configure-hypervisor.sh kata-qemu > kata.cfg
|
||||
$ eval ./configure "$(cat kata.cfg)"
|
||||
$ make -j $(nproc)
|
||||
$ make -j $(nproc --ignore=1)
|
||||
$ sudo -E make install
|
||||
```
|
||||
|
||||
@@ -465,7 +465,7 @@ script and paste its output directly into a
|
||||
> [runtime](../src/runtime) repository.
|
||||
|
||||
To perform analysis on Kata logs, use the
|
||||
[`kata-log-parser`](https://github.com/kata-containers/tests/tree/main/cmd/log-parser)
|
||||
[`kata-log-parser`](../src/tools/log-parser)
|
||||
tool, which can convert the logs into formats (e.g. JSON, TOML, XML, and YAML).
|
||||
|
||||
See [Set up a debug console](#set-up-a-debug-console).
|
||||
@@ -700,11 +700,11 @@ options to have the kernel boot messages logged into the system journal.
|
||||
For generic information on enabling debug in the configuration file, see the
|
||||
[Enable full debug](#enable-full-debug) section.
|
||||
|
||||
The kernel boot messages will appear in the `containerd` or `CRI-O` log appropriately,
|
||||
The kernel boot messages will appear in the `kata` logs (and in the `containerd` or `CRI-O` log appropriately).
|
||||
such as:
|
||||
|
||||
```bash
|
||||
$ sudo journalctl -t containerd
|
||||
$ sudo journalctl -t kata
|
||||
-- Logs begin at Thu 2020-02-13 16:20:40 UTC, end at Thu 2020-02-13 16:30:23 UTC. --
|
||||
...
|
||||
time="2020-09-15T14:56:23.095113803+08:00" level=debug msg="reading guest console" console-protocol=unix console-url=/run/vc/vm/ab9f633385d4987828d342e47554fc6442445b32039023eeddaa971c1bb56791/console.sock pid=107642 sandbox=ab9f633385d4987828d342e47554fc6442445b32039023eeddaa971c1bb56791 source=virtcontainers subsystem=sandbox vmconsole="[ 0.395399] brd: module loaded"
|
||||
@@ -714,3 +714,4 @@ time="2020-09-15T14:56:23.105268162+08:00" level=debug msg="reading guest consol
|
||||
time="2020-09-15T14:56:23.121121598+08:00" level=debug msg="reading guest console" console-protocol=unix console-url=/run/vc/vm/ab9f633385d4987828d342e47554fc6442445b32039023eeddaa971c1bb56791/console.sock pid=107642 sandbox=ab9f633385d4987828d342e47554fc6442445b32039023eeddaa971c1bb56791 source=virtcontainers subsystem=sandbox vmconsole="[ 0.421324] memmap_init_zone_device initialised 32768 pages in 12ms"
|
||||
...
|
||||
```
|
||||
Refer to the [kata-log-parser documentation](../src/tools/log-parser/README.md) which is useful to fetch these.
|
||||
|
||||
@@ -46,7 +46,7 @@ The following link shows the latest list of limitations:
|
||||
# Contributing
|
||||
|
||||
If you would like to work on resolving a limitation, please refer to the
|
||||
[contributors guide](https://github.com/kata-containers/community/blob/master/CONTRIBUTING.md).
|
||||
[contributors guide](https://github.com/kata-containers/community/blob/main/CONTRIBUTING.md).
|
||||
If you wish to raise an issue for a new limitation, either
|
||||
[raise an issue directly on the runtime](https://github.com/kata-containers/kata-containers/issues/new)
|
||||
or see the
|
||||
@@ -60,17 +60,26 @@ This section lists items that might be possible to fix.
|
||||
## OCI CLI commands
|
||||
|
||||
### Docker and Podman support
|
||||
Currently Kata Containers does not support Docker or Podman.
|
||||
Currently Kata Containers does not support Podman.
|
||||
|
||||
See issue https://github.com/kata-containers/kata-containers/issues/722 for more information.
|
||||
|
||||
Docker supports Kata Containers since 22.06:
|
||||
|
||||
```bash
|
||||
$ sudo docker run --runtime io.containerd.kata.v2
|
||||
```
|
||||
|
||||
Kata Containers works perfectly with containerd, we recommend to use
|
||||
containerd's Docker-style command line tool [`nerdctl`](https://github.com/containerd/nerdctl).
|
||||
|
||||
## Runtime commands
|
||||
|
||||
### checkpoint and restore
|
||||
|
||||
The runtime does not provide `checkpoint` and `restore` commands. There
|
||||
are discussions about using VM save and restore to give us a
|
||||
`[criu](https://github.com/checkpoint-restore/criu)`-like functionality,
|
||||
[`criu`](https://github.com/checkpoint-restore/criu)-like functionality,
|
||||
which might provide a solution.
|
||||
|
||||
Note that the OCI standard does not specify `checkpoint` and `restore`
|
||||
@@ -93,6 +102,42 @@ All other configurations are supported and are working properly.
|
||||
|
||||
## Networking
|
||||
|
||||
### Host network
|
||||
|
||||
Host network (`nerdctl/docker run --net=host`or [Kubernetes `HostNetwork`](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#hosts-namespaces)) is not supported.
|
||||
It is not possible to directly access the host networking configuration
|
||||
from within the VM.
|
||||
|
||||
The `--net=host` option can still be used with `runc` containers and
|
||||
inter-mixed with running Kata Containers, thus enabling use of `--net=host`
|
||||
when necessary.
|
||||
|
||||
It should be noted, currently passing the `--net=host` option into a
|
||||
Kata Container may result in the Kata Container networking setup
|
||||
modifying, re-configuring and therefore possibly breaking the host
|
||||
networking setup. Do not use `--net=host` with Kata Containers.
|
||||
|
||||
### Support for joining an existing VM network
|
||||
|
||||
Docker supports the ability for containers to join another containers
|
||||
namespace with the `docker run --net=containers` syntax. This allows
|
||||
multiple containers to share a common network namespace and the network
|
||||
interfaces placed in the network namespace. Kata Containers does not
|
||||
support network namespace sharing. If a Kata Container is setup to
|
||||
share the network namespace of a `runc` container, the runtime
|
||||
effectively takes over all the network interfaces assigned to the
|
||||
namespace and binds them to the VM. Consequently, the `runc` container loses
|
||||
its network connectivity.
|
||||
|
||||
### docker run --link
|
||||
|
||||
The runtime does not support the `docker run --link` command. This
|
||||
command is now deprecated by docker and we have no intention of adding support.
|
||||
Equivalent functionality can be achieved with the newer docker networking commands.
|
||||
|
||||
See more documentation at
|
||||
[docs.docker.com](https://docs.docker.com/network/links/).
|
||||
|
||||
## Resource management
|
||||
|
||||
Due to the way VMs differ in their CPU and memory allocation, and sharing
|
||||
|
||||
@@ -4,11 +4,11 @@
|
||||
## Requirements
|
||||
|
||||
- [hub](https://github.com/github/hub)
|
||||
* Using an [application token](https://github.com/settings/tokens) is required for hub.
|
||||
* Using an [application token](https://github.com/settings/tokens) is required for hub (set to a GITHUB_TOKEN environment variable).
|
||||
|
||||
- GitHub permissions to push tags and create releases in Kata repositories.
|
||||
|
||||
- GPG configured to sign git tags. https://help.github.com/articles/generating-a-new-gpg-key/
|
||||
- GPG configured to sign git tags. https://docs.github.com/en/authentication/managing-commit-signature-verification/generating-a-new-gpg-key
|
||||
|
||||
- You should configure your GitHub to use your ssh keys (to push to branches). See https://help.github.com/articles/adding-a-new-ssh-key-to-your-github-account/.
|
||||
* As an alternative, configure hub to push and fork with HTTPS, `git config --global hub.protocol https` (Not tested yet) *
|
||||
@@ -48,7 +48,7 @@
|
||||
### Merge all bump version Pull requests
|
||||
|
||||
- The above step will create a GitHub pull request in the Kata projects. Trigger the CI using `/test` command on each bump Pull request.
|
||||
- Trigger the test-kata-deploy workflow on the kata-containers repository bump Pull request using `/test_kata_deploy` (monitor under the "action" tab).
|
||||
- Trigger the `test-kata-deploy` workflow which is under the `Actions` tab on the repository GitHub page (make sure to select the correct branch and validate it passes).
|
||||
- Check any failures and fix if needed.
|
||||
- Work with the Kata approvers to verify that the CI works and the pull requests are merged.
|
||||
|
||||
|
||||
@@ -277,7 +277,9 @@ mod tests {
|
||||
|
||||
## Temporary files
|
||||
|
||||
Always delete temporary files on success.
|
||||
Use `t.TempDir()` to create temporary directory. The directory created by
|
||||
`t.TempDir()` is automatically removed when the test and all its subtests
|
||||
complete.
|
||||
|
||||
### Golang temporary files
|
||||
|
||||
@@ -286,11 +288,7 @@ func TestSomething(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
// Create a temporary directory
|
||||
tmpdir, err := os.MkdirTemp("", "")
|
||||
assert.NoError(err)
|
||||
|
||||
// Delete it at the end of the test
|
||||
defer os.RemoveAll(tmpdir)
|
||||
tmpdir := t.TempDir()
|
||||
|
||||
// Add test logic that will use the tmpdir here...
|
||||
}
|
||||
@@ -322,7 +320,7 @@ mod tests {
|
||||
|
||||
## Test user
|
||||
|
||||
[Unit tests are run *twice*](https://github.com/kata-containers/tests/blob/main/.ci/go-test.sh):
|
||||
[Unit tests are run *twice*](../src/runtime/go-test.sh):
|
||||
|
||||
- as the current user
|
||||
- as the `root` user (if different to the current user)
|
||||
|
||||
@@ -79,7 +79,7 @@ a "`BUG: feature X not implemented see {bug-url}`" type error.
|
||||
- Don't use multiple log calls when a single log call could be used.
|
||||
|
||||
- Use structured logging where possible to allow
|
||||
[standard tooling](https://github.com/kata-containers/tests/tree/main/cmd/log-parser)
|
||||
[standard tooling](../src/tools/log-parser)
|
||||
be able to extract the log fields.
|
||||
|
||||
### Names
|
||||
|
||||
@@ -11,7 +11,8 @@ Kata Containers design documents:
|
||||
- [`Inotify` support](inotify.md)
|
||||
- [Metrics(Kata 2.0)](kata-2-0-metrics.md)
|
||||
- [Design for Kata Containers `Lazyload` ability with `nydus`](kata-nydus-design.md)
|
||||
|
||||
- [Design for direct-assigned volume](direct-blk-device-assignment.md)
|
||||
- [Design for core-scheduling](core-scheduling.md)
|
||||
---
|
||||
|
||||
- [Design proposals](proposals)
|
||||
|
||||
@@ -17,7 +17,7 @@ Kubelet instance is responsible for managing the lifecycle of pods
|
||||
within the nodes and eventually relies on a container runtime to
|
||||
handle execution. The Kubelet architecture decouples lifecycle
|
||||
management from container execution through a dedicated gRPC based
|
||||
[Container Runtime Interface (CRI)](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node/container-runtime-interface-v1.md).
|
||||
[Container Runtime Interface (CRI)](https://github.com/kubernetes/design-proposals-archive/blob/main/node/container-runtime-interface-v1.md).
|
||||
|
||||
In other words, a Kubelet is a CRI client and expects a CRI
|
||||
implementation to handle the server side of the interface.
|
||||
|
||||
@@ -1,5 +1,17 @@
|
||||
# Storage
|
||||
|
||||
## Limits
|
||||
|
||||
Kata Containers is [compatible](README.md#compatibility) with existing
|
||||
standards and runtime. From the perspective of storage, this means no
|
||||
limits are placed on the amount of storage a container
|
||||
[workload](README.md#workload) may use.
|
||||
|
||||
Since cgroups are not able to set limits on storage allocation, if you
|
||||
wish to constrain the amount of storage a container uses, consider
|
||||
using an existing facility such as `quota(1)` limits or
|
||||
[device mapper](#devicemapper) limits.
|
||||
|
||||
## virtio SCSI
|
||||
|
||||
If a block-based graph driver is [configured](README.md#configuration),
|
||||
@@ -20,7 +32,7 @@ For virtio-fs, the [runtime](README.md#runtime) starts one `virtiofsd` daemon
|
||||
## Devicemapper
|
||||
|
||||
The
|
||||
[devicemapper `snapshotter`](https://github.com/containerd/containerd/tree/master/snapshots/devmapper)
|
||||
[devicemapper `snapshotter`](https://github.com/containerd/containerd/tree/main/snapshots/devmapper)
|
||||
is a special case. The `snapshotter` uses dedicated block devices
|
||||
rather than formatted filesystems, and operates at the block level
|
||||
rather than the file level. This knowledge is used to directly use the
|
||||
|
||||
169
docs/design/architecture_3.0/README.md
Normal file
169
docs/design/architecture_3.0/README.md
Normal file
@@ -0,0 +1,169 @@
|
||||
# Kata 3.0 Architecture
|
||||
## Overview
|
||||
In cloud-native scenarios, there is an increased demand for container startup speed, resource consumption, stability, and security, areas where the present Kata Containers runtime is challenged relative to other runtimes. To achieve this, we propose a solid, field-tested and secure Rust version of the kata-runtime.
|
||||
|
||||
Also, we provide the following designs:
|
||||
|
||||
- Turn key solution with builtin `Dragonball` Sandbox
|
||||
- Async I/O to reduce resource consumption
|
||||
- Extensible framework for multiple services, runtimes and hypervisors
|
||||
- Lifecycle management for sandbox and container associated resources
|
||||
|
||||
### Rationale for choosing Rust
|
||||
|
||||
We chose Rust because it is designed as a system language with a focus on efficiency.
|
||||
In contrast to Go, Rust makes a variety of design trade-offs in order to obtain
|
||||
good execution performance, with innovative techniques that, in contrast to C or
|
||||
C++, provide reasonable protection against common memory errors (buffer
|
||||
overflow, invalid pointers, range errors), error checking (ensuring errors are
|
||||
dealt with), thread safety, ownership of resources, and more.
|
||||
|
||||
These benefits were verified in our project when the Kata Containers guest agent
|
||||
was rewritten in Rust. We notably saw a significant reduction in memory usage
|
||||
with the Rust-based implementation.
|
||||
|
||||
|
||||
## Design
|
||||
### Architecture
|
||||

|
||||
### Built-in VMM
|
||||
#### Current Kata 2.x architecture
|
||||

|
||||
As shown in the figure, runtime and VMM are separate processes. The runtime process forks the VMM process and interacts through the inter-process RPC. Typically, process interaction consumes more resources than peers within the process, and it will result in relatively low efficiency. At the same time, the cost of resource operation and maintenance should be considered. For example, when performing resource recovery under abnormal conditions, the exception of any process must be detected by others and activate the appropriate resource recovery process. If there are additional processes, the recovery becomes even more difficult.
|
||||
#### How To Support Built-in VMM
|
||||
We provide `Dragonball` Sandbox to enable built-in VMM by integrating VMM's function into the Rust library. We could perform VMM-related functionalities by using the library. Because runtime and VMM are in the same process, there is a benefit in terms of message processing speed and API synchronization. It can also guarantee the consistency of the runtime and the VMM life cycle, reducing resource recovery and exception handling maintenance, as shown in the figure:
|
||||

|
||||
### Async Support
|
||||
#### Why Need Async
|
||||
**Async is already in stable Rust and allows us to write async code**
|
||||
|
||||
- Async provides significantly reduced CPU and memory overhead, especially for workloads with a large amount of IO-bound tasks
|
||||
- Async is zero-cost in Rust, which means that you only pay for what you use. Specifically, you can use async without heap allocations and dynamic dispatch, which greatly improves efficiency
|
||||
- For more (see [Why Async?](https://rust-lang.github.io/async-book/01_getting_started/02_why_async.html) and [The State of Asynchronous Rust](https://rust-lang.github.io/async-book/01_getting_started/03_state_of_async_rust.html)).
|
||||
|
||||
**There may be several problems if implementing kata-runtime with Sync Rust**
|
||||
|
||||
- Too many threads with a new TTRPC connection
|
||||
- TTRPC threads: reaper thread(1) + listener thread(1) + client handler(2)
|
||||
- Add 3 I/O threads with a new container
|
||||
- In Sync mode, implementing a timeout mechanism is challenging. For example, in TTRPC API interaction, the timeout mechanism is difficult to align with Golang
|
||||
#### How To Support Async
|
||||
The kata-runtime is controlled by TOKIO_RUNTIME_WORKER_THREADS to run the OS thread, which is 2 threads by default. For TTRPC and container-related threads run in the `tokio` thread in a unified manner, and related dependencies need to be switched to Async, such as Timer, File, Netlink, etc. With the help of Async, we can easily support no-block I/O and timer. Currently, we only utilize Async for kata-runtime. The built-in VMM keeps the OS thread because it can ensure that the threads are controllable.
|
||||
|
||||
**For N tokio worker threads and M containers**
|
||||
|
||||
- Sync runtime(both OS thread and `tokio` task are OS thread but without `tokio` worker thread) OS thread number: 4 + 12*M
|
||||
- Async runtime(only OS thread is OS thread) OS thread number: 2 + N
|
||||
```shell
|
||||
├─ main(OS thread)
|
||||
├─ async-logger(OS thread)
|
||||
└─ tokio worker(N * OS thread)
|
||||
├─ agent log forwarder(1 * tokio task)
|
||||
├─ health check thread(1 * tokio task)
|
||||
├─ TTRPC reaper thread(M * tokio task)
|
||||
├─ TTRPC listener thread(M * tokio task)
|
||||
├─ TTRPC client handler thread(7 * M * tokio task)
|
||||
├─ container stdin io thread(M * tokio task)
|
||||
├─ container stdin io thread(M * tokio task)
|
||||
└─ container stdin io thread(M * tokio task)
|
||||
```
|
||||
### Extensible Framework
|
||||
The Kata 3.x runtime is designed with the extension of service, runtime, and hypervisor, combined with configuration to meet the needs of different scenarios. At present, the service provides a register mechanism to support multiple services. Services could interact with runtime through messages. In addition, the runtime handler handles messages from services. To meet the needs of a binary that supports multiple runtimes and hypervisors, the startup must obtain the runtime handler type and hypervisor type through configuration.
|
||||
|
||||

|
||||
### Resource Manager
|
||||
In our case, there will be a variety of resources, and every resource has several subtypes. Especially for `Virt-Container`, every subtype of resource has different operations. And there may be dependencies, such as the share-fs rootfs and the share-fs volume will use share-fs resources to share files to the VM. Currently, network and share-fs are regarded as sandbox resources, while rootfs, volume, and cgroup are regarded as container resources. Also, we abstract a common interface for each resource and use subclass operations to evaluate the differences between different subtypes.
|
||||

|
||||
|
||||
## Roadmap
|
||||
|
||||
- Stage 1 (June): provide basic features (current delivered)
|
||||
- Stage 2 (September): support common features
|
||||
- Stage 3: support full features
|
||||
|
||||
| **Class** | **Sub-Class** | **Development Stage** |
|
||||
| -------------------------- | ------------------- | --------------------- |
|
||||
| Service | task service | Stage 1 |
|
||||
| | extend service | Stage 3 |
|
||||
| | image service | Stage 3 |
|
||||
| Runtime handler | `Virt-Container` | Stage 1 |
|
||||
| | `Wasm-Container` | Stage 3 |
|
||||
| | `Linux-Container` | Stage 3 |
|
||||
| Endpoint | VETH Endpoint | Stage 1 |
|
||||
| | Physical Endpoint | Stage 2 |
|
||||
| | Tap Endpoint | Stage 2 |
|
||||
| | `Tuntap` Endpoint | Stage 2 |
|
||||
| | `IPVlan` Endpoint | Stage 3 |
|
||||
| | `MacVlan` Endpoint | Stage 3 |
|
||||
| | MACVTAP Endpoint | Stage 3 |
|
||||
| | `VhostUserEndpoint` | Stage 3 |
|
||||
| Network Interworking Model | Tc filter | Stage 1 |
|
||||
| | `MacVtap` | Stage 3 |
|
||||
| Storage | Virtio-fs | Stage 1 |
|
||||
| | `nydus` | Stage 2 |
|
||||
| Hypervisor | `Dragonball` | Stage 1 |
|
||||
| | QEMU | Stage 2 |
|
||||
| | ACRN | Stage 3 |
|
||||
| | Cloud Hypervisor | Stage 3 |
|
||||
| | Firecracker | Stage 3 |
|
||||
|
||||
## FAQ
|
||||
|
||||
- Are the "service", "message dispatcher" and "runtime handler" all part of the single Kata 3.x runtime binary?
|
||||
|
||||
Yes. They are components in Kata 3.x runtime. And they will be packed into one binary.
|
||||
1. Service is an interface, which is responsible for handling multiple services like task service, image service and etc.
|
||||
2. Message dispatcher, it is used to match multiple requests from the service module.
|
||||
3. Runtime handler is used to deal with the operation for sandbox and container.
|
||||
- What is the name of the Kata 3.x runtime binary?
|
||||
|
||||
Apparently we can't use `containerd-shim-v2-kata` because it's already used. We are facing the hardest issue of "naming" again. Any suggestions are welcomed.
|
||||
Internally we use `containerd-shim-v2-rund`.
|
||||
|
||||
- Is the Kata 3.x design compatible with the containerd shimv2 architecture?
|
||||
|
||||
Yes. It is designed to follow the functionality of go version kata. And it implements the `containerd shim v2` interface/protocol.
|
||||
|
||||
- How will users migrate to the Kata 3.x architecture?
|
||||
|
||||
The migration plan will be provided before the Kata 3.x is merging into the main branch.
|
||||
|
||||
- Is `Dragonball` limited to its own built-in VMM? Can the `Dragonball` system be configured to work using an external `Dragonball` VMM/hypervisor?
|
||||
|
||||
The `Dragonball` could work as an external hypervisor. However, stability and performance is challenging in this case. Built in VMM could optimise the container overhead, and it's easy to maintain stability.
|
||||
|
||||
`runD` is the `containerd-shim-v2` counterpart of `runC` and can run a pod/containers. `Dragonball` is a `microvm`/VMM that is designed to run container workloads. Instead of `microvm`/VMM, we sometimes refer to it as secure sandbox.
|
||||
|
||||
- QEMU, Cloud Hypervisor and Firecracker support are planned, but how that would work. Are they working in separate process?
|
||||
|
||||
Yes. They are unable to work as built in VMM.
|
||||
|
||||
- What is `upcall`?
|
||||
|
||||
The `upcall` is used to hotplug CPU/memory/MMIO devices, and it solves two issues.
|
||||
1. avoid dependency on PCI/ACPI
|
||||
2. avoid dependency on `udevd` within guest and get deterministic results for hotplug operations. So `upcall` is an alternative to ACPI based CPU/memory/device hotplug. And we may cooperate with the community to add support for ACPI based CPU/memory/device hotplug if needed.
|
||||
|
||||
`Dbs-upcall` is a `vsock-based` direct communication tool between VMM and guests. The server side of the `upcall` is a driver in guest kernel (kernel patches are needed for this feature) and it'll start to serve the requests once the kernel has started. And the client side is in VMM , it'll be a thread that communicates with VSOCK through `uds`. We have accomplished device hotplug / hot-unplug directly through `upcall` in order to avoid virtualization of ACPI to minimize virtual machine's overhead. And there could be many other usage through this direct communication channel. It's already open source.
|
||||
https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall
|
||||
|
||||
- The URL below says the kernel patches work with 4.19, but do they also work with 5.15+ ?
|
||||
|
||||
Forward compatibility should be achievable, we have ported it to 5.10 based kernel.
|
||||
|
||||
- Are these patches platform-specific or would they work for any architecture that supports VSOCK?
|
||||
|
||||
It's almost platform independent, but some message related to CPU hotplug are platform dependent.
|
||||
|
||||
- Could the kernel driver be replaced with a userland daemon in the guest using loopback VSOCK?
|
||||
|
||||
We need to create device nodes for hot-added CPU/memory/devices, so it's not easy for userspace daemon to do these tasks.
|
||||
|
||||
- The fact that `upcall` allows communication between the VMM and the guest suggests that this architecture might be incompatible with https://github.com/confidential-containers where the VMM should have no knowledge of what happens inside the VM.
|
||||
|
||||
1. `TDX` doesn't support CPU/memory hotplug yet.
|
||||
2. For ACPI based device hotplug, it depends on ACPI `DSDT` table, and the guest kernel will execute `ASL` code to handle during handling those hotplug event. And it should be easier to audit VSOCK based communication than ACPI `ASL` methods.
|
||||
|
||||
- What is the security boundary for the monolithic / "Built-in VMM" case?
|
||||
|
||||
It has the security boundary of virtualization. More details will be provided in next stage.
|
||||
BIN
docs/design/architecture_3.0/images/architecture.png
Normal file
BIN
docs/design/architecture_3.0/images/architecture.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 95 KiB |
BIN
docs/design/architecture_3.0/images/built_in_vmm.png
Normal file
BIN
docs/design/architecture_3.0/images/built_in_vmm.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 66 KiB |
BIN
docs/design/architecture_3.0/images/framework.png
Normal file
BIN
docs/design/architecture_3.0/images/framework.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 136 KiB |
BIN
docs/design/architecture_3.0/images/not_built_in_vmm.png
Normal file
BIN
docs/design/architecture_3.0/images/not_built_in_vmm.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 72 KiB |
BIN
docs/design/architecture_3.0/images/resourceManager.png
Normal file
BIN
docs/design/architecture_3.0/images/resourceManager.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 139 KiB |
File diff suppressed because one or more lines are too long
12
docs/design/core-scheduling.md
Normal file
12
docs/design/core-scheduling.md
Normal file
@@ -0,0 +1,12 @@
|
||||
# Core scheduling
|
||||
|
||||
Core scheduling is a Linux kernel feature that allows only trusted tasks to run concurrently on
|
||||
CPUs sharing compute resources (for example, hyper-threads on a core).
|
||||
|
||||
Containerd versions >= 1.6.4 leverage this to treat all of the processes associated with a
|
||||
given pod or container to be a single group of trusted tasks. To indicate this should be carried
|
||||
out, containerd sets the `SCHED_CORE` environment variable for each shim it spawns. When this is
|
||||
set, the Kata Containers shim implementation uses the `prctl` syscall to create a new core scheduling
|
||||
domain for the shim process itself as well as future VMM processes it will start.
|
||||
|
||||
For more details on the core scheduling feature, see the [Linux documentation](https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/core-scheduling.html).
|
||||
253
docs/design/direct-blk-device-assignment.md
Normal file
253
docs/design/direct-blk-device-assignment.md
Normal file
@@ -0,0 +1,253 @@
|
||||
# Motivation
|
||||
Today, there exist a few gaps between Container Storage Interface (CSI) and virtual machine (VM) based runtimes such as Kata Containers
|
||||
that prevent them from working together smoothly.
|
||||
|
||||
First, it’s cumbersome to use a persistent volume (PV) with Kata Containers. Today, for a PV with Filesystem volume mode, Virtio-fs
|
||||
is the only way to surface it inside a Kata Container guest VM. But often mounting the filesystem (FS) within the guest operating system (OS) is
|
||||
desired due to performance benefits, availability of native FS features and security benefits over the Virtio-fs mechanism.
|
||||
|
||||
Second, it’s difficult if not impossible to resize a PV online with Kata Containers. While a PV can be expanded on the host OS,
|
||||
the updated metadata needs to be propagated to the guest OS in order for the application container to use the expanded volume.
|
||||
Currently, there is not a way to propagate the PV metadata from the host OS to the guest OS without restarting the Pod sandbox.
|
||||
|
||||
# Proposed Solution
|
||||
|
||||
Because of the OS boundary, these features cannot be implemented in the CSI node driver plugin running on the host OS
|
||||
as is normally done in the runc container. Instead, they can be done by the Kata Containers agent inside the guest OS,
|
||||
but it requires the CSI driver to pass the relevant information to the Kata Containers runtime.
|
||||
An ideal long term solution would be to have the `kubelet` coordinating the communication between the CSI driver and
|
||||
the container runtime, as described in [KEP-2857](https://github.com/kubernetes/enhancements/pull/2893/files).
|
||||
However, as the KEP is still under review, we would like to propose a short/medium term solution to unblock our use case.
|
||||
|
||||
The proposed solution is built on top of a previous [proposal](https://github.com/egernst/kata-containers/blob/da-proposal/docs/design/direct-assign-volume.md)
|
||||
described by Eric Ernst. The previous proposal has two gaps:
|
||||
|
||||
1. Writing a `csiPlugin.json` file to the volume root path introduced a security risk. A malicious user can gain unauthorized
|
||||
access to a block device by writing their own `csiPlugin.json` to the above location through an ephemeral CSI plugin.
|
||||
|
||||
2. The proposal didn't describe how to establish a mapping between a volume and a kata sandbox, which is needed for
|
||||
implementing CSI volume resize and volume stat collection APIs.
|
||||
|
||||
This document particularly focuses on how to address these two gaps.
|
||||
|
||||
## Assumptions and Limitations
|
||||
1. The proposal assumes that a block device volume will only be used by one Pod on a node at a time, which we believe
|
||||
is the most common pattern in Kata Containers use cases. It’s also unsafe to have the same block device attached to more than
|
||||
one Kata pod. In the context of Kubernetes, the `PersistentVolumeClaim` (PVC) needs to have the `accessMode` as `ReadWriteOncePod`.
|
||||
2. More advanced Kubernetes volume features such as, `fsGroup`, `fsGroupChangePolicy`, and `subPath` are not supported.
|
||||
|
||||
## End User Interface
|
||||
|
||||
1. The user specifies a PV as a direct-assigned volume. How a PV is specified as a direct-assigned volume is left for each CSI implementation to decide.
|
||||
There are a few options for reference:
|
||||
1. A storage class parameter specifies whether it's a direct-assigned volume. This avoids any lookups of PVC
|
||||
or Pod information from the CSI plugin (as external provisioner takes care of these). However, all PVs in the storage class with the parameter set
|
||||
will have host mounts skipped.
|
||||
2. Use a PVC annotation. This approach requires the CSI plugins have `--extra-create-metadata` [set](https://kubernetes-csi.github.io/docs/external-provisioner.html#persistentvolumeclaim-and-persistentvolume-parameters)
|
||||
to be able to perform a lookup of the PVC annotations from the API server. Pro: API server lookup of annotations only required during creation of PV.
|
||||
Con: The CSI plugin will always skip host mounting of the PV.
|
||||
3. The CSI plugin can also lookup pod `runtimeclass` during `NodePublish`. This approach can be found in the [ALIBABA CSI plugin](https://github.com/kubernetes-sigs/alibaba-cloud-csi-driver/blob/master/pkg/disk/nodeserver.go#L248).
|
||||
2. The CSI node driver delegates the direct assigned volume to the Kata Containers runtime. The CSI node driver APIs need to
|
||||
be modified to pass the volume mount information and collect volume information to/from the Kata Containers runtime by invoking `kata-runtime` command line commands.
|
||||
* **NodePublishVolume** -- It invokes `kata-runtime direct-volume add --volume-path [volumePath] --mount-info [mountInfo]`
|
||||
to propagate the volume mount information to the Kata Containers runtime for it to carry out the filesystem mount operation.
|
||||
The `volumePath` is the [target_path](https://github.com/container-storage-interface/spec/blob/master/csi.proto#L1364) in the CSI `NodePublishVolumeRequest`.
|
||||
The `mountInfo` is a serialized JSON string.
|
||||
* **NodeGetVolumeStats** -- It invokes `kata-runtime direct-volume stats --volume-path [volumePath]` to retrieve the filesystem stats of direct-assigned volume.
|
||||
* **NodeExpandVolume** -- It invokes `kata-runtime direct-volume resize --volume-path [volumePath] --size [size]` to send a resize request to the Kata Containers runtime to
|
||||
resize the direct-assigned volume.
|
||||
* **NodeStageVolume/NodeUnStageVolume** -- It invokes `kata-runtime direct-volume remove --volume-path [volumePath]` to remove the persisted metadata of a direct-assigned volume.
|
||||
|
||||
The `mountInfo` object is defined as follows:
|
||||
```Golang
|
||||
type MountInfo struct {
|
||||
// The type of the volume (ie. block)
|
||||
VolumeType string `json:"volume-type"`
|
||||
// The device backing the volume.
|
||||
Device string `json:"device"`
|
||||
// The filesystem type to be mounted on the volume.
|
||||
FsType string `json:"fstype"`
|
||||
// Additional metadata to pass to the agent regarding this volume.
|
||||
Metadata map[string]string `json:"metadata,omitempty"`
|
||||
// Additional mount options.
|
||||
Options []string `json:"options,omitempty"`
|
||||
}
|
||||
```
|
||||
Notes: given that the `mountInfo` is persisted to the disk by the Kata runtime, it shouldn't container any secrets (such as SMB mount password).
|
||||
|
||||
## Implementation Details
|
||||
|
||||
### Kata runtime
|
||||
Instead of the CSI node driver writing the mount info into a `csiPlugin.json` file under the volume root,
|
||||
as described in the original proposal, here we propose that the CSI node driver passes the mount information to
|
||||
the Kata Containers runtime through a new `kata-runtime` commandline command. The `kata-runtime` then writes the mount
|
||||
information to a `mount-info.json` file in a predefined location (`/run/kata-containers/shared/direct-volumes/[volume_path]/`).
|
||||
|
||||
When the Kata Containers runtime starts a container, it verifies whether a volume mount is a direct-assigned volume by checking
|
||||
whether there is a `mountInfo` file under the computed Kata `direct-volumes` directory. If it is, the runtime parses the `mountInfo` file,
|
||||
updates the mount spec with the data in `mountInfo`. The updated mount spec is then passed to the Kata agent in the guest VM together
|
||||
with other mounts. The Kata Containers runtime also creates a file named by the sandbox id under the `direct-volumes/[volume_path]/`
|
||||
directory. The reason for adding a sandbox id file is to establish a mapping between the volume and the sandbox using it.
|
||||
Later, when the Kata Containers runtime handles the `get-stats` and `resize` commands, it uses the sandbox id to identify
|
||||
the endpoint of the corresponding `containerd-shim-kata-v2`.
|
||||
|
||||
### containerd-shim-kata-v2 changes
|
||||
`containerd-shim-kata-v2` provides an API for sandbox management through a Unix domain socket. Two new handlers are proposed: `/direct-volume/stats` and `/direct-volume/resize`:
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
$ curl --unix-socket "$shim_socket_path" -I -X GET 'http://localhost/direct-volume/stats/[urlSafeVolumePath]'
|
||||
$ curl --unix-socket "$shim_socket_path" -I -X POST 'http://localhost/direct-volume/resize' -d '{ "volumePath"": [volumePath], "Size": "123123" }'
|
||||
```
|
||||
|
||||
The shim then forwards the corresponding request to the `kata-agent` to carry out the operations inside the guest VM. For `resize` operation,
|
||||
the Kata runtime also needs to notify the hypervisor to resize the block device (e.g. call `block_resize` in QEMU).
|
||||
|
||||
### Kata agent changes
|
||||
|
||||
The mount spec of a direct-assigned volume is passed to `kata-agent` through the existing `Storage` GRPC object.
|
||||
Two new APIs and three new GRPC objects are added to GRPC protocol between the shim and agent for resizing and getting volume stats:
|
||||
```protobuf
|
||||
|
||||
rpc GetVolumeStats(VolumeStatsRequest) returns (VolumeStatsResponse);
|
||||
rpc ResizeVolume(ResizeVolumeRequest) returns (google.protobuf.Empty);
|
||||
|
||||
message VolumeStatsRequest {
|
||||
// The volume path on the guest outside the container
|
||||
string volume_guest_path = 1;
|
||||
}
|
||||
|
||||
message ResizeVolumeRequest {
|
||||
// Full VM guest path of the volume (outside the container)
|
||||
string volume_guest_path = 1;
|
||||
uint64 size = 2;
|
||||
}
|
||||
|
||||
// This should be kept in sync with CSI NodeGetVolumeStatsResponse (https://github.com/container-storage-interface/spec/blob/v1.5.0/csi.proto)
|
||||
message VolumeStatsResponse {
|
||||
// This field is OPTIONAL.
|
||||
repeated VolumeUsage usage = 1;
|
||||
// Information about the current condition of the volume.
|
||||
// This field is OPTIONAL.
|
||||
// This field MUST be specified if the VOLUME_CONDITION node
|
||||
// capability is supported.
|
||||
VolumeCondition volume_condition = 2;
|
||||
}
|
||||
message VolumeUsage {
|
||||
enum Unit {
|
||||
UNKNOWN = 0;
|
||||
BYTES = 1;
|
||||
INODES = 2;
|
||||
}
|
||||
// The available capacity in specified Unit. This field is OPTIONAL.
|
||||
// The value of this field MUST NOT be negative.
|
||||
uint64 available = 1;
|
||||
|
||||
// The total capacity in specified Unit. This field is REQUIRED.
|
||||
// The value of this field MUST NOT be negative.
|
||||
uint64 total = 2;
|
||||
|
||||
// The used capacity in specified Unit. This field is OPTIONAL.
|
||||
// The value of this field MUST NOT be negative.
|
||||
uint64 used = 3;
|
||||
|
||||
// Units by which values are measured. This field is REQUIRED.
|
||||
Unit unit = 4;
|
||||
}
|
||||
|
||||
// VolumeCondition represents the current condition of a volume.
|
||||
message VolumeCondition {
|
||||
|
||||
// Normal volumes are available for use and operating optimally.
|
||||
// An abnormal volume does not meet these criteria.
|
||||
// This field is REQUIRED.
|
||||
bool abnormal = 1;
|
||||
|
||||
// The message describing the condition of the volume.
|
||||
// This field is REQUIRED.
|
||||
string message = 2;
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
### Step by step walk-through
|
||||
|
||||
Given the following definition:
|
||||
```YAML
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: app
|
||||
spec:
|
||||
runtime-class: kata-qemu
|
||||
containers:
|
||||
- name: app
|
||||
image: centos
|
||||
command: ["/bin/sh"]
|
||||
args: ["-c", "while true; do echo $(date -u) >> /data/out.txt; sleep 5; done"]
|
||||
volumeMounts:
|
||||
- name: persistent-storage
|
||||
mountPath: /data
|
||||
volumes:
|
||||
- name: persistent-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: ebs-claim
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
annotations:
|
||||
skip-hostmount: "true"
|
||||
name: ebs-claim
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOncePod
|
||||
volumeMode: Filesystem
|
||||
storageClassName: ebs-sc
|
||||
resources:
|
||||
requests:
|
||||
storage: 4Gi
|
||||
---
|
||||
kind: StorageClass
|
||||
apiVersion: storage.k8s.io/v1
|
||||
metadata:
|
||||
name: ebs-sc
|
||||
provisioner: ebs.csi.aws.com
|
||||
volumeBindingMode: WaitForFirstConsumer
|
||||
parameters:
|
||||
csi.storage.k8s.io/fstype: ext4
|
||||
|
||||
```
|
||||
Let’s assume that changes have been made in the `aws-ebs-csi-driver` node driver.
|
||||
|
||||
**Node publish volume**
|
||||
1. In the node CSI driver, the `NodePublishVolume` API invokes: `kata-runtime direct-volume add --volume-path "/kubelet/a/b/c/d/sdf" --mount-info "{\"Device\": \"/dev/sdf\", \"fstype\": \"ext4\"}"`.
|
||||
2. The `Kata-runtime` writes the mount-info JSON to a file called `mountInfo.json` under `/run/kata-containers/shared/direct-volumes/kubelet/a/b/c/d/sdf`.
|
||||
|
||||
**Node unstage volume**
|
||||
1. In the node CSI driver, the `NodeUnstageVolume` API invokes: `kata-runtime direct-volume remove --volume-path "/kubelet/a/b/c/d/sdf"`.
|
||||
2. Kata-runtime deletes the directory `/run/kata-containers/shared/direct-volumes/kubelet/a/b/c/d/sdf`.
|
||||
|
||||
**Use the volume in sandbox**
|
||||
1. Upon the request to start a container, the `containerd-shim-kata-v2` examines the container spec,
|
||||
and iterates through the mounts. For each mount, if there is a `mountInfo.json` file under `/run/kata-containers/shared/direct-volumes/[mount source path]`,
|
||||
it generates a `storage` GRPC object after overwriting the mount spec with the information in `mountInfo.json`.
|
||||
2. The shim sends the storage objects to kata-agent through TTRPC.
|
||||
3. The shim writes a file with the sandbox id as the name under `/run/kata-containers/shared/direct-volumes/[mount source path]`.
|
||||
4. The kata-agent mounts the storage objects for the container.
|
||||
|
||||
**Node expand volume**
|
||||
1. In the node CSI driver, the `NodeExpandVolume` API invokes: `kata-runtime direct-volume resize –-volume-path "/kubelet/a/b/c/d/sdf" –-size 8Gi`.
|
||||
2. The Kata runtime checks whether there is a sandbox id file under the directory `/run/kata-containers/shared/direct-volumes/kubelet/a/b/c/d/sdf`.
|
||||
3. The Kata runtime identifies the shim instance through the sandbox id, and sends a GRPC request to resize the volume.
|
||||
4. The shim handles the request, asks the hypervisor to resize the block device and sends a GRPC request to Kata agent to resize the filesystem.
|
||||
5. Kata agent receives the request and resizes the filesystem.
|
||||
|
||||
**Node get volume stats**
|
||||
1. In the node CSI driver, the `NodeGetVolumeStats` API invokes: `kata-runtime direct-volume stats –-volume-path "/kubelet/a/b/c/d/sdf"`.
|
||||
2. The Kata runtime checks whether there is a sandbox id file under the directory `/run/kata-containers/shared/direct-volumes/kubelet/a/b/c/d/sdf`.
|
||||
3. The Kata runtime identifies the shim instance through the sandbox id, and sends a GRPC request to get the volume stats.
|
||||
4. The shim handles the request and forwards it to the Kata agent.
|
||||
5. Kata agent receives the request and returns the filesystem stats.
|
||||
@@ -12,7 +12,7 @@ The OCI [runtime specification][linux-config] provides guidance on where the con
|
||||
> [`cgroupsPath`][cgroupspath]: (string, OPTIONAL) path to the cgroups. It can be used to either control the cgroups
|
||||
> hierarchy for containers or to run a new process in an existing container
|
||||
|
||||
Cgroups are hierarchical, and this can be seen with the following pod example:
|
||||
The cgroups are hierarchical, and this can be seen with the following pod example:
|
||||
|
||||
- Pod 1: `cgroupsPath=/kubepods/pod1`
|
||||
- Container 1: `cgroupsPath=/kubepods/pod1/container1`
|
||||
@@ -247,14 +247,14 @@ cgroup size and constraints accordingly.
|
||||
|
||||
# Supported cgroups
|
||||
|
||||
Kata Containers currently only supports cgroups `v1`.
|
||||
Kata Containers currently supports cgroups `v1` and `v2`.
|
||||
|
||||
In the following sections each cgroup is described briefly.
|
||||
|
||||
## Cgroups V1
|
||||
## cgroups v1
|
||||
|
||||
`Cgroups V1` are under a [`tmpfs`][1] filesystem mounted at `/sys/fs/cgroup`, where each cgroup is
|
||||
mounted under a separate cgroup filesystem. A `Cgroups v1` hierarchy may look like the following
|
||||
`cgroups v1` are under a [`tmpfs`][1] filesystem mounted at `/sys/fs/cgroup`, where each cgroup is
|
||||
mounted under a separate cgroup filesystem. A `cgroups v1` hierarchy may look like the following
|
||||
diagram:
|
||||
|
||||
```
|
||||
@@ -301,13 +301,12 @@ diagram:
|
||||
A process can join a cgroup by writing its process id (`pid`) to `cgroup.procs` file,
|
||||
or join a cgroup partially by writing the task (thread) id (`tid`) to the `tasks` file.
|
||||
|
||||
Kata Containers only supports `v1`.
|
||||
To know more about `cgroups v1`, see [cgroupsv1(7)][2].
|
||||
|
||||
## Cgroups V2
|
||||
## cgroups v2
|
||||
|
||||
`Cgroups v2` are also known as unified cgroups, unlike `cgroups v1`, the cgroups are
|
||||
mounted under the same cgroup filesystem. A `Cgroups v2` hierarchy may look like the following
|
||||
`cgroups v2` are also known as unified cgroups, unlike `cgroups v1`, the cgroups are
|
||||
mounted under the same cgroup filesystem. A `cgroups v2` hierarchy may look like the following
|
||||
diagram:
|
||||
|
||||
```
|
||||
@@ -354,8 +353,6 @@ Same as `cgroups v1`, a process can join the cgroup by writing its process id (`
|
||||
`cgroup.procs` file, or join a cgroup partially by writing the task (thread) id (`tid`) to
|
||||
`cgroup.threads` file.
|
||||
|
||||
Kata Containers does not support cgroups `v2` on the host.
|
||||
|
||||
### Distro Support
|
||||
|
||||
Many Linux distributions do not yet support `cgroups v2`, as it is quite a recent addition.
|
||||
|
||||
@@ -51,6 +51,7 @@ The `kata-monitor` management agent should be started on each node where the Kat
|
||||
> **Note**: a *node* running Kata containers will be either a single host system or a worker node belonging to a K8s cluster capable of running Kata pods.
|
||||
|
||||
- Aggregate sandbox metrics running on the node, adding the `sandbox_id` label to them.
|
||||
- Attach the additional `cri_uid`, `cri_name` and `cri_namespace` labels to the sandbox metrics, tracking the `uid`, `name` and `namespace` Kubernetes pod metadata.
|
||||
- Expose a new Prometheus target, allowing all node metrics coming from the Kata shim to be collected by Prometheus indirectly. This simplifies the targets count in Prometheus and avoids exposing shim's metrics by `ip:port`.
|
||||
|
||||
Only one `kata-monitor` process runs in each node.
|
||||
|
||||
@@ -39,7 +39,7 @@ Details of each solution and a summary are provided below.
|
||||
Kata Containers with QEMU has complete compatibility with Kubernetes.
|
||||
|
||||
Depending on the host architecture, Kata Containers supports various machine types,
|
||||
for example `pc` and `q35` on x86 systems, `virt` on ARM systems and `pseries` on IBM Power systems. The default Kata Containers
|
||||
for example `q35` on x86 systems, `virt` on ARM systems and `pseries` on IBM Power systems. The default Kata Containers
|
||||
machine type is `q35`. The machine type and its [`Machine accelerators`](#machine-accelerators) can
|
||||
be changed by editing the runtime [`configuration`](architecture/README.md#configuration) file.
|
||||
|
||||
@@ -60,9 +60,8 @@ Machine accelerators are architecture specific and can be used to improve the pe
|
||||
and enable specific features of the machine types. The following machine accelerators
|
||||
are used in Kata Containers:
|
||||
|
||||
- NVDIMM: This machine accelerator is x86 specific and only supported by `pc` and
|
||||
`q35` machine types. `nvdimm` is used to provide the root filesystem as a persistent
|
||||
memory device to the Virtual Machine.
|
||||
- NVDIMM: This machine accelerator is x86 specific and only supported by `q35` machine types.
|
||||
`nvdimm` is used to provide the root filesystem as a persistent memory device to the Virtual Machine.
|
||||
|
||||
#### Hotplug devices
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
- [Run Kata containers with `crictl`](run-kata-with-crictl.md)
|
||||
- [Run Kata Containers with Kubernetes](run-kata-with-k8s.md)
|
||||
- [How to use Kata Containers and Containerd](containerd-kata.md)
|
||||
- [How to use Kata Containers and CRI (containerd) with Kubernetes](how-to-use-k8s-with-cri-containerd-and-kata.md)
|
||||
- [How to use Kata Containers and containerd with Kubernetes](how-to-use-k8s-with-containerd-and-kata.md)
|
||||
- [Kata Containers and service mesh for Kubernetes](service-mesh.md)
|
||||
- [How to import Kata Containers logs into Fluentd](how-to-import-kata-logs-with-fluentd.md)
|
||||
|
||||
@@ -15,6 +15,11 @@
|
||||
- `qemu`
|
||||
- `cloud-hypervisor`
|
||||
- `firecracker`
|
||||
|
||||
In the case of `firecracker` the use of a block device `snapshotter` is needed
|
||||
for the VM rootfs. Refer to the following guide for additional configuration
|
||||
steps:
|
||||
- [Setup Kata containers with `firecracker`](how-to-use-kata-containers-with-firecracker.md)
|
||||
- `ACRN`
|
||||
|
||||
While `qemu` , `cloud-hypervisor` and `firecracker` work out of the box with installation of Kata,
|
||||
|
||||
@@ -40,7 +40,7 @@ use `RuntimeClass` instead of the deprecated annotations.
|
||||
### Containerd Runtime V2 API: Shim V2 API
|
||||
|
||||
The [`containerd-shim-kata-v2` (short as `shimv2` in this documentation)](../../src/runtime/cmd/containerd-shim-kata-v2/)
|
||||
implements the [Containerd Runtime V2 (Shim API)](https://github.com/containerd/containerd/tree/master/runtime/v2) for Kata.
|
||||
implements the [Containerd Runtime V2 (Shim API)](https://github.com/containerd/containerd/tree/main/runtime/v2) for Kata.
|
||||
With `shimv2`, Kubernetes can launch Pod and OCI-compatible containers with one shim per Pod. Prior to `shimv2`, `2N+1`
|
||||
shims (i.e. a `containerd-shim` and a `kata-shim` for each container and the Pod sandbox itself) and no standalone `kata-proxy`
|
||||
process were used, even with VSOCK not available.
|
||||
@@ -72,7 +72,6 @@ $ command -v containerd
|
||||
|
||||
### Install CNI plugins
|
||||
|
||||
> **Note:** You do not need to install CNI plugins if you do not want to use containerd with Kubernetes.
|
||||
> If you have installed Kubernetes with `kubeadm`, you might have already installed the CNI plugins.
|
||||
|
||||
You can manually install CNI plugins as follows:
|
||||
@@ -94,8 +93,8 @@ $ popd
|
||||
You can install the `cri-tools` from source code:
|
||||
|
||||
```bash
|
||||
$ go get github.com/kubernetes-incubator/cri-tools
|
||||
$ pushd $GOPATH/src/github.com/kubernetes-incubator/cri-tools
|
||||
$ go get github.com/kubernetes-sigs/cri-tools
|
||||
$ pushd $GOPATH/src/github.com/kubernetes-sigs/cri-tools
|
||||
$ make
|
||||
$ sudo -E make install
|
||||
$ popd
|
||||
@@ -131,74 +130,42 @@ For
|
||||
|
||||
The `RuntimeClass` is suggested.
|
||||
|
||||
The following configuration includes three runtime classes:
|
||||
The following configuration includes two runtime classes:
|
||||
- `plugins.cri.containerd.runtimes.runc`: the runc, and it is the default runtime.
|
||||
- `plugins.cri.containerd.runtimes.kata`: The function in containerd (reference [the document here](https://github.com/containerd/containerd/tree/master/runtime/v2#binary-naming))
|
||||
- `plugins.cri.containerd.runtimes.kata`: The function in containerd (reference [the document here](https://github.com/containerd/containerd/tree/main/runtime/v2#binary-naming))
|
||||
where the dot-connected string `io.containerd.kata.v2` is translated to `containerd-shim-kata-v2` (i.e. the
|
||||
binary name of the Kata implementation of [Containerd Runtime V2 (Shim API)](https://github.com/containerd/containerd/tree/master/runtime/v2)).
|
||||
- `plugins.cri.containerd.runtimes.katacli`: the `containerd-shim-runc-v1` calls `kata-runtime`, which is the legacy process.
|
||||
binary name of the Kata implementation of [Containerd Runtime V2 (Shim API)](https://github.com/containerd/containerd/tree/main/runtime/v2)).
|
||||
|
||||
```toml
|
||||
[plugins.cri.containerd]
|
||||
no_pivot = false
|
||||
[plugins.cri.containerd.runtimes]
|
||||
[plugins.cri.containerd.runtimes.runc]
|
||||
runtime_type = "io.containerd.runc.v1"
|
||||
[plugins.cri.containerd.runtimes.runc.options]
|
||||
NoPivotRoot = false
|
||||
NoNewKeyring = false
|
||||
ShimCgroup = ""
|
||||
IoUid = 0
|
||||
IoGid = 0
|
||||
BinaryName = "runc"
|
||||
Root = ""
|
||||
CriuPath = ""
|
||||
SystemdCgroup = false
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
|
||||
privileged_without_host_devices = false
|
||||
runtime_type = "io.containerd.runc.v2"
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
|
||||
BinaryName = ""
|
||||
CriuImagePath = ""
|
||||
CriuPath = ""
|
||||
CriuWorkPath = ""
|
||||
IoGid = 0
|
||||
[plugins.cri.containerd.runtimes.kata]
|
||||
runtime_type = "io.containerd.kata.v2"
|
||||
[plugins.cri.containerd.runtimes.katacli]
|
||||
runtime_type = "io.containerd.runc.v1"
|
||||
[plugins.cri.containerd.runtimes.katacli.options]
|
||||
NoPivotRoot = false
|
||||
NoNewKeyring = false
|
||||
ShimCgroup = ""
|
||||
IoUid = 0
|
||||
IoGid = 0
|
||||
BinaryName = "/usr/bin/kata-runtime"
|
||||
Root = ""
|
||||
CriuPath = ""
|
||||
SystemdCgroup = false
|
||||
```
|
||||
|
||||
From Containerd v1.2.4 and Kata v1.6.0, there is a new runtime option supported, which allows you to specify a specific Kata configuration file as follows:
|
||||
|
||||
```toml
|
||||
[plugins.cri.containerd.runtimes.kata]
|
||||
runtime_type = "io.containerd.kata.v2"
|
||||
privileged_without_host_devices = true
|
||||
[plugins.cri.containerd.runtimes.kata.options]
|
||||
ConfigPath = "/etc/kata-containers/config.toml"
|
||||
privileged_without_host_devices = true
|
||||
pod_annotations = ["io.katacontainers.*"]
|
||||
container_annotations = ["io.katacontainers.*"]
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata.options]
|
||||
ConfigPath = "/opt/kata/share/defaults/kata-containers/configuration.toml"
|
||||
```
|
||||
|
||||
`privileged_without_host_devices` tells containerd that a privileged Kata container should not have direct access to all host devices. If unset, containerd will pass all host devices to Kata container, which may cause security issues.
|
||||
|
||||
`pod_annotations` is the list of pod annotations passed to both the pod sandbox as well as container through the OCI config.
|
||||
|
||||
`container_annotations` is the list of container annotations passed through to the OCI config of the containers.
|
||||
|
||||
This `ConfigPath` option is optional. If you do not specify it, shimv2 first tries to get the configuration file from the environment variable `KATA_CONF_FILE`. If neither are set, shimv2 will use the default Kata configuration file paths (`/etc/kata-containers/configuration.toml` and `/usr/share/defaults/kata-containers/configuration.toml`).
|
||||
|
||||
If you use Containerd older than v1.2.4 or a version of Kata older than v1.6.0 and also want to specify a configuration file, you can use the following workaround, since the shimv2 accepts an environment variable, `KATA_CONF_FILE` for the configuration file path. Then, you can create a
|
||||
shell script with the following:
|
||||
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
KATA_CONF_FILE=/etc/kata-containers/firecracker.toml containerd-shim-kata-v2 $@
|
||||
```
|
||||
|
||||
Name it as `/usr/local/bin/containerd-shim-katafc-v2` and reference it in the configuration of containerd:
|
||||
|
||||
```toml
|
||||
[plugins.cri.containerd.runtimes.kata-firecracker]
|
||||
runtime_type = "io.containerd.katafc.v2"
|
||||
```
|
||||
|
||||
#### Kata Containers as the runtime for untrusted workload
|
||||
|
||||
For cases without `RuntimeClass` support, we can use the legacy annotation method to support using Kata Containers
|
||||
@@ -218,28 +185,8 @@ and then, run an untrusted workload with Kata Containers:
|
||||
runtime_type = "io.containerd.kata.v2"
|
||||
```
|
||||
|
||||
For the earlier versions of Kata Containers and containerd that do not support Runtime V2 (Shim API), you can use the following alternative configuration:
|
||||
|
||||
```toml
|
||||
[plugins.cri.containerd]
|
||||
|
||||
# "plugins.cri.containerd.default_runtime" is the runtime to use in containerd.
|
||||
[plugins.cri.containerd.default_runtime]
|
||||
# runtime_type is the runtime type to use in containerd e.g. io.containerd.runtime.v1.linux
|
||||
runtime_type = "io.containerd.runtime.v1.linux"
|
||||
|
||||
# "plugins.cri.containerd.untrusted_workload_runtime" is a runtime to run untrusted workloads on it.
|
||||
[plugins.cri.containerd.untrusted_workload_runtime]
|
||||
# runtime_type is the runtime type to use in containerd e.g. io.containerd.runtime.v1.linux
|
||||
runtime_type = "io.containerd.runtime.v1.linux"
|
||||
|
||||
# runtime_engine is the name of the runtime engine used by containerd.
|
||||
runtime_engine = "/usr/bin/kata-runtime"
|
||||
```
|
||||
|
||||
You can find more information on the [Containerd config documentation](https://github.com/containerd/cri/blob/master/docs/config.md)
|
||||
|
||||
|
||||
#### Kata Containers as the default runtime
|
||||
|
||||
If you want to set Kata Containers as the only runtime in the deployment, you can simply configure as follows:
|
||||
@@ -250,15 +197,6 @@ If you want to set Kata Containers as the only runtime in the deployment, you ca
|
||||
runtime_type = "io.containerd.kata.v2"
|
||||
```
|
||||
|
||||
Alternatively, for the earlier versions of Kata Containers and containerd that do not support Runtime V2 (Shim API), you can use the following alternative configuration:
|
||||
|
||||
```toml
|
||||
[plugins.cri.containerd]
|
||||
[plugins.cri.containerd.default_runtime]
|
||||
runtime_type = "io.containerd.runtime.v1.linux"
|
||||
runtime_engine = "/usr/bin/kata-runtime"
|
||||
```
|
||||
|
||||
### Configuration for `cri-tools`
|
||||
|
||||
> **Note:** If you skipped the [Install `cri-tools`](#install-cri-tools) section, you can skip this section too.
|
||||
@@ -312,10 +250,12 @@ To run a container with Kata Containers through the containerd command line, you
|
||||
|
||||
```bash
|
||||
$ sudo ctr image pull docker.io/library/busybox:latest
|
||||
$ sudo ctr run --runtime io.containerd.run.kata.v2 -t --rm docker.io/library/busybox:latest hello sh
|
||||
$ sudo ctr run --cni --runtime io.containerd.run.kata.v2 -t --rm docker.io/library/busybox:latest hello sh
|
||||
```
|
||||
|
||||
This launches a BusyBox container named `hello`, and it will be removed by `--rm` after it quits.
|
||||
The `--cni` flag enables CNI networking for the container. Without this flag, a container with just a
|
||||
loopback interface is created.
|
||||
|
||||
### Launch Pods with `crictl` command line
|
||||
|
||||
|
||||
@@ -45,6 +45,9 @@ spec:
|
||||
- name: containerdsocket
|
||||
mountPath: /run/containerd/containerd.sock
|
||||
readOnly: true
|
||||
- name: sbs
|
||||
mountPath: /run/vc/sbs/
|
||||
readOnly: true
|
||||
terminationGracePeriodSeconds: 30
|
||||
volumes:
|
||||
- name: containerdtask
|
||||
@@ -53,3 +56,6 @@ spec:
|
||||
- name: containerdsocket
|
||||
hostPath:
|
||||
path: /run/containerd/containerd.sock
|
||||
- name: sbs
|
||||
hostPath:
|
||||
path: /run/vc/sbs/
|
||||
|
||||
@@ -68,7 +68,7 @@ the Kata logs import to the EFK stack.
|
||||
> stack they are able to utilise in order to modify and test as necessary.
|
||||
|
||||
Minikube by default
|
||||
[configures](https://github.com/kubernetes/minikube/blob/master/deploy/iso/minikube-iso/board/coreos/minikube/rootfs-overlay/etc/systemd/journald.conf)
|
||||
[configures](https://github.com/kubernetes/minikube/blob/master/deploy/iso/minikube-iso/board/minikube/x86_64/rootfs-overlay/etc/systemd/journald.conf)
|
||||
the `systemd-journald` with the
|
||||
[`Storage=volatile`](https://www.freedesktop.org/software/systemd/man/journald.conf.html) option,
|
||||
which results in the journal being stored in `/run/log/journal`. Unfortunately, the Minikube EFK
|
||||
@@ -163,7 +163,7 @@ sub-filter on, for instance, the `SYSLOG_IDENTIFIER` to differentiate the Kata c
|
||||
on the `PRIORITY` to filter out critical issues etc.
|
||||
|
||||
Kata generates a significant amount of Kata specific information, which can be seen as
|
||||
[`logfmt`](https://github.com/kata-containers/tests/tree/main/cmd/log-parser#logfile-requirements).
|
||||
[`logfmt`](../../src/tools/log-parser/README.md#logfile-requirements).
|
||||
data contained in the `MESSAGE` field. Imported as-is, there is no easy way to filter on that data
|
||||
in Kibana:
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ Also you should ensure that `kubectl` working correctly.
|
||||
> **Note**: More information about Kubernetes integrations:
|
||||
> - [Run Kata Containers with Kubernetes](run-kata-with-k8s.md)
|
||||
> - [How to use Kata Containers and Containerd](containerd-kata.md)
|
||||
> - [How to use Kata Containers and CRI (containerd plugin) with Kubernetes](how-to-use-k8s-with-cri-containerd-and-kata.md)
|
||||
> - [How to use Kata Containers and containerd with Kubernetes](how-to-use-k8s-with-containerd-and-kata.md)
|
||||
|
||||
## Configure Prometheus
|
||||
|
||||
|
||||
@@ -91,6 +91,7 @@ There are several kinds of Kata configurations and they are listed below.
|
||||
| `io.katacontainers.config.hypervisor.virtio_fs_daemon` | string | virtio-fs `vhost-user` daemon path |
|
||||
| `io.katacontainers.config.hypervisor.virtio_fs_extra_args` | string | extra options passed to `virtiofs` daemon |
|
||||
| `io.katacontainers.config.hypervisor.enable_guest_swap` | `boolean` | enable swap in the guest |
|
||||
| `io.katacontainers.config.hypervisor.use_legacy_serial` | `boolean` | uses legacy serial device for guest's console (QEMU) |
|
||||
|
||||
## Container Options
|
||||
| Key | Value Type | Comments |
|
||||
@@ -172,7 +173,7 @@ kind: Pod
|
||||
metadata:
|
||||
name: pod2
|
||||
annotations:
|
||||
io.katacontainers.config.runtime.disable_guest_seccomp: false
|
||||
io.katacontainers.config.runtime.disable_guest_seccomp: "false"
|
||||
spec:
|
||||
runtimeClassName: kata
|
||||
containers:
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
# How to use Kata Containers and CRI (containerd plugin) with Kubernetes
|
||||
# How to use Kata Containers and containerd with Kubernetes
|
||||
|
||||
This document describes how to set up a single-machine Kubernetes (k8s) cluster.
|
||||
|
||||
The Kubernetes cluster will use the
|
||||
[CRI containerd](https://github.com/containerd/containerd/) and
|
||||
[Kata Containers](https://katacontainers.io) to launch untrusted workloads.
|
||||
[containerd](https://github.com/containerd/containerd/) and
|
||||
[Kata Containers](https://katacontainers.io) to launch workloads.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Kubernetes, Kubelet, `kubeadm`
|
||||
- containerd with `cri` plug-in
|
||||
- containerd
|
||||
- Kata Containers
|
||||
|
||||
> **Note:** For information about the supported versions of these components,
|
||||
@@ -149,7 +149,7 @@ $ sudo -E kubectl taint nodes --all node-role.kubernetes.io/master-
|
||||
|
||||
## Create runtime class for Kata Containers
|
||||
|
||||
By default, all pods are created with the default runtime configured in CRI containerd plugin.
|
||||
By default, all pods are created with the default runtime configured in containerd.
|
||||
From Kubernetes v1.12, users can use [`RuntimeClass`](https://kubernetes.io/docs/concepts/containers/runtime-class/#runtime-class) to specify a different runtime for Pods.
|
||||
|
||||
```bash
|
||||
@@ -166,7 +166,7 @@ $ sudo -E kubectl apply -f runtime.yaml
|
||||
|
||||
## Run pod in Kata Containers
|
||||
|
||||
If a pod has the `runtimeClassName` set to `kata`, the CRI plugin runs the pod with the
|
||||
If a pod has the `runtimeClassName` set to `kata`, the CRI runs the pod with the
|
||||
[Kata Containers runtime](../../src/runtime/README.md).
|
||||
|
||||
- Create an pod configuration that using Kata Containers runtime
|
||||
254
docs/how-to/how-to-use-kata-containers-with-firecracker.md
Normal file
254
docs/how-to/how-to-use-kata-containers-with-firecracker.md
Normal file
@@ -0,0 +1,254 @@
|
||||
# Configure Kata Containers to use Firecracker
|
||||
|
||||
This document provides an overview on how to run Kata Containers with the AWS Firecracker hypervisor.
|
||||
|
||||
## Introduction
|
||||
|
||||
AWS Firecracker is an open source virtualization technology that is purpose-built for creating and managing secure, multi-tenant container and function-based services that provide serverless operational models. AWS Firecracker runs workloads in lightweight virtual machines, called `microVMs`, which combine the security and isolation properties provided by hardware virtualization technology with the speed and flexibility of Containers.
|
||||
|
||||
Please refer to AWS Firecracker [documentation](https://github.com/firecracker-microvm/firecracker/blob/main/docs/getting-started.md) for more details.
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
This document requires the presence of Kata Containers on your system. Install using the instructions available through the following links:
|
||||
|
||||
- Kata Containers [automated installation](../install/README.md)
|
||||
|
||||
- Kata Containers manual installation: Automated installation does not seem to be supported for Clear Linux, so please use [manual installation](../Developer-Guide.md) steps.
|
||||
> **Note:** Create rootfs image and not initrd image.
|
||||
|
||||
## Install AWS Firecracker
|
||||
|
||||
Kata Containers only support AWS Firecracker v0.23.4 ([yet](https://github.com/kata-containers/kata-containers/pull/1519)).
|
||||
To install Firecracker we need to get the `firecracker` and `jailer` binaries:
|
||||
|
||||
```bash
|
||||
$ release_url="https://github.com/firecracker-microvm/firecracker/releases"
|
||||
$ version="v0.23.1"
|
||||
$ arch=`uname -m`
|
||||
$ curl ${release_url}/download/${version}/firecracker-${version}-${arch} -o firecracker
|
||||
$ curl ${release_url}/download/${version}/jailer-${version}-${arch} -o jailer
|
||||
$ chmod +x jailer firecracker
|
||||
```
|
||||
|
||||
To make the binaries available from the default system `PATH` it is recommended to move them to `/usr/local/bin` or add a symbolic link:
|
||||
|
||||
```bash
|
||||
$ sudo ln -s $(pwd)/firecracker /usr/local/bin
|
||||
$ sudo ln -s $(pwd)/jailer /usr/local/bin
|
||||
```
|
||||
|
||||
More details can be found in [AWS Firecracker docs](https://github.com/firecracker-microvm/firecracker/blob/main/docs/getting-started.md)
|
||||
|
||||
In order to run Kata with AWS Firecracker a block device as the backing store for a VM is required. To interact with `containerd` and Kata we use the `devmapper` `snapshotter`.
|
||||
|
||||
## Configure `devmapper`
|
||||
|
||||
To check support for your `containerd` installation, you can run:
|
||||
|
||||
```
|
||||
$ ctr plugins ls |grep devmapper
|
||||
```
|
||||
|
||||
if the output of the above command is:
|
||||
|
||||
```
|
||||
io.containerd.snapshotter.v1 devmapper linux/amd64 ok
|
||||
```
|
||||
then you can skip this section and move on to `Configure Kata Containers with AWS Firecracker`
|
||||
|
||||
If the output of the above command is:
|
||||
|
||||
```
|
||||
io.containerd.snapshotter.v1 devmapper linux/amd64 error
|
||||
```
|
||||
|
||||
then we need to setup `devmapper` `snapshotter`. Based on a [very useful
|
||||
guide](https://docs.docker.com/storage/storagedriver/device-mapper-driver/)
|
||||
from docker, we can set it up using the following scripts:
|
||||
|
||||
> **Note:** The following scripts assume a 100G sparse file for storing container images, a 10G sparse file for the thin-provisioning pool and 10G base image files for any sandboxed container created. This means that we will need at least 10GB free space.
|
||||
|
||||
```
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
DATA_DIR=/var/lib/containerd/devmapper
|
||||
POOL_NAME=devpool
|
||||
|
||||
mkdir -p ${DATA_DIR}
|
||||
|
||||
# Create data file
|
||||
sudo touch "${DATA_DIR}/data"
|
||||
sudo truncate -s 100G "${DATA_DIR}/data"
|
||||
|
||||
# Create metadata file
|
||||
sudo touch "${DATA_DIR}/meta"
|
||||
sudo truncate -s 10G "${DATA_DIR}/meta"
|
||||
|
||||
# Allocate loop devices
|
||||
DATA_DEV=$(sudo losetup --find --show "${DATA_DIR}/data")
|
||||
META_DEV=$(sudo losetup --find --show "${DATA_DIR}/meta")
|
||||
|
||||
# Define thin-pool parameters.
|
||||
# See https://www.kernel.org/doc/Documentation/device-mapper/thin-provisioning.txt for details.
|
||||
SECTOR_SIZE=512
|
||||
DATA_SIZE="$(sudo blockdev --getsize64 -q ${DATA_DEV})"
|
||||
LENGTH_IN_SECTORS=$(bc <<< "${DATA_SIZE}/${SECTOR_SIZE}")
|
||||
DATA_BLOCK_SIZE=128
|
||||
LOW_WATER_MARK=32768
|
||||
|
||||
# Create a thin-pool device
|
||||
sudo dmsetup create "${POOL_NAME}" \
|
||||
--table "0 ${LENGTH_IN_SECTORS} thin-pool ${META_DEV} ${DATA_DEV} ${DATA_BLOCK_SIZE} ${LOW_WATER_MARK}"
|
||||
|
||||
cat << EOF
|
||||
#
|
||||
# Add this to your config.toml configuration file and restart `containerd` daemon
|
||||
#
|
||||
[plugins]
|
||||
[plugins.devmapper]
|
||||
pool_name = "${POOL_NAME}"
|
||||
root_path = "${DATA_DIR}"
|
||||
base_image_size = "10GB"
|
||||
discard_blocks = true
|
||||
EOF
|
||||
```
|
||||
|
||||
Make it executable and run it:
|
||||
|
||||
```bash
|
||||
$ sudo chmod +x ~/scripts/devmapper/create.sh
|
||||
$ cd ~/scripts/devmapper/
|
||||
$ sudo ./create.sh
|
||||
```
|
||||
|
||||
Now, we can add the `devmapper` configuration provided from the script to `/etc/containerd/config.toml`.
|
||||
> **Note:** If you are using the default `containerd` configuration (`containerd config default >> /etc/containerd/config.toml`), you may need to edit the existing `[plugins."io.containerd.snapshotter.v1.devmapper"]`configuration.
|
||||
Save and restart `containerd`:
|
||||
|
||||
|
||||
```bash
|
||||
$ sudo systemctl restart containerd
|
||||
```
|
||||
|
||||
We can use `dmsetup` to verify that the thin-pool was created successfully.
|
||||
|
||||
```bash
|
||||
$ sudo dmsetup ls
|
||||
```
|
||||
|
||||
We should also check that `devmapper` is registered and running:
|
||||
|
||||
```bash
|
||||
$ sudo ctr plugins ls | grep devmapper
|
||||
```
|
||||
|
||||
This script needs to be run only once, while setting up the `devmapper` `snapshotter` for `containerd`. Afterwards, make sure that on each reboot, the thin-pool is initialized from the same data directory. Otherwise, all the fetched containers (or the ones that you have created) will be re-initialized. A simple script that re-creates the thin-pool from the same data directory is shown below:
|
||||
|
||||
```
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
DATA_DIR=/var/lib/containerd/devmapper
|
||||
POOL_NAME=devpool
|
||||
|
||||
# Allocate loop devices
|
||||
DATA_DEV=$(sudo losetup --find --show "${DATA_DIR}/data")
|
||||
META_DEV=$(sudo losetup --find --show "${DATA_DIR}/meta")
|
||||
|
||||
# Define thin-pool parameters.
|
||||
# See https://www.kernel.org/doc/Documentation/device-mapper/thin-provisioning.txt for details.
|
||||
SECTOR_SIZE=512
|
||||
DATA_SIZE="$(sudo blockdev --getsize64 -q ${DATA_DEV})"
|
||||
LENGTH_IN_SECTORS=$(bc <<< "${DATA_SIZE}/${SECTOR_SIZE}")
|
||||
DATA_BLOCK_SIZE=128
|
||||
LOW_WATER_MARK=32768
|
||||
|
||||
# Create a thin-pool device
|
||||
sudo dmsetup create "${POOL_NAME}" \
|
||||
--table "0 ${LENGTH_IN_SECTORS} thin-pool ${META_DEV} ${DATA_DEV} ${DATA_BLOCK_SIZE} ${LOW_WATER_MARK}"
|
||||
```
|
||||
|
||||
We can create a systemd service to run the above script on each reboot:
|
||||
|
||||
```bash
|
||||
$ sudo nano /lib/systemd/system/devmapper_reload.service
|
||||
```
|
||||
|
||||
The service file:
|
||||
|
||||
```
|
||||
[Unit]
|
||||
Description=Devmapper reload script
|
||||
|
||||
[Service]
|
||||
ExecStart=/path/to/script/reload.sh
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
Enable the newly created service:
|
||||
|
||||
```bash
|
||||
$ sudo systemctl daemon-reload
|
||||
$ sudo systemctl enable devmapper_reload.service
|
||||
$ sudo systemctl start devmapper_reload.service
|
||||
```
|
||||
|
||||
## Configure Kata Containers with AWS Firecracker
|
||||
|
||||
To configure Kata Containers with AWS Firecracker, copy the generated `configuration-fc.toml` file when building the `kata-runtime` to either `/etc/kata-containers/configuration-fc.toml` or `/usr/share/defaults/kata-containers/configuration-fc.toml`.
|
||||
|
||||
The following command shows full paths to the `configuration.toml` files that the runtime loads. It will use the first path that exists. (Please make sure the kernel and image paths are set correctly in the `configuration.toml` file)
|
||||
|
||||
```bash
|
||||
$ sudo kata-runtime --show-default-config-paths
|
||||
```
|
||||
|
||||
## Configure `containerd`
|
||||
Next, we need to configure containerd. Add a file in your path (e.g. `/usr/local/bin/containerd-shim-kata-fc-v2`) with the following contents:
|
||||
|
||||
```
|
||||
#!/bin/bash
|
||||
KATA_CONF_FILE=/etc/containers/configuration-fc.toml /usr/local/bin/containerd-shim-kata-v2 $@
|
||||
```
|
||||
> **Note:** You may need to edit the paths of the configuration file and the `containerd-shim-kata-v2` to correspond to your setup.
|
||||
|
||||
Make it executable:
|
||||
|
||||
```bash
|
||||
$ sudo chmod +x /usr/local/bin/containerd-shim-kata-fc-v2
|
||||
```
|
||||
|
||||
Add the relevant section in `containerd`’s `config.toml` file (`/etc/containerd/config.toml`):
|
||||
|
||||
```
|
||||
[plugins.cri.containerd.runtimes]
|
||||
[plugins.cri.containerd.runtimes.kata-fc]
|
||||
runtime_type = "io.containerd.kata-fc.v2"
|
||||
```
|
||||
|
||||
> **Note:** If you are using the default `containerd` configuration (`containerd config default >> /etc/containerd/config.toml`),
|
||||
> the configuration should change to :
|
||||
```
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata-fc]
|
||||
runtime_type = "io.containerd.kata-fc.v2"
|
||||
```
|
||||
|
||||
Restart `containerd`:
|
||||
|
||||
```bash
|
||||
$ sudo systemctl restart containerd
|
||||
```
|
||||
|
||||
## Verify the installation
|
||||
|
||||
We are now ready to launch a container using Kata with Firecracker to verify that everything worked:
|
||||
|
||||
```bash
|
||||
$ sudo ctr images pull --snapshotter devmapper docker.io/library/ubuntu:latest
|
||||
$ sudo ctr run --snapshotter devmapper --runtime io.containerd.run.kata-fc.v2 -t --rm docker.io/library/ubuntu
|
||||
```
|
||||
@@ -31,7 +31,7 @@ See below example config:
|
||||
[plugins.cri]
|
||||
[plugins.cri.containerd]
|
||||
[plugins.cri.containerd.runtimes.runc]
|
||||
runtime_type = "io.containerd.runc.v1"
|
||||
runtime_type = "io.containerd.runc.v2"
|
||||
privileged_without_host_devices = false
|
||||
[plugins.cri.containerd.runtimes.kata]
|
||||
runtime_type = "io.containerd.kata.v2"
|
||||
@@ -40,7 +40,7 @@ See below example config:
|
||||
ConfigPath = "/opt/kata/share/defaults/kata-containers/configuration.toml"
|
||||
```
|
||||
|
||||
- [Kata Containers with Containerd and CRI documentation](how-to-use-k8s-with-cri-containerd-and-kata.md)
|
||||
- [How to use Kata Containers and containerd with Kubernetes](how-to-use-k8s-with-containerd-and-kata.md)
|
||||
- [Containerd CRI config documentation](https://github.com/containerd/containerd/blob/main/docs/cri/config.md)
|
||||
|
||||
#### CRI-O
|
||||
|
||||
@@ -15,7 +15,7 @@ After choosing one CRI implementation, you must make the appropriate configurati
|
||||
to ensure it integrates with Kata Containers.
|
||||
|
||||
Kata Containers 1.5 introduced the `shimv2` for containerd 1.2.0, reducing the components
|
||||
required to spawn pods and containers, and this is the preferred way to run Kata Containers with Kubernetes ([as documented here](../how-to/how-to-use-k8s-with-cri-containerd-and-kata.md#configure-containerd-to-use-kata-containers)).
|
||||
required to spawn pods and containers, and this is the preferred way to run Kata Containers with Kubernetes ([as documented here](../how-to/how-to-use-k8s-with-containerd-and-kata.md#configure-containerd-to-use-kata-containers)).
|
||||
|
||||
An equivalent shim implementation for CRI-O is planned.
|
||||
|
||||
@@ -57,7 +57,7 @@ content shown below:
|
||||
|
||||
To customize containerd to select Kata Containers runtime, follow our
|
||||
"Configure containerd to use Kata Containers" internal documentation
|
||||
[here](../how-to/how-to-use-k8s-with-cri-containerd-and-kata.md#configure-containerd-to-use-kata-containers).
|
||||
[here](../how-to/how-to-use-k8s-with-containerd-and-kata.md#configure-containerd-to-use-kata-containers).
|
||||
|
||||
## Install Kubernetes
|
||||
|
||||
@@ -85,7 +85,7 @@ Environment="KUBELET_EXTRA_ARGS=--container-runtime=remote --runtime-request-tim
|
||||
Environment="KUBELET_EXTRA_ARGS=--container-runtime=remote --runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock"
|
||||
```
|
||||
For more information about containerd see the "Configure Kubelet to use containerd"
|
||||
documentation [here](../how-to/how-to-use-k8s-with-cri-containerd-and-kata.md#configure-kubelet-to-use-containerd).
|
||||
documentation [here](../how-to/how-to-use-k8s-with-containerd-and-kata.md#configure-kubelet-to-use-containerd).
|
||||
|
||||
## Run a Kubernetes pod with Kata Containers
|
||||
|
||||
@@ -99,7 +99,18 @@ $ sudo systemctl restart kubelet
|
||||
$ sudo kubeadm init --ignore-preflight-errors=all --cri-socket /var/run/crio/crio.sock --pod-network-cidr=10.244.0.0/16
|
||||
|
||||
# If using containerd
|
||||
$ sudo kubeadm init --ignore-preflight-errors=all --cri-socket /run/containerd/containerd.sock --pod-network-cidr=10.244.0.0/16
|
||||
$ cat <<EOF | tee kubeadm-config.yaml
|
||||
apiVersion: kubeadm.k8s.io/v1beta3
|
||||
kind: InitConfiguration
|
||||
nodeRegistration:
|
||||
criSocket: "/run/containerd/containerd.sock"
|
||||
---
|
||||
kind: KubeletConfiguration
|
||||
apiVersion: kubelet.config.k8s.io/v1beta1
|
||||
cgroupDriver: cgroupfs
|
||||
podCIDR: "10.244.0.0/16"
|
||||
EOF
|
||||
$ sudo kubeadm init --ignore-preflight-errors=all --config kubeadm-config.yaml
|
||||
|
||||
$ export KUBECONFIG=/etc/kubernetes/admin.conf
|
||||
```
|
||||
|
||||
@@ -33,6 +33,7 @@ are available, their default values and how each setting can be used.
|
||||
[Cloud Hypervisor] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) | `configuration-clh.toml` |
|
||||
[Firecracker] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) | `configuration-fc.toml` |
|
||||
[QEMU] | C | all | Type 2 ([KVM]) | `configuration-qemu.toml` |
|
||||
[`Dragonball`] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) | `configuration-dragonball.toml` |
|
||||
|
||||
## Determine currently configured hypervisor
|
||||
|
||||
@@ -52,6 +53,7 @@ the hypervisors:
|
||||
[Cloud Hypervisor] | Low latency, small memory footprint, small attack surface | Minimal | | excellent | excellent | High performance modern cloud workloads | |
|
||||
[Firecracker] | Very slimline | Extremely minimal | Doesn't support all device types | excellent | excellent | Serverless / FaaS | |
|
||||
[QEMU] | Lots of features | Lots | | good | good | Good option for most users | | All users |
|
||||
[`Dragonball`] | Built-in VMM, low CPU and memory overhead| Minimal | | excellent | excellent | Optimized for most container workloads | `out-of-the-box` Kata Containers experience |
|
||||
|
||||
For further details, see the [Virtualization in Kata Containers](design/virtualization.md) document and the official documentation for each hypervisor.
|
||||
|
||||
@@ -60,3 +62,4 @@ For further details, see the [Virtualization in Kata Containers](design/virtuali
|
||||
[Firecracker]: https://github.com/firecracker-microvm/firecracker
|
||||
[KVM]: https://en.wikipedia.org/wiki/Kernel-based_Virtual_Machine
|
||||
[QEMU]: http://www.qemu-project.org
|
||||
[`Dragonball`]: https://github.com/openanolis/dragonball-sandbox
|
||||
|
||||
@@ -79,3 +79,6 @@ versions. This is not recommended for normal users.
|
||||
* [upgrading document](../Upgrading.md)
|
||||
* [developer guide](../Developer-Guide.md)
|
||||
* [runtime documentation](../../src/runtime/README.md)
|
||||
|
||||
## Kata Containers 3.0 rust runtime installation
|
||||
* [installation guide](../install/kata-containers-3.0-rust-runtime-installation-guide.md)
|
||||
|
||||
@@ -19,12 +19,6 @@
|
||||
> - If you decide to proceed and install a Kata Containers release, you can
|
||||
> still check for the latest version of Kata Containers by running
|
||||
> `kata-runtime check --only-list-releases`.
|
||||
>
|
||||
> - These instructions will not work for Fedora 31 and higher since those
|
||||
> distribution versions only support cgroups version 2 by default. However,
|
||||
> Kata Containers currently requires cgroups version 1 (on the host side). See
|
||||
> https://github.com/kata-containers/kata-containers/issues/927 for further
|
||||
> details.
|
||||
|
||||
## Install Kata Containers
|
||||
|
||||
@@ -81,7 +75,7 @@
|
||||
- Download the standard `systemd(1)` service file and install to
|
||||
`/etc/systemd/system/`:
|
||||
|
||||
- https://raw.githubusercontent.com/containerd/containerd/master/containerd.service
|
||||
- https://raw.githubusercontent.com/containerd/containerd/main/containerd.service
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
|
||||
@@ -0,0 +1,101 @@
|
||||
# Kata Containers 3.0 rust runtime installation
|
||||
The following is an overview of the different installation methods available.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Kata Containers 3.0 rust runtime requires nested virtualization or bare metal. Check
|
||||
[hardware requirements](/src/runtime/README.md#hardware-requirements) to see if your system is capable of running Kata
|
||||
Containers.
|
||||
|
||||
### Platform support
|
||||
|
||||
Kata Containers 3.0 rust runtime currently runs on 64-bit systems supporting the following
|
||||
architectures:
|
||||
|
||||
> **Notes:**
|
||||
> For other architectures, see https://github.com/kata-containers/kata-containers/issues/4320
|
||||
|
||||
| Architecture | Virtualization technology |
|
||||
|-|-|
|
||||
| `x86_64`| [Intel](https://www.intel.com) VT-x |
|
||||
| `aarch64` ("`arm64`")| [ARM](https://www.arm.com) Hyp |
|
||||
|
||||
## Packaged installation methods
|
||||
|
||||
| Installation method | Description | Automatic updates | Use case | Availability
|
||||
|------------------------------------------------------|----------------------------------------------------------------------------------------------|-------------------|-----------------------------------------------------------------------------------------------|----------- |
|
||||
| [Using kata-deploy](#kata-deploy-installation) | The preferred way to deploy the Kata Containers distributed binaries on a Kubernetes cluster | **No!** | Best way to give it a try on kata-containers on an already up and running Kubernetes cluster. | No |
|
||||
| [Using official distro packages](#official-packages) | Kata packages provided by Linux distributions official repositories | yes | Recommended for most users. | No |
|
||||
| [Using snap](#snap-installation) | Easy to install | yes | Good alternative to official distro packages. | No |
|
||||
| [Automatic](#automatic-installation) | Run a single command to install a full system | **No!** | For those wanting the latest release quickly. | No |
|
||||
| [Manual](#manual-installation) | Follow a guide step-by-step to install a working system | **No!** | For those who want the latest release with more control. | No |
|
||||
| [Build from source](#build-from-source-installation) | Build the software components manually | **No!** | Power users and developers only. | Yes |
|
||||
|
||||
### Kata Deploy Installation
|
||||
`ToDo`
|
||||
### Official packages
|
||||
`ToDo`
|
||||
### Snap Installation
|
||||
`ToDo`
|
||||
### Automatic Installation
|
||||
`ToDo`
|
||||
### Manual Installation
|
||||
`ToDo`
|
||||
|
||||
## Build from source installation
|
||||
|
||||
### Rust Environment Set Up
|
||||
|
||||
* Download `Rustup` and install `Rust`
|
||||
> **Notes:**
|
||||
> Rust version 1.58 is needed
|
||||
|
||||
Example for `x86_64`
|
||||
```
|
||||
$ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
$ source $HOME/.cargo/env
|
||||
$ rustup install 1.58
|
||||
$ rustup default 1.58-x86_64-unknown-linux-gnu
|
||||
```
|
||||
|
||||
* Musl support for fully static binary
|
||||
|
||||
Example for `x86_64`
|
||||
```
|
||||
$ rustup target add x86_64-unknown-linux-musl
|
||||
```
|
||||
* [Musl `libc`](http://musl.libc.org/) install
|
||||
|
||||
Example for musl 1.2.3
|
||||
```
|
||||
$ curl -O https://git.musl-libc.org/cgit/musl/snapshot/musl-1.2.3.tar.gz
|
||||
$ tar vxf musl-1.2.3.tar.gz
|
||||
$ cd musl-1.2.3/
|
||||
$ ./configure --prefix=/usr/local/
|
||||
$ make && sudo make install
|
||||
```
|
||||
|
||||
|
||||
### Install Kata 3.0 Rust Runtime Shim
|
||||
|
||||
```
|
||||
$ git clone https://github.com/kata-containers/kata-containers.git
|
||||
$ cd kata-containers/src/runtime-rs
|
||||
$ make && sudo make install
|
||||
```
|
||||
After running the command above, the default config file `configuration.toml` will be installed under `/usr/share/defaults/kata-containers/`, the binary file `containerd-shim-kata-v2` will be installed under `/user/local/bin` .
|
||||
|
||||
### Build Kata Containers Kernel
|
||||
Follow the [Kernel installation guide](/tools/packaging/kernel/README.md).
|
||||
|
||||
### Build Kata Rootfs
|
||||
Follow the [Rootfs installation guide](../../tools/osbuilder/rootfs-builder/README.md).
|
||||
|
||||
### Build Kata Image
|
||||
Follow the [Image installation guide](../../tools/osbuilder/image-builder/README.md).
|
||||
|
||||
### Install Containerd
|
||||
|
||||
Follow the [Containerd installation guide](container-manager/containerd/containerd-install.md).
|
||||
|
||||
|
||||
@@ -3,4 +3,4 @@
|
||||
Kata Containers supports passing certain GPUs from the host into the container. Select the GPU vendor for detailed information:
|
||||
|
||||
- [Intel](Intel-GPU-passthrough-and-Kata.md)
|
||||
- [Nvidia](Nvidia-GPU-passthrough-and-Kata.md)
|
||||
- [NVIDIA](NVIDIA-GPU-passthrough-and-Kata.md)
|
||||
|
||||
592
docs/use-cases/NVIDIA-GPU-passthrough-and-Kata.md
Normal file
592
docs/use-cases/NVIDIA-GPU-passthrough-and-Kata.md
Normal file
@@ -0,0 +1,592 @@
|
||||
# Using NVIDIA GPU device with Kata Containers
|
||||
|
||||
An NVIDIA GPU device can be passed to a Kata Containers container using GPU
|
||||
passthrough (NVIDIA GPU pass-through mode) as well as GPU mediated passthrough
|
||||
(NVIDIA `vGPU` mode).
|
||||
|
||||
NVIDIA GPU pass-through mode, an entire physical GPU is directly assigned to one
|
||||
VM, bypassing the NVIDIA Virtual GPU Manager. In this mode of operation, the GPU
|
||||
is accessed exclusively by the NVIDIA driver running in the VM to which it is
|
||||
assigned. The GPU is not shared among VMs.
|
||||
|
||||
NVIDIA Virtual GPU (`vGPU`) enables multiple virtual machines (VMs) to have
|
||||
simultaneous, direct access to a single physical GPU, using the same NVIDIA
|
||||
graphics drivers that are deployed on non-virtualized operating systems. By
|
||||
doing this, NVIDIA `vGPU` provides VMs with unparalleled graphics performance,
|
||||
compute performance, and application compatibility, together with the
|
||||
cost-effectiveness and scalability brought about by sharing a GPU among multiple
|
||||
workloads. A `vGPU` can be either time-sliced or Multi-Instance GPU (MIG)-backed
|
||||
with [MIG-slices](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/).
|
||||
|
||||
| Technology | Description | Behavior | Detail |
|
||||
| --- | --- | --- | --- |
|
||||
| NVIDIA GPU pass-through mode | GPU passthrough | Physical GPU assigned to a single VM | Direct GPU assignment to VM without limitation |
|
||||
| NVIDIA vGPU time-sliced | GPU time-sliced | Physical GPU time-sliced for multiple VMs | Mediated passthrough |
|
||||
| NVIDIA vGPU MIG-backed | GPU with MIG-slices | Physical GPU MIG-sliced for multiple VMs | Mediated passthrough |
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
NVIDIA GPUs Recommended for Virtualization:
|
||||
|
||||
- NVIDIA Tesla (T4, M10, P6, V100 or newer)
|
||||
- NVIDIA Quadro RTX 6000/8000
|
||||
|
||||
## Host BIOS Requirements
|
||||
|
||||
Some hardware requires a larger PCI BARs window, for example, NVIDIA Tesla P100,
|
||||
K40m
|
||||
|
||||
```sh
|
||||
$ lspci -s d0:00.0 -vv | grep Region
|
||||
Region 0: Memory at e7000000 (32-bit, non-prefetchable) [size=16M]
|
||||
Region 1: Memory at 222800000000 (64-bit, prefetchable) [size=32G] # Above 4G
|
||||
Region 3: Memory at 223810000000 (64-bit, prefetchable) [size=32M]
|
||||
```
|
||||
|
||||
For large BARs devices, MMIO mapping above 4G address space should be `enabled`
|
||||
in the PCI configuration of the BIOS.
|
||||
|
||||
Some hardware vendors use a different name in BIOS, such as:
|
||||
|
||||
- Above 4G Decoding
|
||||
- Memory Hole for PCI MMIO
|
||||
- Memory Mapped I/O above 4GB
|
||||
|
||||
If one is using a GPU based on the Ampere architecture and later additionally
|
||||
SR-IOV needs to be enabled for the `vGPU` use-case.
|
||||
|
||||
The following steps outline the workflow for using an NVIDIA GPU with Kata.
|
||||
|
||||
## Host Kernel Requirements
|
||||
|
||||
The following configurations need to be enabled on your host kernel:
|
||||
|
||||
- `CONFIG_VFIO`
|
||||
- `CONFIG_VFIO_IOMMU_TYPE1`
|
||||
- `CONFIG_VFIO_MDEV`
|
||||
- `CONFIG_VFIO_MDEV_DEVICE`
|
||||
- `CONFIG_VFIO_PCI`
|
||||
|
||||
Your host kernel needs to be booted with `intel_iommu=on` on the kernel command
|
||||
line.
|
||||
|
||||
## Install and configure Kata Containers
|
||||
|
||||
To use non-large BARs devices (for example, NVIDIA Tesla T4), you need Kata
|
||||
version 1.3.0 or above. Follow the [Kata Containers setup
|
||||
instructions](../install/README.md) to install the latest version of Kata.
|
||||
|
||||
To use large BARs devices (for example, NVIDIA Tesla P100), you need Kata
|
||||
version 1.11.0 or above.
|
||||
|
||||
The following configuration in the Kata `configuration.toml` file as shown below
|
||||
can work:
|
||||
|
||||
Hotplug for PCI devices with small BARs by `acpi_pcihp` (Linux's ACPI PCI
|
||||
Hotplug driver):
|
||||
|
||||
```sh
|
||||
machine_type = "q35"
|
||||
|
||||
hotplug_vfio_on_root_bus = false
|
||||
```
|
||||
|
||||
Hotplug for PCIe devices with large BARs by `pciehp` (Linux's PCIe Hotplug
|
||||
driver):
|
||||
|
||||
```sh
|
||||
machine_type = "q35"
|
||||
|
||||
hotplug_vfio_on_root_bus = true
|
||||
pcie_root_port = 1
|
||||
```
|
||||
|
||||
## Build Kata Containers kernel with GPU support
|
||||
|
||||
The default guest kernel installed with Kata Containers does not provide GPU
|
||||
support. To use an NVIDIA GPU with Kata Containers, you need to build a kernel
|
||||
with the necessary GPU support.
|
||||
|
||||
The following kernel config options need to be enabled:
|
||||
|
||||
```sh
|
||||
# Support PCI/PCIe device hotplug (Required for large BARs device)
|
||||
CONFIG_HOTPLUG_PCI_PCIE=y
|
||||
|
||||
# Support for loading modules (Required for load NVIDIA drivers)
|
||||
CONFIG_MODULES=y
|
||||
CONFIG_MODULE_UNLOAD=y
|
||||
|
||||
# Enable the MMIO access method for PCIe devices (Required for large BARs device)
|
||||
CONFIG_PCI_MMCONFIG=y
|
||||
```
|
||||
|
||||
The following kernel config options need to be disabled:
|
||||
|
||||
```sh
|
||||
# Disable Open Source NVIDIA driver nouveau
|
||||
# It conflicts with NVIDIA official driver
|
||||
CONFIG_DRM_NOUVEAU=n
|
||||
```
|
||||
|
||||
> **Note**: `CONFIG_DRM_NOUVEAU` is normally disabled by default.
|
||||
It is worth checking that it is not enabled in your kernel configuration to
|
||||
prevent any conflicts.
|
||||
|
||||
Build the Kata Containers kernel with the previous config options, using the
|
||||
instructions described in [Building Kata Containers
|
||||
kernel](../../tools/packaging/kernel). For further details on building and
|
||||
installing guest kernels, see [the developer
|
||||
guide](../Developer-Guide.md#install-guest-kernel-images).
|
||||
|
||||
There is an easy way to build a guest kernel that supports NVIDIA GPU:
|
||||
|
||||
```sh
|
||||
## Build guest kernel with ../../tools/packaging/kernel
|
||||
|
||||
# Prepare (download guest kernel source, generate .config)
|
||||
$ ./build-kernel.sh -v 5.15.23 -g nvidia -f setup
|
||||
|
||||
# Build guest kernel
|
||||
$ ./build-kernel.sh -v 5.15.23 -g nvidia build
|
||||
|
||||
# Install guest kernel
|
||||
$ sudo -E ./build-kernel.sh -v 5.15.23 -g nvidia install
|
||||
```
|
||||
|
||||
To build NVIDIA Driver in Kata container, `linux-headers` are required.
|
||||
This is a way to generate deb packages for `linux-headers`:
|
||||
|
||||
> **Note**:
|
||||
> Run `make rpm-pkg` to build the rpm package.
|
||||
> Run `make deb-pkg` to build the deb package.
|
||||
>
|
||||
|
||||
```sh
|
||||
$ cd kata-linux-5.15.23-89
|
||||
$ make deb-pkg
|
||||
```
|
||||
Before using the new guest kernel, please update the `kernel` parameters in
|
||||
`configuration.toml`.
|
||||
|
||||
```sh
|
||||
kernel = "/usr/share/kata-containers/vmlinuz-nvidia-gpu.container"
|
||||
```
|
||||
|
||||
## NVIDIA GPU pass-through mode with Kata Containers
|
||||
|
||||
Use the following steps to pass an NVIDIA GPU device in pass-through mode with Kata:
|
||||
|
||||
1. Find the Bus-Device-Function (BDF) for the GPU device on the host:
|
||||
|
||||
```sh
|
||||
$ sudo lspci -nn -D | grep -i nvidia
|
||||
0000:d0:00.0 3D controller [0302]: NVIDIA Corporation Device [10de:20b9] (rev a1)
|
||||
```
|
||||
|
||||
> PCI address `0000:d0:00.0` is assigned to the hardware GPU device.
|
||||
> `10de:20b9` is the device ID of the hardware GPU device.
|
||||
|
||||
2. Find the IOMMU group for the GPU device:
|
||||
|
||||
```sh
|
||||
$ BDF="0000:d0:00.0"
|
||||
$ readlink -e /sys/bus/pci/devices/$BDF/iommu_group
|
||||
```
|
||||
|
||||
The previous output shows that the GPU belongs to IOMMU group 192. The next
|
||||
step is to bind the GPU to the VFIO-PCI driver.
|
||||
|
||||
```sh
|
||||
$ BDF="0000:d0:00.0"
|
||||
$ DEV="/sys/bus/pci/devices/$BDF"
|
||||
$ echo "vfio-pci" > $DEV/driver_override
|
||||
$ echo $BDF > $DEV/driver/unbind
|
||||
$ echo $BDF > /sys/bus/pci/drivers_probe
|
||||
# To return the device to the standard driver, we simply clear the
|
||||
# driver_override and reprobe the device, ex:
|
||||
$ echo > $DEV/preferred_driver
|
||||
$ echo $BDF > $DEV/driver/unbind
|
||||
$ echo $BDF > /sys/bus/pci/drivers_probe
|
||||
```
|
||||
|
||||
3. Check the IOMMU group number under `/dev/vfio`:
|
||||
|
||||
```sh
|
||||
$ ls -l /dev/vfio
|
||||
total 0
|
||||
crw------- 1 zvonkok zvonkok 243, 0 Mar 18 03:06 192
|
||||
crw-rw-rw- 1 root root 10, 196 Mar 18 02:27 vfio
|
||||
```
|
||||
|
||||
4. Start a Kata container with the GPU device:
|
||||
|
||||
```sh
|
||||
# You may need to `modprobe vhost-vsock` if you get
|
||||
# host system doesn't support vsock: stat /dev/vhost-vsock
|
||||
$ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/library/archlinux:latest" arch uname -r
|
||||
```
|
||||
|
||||
5. Run `lspci` within the container to verify the GPU device is seen in the list
|
||||
of the PCI devices. Note the vendor-device id of the GPU (`10de:20b9`) in the `lspci` output.
|
||||
|
||||
```sh
|
||||
$ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/library/archlinux:latest" arch sh -c "lspci -nn | grep '10de:20b9'"
|
||||
```
|
||||
|
||||
6. Additionally, you can check the PCI BARs space of the NVIDIA GPU device in the container:
|
||||
|
||||
```sh
|
||||
$ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/library/archlinux:latest" arch sh -c "lspci -s 02:00.0 -vv | grep Region"
|
||||
```
|
||||
|
||||
> **Note**: If you see a message similar to the above, the BAR space of the NVIDIA
|
||||
> GPU has been successfully allocated.
|
||||
|
||||
## NVIDIA vGPU mode with Kata Containers
|
||||
|
||||
NVIDIA vGPU is a licensed product on all supported GPU boards. A software license
|
||||
is required to enable all vGPU features within the guest VM. NVIDIA vGPU manager
|
||||
needs to be installed on the host to configure GPUs in vGPU mode. See [NVIDIA Virtual GPU Software Documentation v14.0 through 14.1](https://docs.nvidia.com/grid/14.0/) for more details.
|
||||
|
||||
### NVIDIA vGPU time-sliced
|
||||
|
||||
In the time-sliced mode, the GPU is not partitioned and the workload uses the
|
||||
whole GPU and shares access to the GPU engines. Processes are scheduled in
|
||||
series. The best effort scheduler is the default one and can be exchanged by
|
||||
other scheduling policies see the documentation above how to do that.
|
||||
|
||||
Beware if you had `MIG` enabled before to disable `MIG` on the GPU if you want
|
||||
to use `time-sliced` `vGPU`.
|
||||
|
||||
```sh
|
||||
$ sudo nvidia-smi -mig 0
|
||||
```
|
||||
|
||||
Enable the virtual functions for the physical GPU in the `sysfs` file system.
|
||||
|
||||
```sh
|
||||
$ sudo /usr/lib/nvidia/sriov-manage -e 0000:41:00.0
|
||||
```
|
||||
|
||||
Get the `BDF` of the available virtual function on the GPU, and choose one for the
|
||||
following steps.
|
||||
|
||||
```sh
|
||||
$ cd /sys/bus/pci/devices/0000:41:00.0/
|
||||
$ ls -l | grep virtfn
|
||||
```
|
||||
|
||||
#### List all available vGPU instances
|
||||
|
||||
The following shell snippet will walk the `sysfs` and only print instances
|
||||
that are available, that can be created.
|
||||
|
||||
```sh
|
||||
# The 00.0 is often the PF of the device the VFs will have the funciont in the
|
||||
# BDF incremented by some values so e.g. the very first VF is 0000:41:00.4
|
||||
|
||||
cd /sys/bus/pci/devices/0000:41:00.0/
|
||||
|
||||
for vf in $(ls -d virtfn*)
|
||||
do
|
||||
BDF=$(basename $(readlink -f $vf))
|
||||
for md in $(ls -d $vf/mdev_supported_types/*)
|
||||
do
|
||||
AVAIL=$(cat $md/available_instances)
|
||||
NAME=$(cat $md/name)
|
||||
DIR=$(basename $md)
|
||||
|
||||
if [ $AVAIL -gt 0 ]; then
|
||||
echo "| BDF | INSTANCES | NAME | DIR |"
|
||||
echo "+--------------+-----------+----------------+------------+"
|
||||
printf "| %12s |%10d |%15s | %10s |\n\n" "$BDF" "$AVAIL" "$NAME" "$DIR"
|
||||
fi
|
||||
|
||||
done
|
||||
done
|
||||
```
|
||||
|
||||
If there are available instances you get something like this (for the first VF),
|
||||
beware that the output is highly dependent on the GPU you have, if there is no
|
||||
output check again if `MIG` is really disabled.
|
||||
|
||||
```sh
|
||||
| BDF | INSTANCES | NAME | DIR |
|
||||
+--------------+-----------+----------------+------------+
|
||||
| 0000:41:00.4 | 1 | GRID A100D-4C | nvidia-692 |
|
||||
|
||||
| BDF | INSTANCES | NAME | DIR |
|
||||
+--------------+-----------+----------------+------------+
|
||||
| 0000:41:00.4 | 1 | GRID A100D-8C | nvidia-693 |
|
||||
|
||||
| BDF | INSTANCES | NAME | DIR |
|
||||
+--------------+-----------+----------------+------------+
|
||||
| 0000:41:00.4 | 1 | GRID A100D-10C | nvidia-694 |
|
||||
|
||||
| BDF | INSTANCES | NAME | DIR |
|
||||
+--------------+-----------+----------------+------------+
|
||||
| 0000:41:00.4 | 1 | GRID A100D-16C | nvidia-695 |
|
||||
|
||||
| BDF | INSTANCES | NAME | DIR |
|
||||
+--------------+-----------+----------------+------------+
|
||||
| 0000:41:00.4 | 1 | GRID A100D-20C | nvidia-696 |
|
||||
|
||||
| BDF | INSTANCES | NAME | DIR |
|
||||
+--------------+-----------+----------------+------------+
|
||||
| 0000:41:00.4 | 1 | GRID A100D-40C | nvidia-697 |
|
||||
|
||||
| BDF | INSTANCES | NAME | DIR |
|
||||
+--------------+-----------+----------------+------------+
|
||||
| 0000:41:00.4 | 1 | GRID A100D-80C | nvidia-698 |
|
||||
|
||||
```
|
||||
|
||||
Change to the `mdev_supported_types` directory for the virtual function on which
|
||||
you want to create the `vGPU`. Taking the first output as an example:
|
||||
|
||||
```sh
|
||||
$ cd virtfn0/mdev_supported_types/nvidia-692
|
||||
$ UUIDGEN=$(uuidgen)
|
||||
$ sudo bash -c "echo $UUIDGEN > create"
|
||||
```
|
||||
|
||||
Confirm that the `vGPU` was created. You should see the `UUID` pointing to a
|
||||
subdirectory of the `sysfs` space.
|
||||
|
||||
```sh
|
||||
$ ls -l /sys/bus/mdev/devices/
|
||||
```
|
||||
|
||||
Get the `IOMMU` group number and verify there is a `VFIO` device created to use
|
||||
with Kata.
|
||||
|
||||
```sh
|
||||
$ ls -l /sys/bus/mdev/devices/*/
|
||||
$ ls -l /dev/vfio
|
||||
```
|
||||
|
||||
Use the `VFIO` device created in the same way as in the pass-through use-case.
|
||||
Beware that the guest needs the NVIDIA guest drivers, so one would need to build
|
||||
a new guest `OS` image.
|
||||
|
||||
### NVIDIA vGPU MIG-backed
|
||||
|
||||
We're not going into detail what `MIG` is but briefly it is a technology to
|
||||
partition the hardware into independent instances with guaranteed quality of
|
||||
service. For more details see [NVIDIA Multi-Instance GPU User Guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/).
|
||||
|
||||
First enable `MIG` mode for a GPU, depending on the platform you're running
|
||||
a reboot would be necessary. Some platforms support GPU reset.
|
||||
|
||||
```sh
|
||||
$ sudo nvidia-smi -mig 1
|
||||
```
|
||||
|
||||
If the platform supports a GPU reset one can run, otherwise you will get a
|
||||
warning to reboot the server.
|
||||
|
||||
```sh
|
||||
$ sudo nvidia-smi --gpu-reset
|
||||
```
|
||||
|
||||
The driver per default provides a number of profiles that users can opt-in when
|
||||
configuring the MIG feature.
|
||||
|
||||
```sh
|
||||
$ sudo nvidia-smi mig -lgip
|
||||
+-----------------------------------------------------------------------------+
|
||||
| GPU instance profiles: |
|
||||
| GPU Name ID Instances Memory P2P SM DEC ENC |
|
||||
| Free/Total GiB CE JPEG OFA |
|
||||
|=============================================================================|
|
||||
| 0 MIG 1g.10gb 19 7/7 9.50 No 14 0 0 |
|
||||
| 1 0 0 |
|
||||
+-----------------------------------------------------------------------------+
|
||||
| 0 MIG 1g.10gb+me 20 1/1 9.50 No 14 1 0 |
|
||||
| 1 1 1 |
|
||||
+-----------------------------------------------------------------------------+
|
||||
| 0 MIG 2g.20gb 14 3/3 19.50 No 28 1 0 |
|
||||
| 2 0 0 |
|
||||
+-----------------------------------------------------------------------------+
|
||||
...
|
||||
```
|
||||
|
||||
Create the GPU instances that correspond to the `vGPU` types of the `MIG-backed`
|
||||
`vGPUs` that you will create [NVIDIA A100 PCIe 80GB Virtual GPU Types](https://docs.nvidia.com/grid/13.0/grid-vgpu-user-guide/index.html#vgpu-types-nvidia-a100-pcie-80gb).
|
||||
|
||||
```sh
|
||||
# MIG 1g.10gb --> vGPU A100D-1-10C
|
||||
$ sudo nvidia-smi mig -cgi 19
|
||||
```
|
||||
|
||||
List the GPU instances and get the GPU instance id to create the compute
|
||||
instance.
|
||||
|
||||
```sh
|
||||
$ sudo nvidia-smi mig -lgi # list the created GPU instances
|
||||
$ sudo nvidia-smi mig -cci -gi 9 # each GPU instance can have several compute
|
||||
# instances. Instance -> Workload
|
||||
```
|
||||
|
||||
Verify that the compute instances were created within the GPU instance
|
||||
|
||||
```sh
|
||||
$ nvidia-smi
|
||||
... snip ...
|
||||
+-----------------------------------------------------------------------------+
|
||||
| MIG devices: |
|
||||
+------------------+----------------------+-----------+-----------------------+
|
||||
| GPU GI CI MIG | Memory-Usage | Vol| Shared |
|
||||
| ID ID Dev | BAR1-Usage | SM Unc| CE ENC DEC OFA JPG|
|
||||
| | | ECC| |
|
||||
|==================+======================+===========+=======================|
|
||||
| 0 9 0 0 | 0MiB / 9728MiB | 14 0 | 1 0 0 0 0 |
|
||||
| | 0MiB / 4095MiB | | |
|
||||
+------------------+----------------------+-----------+-----------------------+
|
||||
... snip ...
|
||||
```
|
||||
|
||||
We can use the [snippet](#list-all-available-vgpu-instances) from before to list
|
||||
the available `vGPU` instances, this time `MIG-backed`.
|
||||
|
||||
```sh
|
||||
| BDF | INSTANCES | NAME | DIR |
|
||||
+--------------+-----------+----------------+------------+
|
||||
| 0000:41:00.4 | 1 |GRID A100D-1-10C | nvidia-699 |
|
||||
|
||||
| BDF | INSTANCES | NAME | DIR |
|
||||
+--------------+-----------+----------------+------------+
|
||||
| 0000:41:00.5 | 1 |GRID A100D-1-10C | nvidia-699 |
|
||||
|
||||
| BDF | INSTANCES | NAME | DIR |
|
||||
+--------------+-----------+----------------+------------+
|
||||
| 0000:41:01.6 | 1 |GRID A100D-1-10C | nvidia-699 |
|
||||
... snip ...
|
||||
```
|
||||
|
||||
Repeat the steps after the [snippet](#list-all-available-vgpu-instances) listing
|
||||
to create the corresponding `mdev` device and use the guest `OS` created in the
|
||||
previous section with `time-sliced` `vGPUs`.
|
||||
|
||||
## Install NVIDIA Driver + Toolkit in Kata Containers Guest OS
|
||||
|
||||
Consult the [Developer-Guide](https://github.com/kata-containers/kata-containers/blob/main/docs/Developer-Guide.md#create-a-rootfs-image) on how to create a
|
||||
rootfs base image for a distribution of your choice. This is going to be used as
|
||||
a base for a NVIDIA enabled guest OS. Use the `EXTRA_PKGS` variable to install
|
||||
all the needed packages to compile the drivers. Also copy the kernel development
|
||||
packages from the previous `make deb-pkg` into `$ROOTFS_DIR`.
|
||||
|
||||
```sh
|
||||
export EXTRA_PKGS="gcc make curl gnupg"
|
||||
```
|
||||
|
||||
Having the `$ROOTFS_DIR` exported in the previous step we can now install all the
|
||||
needed parts in the guest OS. In this case, we have an Ubuntu based rootfs.
|
||||
|
||||
First off all mount the special filesystems into the rootfs
|
||||
|
||||
```sh
|
||||
$ sudo mount -t sysfs -o ro none ${ROOTFS_DIR}/sys
|
||||
$ sudo mount -t proc -o ro none ${ROOTFS_DIR}/proc
|
||||
$ sudo mount -t tmpfs none ${ROOTFS_DIR}/tmp
|
||||
$ sudo mount -o bind,ro /dev ${ROOTFS_DIR}/dev
|
||||
$ sudo mount -t devpts none ${ROOTFS_DIR}/dev/pts
|
||||
```
|
||||
|
||||
Now we can enter `chroot`
|
||||
|
||||
```sh
|
||||
$ sudo chroot ${ROOTFS_DIR}
|
||||
```
|
||||
|
||||
Inside the rootfs one is going to install the drivers and toolkit to enable the
|
||||
easy creation of GPU containers with Kata. We can also use this rootfs for any
|
||||
other container not specifically only for GPUs.
|
||||
|
||||
As a prerequisite install the copied kernel development packages
|
||||
|
||||
```sh
|
||||
$ sudo dpkg -i *.deb
|
||||
```
|
||||
|
||||
Get the driver run file, since we need to build the driver against a kernel that
|
||||
is not running on the host we need the ability to specify the exact version we
|
||||
want the driver to build against. Take the kernel version one used for building
|
||||
the NVIDIA kernel (`5.15.23-nvidia-gpu`).
|
||||
|
||||
```sh
|
||||
$ wget https://us.download.nvidia.com/XFree86/Linux-x86_64/510.54/NVIDIA-Linux-x86_64-510.54.run
|
||||
$ chmod +x NVIDIA-Linux-x86_64-510.54.run
|
||||
# Extract the source files so we can run the installer with arguments
|
||||
$ ./NVIDIA-Linux-x86_64-510.54.run -x
|
||||
$ cd NVIDIA-Linux-x86_64-510.54
|
||||
$ ./nvidia-installer -k 5.15.23-nvidia-gpu
|
||||
```
|
||||
|
||||
Having the drivers installed we need to install the toolkit which will take care
|
||||
of providing the right bits into the container.
|
||||
|
||||
```sh
|
||||
$ distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
|
||||
$ curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||
$ curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||
$ apt update
|
||||
$ apt install nvidia-container-toolkit
|
||||
```
|
||||
|
||||
Create the hook execution file for Kata:
|
||||
|
||||
```
|
||||
# Content of $ROOTFS_DIR/usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh
|
||||
|
||||
#!/bin/bash -x
|
||||
|
||||
/usr/bin/nvidia-container-toolkit -debug $@
|
||||
```
|
||||
|
||||
As the last step one can do some cleanup of files or package caches. Build the
|
||||
rootfs and configure it for use with Kata according to the development guide.
|
||||
|
||||
Enable the `guest_hook_path` in Kata's `configuration.toml`
|
||||
|
||||
```sh
|
||||
guest_hook_path = "/usr/share/oci/hooks"
|
||||
```
|
||||
|
||||
One has built a NVIDIA rootfs, kernel and now we can run any GPU container
|
||||
without installing the drivers into the container. Check NVIDIA device status
|
||||
with `nvidia-smi`
|
||||
|
||||
```sh
|
||||
$ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/nvidia/cuda:11.6.0-base-ubuntu20.04" cuda nvidia-smi
|
||||
Fri Mar 18 10:36:59 2022
|
||||
+-----------------------------------------------------------------------------+
|
||||
| NVIDIA-SMI 510.54 Driver Version: 510.54 CUDA Version: 11.6 |
|
||||
|-------------------------------+----------------------+----------------------+
|
||||
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
|
||||
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|
||||
| | | MIG M. |
|
||||
|===============================+======================+======================|
|
||||
| 0 NVIDIA A30X Off | 00000000:02:00.0 Off | 0 |
|
||||
| N/A 38C P0 67W / 230W | 0MiB / 24576MiB | 0% Default |
|
||||
| | | Disabled |
|
||||
+-------------------------------+----------------------+----------------------+
|
||||
|
||||
+-----------------------------------------------------------------------------+
|
||||
| Processes: |
|
||||
| GPU GI CI PID Type Process name GPU Memory |
|
||||
| ID ID Usage |
|
||||
|=============================================================================|
|
||||
| No running processes found |
|
||||
+-----------------------------------------------------------------------------+
|
||||
```
|
||||
|
||||
As the last step one can remove the additional packages and files that were added
|
||||
to the `$ROOTFS_DIR` to keep it as small as possible.
|
||||
|
||||
## References
|
||||
|
||||
- [Configuring a VM for GPU Pass-Through by Using the QEMU Command Line](https://docs.nvidia.com/grid/latest/grid-vgpu-user-guide/index.html#using-gpu-pass-through-red-hat-el-qemu-cli)
|
||||
- https://gitlab.com/nvidia/container-images/driver/-/tree/master
|
||||
- https://github.com/NVIDIA/nvidia-docker/wiki/Driver-containers
|
||||
@@ -1,293 +0,0 @@
|
||||
# Using Nvidia GPU device with Kata Containers
|
||||
|
||||
An Nvidia GPU device can be passed to a Kata Containers container using GPU passthrough
|
||||
(Nvidia GPU pass-through mode) as well as GPU mediated passthrough (Nvidia vGPU mode).
|
||||
|
||||
Nvidia GPU pass-through mode, an entire physical GPU is directly assigned to one VM,
|
||||
bypassing the Nvidia Virtual GPU Manager. In this mode of operation, the GPU is accessed
|
||||
exclusively by the Nvidia driver running in the VM to which it is assigned.
|
||||
The GPU is not shared among VMs.
|
||||
|
||||
Nvidia Virtual GPU (vGPU) enables multiple virtual machines (VMs) to have simultaneous,
|
||||
direct access to a single physical GPU, using the same Nvidia graphics drivers that are
|
||||
deployed on non-virtualized operating systems. By doing this, Nvidia vGPU provides VMs
|
||||
with unparalleled graphics performance, compute performance, and application compatibility,
|
||||
together with the cost-effectiveness and scalability brought about by sharing a GPU
|
||||
among multiple workloads.
|
||||
|
||||
| Technology | Description | Behaviour | Detail |
|
||||
| --- | --- | --- | --- |
|
||||
| Nvidia GPU pass-through mode | GPU passthrough | Physical GPU assigned to a single VM | Direct GPU assignment to VM without limitation |
|
||||
| Nvidia vGPU mode | GPU sharing | Physical GPU shared by multiple VMs | Mediated passthrough |
|
||||
|
||||
## Hardware Requirements
|
||||
Nvidia GPUs Recommended for Virtualization:
|
||||
|
||||
- Nvidia Tesla (T4, M10, P6, V100 or newer)
|
||||
- Nvidia Quadro RTX 6000/8000
|
||||
|
||||
## Host BIOS Requirements
|
||||
|
||||
Some hardware requires a larger PCI BARs window, for example, Nvidia Tesla P100, K40m
|
||||
```
|
||||
$ lspci -s 04:00.0 -vv | grep Region
|
||||
Region 0: Memory at c6000000 (32-bit, non-prefetchable) [size=16M]
|
||||
Region 1: Memory at 383800000000 (64-bit, prefetchable) [size=16G] #above 4G
|
||||
Region 3: Memory at 383c00000000 (64-bit, prefetchable) [size=32M]
|
||||
```
|
||||
|
||||
For large BARs devices, MMIO mapping above 4G address space should be `enabled`
|
||||
in the PCI configuration of the BIOS.
|
||||
|
||||
Some hardware vendors use different name in BIOS, such as:
|
||||
|
||||
- Above 4G Decoding
|
||||
- Memory Hole for PCI MMIO
|
||||
- Memory Mapped I/O above 4GB
|
||||
|
||||
The following steps outline the workflow for using an Nvidia GPU with Kata.
|
||||
|
||||
## Host Kernel Requirements
|
||||
The following configurations need to be enabled on your host kernel:
|
||||
|
||||
- `CONFIG_VFIO`
|
||||
- `CONFIG_VFIO_IOMMU_TYPE1`
|
||||
- `CONFIG_VFIO_MDEV`
|
||||
- `CONFIG_VFIO_MDEV_DEVICE`
|
||||
- `CONFIG_VFIO_PCI`
|
||||
|
||||
Your host kernel needs to be booted with `intel_iommu=on` on the kernel command line.
|
||||
|
||||
## Install and configure Kata Containers
|
||||
To use non-large BARs devices (for example, Nvidia Tesla T4), you need Kata version 1.3.0 or above.
|
||||
Follow the [Kata Containers setup instructions](../install/README.md)
|
||||
to install the latest version of Kata.
|
||||
|
||||
To use large BARs devices (for example, Nvidia Tesla P100), you need Kata version 1.11.0 or above.
|
||||
|
||||
The following configuration in the Kata `configuration.toml` file as shown below can work:
|
||||
|
||||
Hotplug for PCI devices by `acpi_pcihp` (Linux's ACPI PCI Hotplug driver):
|
||||
```
|
||||
machine_type = "q35"
|
||||
|
||||
hotplug_vfio_on_root_bus = false
|
||||
```
|
||||
|
||||
Hotplug for PCIe devices by `pciehp` (Linux's PCIe Hotplug driver):
|
||||
```
|
||||
machine_type = "q35"
|
||||
|
||||
hotplug_vfio_on_root_bus = true
|
||||
pcie_root_port = 1
|
||||
```
|
||||
|
||||
## Build Kata Containers kernel with GPU support
|
||||
The default guest kernel installed with Kata Containers does not provide GPU support.
|
||||
To use an Nvidia GPU with Kata Containers, you need to build a kernel with the
|
||||
necessary GPU support.
|
||||
|
||||
The following kernel config options need to be enabled:
|
||||
```
|
||||
# Support PCI/PCIe device hotplug (Required for large BARs device)
|
||||
CONFIG_HOTPLUG_PCI_PCIE=y
|
||||
|
||||
# Support for loading modules (Required for load Nvidia drivers)
|
||||
CONFIG_MODULES=y
|
||||
CONFIG_MODULE_UNLOAD=y
|
||||
|
||||
# Enable the MMIO access method for PCIe devices (Required for large BARs device)
|
||||
CONFIG_PCI_MMCONFIG=y
|
||||
```
|
||||
|
||||
The following kernel config options need to be disabled:
|
||||
```
|
||||
# Disable Open Source Nvidia driver nouveau
|
||||
# It conflicts with Nvidia official driver
|
||||
CONFIG_DRM_NOUVEAU=n
|
||||
```
|
||||
> **Note**: `CONFIG_DRM_NOUVEAU` is normally disabled by default.
|
||||
It is worth checking that it is not enabled in your kernel configuration to prevent any conflicts.
|
||||
|
||||
|
||||
Build the Kata Containers kernel with the previous config options,
|
||||
using the instructions described in [Building Kata Containers kernel](../../tools/packaging/kernel).
|
||||
For further details on building and installing guest kernels,
|
||||
see [the developer guide](../Developer-Guide.md#install-guest-kernel-images).
|
||||
|
||||
There is an easy way to build a guest kernel that supports Nvidia GPU:
|
||||
```
|
||||
## Build guest kernel with ../../tools/packaging/kernel
|
||||
|
||||
# Prepare (download guest kernel source, generate .config)
|
||||
$ ./build-kernel.sh -v 4.19.86 -g nvidia -f setup
|
||||
|
||||
# Build guest kernel
|
||||
$ ./build-kernel.sh -v 4.19.86 -g nvidia build
|
||||
|
||||
# Install guest kernel
|
||||
$ sudo -E ./build-kernel.sh -v 4.19.86 -g nvidia install
|
||||
/usr/share/kata-containers/vmlinux-nvidia-gpu.container -> vmlinux-4.19.86-70-nvidia-gpu
|
||||
/usr/share/kata-containers/vmlinuz-nvidia-gpu.container -> vmlinuz-4.19.86-70-nvidia-gpu
|
||||
```
|
||||
|
||||
To build Nvidia Driver in Kata container, `kernel-devel` is required.
|
||||
This is a way to generate rpm packages for `kernel-devel`:
|
||||
```
|
||||
$ cd kata-linux-4.19.86-68
|
||||
$ make rpm-pkg
|
||||
Output RPMs:
|
||||
~/rpmbuild/RPMS/x86_64/kernel-devel-4.19.86_nvidia_gpu-1.x86_64.rpm
|
||||
```
|
||||
> **Note**:
|
||||
> - `kernel-devel` should be installed in Kata container before run Nvidia driver installer.
|
||||
> - Run `make deb-pkg` to build the deb package.
|
||||
|
||||
Before using the new guest kernel, please update the `kernel` parameters in `configuration.toml`.
|
||||
```
|
||||
kernel = "/usr/share/kata-containers/vmlinuz-nvidia-gpu.container"
|
||||
```
|
||||
|
||||
## Nvidia GPU pass-through mode with Kata Containers
|
||||
Use the following steps to pass an Nvidia GPU device in pass-through mode with Kata:
|
||||
|
||||
1. Find the Bus-Device-Function (BDF) for GPU device on host:
|
||||
```
|
||||
$ sudo lspci -nn -D | grep -i nvidia
|
||||
0000:04:00.0 3D controller [0302]: NVIDIA Corporation Device [10de:15f8] (rev a1)
|
||||
0000:84:00.0 3D controller [0302]: NVIDIA Corporation Device [10de:15f8] (rev a1)
|
||||
```
|
||||
> PCI address `0000:04:00.0` is assigned to the hardware GPU device.
|
||||
> `10de:15f8` is the device ID of the hardware GPU device.
|
||||
|
||||
2. Find the IOMMU group for the GPU device:
|
||||
```
|
||||
$ BDF="0000:04:00.0"
|
||||
$ readlink -e /sys/bus/pci/devices/$BDF/iommu_group
|
||||
/sys/kernel/iommu_groups/45
|
||||
```
|
||||
The previous output shows that the GPU belongs to IOMMU group 45.
|
||||
|
||||
3. Check the IOMMU group number under `/dev/vfio`:
|
||||
```
|
||||
$ ls -l /dev/vfio
|
||||
total 0
|
||||
crw------- 1 root root 248, 0 Feb 28 09:57 45
|
||||
crw------- 1 root root 248, 1 Feb 28 09:57 54
|
||||
crw-rw-rw- 1 root root 10, 196 Feb 28 09:57 vfio
|
||||
```
|
||||
|
||||
4. Start a Kata container with GPU device:
|
||||
```
|
||||
$ sudo docker run -it --runtime=kata-runtime --cap-add=ALL --device /dev/vfio/45 centos /bin/bash
|
||||
```
|
||||
|
||||
5. Run `lspci` within the container to verify the GPU device is seen in the list
|
||||
of the PCI devices. Note the vendor-device id of the GPU (`10de:15f8`) in the `lspci` output.
|
||||
```
|
||||
$ lspci -nn -D | grep '10de:15f8'
|
||||
0000:01:01.0 3D controller [0302]: NVIDIA Corporation GP100GL [Tesla P100 PCIe 16GB] [10de:15f8] (rev a1)
|
||||
```
|
||||
|
||||
6. Additionally, you can check the PCI BARs space of the Nvidia GPU device in the container:
|
||||
```
|
||||
$ lspci -s 01:01.0 -vv | grep Region
|
||||
Region 0: Memory at c0000000 (32-bit, non-prefetchable) [disabled] [size=16M]
|
||||
Region 1: Memory at 4400000000 (64-bit, prefetchable) [disabled] [size=16G]
|
||||
Region 3: Memory at 4800000000 (64-bit, prefetchable) [disabled] [size=32M]
|
||||
```
|
||||
> **Note**: If you see a message similar to the above, the BAR space of the Nvidia
|
||||
> GPU has been successfully allocated.
|
||||
|
||||
## Nvidia vGPU mode with Kata Containers
|
||||
|
||||
Nvidia vGPU is a licensed product on all supported GPU boards. A software license
|
||||
is required to enable all vGPU features within the guest VM.
|
||||
|
||||
> **Note**: There is no suitable test environment, so it is not written here.
|
||||
|
||||
|
||||
## Install Nvidia Driver in Kata Containers
|
||||
Download the official Nvidia driver from
|
||||
[https://www.nvidia.com/Download/index.aspx](https://www.nvidia.com/Download/index.aspx),
|
||||
for example `NVIDIA-Linux-x86_64-418.87.01.run`.
|
||||
|
||||
Install the `kernel-devel`(generated in the previous steps) for guest kernel:
|
||||
```
|
||||
$ sudo rpm -ivh kernel-devel-4.19.86_gpu-1.x86_64.rpm
|
||||
```
|
||||
|
||||
Here is an example to extract, compile and install Nvidia driver:
|
||||
```
|
||||
## Extract
|
||||
$ sh ./NVIDIA-Linux-x86_64-418.87.01.run -x
|
||||
|
||||
## Compile and install (It will take some time)
|
||||
$ cd NVIDIA-Linux-x86_64-418.87.01
|
||||
$ sudo ./nvidia-installer -a -q --ui=none \
|
||||
--no-cc-version-check \
|
||||
--no-opengl-files --no-install-libglvnd \
|
||||
--kernel-source-path=/usr/src/kernels/`uname -r`
|
||||
```
|
||||
|
||||
Or just run one command line:
|
||||
```
|
||||
$ sudo sh ./NVIDIA-Linux-x86_64-418.87.01.run -a -q --ui=none \
|
||||
--no-cc-version-check \
|
||||
--no-opengl-files --no-install-libglvnd \
|
||||
--kernel-source-path=/usr/src/kernels/`uname -r`
|
||||
```
|
||||
|
||||
To view detailed logs of the installer:
|
||||
```
|
||||
$ tail -f /var/log/nvidia-installer.log
|
||||
```
|
||||
|
||||
Load Nvidia driver module manually
|
||||
```
|
||||
# Optional(generate modules.dep and map files for Nvidia driver)
|
||||
$ sudo depmod
|
||||
|
||||
# Load module
|
||||
$ sudo modprobe nvidia-drm
|
||||
|
||||
# Check module
|
||||
$ lsmod | grep nvidia
|
||||
nvidia_drm 45056 0
|
||||
nvidia_modeset 1093632 1 nvidia_drm
|
||||
nvidia 18202624 1 nvidia_modeset
|
||||
drm_kms_helper 159744 1 nvidia_drm
|
||||
drm 364544 3 nvidia_drm,drm_kms_helper
|
||||
i2c_core 65536 3 nvidia,drm_kms_helper,drm
|
||||
ipmi_msghandler 49152 1 nvidia
|
||||
```
|
||||
|
||||
|
||||
Check Nvidia device status with `nvidia-smi`
|
||||
```
|
||||
$ nvidia-smi
|
||||
Tue Mar 3 00:03:49 2020
|
||||
+-----------------------------------------------------------------------------+
|
||||
| NVIDIA-SMI 418.87.01 Driver Version: 418.87.01 CUDA Version: 10.1 |
|
||||
|-------------------------------+----------------------+----------------------+
|
||||
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
|
||||
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|
||||
|===============================+======================+======================|
|
||||
| 0 Tesla P100-PCIE... Off | 00000000:01:01.0 Off | 0 |
|
||||
| N/A 27C P0 25W / 250W | 0MiB / 16280MiB | 0% Default |
|
||||
+-------------------------------+----------------------+----------------------+
|
||||
|
||||
+-----------------------------------------------------------------------------+
|
||||
| Processes: GPU Memory |
|
||||
| GPU PID Type Process name Usage |
|
||||
|=============================================================================|
|
||||
| No running processes found |
|
||||
+-----------------------------------------------------------------------------+
|
||||
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
- [Configuring a VM for GPU Pass-Through by Using the QEMU Command Line](https://docs.nvidia.com/grid/latest/grid-vgpu-user-guide/index.html#using-gpu-pass-through-red-hat-el-qemu-cli)
|
||||
- https://gitlab.com/nvidia/container-images/driver/-/tree/master
|
||||
- https://github.com/NVIDIA/nvidia-docker/wiki/Driver-containers
|
||||
@@ -279,8 +279,8 @@ $ export KERNEL_EXTRAVERSION=$(awk '/^EXTRAVERSION =/{print $NF}' $GOPATH/$LINUX
|
||||
$ export KERNEL_ROOTFS_DIR=${KERNEL_MAJOR_VERSION}.${KERNEL_PATHLEVEL}.${KERNEL_SUBLEVEL}${KERNEL_EXTRAVERSION}
|
||||
$ cd $QAT_SRC
|
||||
$ KERNEL_SOURCE_ROOT=$GOPATH/$LINUX_VER ./configure --enable-icp-sriov=guest
|
||||
$ sudo -E make all -j$(nproc)
|
||||
$ sudo -E make INSTALL_MOD_PATH=$ROOTFS_DIR qat-driver-install -j$(nproc)
|
||||
$ sudo -E make all -j $($(nproc ${CI:+--ignore 1}))
|
||||
$ sudo -E make INSTALL_MOD_PATH=$ROOTFS_DIR qat-driver-install -j $($(nproc ${CI:+--ignore 1}))
|
||||
```
|
||||
|
||||
The `usdm_drv` module also needs to be copied into the rootfs modules path and
|
||||
@@ -312,7 +312,7 @@ working properly with the Kata Containers VM.
|
||||
|
||||
### Build OpenSSL Intel® QAT engine container
|
||||
|
||||
Use the OpenSSL Intel® QAT [Dockerfile](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/master/demo/openssl-qat-engine)
|
||||
Use the OpenSSL Intel® QAT [Dockerfile](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/demo/openssl-qat-engine)
|
||||
to build a container image with an optimized OpenSSL engine for
|
||||
Intel® QAT. Using `docker build` with the Kata Containers runtime can sometimes
|
||||
have issues. Therefore, make sure that `runc` is the default Docker container
|
||||
@@ -444,7 +444,7 @@ $ sudo docker save -o openssl-qat-engine.tar openssl-qat-engine:latest
|
||||
$ sudo ctr -n=k8s.io images import openssl-qat-engine.tar
|
||||
```
|
||||
|
||||
The [Intel® QAT Plugin](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/master/cmd/qat_plugin/README.md)
|
||||
The [Intel® QAT Plugin](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/cmd/qat_plugin/README.md)
|
||||
needs to be started so that the virtual functions can be discovered and
|
||||
used by Kubernetes.
|
||||
|
||||
|
||||
@@ -18,16 +18,13 @@ CONFIG_X86_SGX_KVM=y
|
||||
|
||||
* Kubernetes cluster configured with:
|
||||
* [`kata-deploy`](../../tools/packaging/kata-deploy) based Kata Containers installation
|
||||
* [Intel SGX Kubernetes device plugin](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/cmd/sgx_plugin#deploying-with-pre-built-images)
|
||||
* [Intel SGX Kubernetes device plugin](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/cmd/sgx_plugin#deploying-with-pre-built-images) and associated components including [operator](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/cmd/operator/README.md) and dependencies
|
||||
|
||||
> Note: Kata Containers supports creating VM sandboxes with Intel® SGX enabled
|
||||
> using [cloud-hypervisor](https://github.com/cloud-hypervisor/cloud-hypervisor/) and [QEMU](https://www.qemu.org/) VMMs only.
|
||||
|
||||
### Kata Containers Configuration
|
||||
|
||||
Before running a Kata Container make sure that your version of `crio` or `containerd`
|
||||
supports annotations.
|
||||
|
||||
For `containerd` check in `/etc/containerd/config.toml` that the list of `pod_annotations` passed
|
||||
to the `sandbox` are: `["io.katacontainers.*", "sgx.intel.com/epc"]`.
|
||||
|
||||
@@ -99,4 +96,4 @@ because socket passthrough is not supported. An alternative is to deploy the `ae
|
||||
container.
|
||||
* Projects like [Gramine Shielded Containers (GSC)](https://gramine-gsc.readthedocs.io/en/latest/) are
|
||||
also known to work. For GSC specifically, the Kata guest kernel needs to have the `CONFIG_NUMA=y`
|
||||
enabled and at least one CPU online when running the GSC container.
|
||||
enabled and at least one CPU online when running the GSC container. The Kata Containers guest kernel currently has `CONFIG_NUMA=y` enabled by default.
|
||||
|
||||
@@ -22,21 +22,35 @@ $ sudo snap install kata-containers --classic
|
||||
|
||||
## Build and install snap image
|
||||
|
||||
Run next command at the root directory of the packaging repository.
|
||||
Run the command below which will use the packaging Makefile to build the snap image:
|
||||
|
||||
```sh
|
||||
$ make snap
|
||||
$ make -C tools/packaging snap
|
||||
```
|
||||
|
||||
> **Warning:**
|
||||
>
|
||||
> By default, `snapcraft` will create a clean virtual machine
|
||||
> environment to build the snap in using the `multipass` tool.
|
||||
>
|
||||
> However, `multipass` is silently disabled when `--destructive-mode` is
|
||||
> used.
|
||||
>
|
||||
> Since building the Kata Containers package currently requires
|
||||
> `--destructive-mode`, the snap will be built using the host
|
||||
> environment. To avoid parts of the build auto-detecting additional
|
||||
> features to enable (for example for QEMU), we recommend that you
|
||||
> only run the snap build in a minimal host environment.
|
||||
|
||||
To install the resulting snap image, snap must be put in [classic mode][3] and the
|
||||
security confinement must be disabled (*--classic*). Also since the resulting snap
|
||||
has not been signed the verification of signature must be omitted (*--dangerous*).
|
||||
security confinement must be disabled (`--classic`). Also since the resulting snap
|
||||
has not been signed the verification of signature must be omitted (`--dangerous`).
|
||||
|
||||
```sh
|
||||
$ sudo snap install --classic --dangerous kata-containers_[VERSION]_[ARCH].snap
|
||||
$ sudo snap install --classic --dangerous "kata-containers_${version}_${arch}.snap"
|
||||
```
|
||||
|
||||
Replace `VERSION` with the current version of Kata Containers and `ARCH` with
|
||||
Replace `${version}` with the current version of Kata Containers and `${arch}` with
|
||||
the system architecture.
|
||||
|
||||
## Configure Kata Containers
|
||||
@@ -76,12 +90,12 @@ then a new configuration file can be [created](#configure-kata-containers)
|
||||
and [configured][7].
|
||||
|
||||
[1]: https://docs.snapcraft.io/snaps/intro
|
||||
[2]: ../docs/design/architecture/README.md#root-filesystem-image
|
||||
[2]: ../../docs/design/architecture/README.md#root-filesystem-image
|
||||
[3]: https://docs.snapcraft.io/reference/confinement#classic
|
||||
[4]: https://github.com/kata-containers/runtime#configuration
|
||||
[4]: https://github.com/kata-containers/kata-containers/tree/main/src/runtime#configuration
|
||||
[5]: https://docs.docker.com/engine/reference/commandline/dockerd
|
||||
[6]: ../docs/install/docker/ubuntu-docker-install.md
|
||||
[7]: ../docs/Developer-Guide.md#configure-to-use-initrd-or-rootfs-image
|
||||
[6]: ../../docs/install/docker/ubuntu-docker-install.md
|
||||
[7]: ../../docs/Developer-Guide.md#configure-to-use-initrd-or-rootfs-image
|
||||
[8]: https://snapcraft.io/kata-containers
|
||||
[9]: ../docs/Developer-Guide.md#run-kata-containers-with-docker
|
||||
[10]: ../docs/Developer-Guide.md#run-kata-containers-with-kubernetes
|
||||
[9]: ../../docs/Developer-Guide.md#run-kata-containers-with-docker
|
||||
[10]: ../../docs/Developer-Guide.md#run-kata-containers-with-kubernetes
|
||||
114
snap/local/snap-common.sh
Normal file
114
snap/local/snap-common.sh
Normal file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# Copyright (c) 2022 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Description: Idempotent script to be sourced by all parts in a
|
||||
# snapcraft config file.
|
||||
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
# XXX: Bash-specific code. zsh doesn't support this option and that *does*
|
||||
# matter if this script is run sourced... since it'll be using zsh! ;)
|
||||
[ -n "$BASH_VERSION" ] && set -o errtrace
|
||||
|
||||
[ -n "${DEBUG:-}" ] && set -o xtrace
|
||||
|
||||
die()
|
||||
{
|
||||
echo >&2 "ERROR: $0: $*"
|
||||
}
|
||||
|
||||
[ -n "${SNAPCRAFT_STAGE:-}" ] ||\
|
||||
die "must be sourced from a snapcraft config file"
|
||||
|
||||
snap_yq_version=3.4.1
|
||||
|
||||
snap_common_install_yq()
|
||||
{
|
||||
export yq="${SNAPCRAFT_STAGE}/bin/yq"
|
||||
|
||||
local yq_pkg
|
||||
yq_pkg="github.com/mikefarah/yq"
|
||||
|
||||
local yq_url
|
||||
yq_url="https://${yq_pkg}/releases/download/${snap_yq_version}/yq_${goos}_${goarch}"
|
||||
curl -o "${yq}" -L "${yq_url}"
|
||||
chmod +x "${yq}"
|
||||
}
|
||||
|
||||
# Function that should be called for each snap "part" in
|
||||
# snapcraft.yaml.
|
||||
snap_common_main()
|
||||
{
|
||||
# Architecture
|
||||
arch="$(uname -m)"
|
||||
|
||||
case "${arch}" in
|
||||
aarch64)
|
||||
goarch="arm64"
|
||||
qemu_arch="${arch}"
|
||||
;;
|
||||
|
||||
ppc64le)
|
||||
goarch="ppc64le"
|
||||
qemu_arch="ppc64"
|
||||
;;
|
||||
|
||||
s390x)
|
||||
goarch="${arch}"
|
||||
qemu_arch="${arch}"
|
||||
;;
|
||||
|
||||
x86_64)
|
||||
goarch="amd64"
|
||||
qemu_arch="${arch}"
|
||||
;;
|
||||
|
||||
*) die "unsupported architecture: ${arch}" ;;
|
||||
esac
|
||||
|
||||
dpkg_arch=$(dpkg --print-architecture)
|
||||
|
||||
# golang
|
||||
#
|
||||
# We need the O/S name in golang format, but since we don't
|
||||
# know if the godeps part has run, we don't know if golang is
|
||||
# available yet, hence fall back to a standard system command.
|
||||
goos="$(go env GOOS &>/dev/null || true)"
|
||||
[ -z "$goos" ] && goos=$(uname -s|tr '[A-Z]' '[a-z]')
|
||||
|
||||
export GOROOT="${SNAPCRAFT_STAGE}"
|
||||
export GOPATH="${GOROOT}/gopath"
|
||||
export GO111MODULE="auto"
|
||||
|
||||
mkdir -p "${GOPATH}/bin"
|
||||
export PATH="${GOPATH}/bin:${PATH}"
|
||||
|
||||
# Proxy
|
||||
export http_proxy="${http_proxy:-}"
|
||||
export https_proxy="${https_proxy:-}"
|
||||
|
||||
# Binaries
|
||||
mkdir -p "${SNAPCRAFT_STAGE}/bin"
|
||||
|
||||
export PATH="$PATH:${SNAPCRAFT_STAGE}/bin"
|
||||
|
||||
# YAML query tool
|
||||
export yq="${SNAPCRAFT_STAGE}/bin/yq"
|
||||
|
||||
# Kata paths
|
||||
export kata_dir=$(printf "%s/src/github.com/%s/%s" \
|
||||
"${GOPATH}" \
|
||||
"${SNAPCRAFT_PROJECT_NAME}" \
|
||||
"${SNAPCRAFT_PROJECT_NAME}")
|
||||
|
||||
export versions_file="${kata_dir}/versions.yaml"
|
||||
|
||||
[ -n "${yq:-}" ] && [ -x "${yq:-}" ] || snap_common_install_yq
|
||||
}
|
||||
|
||||
snap_common_main
|
||||
@@ -1,4 +1,5 @@
|
||||
name: kata-containers
|
||||
website: https://github.com/kata-containers/kata-containers
|
||||
summary: Build lightweight VMs that seamlessly plug into the containers ecosystem
|
||||
description: |
|
||||
Kata Containers is an open source project and community working to build a
|
||||
@@ -18,20 +19,18 @@ parts:
|
||||
- git
|
||||
- git-extras
|
||||
override-pull: |
|
||||
version="9999"
|
||||
kata_url="https://github.com/kata-containers/kata-containers"
|
||||
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
|
||||
|
||||
if echo "${GITHUB_REF}" | grep -q -E "^refs/tags"; then
|
||||
version=$(echo ${GITHUB_REF} | cut -d/ -f3)
|
||||
version="9999"
|
||||
|
||||
if echo "${GITHUB_REF:-}" | grep -q -E "^refs/tags"; then
|
||||
version=$(echo ${GITHUB_REF:-} | cut -d/ -f3)
|
||||
git checkout ${version}
|
||||
fi
|
||||
|
||||
snapcraftctl set-grade "stable"
|
||||
snapcraftctl set-version "${version}"
|
||||
|
||||
# setup GOPATH - this repo dir should be there
|
||||
export GOPATH=${SNAPCRAFT_STAGE}/gopath
|
||||
kata_dir=${GOPATH}/src/github.com/${SNAPCRAFT_PROJECT_NAME}/${SNAPCRAFT_PROJECT_NAME}
|
||||
mkdir -p $(dirname ${kata_dir})
|
||||
ln -sf $(realpath "${SNAPCRAFT_STAGE}/..") ${kata_dir}
|
||||
|
||||
@@ -43,31 +42,46 @@ parts:
|
||||
build-packages:
|
||||
- curl
|
||||
override-build: |
|
||||
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
|
||||
|
||||
# put everything in stage
|
||||
cd ${SNAPCRAFT_STAGE}
|
||||
cd "${SNAPCRAFT_STAGE}"
|
||||
|
||||
yq_path="./yq"
|
||||
yq_pkg="github.com/mikefarah/yq"
|
||||
goos="linux"
|
||||
case "$(uname -m)" in
|
||||
aarch64) goarch="arm64";;
|
||||
ppc64le) goarch="ppc64le";;
|
||||
x86_64) goarch="amd64";;
|
||||
s390x) goarch="s390x";;
|
||||
*) echo "unsupported architecture: $(uname -m)"; exit 1;;
|
||||
esac
|
||||
|
||||
yq_version=3.4.1
|
||||
yq_url="https://${yq_pkg}/releases/download/${yq_version}/yq_${goos}_${goarch}"
|
||||
curl -o "${yq_path}" -L "${yq_url}"
|
||||
chmod +x "${yq_path}"
|
||||
|
||||
kata_dir=gopath/src/github.com/${SNAPCRAFT_PROJECT_NAME}/${SNAPCRAFT_PROJECT_NAME}
|
||||
version="$(${yq_path} r ${kata_dir}/versions.yaml languages.golang.meta.newest-version)"
|
||||
version="$(${yq} r ${kata_dir}/versions.yaml languages.golang.meta.newest-version)"
|
||||
tarfile="go${version}.${goos}-${goarch}.tar.gz"
|
||||
curl -LO https://golang.org/dl/${tarfile}
|
||||
tar -xf ${tarfile} --strip-components=1
|
||||
|
||||
rustdeps:
|
||||
after: [metadata]
|
||||
plugin: nil
|
||||
prime:
|
||||
- -*
|
||||
build-packages:
|
||||
- curl
|
||||
override-build: |
|
||||
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
|
||||
|
||||
# put everything in stage
|
||||
cd "${SNAPCRAFT_STAGE}"
|
||||
|
||||
version="$(${yq} r ${kata_dir}/versions.yaml languages.rust.meta.newest-version)"
|
||||
if ! command -v rustup > /dev/null; then
|
||||
curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain ${version}
|
||||
fi
|
||||
|
||||
export PATH=${PATH}:${HOME}/.cargo/bin
|
||||
rustup toolchain install ${version}
|
||||
rustup default ${version}
|
||||
if [ "${arch}" == "ppc64le" ] || [ "${arch}" == "s390x" ] ; then
|
||||
[ "${arch}" == "ppc64le" ] && arch="powerpc64le"
|
||||
rustup target add ${arch}-unknown-linux-gnu
|
||||
else
|
||||
rustup target add ${arch}-unknown-linux-musl
|
||||
$([ "$(whoami)" != "root" ] && echo sudo) ln -sf /usr/bin/g++ /bin/musl-g++
|
||||
fi
|
||||
rustup component add rustfmt
|
||||
|
||||
image:
|
||||
after: [godeps, qemu, kernel]
|
||||
plugin: nil
|
||||
@@ -80,28 +94,17 @@ parts:
|
||||
- uidmap
|
||||
- gnupg2
|
||||
override-build: |
|
||||
[ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "s390x" ] && sudo apt-get --no-install-recommends install -y protobuf-compiler
|
||||
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
|
||||
|
||||
yq=${SNAPCRAFT_STAGE}/yq
|
||||
[ "${arch}" = "ppc64le" ] || [ "${arch}" = "s390x" ] && sudo apt-get --no-install-recommends install -y protobuf-compiler
|
||||
|
||||
# set GOPATH
|
||||
export GOPATH=${SNAPCRAFT_STAGE}/gopath
|
||||
kata_dir=${GOPATH}/src/github.com/${SNAPCRAFT_PROJECT_NAME}/${SNAPCRAFT_PROJECT_NAME}
|
||||
|
||||
export GOROOT=${SNAPCRAFT_STAGE}
|
||||
export PATH="${GOROOT}/bin:${PATH}"
|
||||
export GO111MODULE="auto"
|
||||
|
||||
http_proxy=${http_proxy:-""}
|
||||
https_proxy=${https_proxy:-""}
|
||||
if [ -n "$http_proxy" ]; then
|
||||
echo "Setting proxy $http_proxy"
|
||||
sudo -E systemctl set-environment http_proxy=$http_proxy || true
|
||||
sudo -E systemctl set-environment https_proxy=$https_proxy || true
|
||||
sudo -E systemctl set-environment http_proxy="$http_proxy" || true
|
||||
sudo -E systemctl set-environment https_proxy="$https_proxy" || true
|
||||
fi
|
||||
|
||||
# Copy yq binary. It's used in the container
|
||||
mkdir -p "${GOPATH}/bin/"
|
||||
cp -a "${yq}" "${GOPATH}/bin/"
|
||||
|
||||
echo "Unmasking docker service"
|
||||
@@ -112,63 +115,54 @@ parts:
|
||||
echo "Starting docker"
|
||||
sudo -E systemctl start docker || true
|
||||
|
||||
cd ${kata_dir}/tools/osbuilder
|
||||
cd "${kata_dir}/tools/osbuilder"
|
||||
|
||||
# build image
|
||||
export AGENT_INIT=yes
|
||||
export USE_DOCKER=1
|
||||
export DEBUG=1
|
||||
arch="$(uname -m)"
|
||||
initrd_distro=$(${yq} r -X ${kata_dir}/versions.yaml assets.initrd.architecture.${arch}.name)
|
||||
image_distro=$(${yq} r -X ${kata_dir}/versions.yaml assets.image.architecture.${arch}.name)
|
||||
case "$arch" in
|
||||
x86_64)
|
||||
# In some build systems it's impossible to build a rootfs image, try with the initrd image
|
||||
sudo -E PATH=$PATH make image DISTRO=${image_distro} || sudo -E PATH=$PATH make initrd DISTRO=${initrd_distro}
|
||||
sudo -E PATH=$PATH make image DISTRO="${image_distro}" || sudo -E PATH="$PATH" make initrd DISTRO="${initrd_distro}"
|
||||
;;
|
||||
|
||||
aarch64|ppc64le|s390x)
|
||||
sudo -E PATH=$PATH make initrd DISTRO=${initrd_distro}
|
||||
sudo -E PATH="$PATH" make initrd DISTRO="${initrd_distro}"
|
||||
;;
|
||||
|
||||
*) echo "unsupported architecture: $(uname -m)"; exit 1;;
|
||||
*) die "unsupported architecture: ${arch}" ;;
|
||||
esac
|
||||
|
||||
# Install image
|
||||
kata_image_dir=${SNAPCRAFT_PART_INSTALL}/usr/share/kata-containers
|
||||
mkdir -p ${kata_image_dir}
|
||||
cp kata-containers*.img ${kata_image_dir}
|
||||
kata_image_dir="${SNAPCRAFT_PART_INSTALL}/usr/share/kata-containers"
|
||||
mkdir -p "${kata_image_dir}"
|
||||
cp kata-containers*.img "${kata_image_dir}"
|
||||
|
||||
runtime:
|
||||
after: [godeps, image, cloud-hypervisor]
|
||||
plugin: nil
|
||||
build-attributes: [no-patchelf]
|
||||
override-build: |
|
||||
# set GOPATH
|
||||
export GOPATH=${SNAPCRAFT_STAGE}/gopath
|
||||
export GOROOT=${SNAPCRAFT_STAGE}
|
||||
export PATH="${GOROOT}/bin:${PATH}"
|
||||
export GO111MODULE="auto"
|
||||
kata_dir=${GOPATH}/src/github.com/${SNAPCRAFT_PROJECT_NAME}/${SNAPCRAFT_PROJECT_NAME}
|
||||
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
|
||||
|
||||
cd ${kata_dir}/src/runtime
|
||||
cd "${kata_dir}/src/runtime"
|
||||
|
||||
# setup arch
|
||||
arch=$(uname -m)
|
||||
if [ ${arch} = "ppc64le" ]; then
|
||||
arch="ppc64"
|
||||
fi
|
||||
qemu_cmd="qemu-system-${qemu_arch}"
|
||||
|
||||
# build and install runtime
|
||||
make \
|
||||
PREFIX=/snap/${SNAPCRAFT_PROJECT_NAME}/current/usr \
|
||||
PREFIX="/snap/${SNAPCRAFT_PROJECT_NAME}/current/usr" \
|
||||
SKIP_GO_VERSION_CHECK=1 \
|
||||
QEMUCMD=qemu-system-$arch
|
||||
QEMUCMD="${qemu_cmd}"
|
||||
|
||||
make install \
|
||||
PREFIX=/usr \
|
||||
DESTDIR=${SNAPCRAFT_PART_INSTALL} \
|
||||
DESTDIR="${SNAPCRAFT_PART_INSTALL}" \
|
||||
SKIP_GO_VERSION_CHECK=1 \
|
||||
QEMUCMD=qemu-system-$arch
|
||||
QEMUCMD="${qemu_cmd}"
|
||||
|
||||
if [ ! -f ${SNAPCRAFT_PART_INSTALL}/../../image/install/usr/share/kata-containers/kata-containers.img ]; then
|
||||
sed -i -e "s|^image =.*|initrd = \"/snap/${SNAPCRAFT_PROJECT_NAME}/current/usr/share/kata-containers/kata-containers-initrd.img\"|" \
|
||||
@@ -185,44 +179,37 @@ parts:
|
||||
- bison
|
||||
- flex
|
||||
override-build: |
|
||||
yq=${SNAPCRAFT_STAGE}/yq
|
||||
export PATH="${PATH}:${SNAPCRAFT_STAGE}"
|
||||
export GOPATH=${SNAPCRAFT_STAGE}/gopath
|
||||
kata_dir=${GOPATH}/src/github.com/${SNAPCRAFT_PROJECT_NAME}/${SNAPCRAFT_PROJECT_NAME}
|
||||
versions_file="${kata_dir}/versions.yaml"
|
||||
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
|
||||
|
||||
kernel_version="$(${yq} r $versions_file assets.kernel.version)"
|
||||
#Remove extra 'v'
|
||||
kernel_version=${kernel_version#v}
|
||||
kernel_version="${kernel_version#v}"
|
||||
|
||||
[ "$(uname -m)" = "s390x" ] && sudo apt-get --no-install-recommends install -y libssl-dev
|
||||
[ "${arch}" = "s390x" ] && sudo apt-get --no-install-recommends install -y libssl-dev
|
||||
|
||||
export GOPATH=${SNAPCRAFT_STAGE}/gopath
|
||||
export GO111MODULE="auto"
|
||||
kata_dir=${GOPATH}/src/github.com/${SNAPCRAFT_PROJECT_NAME}/${SNAPCRAFT_PROJECT_NAME}
|
||||
|
||||
cd ${kata_dir}/tools/packaging/kernel
|
||||
cd "${kata_dir}/tools/packaging/kernel"
|
||||
kernel_dir_prefix="kata-linux-"
|
||||
|
||||
# Setup and build kernel
|
||||
./build-kernel.sh -v ${kernel_version} -d setup
|
||||
./build-kernel.sh -v "${kernel_version}" -d setup
|
||||
cd ${kernel_dir_prefix}*
|
||||
make -j $(($(nproc)-1)) EXTRAVERSION=".container"
|
||||
make -j $(nproc ${CI:+--ignore 1}) EXTRAVERSION=".container"
|
||||
|
||||
kernel_suffix=${kernel_version}.container
|
||||
kata_kernel_dir=${SNAPCRAFT_PART_INSTALL}/usr/share/kata-containers
|
||||
mkdir -p ${kata_kernel_dir}
|
||||
kernel_suffix="${kernel_version}.container"
|
||||
kata_kernel_dir="${SNAPCRAFT_PART_INSTALL}/usr/share/kata-containers"
|
||||
mkdir -p "${kata_kernel_dir}"
|
||||
|
||||
# Install bz kernel
|
||||
make install INSTALL_PATH=${kata_kernel_dir} EXTRAVERSION=".container" || true
|
||||
vmlinuz_name=vmlinuz-${kernel_suffix}
|
||||
ln -sf ${vmlinuz_name} ${kata_kernel_dir}/vmlinuz.container
|
||||
make install INSTALL_PATH="${kata_kernel_dir}" EXTRAVERSION=".container" || true
|
||||
vmlinuz_name="vmlinuz-${kernel_suffix}"
|
||||
ln -sf "${vmlinuz_name}" "${kata_kernel_dir}/vmlinuz.container"
|
||||
|
||||
# Install raw kernel
|
||||
vmlinux_path=vmlinux
|
||||
[ "$(uname -m)" = "s390x" ] && vmlinux_path=arch/s390/boot/compressed/vmlinux
|
||||
vmlinux_name=vmlinux-${kernel_suffix}
|
||||
cp ${vmlinux_path} ${kata_kernel_dir}/${vmlinux_name}
|
||||
ln -sf ${vmlinux_name} ${kata_kernel_dir}/vmlinux.container
|
||||
vmlinux_path="vmlinux"
|
||||
[ "${arch}" = "s390x" ] && vmlinux_path="arch/s390/boot/compressed/vmlinux"
|
||||
vmlinux_name="vmlinux-${kernel_suffix}"
|
||||
cp "${vmlinux_path}" "${kata_kernel_dir}/${vmlinux_name}"
|
||||
ln -sf "${vmlinux_name}" "${kata_kernel_dir}/vmlinux.container"
|
||||
|
||||
qemu:
|
||||
plugin: make
|
||||
@@ -249,12 +236,8 @@ parts:
|
||||
- libselinux1-dev
|
||||
- ninja-build
|
||||
override-build: |
|
||||
yq=${SNAPCRAFT_STAGE}/yq
|
||||
export GOPATH=${SNAPCRAFT_STAGE}/gopath
|
||||
export GO111MODULE="auto"
|
||||
kata_dir=${GOPATH}/src/github.com/${SNAPCRAFT_PROJECT_NAME}/${SNAPCRAFT_PROJECT_NAME}
|
||||
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
|
||||
|
||||
versions_file="${kata_dir}/versions.yaml"
|
||||
branch="$(${yq} r ${versions_file} assets.hypervisor.qemu.version)"
|
||||
url="$(${yq} r ${versions_file} assets.hypervisor.qemu.url)"
|
||||
commit=""
|
||||
@@ -262,11 +245,11 @@ parts:
|
||||
patches_version_dir="${kata_dir}/tools/packaging/qemu/patches/tag_patches/${branch}"
|
||||
|
||||
# download source
|
||||
qemu_dir=${SNAPCRAFT_STAGE}/qemu
|
||||
qemu_dir="${SNAPCRAFT_STAGE}/qemu"
|
||||
rm -rf "${qemu_dir}"
|
||||
git clone --depth 1 --branch ${branch} --single-branch ${url} "${qemu_dir}"
|
||||
cd ${qemu_dir}
|
||||
[ -z "${commit}" ] || git checkout ${commit}
|
||||
cd "${qemu_dir}"
|
||||
[ -z "${commit}" ] || git checkout "${commit}"
|
||||
|
||||
[ -n "$(ls -A ui/keycodemapdb)" ] || git clone --depth 1 https://github.com/qemu/keycodemapdb ui/keycodemapdb/
|
||||
[ -n "$(ls -A capstone)" ] || git clone --depth 1 https://github.com/qemu/capstone capstone
|
||||
@@ -277,10 +260,10 @@ parts:
|
||||
${kata_dir}/tools/packaging/scripts/apply_patches.sh "${patches_version_dir}"
|
||||
|
||||
# Only x86_64 supports libpmem
|
||||
[ "$(uname -m)" = "x86_64" ] && sudo apt-get --no-install-recommends install -y apt-utils ca-certificates libpmem-dev
|
||||
[ "${arch}" = "x86_64" ] && sudo apt-get --no-install-recommends install -y apt-utils ca-certificates libpmem-dev
|
||||
|
||||
configure_hypervisor=${kata_dir}/tools/packaging/scripts/configure-hypervisor.sh
|
||||
chmod +x ${configure_hypervisor}
|
||||
configure_hypervisor="${kata_dir}/tools/packaging/scripts/configure-hypervisor.sh"
|
||||
chmod +x "${configure_hypervisor}"
|
||||
# static build. The --prefix, --libdir, --libexecdir, --datadir arguments are
|
||||
# based on PREFIX and set by configure-hypervisor.sh
|
||||
echo "$(PREFIX=/snap/${SNAPCRAFT_PROJECT_NAME}/current/usr ${configure_hypervisor} -s kata-qemu) \
|
||||
@@ -290,17 +273,17 @@ parts:
|
||||
# Copy QEMU configurations (Kconfigs)
|
||||
case "${branch}" in
|
||||
"v5.1.0")
|
||||
cp -a ${kata_dir}/tools/packaging/qemu/default-configs/* default-configs
|
||||
cp -a "${kata_dir}"/tools/packaging/qemu/default-configs/* default-configs
|
||||
;;
|
||||
|
||||
*)
|
||||
cp -a ${kata_dir}/tools/packaging/qemu/default-configs/* configs/devices/
|
||||
cp -a "${kata_dir}"/tools/packaging/qemu/default-configs/* configs/devices/
|
||||
;;
|
||||
esac
|
||||
|
||||
# build and install
|
||||
make -j $(($(nproc)-1))
|
||||
make install DESTDIR=${SNAPCRAFT_PART_INSTALL}
|
||||
make -j $(nproc ${CI:+--ignore 1})
|
||||
make install DESTDIR="${SNAPCRAFT_PART_INSTALL}"
|
||||
prime:
|
||||
- -snap/
|
||||
- -usr/bin/qemu-ga
|
||||
@@ -316,26 +299,67 @@ parts:
|
||||
# Hack: move qemu to /
|
||||
"snap/kata-containers/current/": "./"
|
||||
|
||||
virtiofsd:
|
||||
plugin: nil
|
||||
after: [godeps, rustdeps]
|
||||
override-build: |
|
||||
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
|
||||
|
||||
# Currently, powerpc makes use of the QEMU's C implementation.
|
||||
# The other platforms make use of the new rust virtiofsd.
|
||||
#
|
||||
# See "tools/packaging/scripts/configure-hypervisor.sh".
|
||||
if [ "${arch}" == "ppc64le" ]
|
||||
then
|
||||
echo "INFO: Building QEMU's C version of virtiofsd"
|
||||
# Handled by the 'qemu' part, so nothing more to do here.
|
||||
exit 0
|
||||
else
|
||||
echo "INFO: Building rust version of virtiofsd"
|
||||
fi
|
||||
|
||||
cd "${kata_dir}"
|
||||
|
||||
export PATH=${PATH}:${HOME}/.cargo/bin
|
||||
# Download the rust implementation of virtiofsd
|
||||
tools/packaging/static-build/virtiofsd/build-static-virtiofsd.sh
|
||||
sudo install \
|
||||
--owner='root' \
|
||||
--group='root' \
|
||||
--mode=0755 \
|
||||
-D \
|
||||
--target-directory="${SNAPCRAFT_PART_INSTALL}/usr/libexec/" \
|
||||
virtiofsd/virtiofsd
|
||||
|
||||
cloud-hypervisor:
|
||||
plugin: nil
|
||||
after: [godeps]
|
||||
override-build: |
|
||||
arch=$(uname -m)
|
||||
if [ "{$arch}" == "aarch64" ] || [ "${arch}" == "x64_64" ]; then
|
||||
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
|
||||
|
||||
if [ "${arch}" == "aarch64" ] || [ "${arch}" == "x86_64" ]; then
|
||||
sudo apt-get -y update
|
||||
sudo apt-get -y install ca-certificates curl gnupg lsb-release
|
||||
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --batch --yes --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
|
||||
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
curl -fsSL https://download.docker.com/linux/ubuntu/gpg |\
|
||||
sudo gpg --batch --yes --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
|
||||
distro_codename=$(lsb_release -cs)
|
||||
echo "deb [arch=${dpkg_arch} signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu ${distro_codename} stable" |\
|
||||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
sudo apt-get -y update
|
||||
sudo apt-get -y install docker-ce docker-ce-cli containerd.io
|
||||
sudo systemctl start docker.socket
|
||||
|
||||
export GOPATH=${SNAPCRAFT_STAGE}/gopath
|
||||
kata_dir=${GOPATH}/src/github.com/${SNAPCRAFT_PROJECT_NAME}/${SNAPCRAFT_PROJECT_NAME}
|
||||
cd ${kata_dir}
|
||||
cd "${SNAPCRAFT_PROJECT_DIR}"
|
||||
sudo -E NO_TTY=true make cloud-hypervisor-tarball
|
||||
tar xvJpf build/kata-static-cloud-hypervisor.tar.xz -C /tmp/
|
||||
install -D /tmp/opt/kata/bin/cloud-hypervisor ${SNAPCRAFT_PART_INSTALL}/usr/bin/cloud-hypervisor
|
||||
|
||||
tarfile="${SNAPCRAFT_PROJECT_DIR}/tools/packaging/kata-deploy/local-build/build/kata-static-cloud-hypervisor.tar.xz"
|
||||
tmpdir=$(mktemp -d)
|
||||
|
||||
tar -xvJpf "${tarfile}" -C "${tmpdir}"
|
||||
|
||||
install -D "${tmpdir}/opt/kata/bin/cloud-hypervisor" "${SNAPCRAFT_PART_INSTALL}/usr/bin/cloud-hypervisor"
|
||||
|
||||
rm -rf "${tmpdir}"
|
||||
fi
|
||||
|
||||
apps:
|
||||
|
||||
878
src/agent/Cargo.lock
generated
878
src/agent/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -7,19 +7,20 @@ edition = "2018"
|
||||
[dependencies]
|
||||
oci = { path = "../libs/oci" }
|
||||
rustjail = { path = "rustjail" }
|
||||
protocols = { path = "../libs/protocols" }
|
||||
protocols = { path = "../libs/protocols", features = ["async"] }
|
||||
lazy_static = "1.3.0"
|
||||
ttrpc = { version = "0.5.0", features = ["async", "protobuf-codec"], default-features = false }
|
||||
protobuf = "=2.14.0"
|
||||
ttrpc = { version = "0.6.0", features = ["async"], default-features = false }
|
||||
protobuf = "2.27.0"
|
||||
libc = "0.2.58"
|
||||
nix = "0.23.0"
|
||||
nix = "0.24.1"
|
||||
capctl = "0.2.0"
|
||||
serde_json = "1.0.39"
|
||||
scan_fmt = "0.2.3"
|
||||
scopeguard = "1.0.0"
|
||||
thiserror = "1.0.26"
|
||||
regex = "1.5.4"
|
||||
regex = "1.5.5"
|
||||
serial_test = "0.5.1"
|
||||
kata-sys-util = { path = "../libs/kata-sys-util" }
|
||||
sysinfo = "0.23.0"
|
||||
|
||||
# Async helpers
|
||||
@@ -32,7 +33,7 @@ tokio = { version = "1.14.0", features = ["full"] }
|
||||
tokio-vsock = "0.3.1"
|
||||
|
||||
netlink-sys = { version = "0.7.0", features = ["tokio_socket",]}
|
||||
rtnetlink = "0.8.0"
|
||||
rtnetlink = "0.11.0"
|
||||
netlink-packet-utils = "0.4.1"
|
||||
ipnetwork = "0.17.0"
|
||||
|
||||
@@ -76,3 +77,8 @@ lto = true
|
||||
|
||||
[features]
|
||||
seccomp = ["rustjail/seccomp"]
|
||||
standard-oci-runtime = ["rustjail/standard-oci-runtime"]
|
||||
|
||||
[[bin]]
|
||||
name = "kata-agent"
|
||||
path = "src/main.rs"
|
||||
|
||||
@@ -14,10 +14,6 @@ PROJECT_COMPONENT = kata-agent
|
||||
|
||||
TARGET = $(PROJECT_COMPONENT)
|
||||
|
||||
SOURCES := \
|
||||
$(shell find . 2>&1 | grep -E '.*\.rs$$') \
|
||||
Cargo.toml
|
||||
|
||||
VERSION_FILE := ./VERSION
|
||||
VERSION := $(shell grep -v ^\# $(VERSION_FILE))
|
||||
COMMIT_NO := $(shell git rev-parse HEAD 2>/dev/null || true)
|
||||
@@ -37,8 +33,16 @@ ifeq ($(SECCOMP),yes)
|
||||
override EXTRA_RUSTFEATURES += seccomp
|
||||
endif
|
||||
|
||||
##VAR STANDARD_OCI_RUNTIME=yes|no define if agent enables standard oci runtime feature
|
||||
STANDARD_OCI_RUNTIME := no
|
||||
|
||||
# Enable standard oci runtime feature of rust build
|
||||
ifeq ($(STANDARD_OCI_RUNTIME),yes)
|
||||
override EXTRA_RUSTFEATURES += standard-oci-runtime
|
||||
endif
|
||||
|
||||
ifneq ($(EXTRA_RUSTFEATURES),)
|
||||
override EXTRA_RUSTFEATURES := --features $(EXTRA_RUSTFEATURES)
|
||||
override EXTRA_RUSTFEATURES := --features "$(EXTRA_RUSTFEATURES)"
|
||||
endif
|
||||
|
||||
include ../../utils.mk
|
||||
@@ -103,20 +107,17 @@ endef
|
||||
##TARGET default: build code
|
||||
default: $(TARGET) show-header
|
||||
|
||||
$(TARGET): $(GENERATED_CODE) logging-crate-tests $(TARGET_PATH)
|
||||
$(TARGET): $(GENERATED_CODE) $(TARGET_PATH)
|
||||
|
||||
logging-crate-tests:
|
||||
make -C $(CWD)/../libs/logging
|
||||
|
||||
$(TARGET_PATH): $(SOURCES) | show-summary
|
||||
@RUSTFLAGS="$(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) $(EXTRA_RUSTFEATURES)
|
||||
$(TARGET_PATH): show-summary
|
||||
@RUSTFLAGS="$(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) $(if $(findstring release,$(BUILD_TYPE)),--release) $(EXTRA_RUSTFEATURES)
|
||||
|
||||
$(GENERATED_FILES): %: %.in
|
||||
@sed $(foreach r,$(GENERATED_REPLACEMENTS),-e 's|@$r@|$($r)|g') "$<" > "$@"
|
||||
|
||||
##TARGET optimize: optimized build
|
||||
optimize: $(SOURCES) | show-summary show-header
|
||||
@RUSTFLAGS="-C link-arg=-s $(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) $(EXTRA_RUSTFEATURES)
|
||||
optimize: show-summary show-header
|
||||
@RUSTFLAGS="-C link-arg=-s $(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) $(if $(findstring release,$(BUILD_TYPE)),--release) $(EXTRA_RUSTFEATURES)
|
||||
|
||||
##TARGET install: install agent
|
||||
install: install-services
|
||||
@@ -199,7 +200,6 @@ codecov-html: check_tarpaulin
|
||||
|
||||
.PHONY: \
|
||||
help \
|
||||
logging-crate-tests \
|
||||
optimize \
|
||||
show-header \
|
||||
show-summary \
|
||||
|
||||
@@ -16,17 +16,18 @@ scopeguard = "1.0.0"
|
||||
capctl = "0.2.0"
|
||||
lazy_static = "1.3.0"
|
||||
libc = "0.2.58"
|
||||
protobuf = "=2.14.0"
|
||||
protobuf = "2.27.0"
|
||||
slog = "2.5.2"
|
||||
slog-scope = "4.1.2"
|
||||
scan_fmt = "0.2.6"
|
||||
regex = "1.5.4"
|
||||
regex = "1.5.5"
|
||||
path-absolutize = "1.2.0"
|
||||
anyhow = "1.0.32"
|
||||
cgroups = { package = "cgroups-rs", version = "0.2.8" }
|
||||
rlimit = "0.5.3"
|
||||
cfg-if = "0.1.0"
|
||||
|
||||
tokio = { version = "1.2.0", features = ["sync", "io-util", "process", "time", "macros"] }
|
||||
tokio = { version = "1.2.0", features = ["sync", "io-util", "process", "time", "macros", "rt"] }
|
||||
futures = "0.3.17"
|
||||
async-trait = "0.1.31"
|
||||
inotify = "0.9.2"
|
||||
@@ -38,3 +39,4 @@ tempfile = "3.1.0"
|
||||
|
||||
[features]
|
||||
seccomp = ["libseccomp"]
|
||||
standard-oci-runtime = []
|
||||
|
||||
@@ -391,7 +391,7 @@ fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool
|
||||
|
||||
if let Some(swappiness) = memory.swappiness {
|
||||
if (0..=100).contains(&swappiness) {
|
||||
mem_controller.set_swappiness(swappiness as u64)?;
|
||||
mem_controller.set_swappiness(swappiness)?;
|
||||
} else {
|
||||
return Err(anyhow!(
|
||||
"invalid value:{}. valid memory swappiness range is 0-100",
|
||||
@@ -590,9 +590,9 @@ fn get_cpuacct_stats(cg: &cgroups::Cgroup) -> SingularPtrField<CpuUsage> {
|
||||
|
||||
let h = lines_to_map(&cpuacct.stat);
|
||||
let usage_in_usermode =
|
||||
(((*h.get("user").unwrap() * NANO_PER_SECOND) as f64) / *CLOCK_TICKS) as u64;
|
||||
(((*h.get("user").unwrap_or(&0) * NANO_PER_SECOND) as f64) / *CLOCK_TICKS) as u64;
|
||||
let usage_in_kernelmode =
|
||||
(((*h.get("system").unwrap() * NANO_PER_SECOND) as f64) / *CLOCK_TICKS) as u64;
|
||||
(((*h.get("system").unwrap_or(&0) * NANO_PER_SECOND) as f64) / *CLOCK_TICKS) as u64;
|
||||
|
||||
let total_usage = cpuacct.usage;
|
||||
|
||||
@@ -623,9 +623,9 @@ fn get_cpuacct_stats(cg: &cgroups::Cgroup) -> SingularPtrField<CpuUsage> {
|
||||
let cpu_controller: &CpuController = get_controller_or_return_singular_none!(cg);
|
||||
let stat = cpu_controller.cpu().stat;
|
||||
let h = lines_to_map(&stat);
|
||||
let usage_in_usermode = *h.get("user_usec").unwrap();
|
||||
let usage_in_kernelmode = *h.get("system_usec").unwrap();
|
||||
let total_usage = *h.get("usage_usec").unwrap();
|
||||
let usage_in_usermode = *h.get("user_usec").unwrap_or(&0);
|
||||
let usage_in_kernelmode = *h.get("system_usec").unwrap_or(&0);
|
||||
let total_usage = *h.get("usage_usec").unwrap_or(&0);
|
||||
let percpu_usage = vec![];
|
||||
|
||||
SingularPtrField::some(CpuUsage {
|
||||
@@ -911,9 +911,8 @@ pub fn get_paths() -> Result<HashMap<String, String>> {
|
||||
Ok(m)
|
||||
}
|
||||
|
||||
pub fn get_mounts() -> Result<HashMap<String, String>> {
|
||||
pub fn get_mounts(paths: &HashMap<String, String>) -> Result<HashMap<String, String>> {
|
||||
let mut m = HashMap::new();
|
||||
let paths = get_paths()?;
|
||||
|
||||
for l in fs::read_to_string(MOUNTS)?.lines() {
|
||||
let p: Vec<&str> = l.splitn(2, " - ").collect();
|
||||
@@ -951,7 +950,7 @@ impl Manager {
|
||||
let mut m = HashMap::new();
|
||||
|
||||
let paths = get_paths()?;
|
||||
let mounts = get_mounts()?;
|
||||
let mounts = get_mounts(&paths)?;
|
||||
|
||||
for key in paths.keys() {
|
||||
let mnt = mounts.get(key);
|
||||
|
||||
79
src/agent/rustjail/src/console.rs
Normal file
79
src/agent/rustjail/src/console.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Copyright 2021 Sony Group Corporation
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use nix::errno::Errno;
|
||||
use nix::pty;
|
||||
use nix::sys::{socket, uio};
|
||||
use nix::unistd::{self, dup2};
|
||||
use std::os::unix::io::{AsRawFd, RawFd};
|
||||
use std::path::Path;
|
||||
|
||||
pub fn setup_console_socket(csocket_path: &str) -> Result<Option<RawFd>> {
|
||||
if csocket_path.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let socket_fd = socket::socket(
|
||||
socket::AddressFamily::Unix,
|
||||
socket::SockType::Stream,
|
||||
socket::SockFlag::empty(),
|
||||
None,
|
||||
)?;
|
||||
|
||||
match socket::connect(
|
||||
socket_fd,
|
||||
&socket::SockAddr::Unix(socket::UnixAddr::new(Path::new(csocket_path))?),
|
||||
) {
|
||||
Ok(()) => Ok(Some(socket_fd)),
|
||||
Err(errno) => Err(anyhow!("failed to open console fd: {}", errno)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn setup_master_console(socket_fd: RawFd) -> Result<()> {
|
||||
let pseudo = pty::openpty(None, None)?;
|
||||
|
||||
let pty_name: &[u8] = b"/dev/ptmx";
|
||||
let iov = [uio::IoVec::from_slice(pty_name)];
|
||||
let fds = [pseudo.master];
|
||||
let cmsg = socket::ControlMessage::ScmRights(&fds);
|
||||
|
||||
socket::sendmsg(socket_fd, &iov, &[cmsg], socket::MsgFlags::empty(), None)?;
|
||||
|
||||
unistd::setsid()?;
|
||||
let ret = unsafe { libc::ioctl(pseudo.slave, libc::TIOCSCTTY) };
|
||||
Errno::result(ret).map_err(|e| anyhow!(e).context("ioctl TIOCSCTTY"))?;
|
||||
|
||||
dup2(pseudo.slave, std::io::stdin().as_raw_fd())?;
|
||||
dup2(pseudo.slave, std::io::stdout().as_raw_fd())?;
|
||||
dup2(pseudo.slave, std::io::stderr().as_raw_fd())?;
|
||||
|
||||
unistd::close(socket_fd)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::os::unix::net::UnixListener;
|
||||
use tempfile::{self, tempdir};
|
||||
|
||||
const CONSOLE_SOCKET: &str = "console-socket";
|
||||
|
||||
#[test]
|
||||
fn test_setup_console_socket() {
|
||||
let dir = tempdir()
|
||||
.map_err(|e| anyhow!(e).context("tempdir failed"))
|
||||
.unwrap();
|
||||
let socket_path = dir.path().join(CONSOLE_SOCKET);
|
||||
|
||||
let _listener = UnixListener::bind(&socket_path).unwrap();
|
||||
|
||||
let ret = setup_console_socket(socket_path.to_str().unwrap());
|
||||
|
||||
assert!(ret.is_ok());
|
||||
}
|
||||
}
|
||||
@@ -23,6 +23,8 @@ use crate::cgroups::fs::Manager as FsManager;
|
||||
#[cfg(test)]
|
||||
use crate::cgroups::mock::Manager as FsManager;
|
||||
use crate::cgroups::Manager;
|
||||
#[cfg(feature = "standard-oci-runtime")]
|
||||
use crate::console;
|
||||
use crate::log_child;
|
||||
use crate::process::Process;
|
||||
#[cfg(feature = "seccomp")]
|
||||
@@ -40,7 +42,7 @@ use nix::pty;
|
||||
use nix::sched::{self, CloneFlags};
|
||||
use nix::sys::signal::{self, Signal};
|
||||
use nix::sys::stat::{self, Mode};
|
||||
use nix::unistd::{self, fork, ForkResult, Gid, Pid, Uid};
|
||||
use nix::unistd::{self, fork, ForkResult, Gid, Pid, Uid, User};
|
||||
use std::os::unix::fs::MetadataExt;
|
||||
use std::os::unix::io::AsRawFd;
|
||||
|
||||
@@ -62,9 +64,7 @@ use rlimit::{setrlimit, Resource, Rlim};
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use crate::utils;
|
||||
|
||||
const EXEC_FIFO_FILENAME: &str = "exec.fifo";
|
||||
pub const EXEC_FIFO_FILENAME: &str = "exec.fifo";
|
||||
|
||||
const INIT: &str = "INIT";
|
||||
const NO_PIVOT: &str = "NO_PIVOT";
|
||||
@@ -74,6 +74,7 @@ const CLOG_FD: &str = "CLOG_FD";
|
||||
const FIFO_FD: &str = "FIFO_FD";
|
||||
const HOME_ENV_KEY: &str = "HOME";
|
||||
const PIDNS_FD: &str = "PIDNS_FD";
|
||||
const CONSOLE_SOCKET_FD: &str = "CONSOLE_SOCKET_FD";
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ContainerStatus {
|
||||
@@ -82,7 +83,7 @@ pub struct ContainerStatus {
|
||||
}
|
||||
|
||||
impl ContainerStatus {
|
||||
fn new() -> Self {
|
||||
pub fn new() -> Self {
|
||||
ContainerStatus {
|
||||
pre_status: ContainerState::Created,
|
||||
cur_status: ContainerState::Created,
|
||||
@@ -99,6 +100,12 @@ impl ContainerStatus {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ContainerStatus {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
pub type Config = CreateOpts;
|
||||
type NamespaceType = String;
|
||||
|
||||
@@ -106,7 +113,7 @@ lazy_static! {
|
||||
// This locker ensures the child exit signal will be received by the right receiver.
|
||||
pub static ref WAIT_PID_LOCKER: Arc<Mutex<bool>> = Arc::new(Mutex::new(false));
|
||||
|
||||
static ref NAMESPACES: HashMap<&'static str, CloneFlags> = {
|
||||
pub static ref NAMESPACES: HashMap<&'static str, CloneFlags> = {
|
||||
let mut m = HashMap::new();
|
||||
m.insert("user", CloneFlags::CLONE_NEWUSER);
|
||||
m.insert("ipc", CloneFlags::CLONE_NEWIPC);
|
||||
@@ -119,7 +126,7 @@ lazy_static! {
|
||||
};
|
||||
|
||||
// type to name hashmap, better to be in NAMESPACES
|
||||
static ref TYPETONAME: HashMap<&'static str, &'static str> = {
|
||||
pub static ref TYPETONAME: HashMap<&'static str, &'static str> = {
|
||||
let mut m = HashMap::new();
|
||||
m.insert("ipc", "ipc");
|
||||
m.insert("user", "user");
|
||||
@@ -215,7 +222,7 @@ pub trait BaseContainer {
|
||||
async fn start(&mut self, p: Process) -> Result<()>;
|
||||
async fn run(&mut self, p: Process) -> Result<()>;
|
||||
async fn destroy(&mut self) -> Result<()>;
|
||||
fn exec(&mut self) -> Result<()>;
|
||||
async fn exec(&mut self) -> Result<()>;
|
||||
}
|
||||
|
||||
// LinuxContainer protected by Mutex
|
||||
@@ -236,6 +243,8 @@ pub struct LinuxContainer {
|
||||
pub status: ContainerStatus,
|
||||
pub created: SystemTime,
|
||||
pub logger: Logger,
|
||||
#[cfg(feature = "standard-oci-runtime")]
|
||||
pub console_socket: PathBuf,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -359,7 +368,6 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
log_child!(cfd_log, "child process start run");
|
||||
let buf = read_sync(crfd)?;
|
||||
let spec_str = std::str::from_utf8(&buf)?;
|
||||
@@ -379,6 +387,9 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
|
||||
let cm: FsManager = serde_json::from_str(cm_str)?;
|
||||
|
||||
#[cfg(feature = "standard-oci-runtime")]
|
||||
let csocket_fd = console::setup_console_socket(&std::env::var(CONSOLE_SOCKET_FD)?)?;
|
||||
|
||||
let p = if spec.process.is_some() {
|
||||
spec.process.as_ref().unwrap()
|
||||
} else {
|
||||
@@ -576,14 +587,20 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
|
||||
// only change stdio devices owner when user
|
||||
// isn't root.
|
||||
if guser.uid != 0 {
|
||||
set_stdio_permissions(guser.uid)?;
|
||||
if !uid.is_root() {
|
||||
set_stdio_permissions(uid)?;
|
||||
}
|
||||
|
||||
setid(uid, gid)?;
|
||||
|
||||
if !guser.additional_gids.is_empty() {
|
||||
setgroups(guser.additional_gids.as_slice()).map_err(|e| {
|
||||
let gids: Vec<Gid> = guser
|
||||
.additional_gids
|
||||
.iter()
|
||||
.map(|gid| Gid::from_raw(*gid))
|
||||
.collect();
|
||||
|
||||
unistd::setgroups(&gids).map_err(|e| {
|
||||
let _ = write_sync(
|
||||
cwfd,
|
||||
SYNC_FAILED,
|
||||
@@ -623,11 +640,6 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
capabilities::drop_privileges(cfd_log, c)?;
|
||||
}
|
||||
|
||||
if init {
|
||||
// notify parent to run poststart hooks
|
||||
write_sync(cwfd, SYNC_SUCCESS, "")?;
|
||||
}
|
||||
|
||||
let args = oci_process.args.to_vec();
|
||||
let env = oci_process.env.to_vec();
|
||||
|
||||
@@ -649,12 +661,17 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// set the "HOME" env getting from "/etc/passwd", if
|
||||
// there's no uid entry in /etc/passwd, set "/" as the
|
||||
// home env.
|
||||
if env::var_os(HOME_ENV_KEY).is_none() {
|
||||
let home_dir = utils::home_dir(guser.uid).unwrap_or_else(|_| String::from("/"));
|
||||
env::set_var(HOME_ENV_KEY, home_dir);
|
||||
// try to set "HOME" env by uid
|
||||
if let Ok(Some(user)) = User::from_uid(Uid::from_raw(guser.uid)) {
|
||||
if let Ok(user_home_dir) = user.dir.into_os_string().into_string() {
|
||||
env::set_var(HOME_ENV_KEY, user_home_dir);
|
||||
}
|
||||
}
|
||||
// set default home dir as "/" if "HOME" env is still empty
|
||||
if env::var_os(HOME_ENV_KEY).is_none() {
|
||||
env::set_var(HOME_ENV_KEY, String::from("/"));
|
||||
}
|
||||
}
|
||||
|
||||
let exec_file = Path::new(&args[0]);
|
||||
@@ -670,10 +687,19 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
let _ = unistd::close(crfd);
|
||||
let _ = unistd::close(cwfd);
|
||||
|
||||
unistd::setsid().context("create a new session")?;
|
||||
if oci_process.terminal {
|
||||
unsafe {
|
||||
libc::ioctl(0, libc::TIOCSCTTY);
|
||||
cfg_if::cfg_if! {
|
||||
if #[cfg(feature = "standard-oci-runtime")] {
|
||||
if let Some(csocket_fd) = csocket_fd {
|
||||
console::setup_master_console(csocket_fd)?;
|
||||
} else {
|
||||
return Err(anyhow!("failed to get console master socket fd"));
|
||||
}
|
||||
}
|
||||
else {
|
||||
unistd::setsid().context("create a new session")?;
|
||||
unsafe { libc::ioctl(0, libc::TIOCSCTTY) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -705,7 +731,7 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
// within the container to the specified user.
|
||||
// The ownership needs to match because it is created outside of
|
||||
// the container and needs to be localized.
|
||||
fn set_stdio_permissions(uid: libc::uid_t) -> Result<()> {
|
||||
fn set_stdio_permissions(uid: Uid) -> Result<()> {
|
||||
let meta = fs::metadata("/dev/null")?;
|
||||
let fds = [
|
||||
std::io::stdin().as_raw_fd(),
|
||||
@@ -720,19 +746,13 @@ fn set_stdio_permissions(uid: libc::uid_t) -> Result<()> {
|
||||
continue;
|
||||
}
|
||||
|
||||
// According to the POSIX specification, -1 is used to indicate that owner and group
|
||||
// are not to be changed. Since uid_t and gid_t are unsigned types, we have to wrap
|
||||
// around to get -1.
|
||||
let gid = 0u32.wrapping_sub(1);
|
||||
|
||||
// We only change the uid owner (as it is possible for the mount to
|
||||
// prefer a different gid, and there's no reason for us to change it).
|
||||
// The reason why we don't just leave the default uid=X mount setup is
|
||||
// that users expect to be able to actually use their console. Without
|
||||
// this code, you couldn't effectively run as a non-root user inside a
|
||||
// container and also have a console set up.
|
||||
let res = unsafe { libc::fchown(*fd, uid, gid) };
|
||||
Errno::result(res).map_err(|e| anyhow!(e).context("set stdio permissions failed"))?;
|
||||
unistd::fchown(*fd, Some(uid), None).with_context(|| "set stdio permissions failed")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -928,6 +948,14 @@ impl BaseContainer for LinuxContainer {
|
||||
|
||||
let exec_path = std::env::current_exe()?;
|
||||
let mut child = std::process::Command::new(exec_path);
|
||||
|
||||
#[allow(unused_mut)]
|
||||
let mut console_name = PathBuf::from("");
|
||||
#[cfg(feature = "standard-oci-runtime")]
|
||||
if !self.console_socket.as_os_str().is_empty() {
|
||||
console_name = self.console_socket.clone();
|
||||
}
|
||||
|
||||
let mut child = child
|
||||
.arg("init")
|
||||
.stdin(child_stdin)
|
||||
@@ -937,7 +965,8 @@ impl BaseContainer for LinuxContainer {
|
||||
.env(NO_PIVOT, format!("{}", self.config.no_pivot_root))
|
||||
.env(CRFD_FD, format!("{}", crfd))
|
||||
.env(CWFD_FD, format!("{}", cwfd))
|
||||
.env(CLOG_FD, format!("{}", cfd_log));
|
||||
.env(CLOG_FD, format!("{}", cfd_log))
|
||||
.env(CONSOLE_SOCKET_FD, console_name);
|
||||
|
||||
if p.init {
|
||||
child = child.env(FIFO_FD, format!("{}", fifofd));
|
||||
@@ -1020,7 +1049,7 @@ impl BaseContainer for LinuxContainer {
|
||||
self.start(p).await?;
|
||||
|
||||
if init {
|
||||
self.exec()?;
|
||||
self.exec().await?;
|
||||
self.status.transition(ContainerState::Running);
|
||||
}
|
||||
|
||||
@@ -1032,7 +1061,19 @@ impl BaseContainer for LinuxContainer {
|
||||
let st = self.oci_state()?;
|
||||
|
||||
for pid in self.processes.keys() {
|
||||
signal::kill(Pid::from_raw(*pid), Some(Signal::SIGKILL))?;
|
||||
match signal::kill(Pid::from_raw(*pid), Some(Signal::SIGKILL)) {
|
||||
Err(Errno::ESRCH) => {
|
||||
info!(
|
||||
self.logger,
|
||||
"kill encounters ESRCH, pid: {}, container: {}",
|
||||
pid,
|
||||
self.id.clone()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
Err(err) => return Err(anyhow!(err)),
|
||||
Ok(_) => continue,
|
||||
}
|
||||
}
|
||||
|
||||
if spec.hooks.is_some() {
|
||||
@@ -1051,12 +1092,22 @@ impl BaseContainer for LinuxContainer {
|
||||
fs::remove_dir_all(&self.root)?;
|
||||
|
||||
if let Some(cgm) = self.cgroup_manager.as_mut() {
|
||||
// Kill all of the processes created in this container to prevent
|
||||
// the leak of some daemon process when this container shared pidns
|
||||
// with the sandbox.
|
||||
let pids = cgm.get_pids().context("get cgroup pids")?;
|
||||
for i in pids {
|
||||
if let Err(e) = signal::kill(Pid::from_raw(i), Signal::SIGKILL) {
|
||||
warn!(self.logger, "kill the process {} error: {:?}", i, e);
|
||||
}
|
||||
}
|
||||
|
||||
cgm.destroy().context("destroy cgroups")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn exec(&mut self) -> Result<()> {
|
||||
async fn exec(&mut self) -> Result<()> {
|
||||
let fifo = format!("{}/{}", &self.root, EXEC_FIFO_FILENAME);
|
||||
let fd = fcntl::open(fifo.as_str(), OFlag::O_WRONLY, Mode::from_bits_truncate(0))?;
|
||||
let data: &[u8] = &[0];
|
||||
@@ -1068,6 +1119,26 @@ impl BaseContainer for LinuxContainer {
|
||||
.as_secs();
|
||||
|
||||
self.status.transition(ContainerState::Running);
|
||||
|
||||
let spec = self
|
||||
.config
|
||||
.spec
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("OCI spec was not found"))?;
|
||||
let st = self.oci_state()?;
|
||||
|
||||
// run poststart hook
|
||||
if spec.hooks.is_some() {
|
||||
info!(self.logger, "poststart hook");
|
||||
let hooks = spec
|
||||
.hooks
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("OCI hooks were not found"))?;
|
||||
for h in hooks.poststart.iter() {
|
||||
execute_hook(&self.logger, h, &st).await?;
|
||||
}
|
||||
}
|
||||
|
||||
unistd::close(fd)?;
|
||||
|
||||
Ok(())
|
||||
@@ -1110,7 +1181,7 @@ fn do_exec(args: &[String]) -> ! {
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
fn update_namespaces(logger: &Logger, spec: &mut Spec, init_pid: RawFd) -> Result<()> {
|
||||
pub fn update_namespaces(logger: &Logger, spec: &mut Spec, init_pid: RawFd) -> Result<()> {
|
||||
info!(logger, "updating namespaces");
|
||||
let linux = spec
|
||||
.linux
|
||||
@@ -1289,20 +1360,6 @@ async fn join_namespaces(
|
||||
// notify child run prestart hooks completed
|
||||
info!(logger, "notify child run prestart hook completed!");
|
||||
write_async(pipe_w, SYNC_SUCCESS, "").await?;
|
||||
|
||||
info!(logger, "notify child parent ready to run poststart hook!");
|
||||
// wait to run poststart hook
|
||||
read_async(pipe_r).await?;
|
||||
info!(logger, "get ready to run poststart hook!");
|
||||
|
||||
// run poststart hook
|
||||
if spec.hooks.is_some() {
|
||||
info!(logger, "poststart hook");
|
||||
let hooks = spec.hooks.as_ref().unwrap();
|
||||
for h in hooks.poststart.iter() {
|
||||
execute_hook(&logger, h, st).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!(logger, "wait for child process ready to run exec");
|
||||
@@ -1419,14 +1476,16 @@ impl LinuxContainer {
|
||||
.unwrap()
|
||||
.as_secs(),
|
||||
logger: logger.new(o!("module" => "rustjail", "subsystem" => "container", "cid" => id)),
|
||||
#[cfg(feature = "standard-oci-runtime")]
|
||||
console_socket: Path::new("").to_path_buf(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn setgroups(grps: &[libc::gid_t]) -> Result<()> {
|
||||
let ret = unsafe { libc::setgroups(grps.len(), grps.as_ptr() as *const libc::gid_t) };
|
||||
Errno::result(ret).map(drop)?;
|
||||
Ok(())
|
||||
#[cfg(feature = "standard-oci-runtime")]
|
||||
pub fn set_console_socket(&mut self, console_socket: &Path) -> Result<()> {
|
||||
self.console_socket = console_socket.to_path_buf();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
use std::fs::OpenOptions;
|
||||
@@ -1460,7 +1519,7 @@ use std::process::Stdio;
|
||||
use std::time::Duration;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
|
||||
async fn execute_hook(logger: &Logger, h: &Hook, st: &OCIState) -> Result<()> {
|
||||
pub async fn execute_hook(logger: &Logger, h: &Hook, st: &OCIState) -> Result<()> {
|
||||
let logger = logger.new(o!("action" => "execute-hook"));
|
||||
|
||||
let binary = PathBuf::from(h.path.as_str());
|
||||
@@ -1598,6 +1657,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::process::Process;
|
||||
use crate::skip_if_not_root;
|
||||
use nix::unistd::Uid;
|
||||
use std::fs;
|
||||
use std::os::unix::fs::MetadataExt;
|
||||
use std::os::unix::io::AsRawFd;
|
||||
@@ -1743,7 +1803,7 @@ mod tests {
|
||||
let old_uid = meta.uid();
|
||||
|
||||
let uid = 1000;
|
||||
set_stdio_permissions(uid).unwrap();
|
||||
set_stdio_permissions(Uid::from_raw(uid)).unwrap();
|
||||
|
||||
let meta = fs::metadata("/dev/stdin").unwrap();
|
||||
assert_eq!(meta.uid(), uid);
|
||||
@@ -1755,7 +1815,7 @@ mod tests {
|
||||
assert_eq!(meta.uid(), uid);
|
||||
|
||||
// restore the uid
|
||||
set_stdio_permissions(old_uid).unwrap();
|
||||
set_stdio_permissions(Uid::from_raw(old_uid)).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -2036,9 +2096,10 @@ mod tests {
|
||||
assert!(ret.is_ok(), "Expecting Ok, Got {:?}", ret);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_linuxcontainer_exec() {
|
||||
let ret = new_linux_container_and_then(|mut c: LinuxContainer| c.exec());
|
||||
#[tokio::test]
|
||||
async fn test_linuxcontainer_exec() {
|
||||
let (c, _dir) = new_linux_container();
|
||||
let ret = c.unwrap().exec().await;
|
||||
assert!(ret.is_err(), "Expecting Err, Got {:?}", ret);
|
||||
}
|
||||
|
||||
|
||||
@@ -30,6 +30,8 @@ extern crate regex;
|
||||
|
||||
pub mod capabilities;
|
||||
pub mod cgroups;
|
||||
#[cfg(feature = "standard-oci-runtime")]
|
||||
pub mod console;
|
||||
pub mod container;
|
||||
pub mod mount;
|
||||
pub mod pipestream;
|
||||
@@ -39,7 +41,6 @@ pub mod seccomp;
|
||||
pub mod specconv;
|
||||
pub mod sync;
|
||||
pub mod sync_with_async;
|
||||
pub mod utils;
|
||||
pub mod validator;
|
||||
|
||||
use std::collections::HashMap;
|
||||
@@ -265,7 +266,7 @@ pub fn resources_grpc_to_oci(res: &grpc::LinuxResources) -> oci::LinuxResources
|
||||
swap: Some(mem.Swap),
|
||||
kernel: Some(mem.Kernel),
|
||||
kernel_tcp: Some(mem.KernelTCP),
|
||||
swappiness: Some(mem.Swappiness as i64),
|
||||
swappiness: Some(mem.Swappiness),
|
||||
disable_oom_killer: Some(mem.DisableOOMKiller),
|
||||
})
|
||||
} else {
|
||||
@@ -512,6 +513,7 @@ pub fn grpc_to_oci(grpc: &grpc::Spec) -> oci::Spec {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
#[macro_export]
|
||||
macro_rules! skip_if_not_root {
|
||||
() => {
|
||||
@@ -521,4 +523,595 @@ mod tests {
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Parameters:
|
||||
//
|
||||
// 1: expected Result
|
||||
// 2: actual Result
|
||||
// 3: string used to identify the test on error
|
||||
#[macro_export]
|
||||
macro_rules! assert_result {
|
||||
($expected_result:expr, $actual_result:expr, $msg:expr) => {
|
||||
if $expected_result.is_ok() {
|
||||
let expected_value = $expected_result.as_ref().unwrap();
|
||||
let actual_value = $actual_result.unwrap();
|
||||
assert!(*expected_value == actual_value, "{}", $msg);
|
||||
} else {
|
||||
assert!($actual_result.is_err(), "{}", $msg);
|
||||
|
||||
let expected_error = $expected_result.as_ref().unwrap_err();
|
||||
let expected_error_msg = format!("{:?}", expected_error);
|
||||
|
||||
let actual_error_msg = format!("{:?}", $actual_result.unwrap_err());
|
||||
|
||||
assert!(expected_error_msg == actual_error_msg, "{}", $msg);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_process_grpc_to_oci() {
|
||||
#[derive(Debug)]
|
||||
struct TestData {
|
||||
grpcproc: grpc::Process,
|
||||
result: oci::Process,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
// All fields specified
|
||||
grpcproc: grpc::Process {
|
||||
Terminal: true,
|
||||
ConsoleSize: protobuf::SingularPtrField::<grpc::Box>::some(grpc::Box {
|
||||
Height: 123,
|
||||
Width: 456,
|
||||
..Default::default()
|
||||
}),
|
||||
User: protobuf::SingularPtrField::<grpc::User>::some(grpc::User {
|
||||
UID: 1234,
|
||||
GID: 5678,
|
||||
AdditionalGids: Vec::from([910, 1112]),
|
||||
Username: String::from("username"),
|
||||
..Default::default()
|
||||
}),
|
||||
Args: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("arg1"),
|
||||
String::from("arg2"),
|
||||
])),
|
||||
Env: protobuf::RepeatedField::from(Vec::from([String::from("env")])),
|
||||
Cwd: String::from("cwd"),
|
||||
Capabilities: protobuf::SingularPtrField::some(grpc::LinuxCapabilities {
|
||||
Bounding: protobuf::RepeatedField::from(Vec::from([String::from("bnd")])),
|
||||
Effective: protobuf::RepeatedField::from(Vec::from([String::from("eff")])),
|
||||
Inheritable: protobuf::RepeatedField::from(Vec::from([String::from(
|
||||
"inher",
|
||||
)])),
|
||||
Permitted: protobuf::RepeatedField::from(Vec::from([String::from("perm")])),
|
||||
Ambient: protobuf::RepeatedField::from(Vec::from([String::from("amb")])),
|
||||
..Default::default()
|
||||
}),
|
||||
Rlimits: protobuf::RepeatedField::from(Vec::from([
|
||||
grpc::POSIXRlimit {
|
||||
Type: String::from("r#type"),
|
||||
Hard: 123,
|
||||
Soft: 456,
|
||||
..Default::default()
|
||||
},
|
||||
grpc::POSIXRlimit {
|
||||
Type: String::from("r#type2"),
|
||||
Hard: 789,
|
||||
Soft: 1011,
|
||||
..Default::default()
|
||||
},
|
||||
])),
|
||||
NoNewPrivileges: true,
|
||||
ApparmorProfile: String::from("apparmor profile"),
|
||||
OOMScoreAdj: 123456,
|
||||
SelinuxLabel: String::from("Selinux Label"),
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Process {
|
||||
terminal: true,
|
||||
console_size: Some(oci::Box {
|
||||
height: 123,
|
||||
width: 456,
|
||||
}),
|
||||
user: oci::User {
|
||||
uid: 1234,
|
||||
gid: 5678,
|
||||
additional_gids: Vec::from([910, 1112]),
|
||||
username: String::from("username"),
|
||||
},
|
||||
args: Vec::from([String::from("arg1"), String::from("arg2")]),
|
||||
env: Vec::from([String::from("env")]),
|
||||
cwd: String::from("cwd"),
|
||||
capabilities: Some(oci::LinuxCapabilities {
|
||||
bounding: Vec::from([String::from("bnd")]),
|
||||
effective: Vec::from([String::from("eff")]),
|
||||
inheritable: Vec::from([String::from("inher")]),
|
||||
permitted: Vec::from([String::from("perm")]),
|
||||
ambient: Vec::from([String::from("amb")]),
|
||||
}),
|
||||
rlimits: Vec::from([
|
||||
oci::PosixRlimit {
|
||||
r#type: String::from("r#type"),
|
||||
hard: 123,
|
||||
soft: 456,
|
||||
},
|
||||
oci::PosixRlimit {
|
||||
r#type: String::from("r#type2"),
|
||||
hard: 789,
|
||||
soft: 1011,
|
||||
},
|
||||
]),
|
||||
no_new_privileges: true,
|
||||
apparmor_profile: String::from("apparmor profile"),
|
||||
oom_score_adj: Some(123456),
|
||||
selinux_label: String::from("Selinux Label"),
|
||||
},
|
||||
},
|
||||
TestData {
|
||||
// None ConsoleSize
|
||||
grpcproc: grpc::Process {
|
||||
ConsoleSize: protobuf::SingularPtrField::<grpc::Box>::none(),
|
||||
OOMScoreAdj: 0,
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Process {
|
||||
console_size: None,
|
||||
oom_score_adj: Some(0),
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
TestData {
|
||||
// None User
|
||||
grpcproc: grpc::Process {
|
||||
User: protobuf::SingularPtrField::<grpc::User>::none(),
|
||||
OOMScoreAdj: 0,
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Process {
|
||||
user: oci::User {
|
||||
uid: 0,
|
||||
gid: 0,
|
||||
additional_gids: vec![],
|
||||
username: String::from(""),
|
||||
},
|
||||
oom_score_adj: Some(0),
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
TestData {
|
||||
// None Capabilities
|
||||
grpcproc: grpc::Process {
|
||||
Capabilities: protobuf::SingularPtrField::none(),
|
||||
OOMScoreAdj: 0,
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Process {
|
||||
capabilities: None,
|
||||
oom_score_adj: Some(0),
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
let result = process_grpc_to_oci(&d.grpcproc);
|
||||
|
||||
let msg = format!("{}, result: {:?}", msg, result);
|
||||
|
||||
assert_eq!(d.result, result, "{}", msg);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_root_grpc_to_oci() {
|
||||
#[derive(Debug)]
|
||||
struct TestData {
|
||||
grpcroot: grpc::Root,
|
||||
result: oci::Root,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
// Default fields
|
||||
grpcroot: grpc::Root {
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Root {
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
TestData {
|
||||
// Specified fields, readonly false
|
||||
grpcroot: grpc::Root {
|
||||
Path: String::from("path"),
|
||||
Readonly: false,
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Root {
|
||||
path: String::from("path"),
|
||||
readonly: false,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
TestData {
|
||||
// Specified fields, readonly true
|
||||
grpcroot: grpc::Root {
|
||||
Path: String::from("path"),
|
||||
Readonly: true,
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Root {
|
||||
path: String::from("path"),
|
||||
readonly: true,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
let result = root_grpc_to_oci(&d.grpcroot);
|
||||
|
||||
let msg = format!("{}, result: {:?}", msg, result);
|
||||
|
||||
assert_eq!(d.result, result, "{}", msg);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hooks_grpc_to_oci() {
|
||||
#[derive(Debug)]
|
||||
struct TestData {
|
||||
grpchooks: grpc::Hooks,
|
||||
result: oci::Hooks,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
// Default fields
|
||||
grpchooks: grpc::Hooks {
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Hooks {
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
TestData {
|
||||
// All specified
|
||||
grpchooks: grpc::Hooks {
|
||||
Prestart: protobuf::RepeatedField::from(Vec::from([
|
||||
grpc::Hook {
|
||||
Path: String::from("prestartpath"),
|
||||
Args: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("arg1"),
|
||||
String::from("arg2"),
|
||||
])),
|
||||
Env: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("env1"),
|
||||
String::from("env2"),
|
||||
])),
|
||||
Timeout: 10,
|
||||
..Default::default()
|
||||
},
|
||||
grpc::Hook {
|
||||
Path: String::from("prestartpath2"),
|
||||
Args: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("arg3"),
|
||||
String::from("arg4"),
|
||||
])),
|
||||
Env: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("env3"),
|
||||
String::from("env4"),
|
||||
])),
|
||||
Timeout: 25,
|
||||
..Default::default()
|
||||
},
|
||||
])),
|
||||
Poststart: protobuf::RepeatedField::from(Vec::from([grpc::Hook {
|
||||
Path: String::from("poststartpath"),
|
||||
Args: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("arg1"),
|
||||
String::from("arg2"),
|
||||
])),
|
||||
Env: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("env1"),
|
||||
String::from("env2"),
|
||||
])),
|
||||
Timeout: 10,
|
||||
..Default::default()
|
||||
}])),
|
||||
Poststop: protobuf::RepeatedField::from(Vec::from([grpc::Hook {
|
||||
Path: String::from("poststoppath"),
|
||||
Args: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("arg1"),
|
||||
String::from("arg2"),
|
||||
])),
|
||||
Env: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("env1"),
|
||||
String::from("env2"),
|
||||
])),
|
||||
Timeout: 10,
|
||||
..Default::default()
|
||||
}])),
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Hooks {
|
||||
prestart: Vec::from([
|
||||
oci::Hook {
|
||||
path: String::from("prestartpath"),
|
||||
args: Vec::from([String::from("arg1"), String::from("arg2")]),
|
||||
env: Vec::from([String::from("env1"), String::from("env2")]),
|
||||
timeout: Some(10),
|
||||
},
|
||||
oci::Hook {
|
||||
path: String::from("prestartpath2"),
|
||||
args: Vec::from([String::from("arg3"), String::from("arg4")]),
|
||||
env: Vec::from([String::from("env3"), String::from("env4")]),
|
||||
timeout: Some(25),
|
||||
},
|
||||
]),
|
||||
poststart: Vec::from([oci::Hook {
|
||||
path: String::from("poststartpath"),
|
||||
args: Vec::from([String::from("arg1"), String::from("arg2")]),
|
||||
env: Vec::from([String::from("env1"), String::from("env2")]),
|
||||
timeout: Some(10),
|
||||
}]),
|
||||
poststop: Vec::from([oci::Hook {
|
||||
path: String::from("poststoppath"),
|
||||
args: Vec::from([String::from("arg1"), String::from("arg2")]),
|
||||
env: Vec::from([String::from("env1"), String::from("env2")]),
|
||||
timeout: Some(10),
|
||||
}]),
|
||||
},
|
||||
},
|
||||
TestData {
|
||||
// Prestart empty
|
||||
grpchooks: grpc::Hooks {
|
||||
Prestart: protobuf::RepeatedField::from(Vec::from([])),
|
||||
Poststart: protobuf::RepeatedField::from(Vec::from([grpc::Hook {
|
||||
Path: String::from("poststartpath"),
|
||||
Args: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("arg1"),
|
||||
String::from("arg2"),
|
||||
])),
|
||||
Env: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("env1"),
|
||||
String::from("env2"),
|
||||
])),
|
||||
Timeout: 10,
|
||||
..Default::default()
|
||||
}])),
|
||||
Poststop: protobuf::RepeatedField::from(Vec::from([grpc::Hook {
|
||||
Path: String::from("poststoppath"),
|
||||
Args: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("arg1"),
|
||||
String::from("arg2"),
|
||||
])),
|
||||
Env: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("env1"),
|
||||
String::from("env2"),
|
||||
])),
|
||||
Timeout: 10,
|
||||
..Default::default()
|
||||
}])),
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Hooks {
|
||||
prestart: Vec::from([]),
|
||||
poststart: Vec::from([oci::Hook {
|
||||
path: String::from("poststartpath"),
|
||||
args: Vec::from([String::from("arg1"), String::from("arg2")]),
|
||||
env: Vec::from([String::from("env1"), String::from("env2")]),
|
||||
timeout: Some(10),
|
||||
}]),
|
||||
poststop: Vec::from([oci::Hook {
|
||||
path: String::from("poststoppath"),
|
||||
args: Vec::from([String::from("arg1"), String::from("arg2")]),
|
||||
env: Vec::from([String::from("env1"), String::from("env2")]),
|
||||
timeout: Some(10),
|
||||
}]),
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
let result = hooks_grpc_to_oci(&d.grpchooks);
|
||||
|
||||
let msg = format!("{}, result: {:?}", msg, result);
|
||||
|
||||
assert_eq!(d.result, result, "{}", msg);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mount_grpc_to_oci() {
|
||||
#[derive(Debug)]
|
||||
struct TestData {
|
||||
grpcmount: grpc::Mount,
|
||||
result: oci::Mount,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
// Default fields
|
||||
grpcmount: grpc::Mount {
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Mount {
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
TestData {
|
||||
grpcmount: grpc::Mount {
|
||||
destination: String::from("destination"),
|
||||
source: String::from("source"),
|
||||
field_type: String::from("fieldtype"),
|
||||
options: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("option1"),
|
||||
String::from("option2"),
|
||||
])),
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Mount {
|
||||
destination: String::from("destination"),
|
||||
source: String::from("source"),
|
||||
r#type: String::from("fieldtype"),
|
||||
options: Vec::from([String::from("option1"), String::from("option2")]),
|
||||
},
|
||||
},
|
||||
TestData {
|
||||
grpcmount: grpc::Mount {
|
||||
destination: String::from("destination"),
|
||||
source: String::from("source"),
|
||||
field_type: String::from("fieldtype"),
|
||||
options: protobuf::RepeatedField::from(Vec::new()),
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Mount {
|
||||
destination: String::from("destination"),
|
||||
source: String::from("source"),
|
||||
r#type: String::from("fieldtype"),
|
||||
options: Vec::new(),
|
||||
},
|
||||
},
|
||||
TestData {
|
||||
grpcmount: grpc::Mount {
|
||||
destination: String::new(),
|
||||
source: String::from("source"),
|
||||
field_type: String::from("fieldtype"),
|
||||
options: protobuf::RepeatedField::from(Vec::from([String::from("option1")])),
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Mount {
|
||||
destination: String::new(),
|
||||
source: String::from("source"),
|
||||
r#type: String::from("fieldtype"),
|
||||
options: Vec::from([String::from("option1")]),
|
||||
},
|
||||
},
|
||||
TestData {
|
||||
grpcmount: grpc::Mount {
|
||||
destination: String::from("destination"),
|
||||
source: String::from("source"),
|
||||
field_type: String::new(),
|
||||
options: protobuf::RepeatedField::from(Vec::from([String::from("option1")])),
|
||||
..Default::default()
|
||||
},
|
||||
result: oci::Mount {
|
||||
destination: String::from("destination"),
|
||||
source: String::from("source"),
|
||||
r#type: String::new(),
|
||||
options: Vec::from([String::from("option1")]),
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
let result = mount_grpc_to_oci(&d.grpcmount);
|
||||
|
||||
let msg = format!("{}, result: {:?}", msg, result);
|
||||
|
||||
assert_eq!(d.result, result, "{}", msg);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hook_grpc_to_oci<'a>() {
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
grpchook: &'a [grpc::Hook],
|
||||
result: Vec<oci::Hook>,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
// Default fields
|
||||
grpchook: &[
|
||||
grpc::Hook {
|
||||
Timeout: 0,
|
||||
..Default::default()
|
||||
},
|
||||
grpc::Hook {
|
||||
Timeout: 0,
|
||||
..Default::default()
|
||||
},
|
||||
],
|
||||
result: vec![
|
||||
oci::Hook {
|
||||
timeout: Some(0),
|
||||
..Default::default()
|
||||
},
|
||||
oci::Hook {
|
||||
timeout: Some(0),
|
||||
..Default::default()
|
||||
},
|
||||
],
|
||||
},
|
||||
TestData {
|
||||
// Specified fields
|
||||
grpchook: &[
|
||||
grpc::Hook {
|
||||
Path: String::from("path"),
|
||||
Args: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("arg1"),
|
||||
String::from("arg2"),
|
||||
])),
|
||||
Env: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("env1"),
|
||||
String::from("env2"),
|
||||
])),
|
||||
Timeout: 10,
|
||||
..Default::default()
|
||||
},
|
||||
grpc::Hook {
|
||||
Path: String::from("path2"),
|
||||
Args: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("arg3"),
|
||||
String::from("arg4"),
|
||||
])),
|
||||
Env: protobuf::RepeatedField::from(Vec::from([
|
||||
String::from("env3"),
|
||||
String::from("env4"),
|
||||
])),
|
||||
Timeout: 20,
|
||||
..Default::default()
|
||||
},
|
||||
],
|
||||
result: vec![
|
||||
oci::Hook {
|
||||
path: String::from("path"),
|
||||
args: Vec::from([String::from("arg1"), String::from("arg2")]),
|
||||
env: Vec::from([String::from("env1"), String::from("env2")]),
|
||||
timeout: Some(10),
|
||||
},
|
||||
oci::Hook {
|
||||
path: String::from("path2"),
|
||||
args: Vec::from([String::from("arg3"), String::from("arg4")]),
|
||||
env: Vec::from([String::from("env3"), String::from("env4")]),
|
||||
timeout: Some(20),
|
||||
},
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
let result = hook_grpc_to_oci(d.grpchook);
|
||||
|
||||
let msg = format!("{}, result: {:?}", msg, result);
|
||||
|
||||
assert_eq!(d.result, result, "{}", msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,16 +32,21 @@ use crate::log_child;
|
||||
|
||||
// Info reveals information about a particular mounted filesystem. This
|
||||
// struct is populated from the content in the /proc/<pid>/mountinfo file.
|
||||
#[derive(std::fmt::Debug)]
|
||||
#[derive(std::fmt::Debug, PartialEq)]
|
||||
pub struct Info {
|
||||
mount_point: String,
|
||||
optional: String,
|
||||
fstype: String,
|
||||
}
|
||||
|
||||
const MOUNTINFOFORMAT: &str = "{d} {d} {d}:{d} {} {} {} {}";
|
||||
const MOUNTINFO_FORMAT: &str = "{d} {d} {d}:{d} {} {} {} {}";
|
||||
const MOUNTINFO_PATH: &str = "/proc/self/mountinfo";
|
||||
const PROC_PATH: &str = "/proc";
|
||||
|
||||
const ERR_FAILED_PARSE_MOUNTINFO: &str = "failed to parse mountinfo file";
|
||||
const ERR_FAILED_PARSE_MOUNTINFO_FINAL_FIELDS: &str =
|
||||
"failed to parse final fields in mountinfo file";
|
||||
|
||||
// since libc didn't defined this const for musl, thus redefined it here.
|
||||
#[cfg(all(target_os = "linux", target_env = "gnu", not(target_arch = "s390x")))]
|
||||
const PROC_SUPER_MAGIC: libc::c_long = 0x00009fa0;
|
||||
@@ -518,7 +523,7 @@ pub fn pivot_rootfs<P: ?Sized + NixPath + std::fmt::Debug>(path: &P) -> Result<(
|
||||
}
|
||||
|
||||
fn rootfs_parent_mount_private(path: &str) -> Result<()> {
|
||||
let mount_infos = parse_mount_table()?;
|
||||
let mount_infos = parse_mount_table(MOUNTINFO_PATH)?;
|
||||
|
||||
let mut max_len = 0;
|
||||
let mut mount_point = String::from("");
|
||||
@@ -546,8 +551,8 @@ fn rootfs_parent_mount_private(path: &str) -> Result<()> {
|
||||
|
||||
// Parse /proc/self/mountinfo because comparing Dev and ino does not work from
|
||||
// bind mounts
|
||||
fn parse_mount_table() -> Result<Vec<Info>> {
|
||||
let file = File::open("/proc/self/mountinfo")?;
|
||||
fn parse_mount_table(mountinfo_path: &str) -> Result<Vec<Info>> {
|
||||
let file = File::open(mountinfo_path)?;
|
||||
let reader = BufReader::new(file);
|
||||
let mut infos = Vec::new();
|
||||
|
||||
@@ -569,7 +574,7 @@ fn parse_mount_table() -> Result<Vec<Info>> {
|
||||
|
||||
let (_id, _parent, _major, _minor, _root, mount_point, _opts, optional) = scan_fmt!(
|
||||
&line,
|
||||
MOUNTINFOFORMAT,
|
||||
MOUNTINFO_FORMAT,
|
||||
i32,
|
||||
i32,
|
||||
i32,
|
||||
@@ -578,12 +583,17 @@ fn parse_mount_table() -> Result<Vec<Info>> {
|
||||
String,
|
||||
String,
|
||||
String
|
||||
)?;
|
||||
)
|
||||
.map_err(|_| anyhow!(ERR_FAILED_PARSE_MOUNTINFO))?;
|
||||
|
||||
let fields: Vec<&str> = line.split(" - ").collect();
|
||||
if fields.len() == 2 {
|
||||
let (fstype, _source, _vfs_opts) =
|
||||
scan_fmt!(fields[1], "{} {} {}", String, String, String)?;
|
||||
let final_fields: Vec<&str> = fields[1].split_whitespace().collect();
|
||||
|
||||
if final_fields.len() != 3 {
|
||||
return Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO_FINAL_FIELDS));
|
||||
}
|
||||
let fstype = final_fields[0].to_string();
|
||||
|
||||
let mut optional_new = String::new();
|
||||
if optional != "-" {
|
||||
@@ -598,7 +608,7 @@ fn parse_mount_table() -> Result<Vec<Info>> {
|
||||
|
||||
infos.push(info);
|
||||
} else {
|
||||
return Err(anyhow!("failed to parse mount info file".to_string()));
|
||||
return Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -619,7 +629,7 @@ fn chroot<P: ?Sized + NixPath>(_path: &P) -> Result<(), nix::Error> {
|
||||
|
||||
pub fn ms_move_root(rootfs: &str) -> Result<bool> {
|
||||
unistd::chdir(rootfs)?;
|
||||
let mount_infos = parse_mount_table()?;
|
||||
let mount_infos = parse_mount_table(MOUNTINFO_PATH)?;
|
||||
|
||||
let root_path = Path::new(rootfs);
|
||||
let abs_root_buf = root_path.absolutize()?;
|
||||
@@ -770,18 +780,31 @@ fn mount_from(
|
||||
Path::new(&dest).parent().unwrap()
|
||||
};
|
||||
|
||||
let _ = fs::create_dir_all(&dir).map_err(|e| {
|
||||
fs::create_dir_all(&dir).map_err(|e| {
|
||||
log_child!(
|
||||
cfd_log,
|
||||
"create dir {}: {}",
|
||||
dir.to_str().unwrap(),
|
||||
e.to_string()
|
||||
)
|
||||
});
|
||||
);
|
||||
e
|
||||
})?;
|
||||
|
||||
// make sure file exists so we can bind over it
|
||||
if !src.is_dir() {
|
||||
let _ = OpenOptions::new().create(true).write(true).open(&dest);
|
||||
let _ = OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.open(&dest)
|
||||
.map_err(|e| {
|
||||
log_child!(
|
||||
cfd_log,
|
||||
"open/create dest error. {}: {:?}",
|
||||
dest.as_str(),
|
||||
e
|
||||
);
|
||||
e
|
||||
})?;
|
||||
}
|
||||
src.to_str().unwrap().to_string()
|
||||
} else {
|
||||
@@ -794,8 +817,10 @@ fn mount_from(
|
||||
}
|
||||
};
|
||||
|
||||
let _ = stat::stat(dest.as_str())
|
||||
.map_err(|e| log_child!(cfd_log, "dest stat error. {}: {:?}", dest.as_str(), e));
|
||||
let _ = stat::stat(dest.as_str()).map_err(|e| {
|
||||
log_child!(cfd_log, "dest stat error. {}: {:?}", dest.as_str(), e);
|
||||
e
|
||||
})?;
|
||||
|
||||
mount(
|
||||
Some(src.as_str()),
|
||||
@@ -1046,10 +1071,12 @@ fn readonly_path(path: &str) -> Result<()> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::assert_result;
|
||||
use crate::skip_if_not_root;
|
||||
use std::fs::create_dir;
|
||||
use std::fs::create_dir_all;
|
||||
use std::fs::remove_dir_all;
|
||||
use std::io;
|
||||
use std::os::unix::fs;
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use tempfile::tempdir;
|
||||
@@ -1286,6 +1313,113 @@ mod tests {
|
||||
let ret = stat::stat(path);
|
||||
assert!(ret.is_ok(), "Should pass. Got: {:?}", ret);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mount_from() {
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
source: &'a str,
|
||||
destination: &'a str,
|
||||
r#type: &'a str,
|
||||
flags: MsFlags,
|
||||
error_contains: &'a str,
|
||||
|
||||
// if true, a directory will be created at path in source
|
||||
make_source_directory: bool,
|
||||
// if true, a file will be created at path in source
|
||||
make_source_file: bool,
|
||||
}
|
||||
|
||||
impl Default for TestData<'_> {
|
||||
fn default() -> Self {
|
||||
TestData {
|
||||
source: "tmp",
|
||||
destination: "dest",
|
||||
r#type: "tmpfs",
|
||||
flags: MsFlags::empty(),
|
||||
error_contains: "",
|
||||
make_source_directory: true,
|
||||
make_source_file: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
flags: MsFlags::MS_BIND,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
r#type: "bind",
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
r#type: "cgroup2",
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
r#type: "bind",
|
||||
make_source_directory: false,
|
||||
error_contains: &format!("{}", std::io::Error::from_raw_os_error(libc::ENOENT)),
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
r#type: "bind",
|
||||
make_source_directory: false,
|
||||
make_source_file: true,
|
||||
..Default::default()
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
let tempdir = tempdir().unwrap();
|
||||
|
||||
let (rfd, wfd) = unistd::pipe2(OFlag::O_CLOEXEC).unwrap();
|
||||
defer!({
|
||||
unistd::close(rfd).unwrap();
|
||||
unistd::close(wfd).unwrap();
|
||||
});
|
||||
|
||||
let source_path = tempdir.path().join(d.source).to_str().unwrap().to_string();
|
||||
if d.make_source_directory {
|
||||
std::fs::create_dir_all(&source_path).unwrap();
|
||||
} else if d.make_source_file {
|
||||
std::fs::write(&source_path, []).unwrap();
|
||||
}
|
||||
|
||||
let mount = Mount {
|
||||
source: source_path,
|
||||
destination: d.destination.to_string(),
|
||||
r#type: d.r#type.to_string(),
|
||||
options: vec![],
|
||||
};
|
||||
|
||||
let result = mount_from(
|
||||
wfd,
|
||||
&mount,
|
||||
tempdir.path().to_str().unwrap(),
|
||||
d.flags,
|
||||
"",
|
||||
"",
|
||||
);
|
||||
|
||||
let msg = format!("{}: result: {:?}", msg, result);
|
||||
|
||||
if d.error_contains.is_empty() {
|
||||
assert!(result.is_ok(), "{}", msg);
|
||||
} else {
|
||||
assert!(result.is_err(), "{}", msg);
|
||||
|
||||
let error_msg = format!("{}", result.unwrap_err());
|
||||
assert!(error_msg.contains(d.error_contains), "{}", msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_check_proc_mount() {
|
||||
let mount = oci::Mount {
|
||||
@@ -1401,6 +1535,121 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_mount_table() {
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
mountinfo_data: Option<&'a str>,
|
||||
result: Result<Vec<Info>>,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
mountinfo_data: Some(
|
||||
"22 933 0:20 / /sys rw,nodev shared:2 - sysfs sysfs rw,noexec",
|
||||
),
|
||||
result: Ok(vec![Info {
|
||||
mount_point: "/sys".to_string(),
|
||||
optional: "shared:2".to_string(),
|
||||
fstype: "sysfs".to_string(),
|
||||
}]),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: Some(
|
||||
r#"22 933 0:20 / /sys rw,nodev - sysfs sysfs rw,noexec
|
||||
81 13 1:2 / /tmp/dir rw shared:2 - tmpfs tmpfs rw"#,
|
||||
),
|
||||
result: Ok(vec![
|
||||
Info {
|
||||
mount_point: "/sys".to_string(),
|
||||
optional: "".to_string(),
|
||||
fstype: "sysfs".to_string(),
|
||||
},
|
||||
Info {
|
||||
mount_point: "/tmp/dir".to_string(),
|
||||
optional: "shared:2".to_string(),
|
||||
fstype: "tmpfs".to_string(),
|
||||
},
|
||||
]),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: Some(
|
||||
"22 933 0:20 /foo\040-\040bar /sys rw,nodev shared:2 - sysfs sysfs rw,noexec",
|
||||
),
|
||||
result: Ok(vec![Info {
|
||||
mount_point: "/sys".to_string(),
|
||||
optional: "shared:2".to_string(),
|
||||
fstype: "sysfs".to_string(),
|
||||
}]),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: Some(""),
|
||||
result: Ok(vec![]),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: Some("invalid line data - sysfs sysfs rw"),
|
||||
result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: Some("22 96 0:21 / /sys rw,noexec - sysfs"),
|
||||
result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO_FINAL_FIELDS)),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: Some("22 96 0:21 / /sys rw,noexec - sysfs sysfs rw rw"),
|
||||
result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO_FINAL_FIELDS)),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: Some("22 96 0:21 / /sys rw,noexec shared:2 - x - x"),
|
||||
result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: Some("-"),
|
||||
result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: Some("--"),
|
||||
result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: Some("- -"),
|
||||
result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: Some(" - "),
|
||||
result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: Some(
|
||||
r#"22 933 0:20 / /sys rw,nodev - sysfs sysfs rw,noexec
|
||||
invalid line
|
||||
81 13 1:2 / /tmp/dir rw shared:2 - tmpfs tmpfs rw"#,
|
||||
),
|
||||
result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)),
|
||||
},
|
||||
TestData {
|
||||
mountinfo_data: None,
|
||||
result: Err(anyhow!(io::Error::from_raw_os_error(libc::ENOENT))),
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
let tempdir = tempdir().unwrap();
|
||||
let mountinfo_path = tempdir.path().join("mountinfo");
|
||||
|
||||
if let Some(mountinfo_data) = d.mountinfo_data {
|
||||
std::fs::write(&mountinfo_path, mountinfo_data).unwrap();
|
||||
}
|
||||
|
||||
let result = parse_mount_table(mountinfo_path.to_str().unwrap());
|
||||
|
||||
let msg = format!("{}: result: {:?}", msg, result);
|
||||
|
||||
assert_result!(d.result, result, msg);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dev_rel_path() {
|
||||
// Valid device paths
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
use libc::pid_t;
|
||||
use std::fs::File;
|
||||
use std::os::unix::io::RawFd;
|
||||
use std::os::unix::io::{AsRawFd, RawFd};
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
use nix::errno::Errno;
|
||||
@@ -28,7 +28,6 @@ macro_rules! close_process_stream {
|
||||
($self: ident, $stream:ident, $stream_type: ident) => {
|
||||
if $self.$stream.is_some() {
|
||||
$self.close_stream(StreamType::$stream_type);
|
||||
let _ = unistd::close($self.$stream.unwrap());
|
||||
$self.$stream = None;
|
||||
}
|
||||
};
|
||||
@@ -137,19 +136,25 @@ impl Process {
|
||||
info!(logger, "before create console socket!");
|
||||
|
||||
if !p.tty {
|
||||
info!(logger, "created console socket!");
|
||||
if cfg!(feature = "standard-oci-runtime") {
|
||||
p.stdin = Some(std::io::stdin().as_raw_fd());
|
||||
p.stdout = Some(std::io::stdout().as_raw_fd());
|
||||
p.stderr = Some(std::io::stderr().as_raw_fd());
|
||||
} else {
|
||||
info!(logger, "created console socket!");
|
||||
|
||||
let (stdin, pstdin) = unistd::pipe2(OFlag::O_CLOEXEC)?;
|
||||
p.parent_stdin = Some(pstdin);
|
||||
p.stdin = Some(stdin);
|
||||
let (stdin, pstdin) = unistd::pipe2(OFlag::O_CLOEXEC)?;
|
||||
p.parent_stdin = Some(pstdin);
|
||||
p.stdin = Some(stdin);
|
||||
|
||||
let (pstdout, stdout) = create_extended_pipe(OFlag::O_CLOEXEC, pipe_size)?;
|
||||
p.parent_stdout = Some(pstdout);
|
||||
p.stdout = Some(stdout);
|
||||
let (pstdout, stdout) = create_extended_pipe(OFlag::O_CLOEXEC, pipe_size)?;
|
||||
p.parent_stdout = Some(pstdout);
|
||||
p.stdout = Some(stdout);
|
||||
|
||||
let (pstderr, stderr) = create_extended_pipe(OFlag::O_CLOEXEC, pipe_size)?;
|
||||
p.parent_stderr = Some(pstderr);
|
||||
p.stderr = Some(stderr);
|
||||
let (pstderr, stderr) = create_extended_pipe(OFlag::O_CLOEXEC, pipe_size)?;
|
||||
p.parent_stderr = Some(pstderr);
|
||||
p.stderr = Some(stderr);
|
||||
}
|
||||
}
|
||||
Ok(p)
|
||||
}
|
||||
@@ -219,7 +224,7 @@ impl Process {
|
||||
Some(writer)
|
||||
}
|
||||
|
||||
pub fn close_stream(&mut self, stream_type: StreamType) {
|
||||
fn close_stream(&mut self, stream_type: StreamType) {
|
||||
let _ = self.readers.remove(&stream_type);
|
||||
let _ = self.writers.remove(&stream_type);
|
||||
}
|
||||
@@ -284,5 +289,11 @@ mod tests {
|
||||
// group of the calling process.
|
||||
process.pid = 0;
|
||||
assert!(process.signal(libc::SIGCONT).is_ok());
|
||||
|
||||
if cfg!(feature = "standard-oci-runtime") {
|
||||
assert_eq!(process.stdin.unwrap(), std::io::stdin().as_raw_fd());
|
||||
assert_eq!(process.stdout.unwrap(), std::io::stdout().as_raw_fd());
|
||||
assert_eq!(process.stderr.unwrap(), std::io::stderr().as_raw_fd());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
use oci::Spec;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Serialize, Deserialize, Debug, Default, Clone)]
|
||||
pub struct CreateOpts {
|
||||
pub cgroup_name: String,
|
||||
pub use_systemd_cgroup: bool,
|
||||
|
||||
@@ -1,120 +0,0 @@
|
||||
// Copyright (c) 2021 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use libc::gid_t;
|
||||
use libc::uid_t;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
const PASSWD_FILE: &str = "/etc/passwd";
|
||||
|
||||
// An entry from /etc/passwd
|
||||
#[derive(Debug, PartialEq, PartialOrd)]
|
||||
pub struct PasswdEntry {
|
||||
// username
|
||||
pub name: String,
|
||||
// user password
|
||||
pub passwd: String,
|
||||
// user id
|
||||
pub uid: uid_t,
|
||||
// group id
|
||||
pub gid: gid_t,
|
||||
// user Information
|
||||
pub gecos: String,
|
||||
// home directory
|
||||
pub dir: String,
|
||||
// User's Shell
|
||||
pub shell: String,
|
||||
}
|
||||
|
||||
// get an entry for a given `uid` from `/etc/passwd`
|
||||
fn get_entry_by_uid(uid: uid_t, path: &str) -> Result<PasswdEntry> {
|
||||
let file = File::open(path).with_context(|| format!("open file {}", path))?;
|
||||
let mut reader = BufReader::new(file);
|
||||
|
||||
let mut line = String::new();
|
||||
loop {
|
||||
line.clear();
|
||||
match reader.read_line(&mut line) {
|
||||
Ok(0) => return Err(anyhow!(format!("file {} is empty", path))),
|
||||
Ok(_) => (),
|
||||
Err(e) => {
|
||||
return Err(anyhow!(format!(
|
||||
"failed to read file {} with {:?}",
|
||||
path, e
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
if line.starts_with('#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
let parts: Vec<&str> = line.split(':').map(|part| part.trim()).collect();
|
||||
if parts.len() != 7 {
|
||||
continue;
|
||||
}
|
||||
|
||||
match parts[2].parse() {
|
||||
Err(_e) => continue,
|
||||
Ok(new_uid) => {
|
||||
if uid != new_uid {
|
||||
continue;
|
||||
}
|
||||
|
||||
let entry = PasswdEntry {
|
||||
name: parts[0].to_string(),
|
||||
passwd: parts[1].to_string(),
|
||||
uid: new_uid,
|
||||
gid: parts[3].parse().unwrap_or(0),
|
||||
gecos: parts[4].to_string(),
|
||||
dir: parts[5].to_string(),
|
||||
shell: parts[6].to_string(),
|
||||
};
|
||||
|
||||
return Ok(entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn home_dir(uid: uid_t) -> Result<String> {
|
||||
get_entry_by_uid(uid, PASSWD_FILE).map(|entry| entry.dir)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::io::Write;
|
||||
use tempfile::Builder;
|
||||
|
||||
#[test]
|
||||
fn test_get_entry_by_uid() {
|
||||
let tmpdir = Builder::new().tempdir().unwrap();
|
||||
let tmpdir_path = tmpdir.path().to_str().unwrap();
|
||||
let temp_passwd = format!("{}/passwd", tmpdir_path);
|
||||
|
||||
let mut tempf = File::create(temp_passwd.as_str()).unwrap();
|
||||
let passwd_entries = "root:x:0:0:root:/root0:/bin/bash
|
||||
root:x:1:0:root:/root1:/bin/bash
|
||||
#root:x:1:0:root:/rootx:/bin/bash
|
||||
root:x:2:0:root:/root2:/bin/bash
|
||||
root:x:3:0:root:/root3
|
||||
root:x:3:0:root:/root3:/bin/bash";
|
||||
writeln!(tempf, "{}", passwd_entries).unwrap();
|
||||
|
||||
let entry = get_entry_by_uid(0, temp_passwd.as_str()).unwrap();
|
||||
assert_eq!(entry.dir.as_str(), "/root0");
|
||||
|
||||
let entry = get_entry_by_uid(1, temp_passwd.as_str()).unwrap();
|
||||
assert_eq!(entry.dir.as_str(), "/root1");
|
||||
|
||||
let entry = get_entry_by_uid(2, temp_passwd.as_str()).unwrap();
|
||||
assert_eq!(entry.dir.as_str(), "/root2");
|
||||
|
||||
let entry = get_entry_by_uid(3, temp_passwd.as_str()).unwrap();
|
||||
assert_eq!(entry.dir.as_str(), "/root3");
|
||||
}
|
||||
}
|
||||
@@ -432,6 +432,8 @@ fn get_container_pipe_size(param: &str) -> Result<i32> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::assert_result;
|
||||
|
||||
use super::*;
|
||||
use anyhow::anyhow;
|
||||
use std::fs::File;
|
||||
@@ -439,32 +441,6 @@ mod tests {
|
||||
use std::time;
|
||||
use tempfile::tempdir;
|
||||
|
||||
// Parameters:
|
||||
//
|
||||
// 1: expected Result
|
||||
// 2: actual Result
|
||||
// 3: string used to identify the test on error
|
||||
macro_rules! assert_result {
|
||||
($expected_result:expr, $actual_result:expr, $msg:expr) => {
|
||||
if $expected_result.is_ok() {
|
||||
let expected_level = $expected_result.as_ref().unwrap();
|
||||
let actual_level = $actual_result.unwrap();
|
||||
assert!(*expected_level == actual_level, "{}", $msg);
|
||||
} else {
|
||||
let expected_error = $expected_result.as_ref().unwrap_err();
|
||||
let expected_error_msg = format!("{:?}", expected_error);
|
||||
|
||||
if let Err(actual_error) = $actual_result {
|
||||
let actual_error_msg = format!("{:?}", actual_error);
|
||||
|
||||
assert!(expected_error_msg == actual_error_msg, "{}", $msg);
|
||||
} else {
|
||||
assert!(expected_error_msg == "expected error, got OK", "{}", $msg);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_new() {
|
||||
let config: AgentConfig = Default::default();
|
||||
|
||||
@@ -9,7 +9,7 @@ use anyhow::{anyhow, Result};
|
||||
use nix::fcntl::{self, FcntlArg, FdFlag, OFlag};
|
||||
use nix::libc::{STDERR_FILENO, STDIN_FILENO, STDOUT_FILENO};
|
||||
use nix::pty::{openpty, OpenptyResult};
|
||||
use nix::sys::socket::{self, AddressFamily, SockAddr, SockFlag, SockType};
|
||||
use nix::sys::socket::{self, AddressFamily, SockFlag, SockType, VsockAddr};
|
||||
use nix::sys::stat::Mode;
|
||||
use nix::sys::wait;
|
||||
use nix::unistd::{self, close, dup2, fork, setsid, ForkResult, Pid};
|
||||
@@ -67,7 +67,7 @@ pub async fn debug_console_handler(
|
||||
SockFlag::SOCK_CLOEXEC,
|
||||
None,
|
||||
)?;
|
||||
let addr = SockAddr::new_vsock(libc::VMADDR_CID_ANY, port);
|
||||
let addr = VsockAddr::new(libc::VMADDR_CID_ANY, port);
|
||||
socket::bind(listenfd, &addr)?;
|
||||
socket::listen(listenfd, 1)?;
|
||||
|
||||
|
||||
@@ -22,12 +22,11 @@ extern crate slog;
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use clap::{AppSettings, Parser};
|
||||
use nix::fcntl::OFlag;
|
||||
use nix::sys::socket::{self, AddressFamily, SockAddr, SockFlag, SockType};
|
||||
use nix::sys::socket::{self, AddressFamily, SockFlag, SockType, VsockAddr};
|
||||
use nix::unistd::{self, dup, Pid};
|
||||
use std::env;
|
||||
use std::ffi::OsStr;
|
||||
use std::fs::{self, File};
|
||||
use std::os::unix::ffi::OsStrExt;
|
||||
use std::os::unix::fs as unixfs;
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use std::path::Path;
|
||||
@@ -111,10 +110,6 @@ enum SubCommand {
|
||||
fn announce(logger: &Logger, config: &AgentConfig) {
|
||||
info!(logger, "announce";
|
||||
"agent-commit" => version::VERSION_COMMIT,
|
||||
|
||||
// Avoid any possibility of confusion with the old agent
|
||||
"agent-type" => "rust",
|
||||
|
||||
"agent-version" => version::AGENT_VERSION,
|
||||
"api-version" => version::API_VERSION,
|
||||
"config" => format!("{:?}", config),
|
||||
@@ -133,7 +128,7 @@ async fn create_logger_task(rfd: RawFd, vsock_port: u32, shutdown: Receiver<bool
|
||||
None,
|
||||
)?;
|
||||
|
||||
let addr = SockAddr::new_vsock(libc::VMADDR_CID_ANY, vsock_port);
|
||||
let addr = VsockAddr::new(libc::VMADDR_CID_ANY, vsock_port);
|
||||
socket::bind(listenfd, &addr)?;
|
||||
socket::listen(listenfd, 1)?;
|
||||
|
||||
@@ -214,7 +209,7 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
|
||||
if config.log_level == slog::Level::Trace {
|
||||
// Redirect ttrpc log calls to slog iff full debug requested
|
||||
ttrpc_log_guard = Ok(slog_stdlog::init().map_err(|e| e)?);
|
||||
ttrpc_log_guard = Ok(slog_stdlog::init()?);
|
||||
}
|
||||
|
||||
if config.tracing {
|
||||
@@ -382,27 +377,13 @@ fn init_agent_as_init(logger: &Logger, unified_cgroup_hierarchy: bool) -> Result
|
||||
let contents_array: Vec<&str> = contents.split(' ').collect();
|
||||
let hostname = contents_array[0].trim();
|
||||
|
||||
if sethostname(OsStr::new(hostname)).is_err() {
|
||||
if unistd::sethostname(OsStr::new(hostname)).is_err() {
|
||||
warn!(logger, "failed to set hostname");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
fn sethostname(hostname: &OsStr) -> Result<()> {
|
||||
let size = hostname.len() as usize;
|
||||
|
||||
let result =
|
||||
unsafe { libc::sethostname(hostname.as_bytes().as_ptr() as *const libc::c_char, size) };
|
||||
|
||||
if result != 0 {
|
||||
Err(anyhow!("failed to set hostname"))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// The Rust standard library had suppressed the default SIGPIPE behavior,
|
||||
// see https://github.com/rust-lang/rust/pull/13158.
|
||||
// Since the parent's signal handler would be inherited by it's child process,
|
||||
@@ -416,3 +397,59 @@ fn reset_sigpipe() {
|
||||
|
||||
use crate::config::AgentConfig;
|
||||
use std::os::unix::io::{FromRawFd, RawFd};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::test_utils::test_utils::TestUserType;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_logger_task() {
|
||||
#[derive(Debug)]
|
||||
struct TestData {
|
||||
vsock_port: u32,
|
||||
test_user: TestUserType,
|
||||
result: Result<()>,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
// non-root user cannot use privileged vsock port
|
||||
vsock_port: 1,
|
||||
test_user: TestUserType::NonRootOnly,
|
||||
result: Err(anyhow!(nix::errno::Errno::from_i32(libc::EACCES))),
|
||||
},
|
||||
TestData {
|
||||
// passing vsock_port 0 causes logger task to write to stdout
|
||||
vsock_port: 0,
|
||||
test_user: TestUserType::Any,
|
||||
result: Ok(()),
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
if d.test_user == TestUserType::RootOnly {
|
||||
skip_if_not_root!();
|
||||
} else if d.test_user == TestUserType::NonRootOnly {
|
||||
skip_if_root!();
|
||||
}
|
||||
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
let (rfd, wfd) = unistd::pipe2(OFlag::O_CLOEXEC).unwrap();
|
||||
defer!({
|
||||
// rfd is closed by the use of PipeStream in the crate_logger_task function,
|
||||
// but we will attempt to close in case of a failure
|
||||
let _ = unistd::close(rfd);
|
||||
unistd::close(wfd).unwrap();
|
||||
});
|
||||
|
||||
let (shutdown_tx, shutdown_rx) = channel(true);
|
||||
|
||||
shutdown_tx.send(true).unwrap();
|
||||
let result = create_logger_task(rfd, d.vsock_port, shutdown_rx).await;
|
||||
|
||||
let msg = format!("{}, result: {:?}", msg, result);
|
||||
assert_result!(d.result, result, msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use nix::mount::MsFlags;
|
||||
use nix::unistd::Gid;
|
||||
use nix::unistd::{Gid, Uid};
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
@@ -29,6 +29,7 @@ use crate::device::{
|
||||
use crate::linux_abi::*;
|
||||
use crate::pci;
|
||||
use crate::protocols::agent::Storage;
|
||||
use crate::protocols::types::FSGroupChangePolicy;
|
||||
use crate::Sandbox;
|
||||
#[cfg(target_arch = "s390x")]
|
||||
use crate::{ccw, device::get_virtio_blk_ccw_device_name};
|
||||
@@ -43,6 +44,11 @@ pub const MOUNT_GUEST_TAG: &str = "kataShared";
|
||||
// Allocating an FSGroup that owns the pod's volumes
|
||||
const FS_GID: &str = "fsgid";
|
||||
|
||||
const RW_MASK: u32 = 0o660;
|
||||
const RO_MASK: u32 = 0o440;
|
||||
const EXEC_MASK: u32 = 0o110;
|
||||
const MODE_SETGID: u32 = 0o2000;
|
||||
|
||||
#[rustfmt::skip]
|
||||
lazy_static! {
|
||||
pub static ref FLAGS: HashMap<&'static str, (bool, MsFlags)> = {
|
||||
@@ -85,11 +91,11 @@ lazy_static! {
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct InitMount {
|
||||
fstype: &'static str,
|
||||
src: &'static str,
|
||||
dest: &'static str,
|
||||
options: Vec<&'static str>,
|
||||
pub struct InitMount<'a> {
|
||||
fstype: &'a str,
|
||||
src: &'a str,
|
||||
dest: &'a str,
|
||||
options: Vec<&'a str>,
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
@@ -115,7 +121,7 @@ lazy_static!{
|
||||
|
||||
#[rustfmt::skip]
|
||||
lazy_static! {
|
||||
pub static ref INIT_ROOTFS_MOUNTS: Vec<InitMount> = vec![
|
||||
pub static ref INIT_ROOTFS_MOUNTS: Vec<InitMount<'static>> = vec![
|
||||
InitMount{fstype: "proc", src: "proc", dest: "/proc", options: vec!["nosuid", "nodev", "noexec"]},
|
||||
InitMount{fstype: "sysfs", src: "sysfs", dest: "/sys", options: vec!["nosuid", "nodev", "noexec"]},
|
||||
InitMount{fstype: "devtmpfs", src: "dev", dest: "/dev", options: vec!["nosuid"]},
|
||||
@@ -222,7 +228,7 @@ async fn ephemeral_storage_handler(
|
||||
let meta = fs::metadata(&storage.mount_point)?;
|
||||
let mut permission = meta.permissions();
|
||||
|
||||
let o_mode = meta.mode() | 0o2000;
|
||||
let o_mode = meta.mode() | MODE_SETGID;
|
||||
permission.set_mode(o_mode);
|
||||
fs::set_permissions(&storage.mount_point, permission)?;
|
||||
}
|
||||
@@ -272,7 +278,7 @@ async fn local_storage_handler(
|
||||
|
||||
if need_set_fsgid {
|
||||
// set SetGid mode mask.
|
||||
o_mode |= 0o2000;
|
||||
o_mode |= MODE_SETGID;
|
||||
}
|
||||
permission.set_mode(o_mode);
|
||||
|
||||
@@ -489,7 +495,9 @@ fn common_storage_handler(logger: &Logger, storage: &Storage) -> Result<String>
|
||||
// Mount the storage device.
|
||||
let mount_point = storage.mount_point.to_string();
|
||||
|
||||
mount_storage(logger, storage).and(Ok(mount_point))
|
||||
mount_storage(logger, storage)?;
|
||||
set_ownership(logger, storage)?;
|
||||
Ok(mount_point)
|
||||
}
|
||||
|
||||
// nvdimm_storage_handler handles the storage for NVDIMM driver.
|
||||
@@ -573,6 +581,91 @@ fn mount_storage(logger: &Logger, storage: &Storage) -> Result<()> {
|
||||
)
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
pub fn set_ownership(logger: &Logger, storage: &Storage) -> Result<()> {
|
||||
let logger = logger.new(o!("subsystem" => "mount", "fn" => "set_ownership"));
|
||||
|
||||
// If fsGroup is not set, skip performing ownership change
|
||||
if storage.fs_group.is_none() {
|
||||
return Ok(());
|
||||
}
|
||||
let fs_group = storage.get_fs_group();
|
||||
|
||||
let mut read_only = false;
|
||||
let opts_vec: Vec<String> = storage.options.to_vec();
|
||||
if opts_vec.contains(&String::from("ro")) {
|
||||
read_only = true;
|
||||
}
|
||||
|
||||
let mount_path = Path::new(&storage.mount_point);
|
||||
let metadata = mount_path.metadata().map_err(|err| {
|
||||
error!(logger, "failed to obtain metadata for mount path";
|
||||
"mount-path" => mount_path.to_str(),
|
||||
"error" => err.to_string(),
|
||||
);
|
||||
err
|
||||
})?;
|
||||
|
||||
if fs_group.group_change_policy == FSGroupChangePolicy::OnRootMismatch
|
||||
&& metadata.gid() == fs_group.group_id
|
||||
{
|
||||
let mut mask = if read_only { RO_MASK } else { RW_MASK };
|
||||
mask |= EXEC_MASK;
|
||||
|
||||
// With fsGroup change policy to OnRootMismatch, if the current
|
||||
// gid of the mount path root directory matches the desired gid
|
||||
// and the current permission of mount path root directory is correct,
|
||||
// then ownership change will be skipped.
|
||||
let current_mode = metadata.permissions().mode();
|
||||
if (mask & current_mode == mask) && (current_mode & MODE_SETGID != 0) {
|
||||
info!(logger, "skipping ownership change for volume";
|
||||
"mount-path" => mount_path.to_str(),
|
||||
"fs-group" => fs_group.group_id.to_string(),
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
info!(logger, "performing recursive ownership change";
|
||||
"mount-path" => mount_path.to_str(),
|
||||
"fs-group" => fs_group.group_id.to_string(),
|
||||
);
|
||||
recursive_ownership_change(
|
||||
mount_path,
|
||||
None,
|
||||
Some(Gid::from_raw(fs_group.group_id)),
|
||||
read_only,
|
||||
)
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
pub fn recursive_ownership_change(
|
||||
path: &Path,
|
||||
uid: Option<Uid>,
|
||||
gid: Option<Gid>,
|
||||
read_only: bool,
|
||||
) -> Result<()> {
|
||||
let mut mask = if read_only { RO_MASK } else { RW_MASK };
|
||||
if path.is_dir() {
|
||||
for entry in fs::read_dir(&path)? {
|
||||
recursive_ownership_change(entry?.path().as_path(), uid, gid, read_only)?;
|
||||
}
|
||||
mask |= EXEC_MASK;
|
||||
mask |= MODE_SETGID;
|
||||
}
|
||||
nix::unistd::chown(path, uid, gid)?;
|
||||
|
||||
if gid.is_some() {
|
||||
let metadata = path.metadata()?;
|
||||
let mut permission = metadata.permissions();
|
||||
let target_mode = metadata.mode() | mask;
|
||||
permission.set_mode(target_mode);
|
||||
fs::set_permissions(path, permission)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Looks for `mount_point` entry in the /proc/mounts.
|
||||
#[instrument]
|
||||
pub fn is_mounted(mount_point: &str) -> Result<bool> {
|
||||
@@ -747,15 +840,13 @@ pub fn get_mount_fs_type_from_file(mount_file: &str, mount_point: &str) -> Resul
|
||||
return Err(anyhow!("Invalid mount point {}", mount_point));
|
||||
}
|
||||
|
||||
let file = File::open(mount_file)?;
|
||||
let reader = BufReader::new(file);
|
||||
let content = fs::read_to_string(mount_file)?;
|
||||
|
||||
let re = Regex::new(format!("device .+ mounted on {} with fstype (.+)", mount_point).as_str())?;
|
||||
|
||||
// Read the file line by line using the lines() iterator from std::io::BufRead.
|
||||
for (_index, line) in reader.lines().enumerate() {
|
||||
let line = line?;
|
||||
let capes = match re.captures(line.as_str()) {
|
||||
for (_index, line) in content.lines().enumerate() {
|
||||
let capes = match re.captures(line) {
|
||||
Some(c) => c,
|
||||
None => continue,
|
||||
};
|
||||
@@ -766,8 +857,9 @@ pub fn get_mount_fs_type_from_file(mount_file: &str, mount_point: &str) -> Resul
|
||||
}
|
||||
|
||||
Err(anyhow!(
|
||||
"failed to find FS type for mount point {}",
|
||||
mount_point
|
||||
"failed to find FS type for mount point {}, mount file content: {:?}",
|
||||
mount_point,
|
||||
content
|
||||
))
|
||||
}
|
||||
|
||||
@@ -776,7 +868,7 @@ pub fn get_cgroup_mounts(
|
||||
logger: &Logger,
|
||||
cg_path: &str,
|
||||
unified_cgroup_hierarchy: bool,
|
||||
) -> Result<Vec<InitMount>> {
|
||||
) -> Result<Vec<InitMount<'static>>> {
|
||||
// cgroup v2
|
||||
// https://github.com/kata-containers/agent/blob/8c9bbadcd448c9a67690fbe11a860aaacc69813c/agent.go#L1249
|
||||
if unified_cgroup_hierarchy {
|
||||
@@ -924,20 +1016,16 @@ fn parse_options(option_list: Vec<String>) -> HashMap<String, String> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{skip_if_not_root, skip_loop_if_not_root, skip_loop_if_root};
|
||||
use crate::test_utils::test_utils::TestUserType;
|
||||
use crate::{skip_if_not_root, skip_loop_by_user, skip_loop_if_not_root, skip_loop_if_root};
|
||||
use protobuf::RepeatedField;
|
||||
use protocols::agent::FSGroup;
|
||||
use std::fs::File;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
enum TestUserType {
|
||||
RootOnly,
|
||||
NonRootOnly,
|
||||
Any,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mount() {
|
||||
#[derive(Debug)]
|
||||
@@ -1023,11 +1111,7 @@ mod tests {
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
if d.test_user == TestUserType::RootOnly {
|
||||
skip_loop_if_not_root!(msg);
|
||||
} else if d.test_user == TestUserType::NonRootOnly {
|
||||
skip_loop_if_root!(msg);
|
||||
}
|
||||
skip_loop_by_user!(msg, d.test_user);
|
||||
|
||||
let src: PathBuf;
|
||||
let dest: PathBuf;
|
||||
@@ -1497,6 +1581,226 @@ mod tests {
|
||||
assert!(testfile.is_file());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mount_storage() {
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
test_user: TestUserType,
|
||||
storage: Storage,
|
||||
error_contains: &'a str,
|
||||
|
||||
make_source_dir: bool,
|
||||
make_mount_dir: bool,
|
||||
deny_mount_permission: bool,
|
||||
}
|
||||
|
||||
impl Default for TestData<'_> {
|
||||
fn default() -> Self {
|
||||
TestData {
|
||||
test_user: TestUserType::Any,
|
||||
storage: Storage {
|
||||
mount_point: "mnt".to_string(),
|
||||
source: "src".to_string(),
|
||||
fstype: "tmpfs".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
make_source_dir: true,
|
||||
make_mount_dir: false,
|
||||
deny_mount_permission: false,
|
||||
error_contains: "",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
test_user: TestUserType::NonRootOnly,
|
||||
error_contains: "EPERM: Operation not permitted",
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
test_user: TestUserType::RootOnly,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
storage: Storage {
|
||||
mount_point: "mnt".to_string(),
|
||||
source: "src".to_string(),
|
||||
fstype: "bind".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
make_source_dir: false,
|
||||
make_mount_dir: true,
|
||||
error_contains: "Could not create mountpoint",
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
test_user: TestUserType::NonRootOnly,
|
||||
deny_mount_permission: true,
|
||||
error_contains: "Could not create mountpoint",
|
||||
..Default::default()
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
skip_loop_by_user!(msg, d.test_user);
|
||||
|
||||
let drain = slog::Discard;
|
||||
let logger = slog::Logger::root(drain, o!());
|
||||
|
||||
let tempdir = tempdir().unwrap();
|
||||
|
||||
let source = tempdir.path().join(&d.storage.source);
|
||||
let mount_point = tempdir.path().join(&d.storage.mount_point);
|
||||
|
||||
let storage = Storage {
|
||||
source: source.to_str().unwrap().to_string(),
|
||||
mount_point: mount_point.to_str().unwrap().to_string(),
|
||||
..d.storage.clone()
|
||||
};
|
||||
|
||||
if d.make_source_dir {
|
||||
fs::create_dir_all(&storage.source).unwrap();
|
||||
}
|
||||
if d.make_mount_dir {
|
||||
fs::create_dir_all(&storage.mount_point).unwrap();
|
||||
}
|
||||
|
||||
if d.deny_mount_permission {
|
||||
fs::set_permissions(
|
||||
mount_point.parent().unwrap(),
|
||||
fs::Permissions::from_mode(0o000),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let result = mount_storage(&logger, &storage);
|
||||
|
||||
// restore permissions so tempdir can be cleaned up
|
||||
if d.deny_mount_permission {
|
||||
fs::set_permissions(
|
||||
mount_point.parent().unwrap(),
|
||||
fs::Permissions::from_mode(0o755),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
if result.is_ok() {
|
||||
nix::mount::umount(&mount_point).unwrap();
|
||||
}
|
||||
|
||||
let msg = format!("{}: result: {:?}", msg, result);
|
||||
if d.error_contains.is_empty() {
|
||||
assert!(result.is_ok(), "{}", msg);
|
||||
} else {
|
||||
assert!(result.is_err(), "{}", msg);
|
||||
let error_msg = format!("{}", result.unwrap_err());
|
||||
assert!(error_msg.contains(d.error_contains), "{}", msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mount_to_rootfs() {
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
test_user: TestUserType,
|
||||
src: &'a str,
|
||||
options: Vec<&'a str>,
|
||||
error_contains: &'a str,
|
||||
deny_mount_dir_permission: bool,
|
||||
// if true src will be prepended with a temporary directory
|
||||
mask_src: bool,
|
||||
}
|
||||
|
||||
impl Default for TestData<'_> {
|
||||
fn default() -> Self {
|
||||
TestData {
|
||||
test_user: TestUserType::Any,
|
||||
src: "src",
|
||||
options: vec![],
|
||||
error_contains: "",
|
||||
deny_mount_dir_permission: false,
|
||||
mask_src: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
test_user: TestUserType::NonRootOnly,
|
||||
error_contains: "EPERM: Operation not permitted",
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
test_user: TestUserType::NonRootOnly,
|
||||
src: "dev",
|
||||
mask_src: false,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
test_user: TestUserType::RootOnly,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
test_user: TestUserType::NonRootOnly,
|
||||
deny_mount_dir_permission: true,
|
||||
error_contains: "could not create directory",
|
||||
..Default::default()
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
skip_loop_by_user!(msg, d.test_user);
|
||||
|
||||
let drain = slog::Discard;
|
||||
let logger = slog::Logger::root(drain, o!());
|
||||
let tempdir = tempdir().unwrap();
|
||||
|
||||
let src = if d.mask_src {
|
||||
tempdir.path().join(&d.src)
|
||||
} else {
|
||||
Path::new(d.src).to_path_buf()
|
||||
};
|
||||
let dest = tempdir.path().join("mnt");
|
||||
let init_mount = InitMount {
|
||||
fstype: "tmpfs",
|
||||
src: src.to_str().unwrap(),
|
||||
dest: dest.to_str().unwrap(),
|
||||
options: d.options.clone(),
|
||||
};
|
||||
|
||||
if d.deny_mount_dir_permission {
|
||||
fs::set_permissions(dest.parent().unwrap(), fs::Permissions::from_mode(0o000))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let result = mount_to_rootfs(&logger, &init_mount);
|
||||
|
||||
// restore permissions so tempdir can be cleaned up
|
||||
if d.deny_mount_dir_permission {
|
||||
fs::set_permissions(dest.parent().unwrap(), fs::Permissions::from_mode(0o755))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
if result.is_ok() && d.mask_src {
|
||||
nix::mount::umount(&dest).unwrap();
|
||||
}
|
||||
|
||||
let msg = format!("{}: result: {:?}", msg, result);
|
||||
if d.error_contains.is_empty() {
|
||||
assert!(result.is_ok(), "{}", msg);
|
||||
} else {
|
||||
assert!(result.is_err(), "{}", msg);
|
||||
let error_msg = format!("{}", result.unwrap_err());
|
||||
assert!(error_msg.contains(d.error_contains), "{}", msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_pagesize_and_size_from_option() {
|
||||
let expected_pagesize = 2048;
|
||||
@@ -1552,4 +1856,263 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_mount_flags_and_options() {
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
options_vec: Vec<&'a str>,
|
||||
result: (MsFlags, &'a str),
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
options_vec: vec![],
|
||||
result: (MsFlags::empty(), ""),
|
||||
},
|
||||
TestData {
|
||||
options_vec: vec!["ro"],
|
||||
result: (MsFlags::MS_RDONLY, ""),
|
||||
},
|
||||
TestData {
|
||||
options_vec: vec!["rw"],
|
||||
result: (MsFlags::empty(), ""),
|
||||
},
|
||||
TestData {
|
||||
options_vec: vec!["ro", "rw"],
|
||||
result: (MsFlags::empty(), ""),
|
||||
},
|
||||
TestData {
|
||||
options_vec: vec!["ro", "nodev"],
|
||||
result: (MsFlags::MS_RDONLY | MsFlags::MS_NODEV, ""),
|
||||
},
|
||||
TestData {
|
||||
options_vec: vec!["option1", "nodev", "option2"],
|
||||
result: (MsFlags::MS_NODEV, "option1,option2"),
|
||||
},
|
||||
TestData {
|
||||
options_vec: vec!["rbind", "", "ro"],
|
||||
result: (MsFlags::MS_BIND | MsFlags::MS_REC | MsFlags::MS_RDONLY, ""),
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
let result = parse_mount_flags_and_options(d.options_vec.clone());
|
||||
|
||||
let msg = format!("{}: result: {:?}", msg, result);
|
||||
|
||||
let expected_result = (d.result.0, d.result.1.to_owned());
|
||||
assert_eq!(expected_result, result, "{}", msg);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_set_ownership() {
|
||||
skip_if_not_root!();
|
||||
|
||||
let logger = slog::Logger::root(slog::Discard, o!());
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
mount_path: &'a str,
|
||||
fs_group: Option<FSGroup>,
|
||||
read_only: bool,
|
||||
expected_group_id: u32,
|
||||
expected_permission: u32,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
mount_path: "foo",
|
||||
fs_group: None,
|
||||
read_only: false,
|
||||
expected_group_id: 0,
|
||||
expected_permission: 0,
|
||||
},
|
||||
TestData {
|
||||
mount_path: "rw_mount",
|
||||
fs_group: Some(FSGroup {
|
||||
group_id: 3000,
|
||||
group_change_policy: FSGroupChangePolicy::Always,
|
||||
unknown_fields: Default::default(),
|
||||
cached_size: Default::default(),
|
||||
}),
|
||||
read_only: false,
|
||||
expected_group_id: 3000,
|
||||
expected_permission: RW_MASK | EXEC_MASK | MODE_SETGID,
|
||||
},
|
||||
TestData {
|
||||
mount_path: "ro_mount",
|
||||
fs_group: Some(FSGroup {
|
||||
group_id: 3000,
|
||||
group_change_policy: FSGroupChangePolicy::OnRootMismatch,
|
||||
unknown_fields: Default::default(),
|
||||
cached_size: Default::default(),
|
||||
}),
|
||||
read_only: true,
|
||||
expected_group_id: 3000,
|
||||
expected_permission: RO_MASK | EXEC_MASK | MODE_SETGID,
|
||||
},
|
||||
];
|
||||
|
||||
let tempdir = tempdir().expect("failed to create tmpdir");
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
let mount_dir = tempdir.path().join(d.mount_path);
|
||||
fs::create_dir(&mount_dir)
|
||||
.unwrap_or_else(|_| panic!("{}: failed to create root directory", msg));
|
||||
|
||||
let directory_mode = mount_dir.as_path().metadata().unwrap().permissions().mode();
|
||||
let mut storage_data = Storage::new();
|
||||
if d.read_only {
|
||||
storage_data.set_options(RepeatedField::from_slice(&[
|
||||
"foo".to_string(),
|
||||
"ro".to_string(),
|
||||
]));
|
||||
}
|
||||
if let Some(fs_group) = d.fs_group.clone() {
|
||||
storage_data.set_fs_group(fs_group);
|
||||
}
|
||||
storage_data.mount_point = mount_dir.clone().into_os_string().into_string().unwrap();
|
||||
|
||||
let result = set_ownership(&logger, &storage_data);
|
||||
assert!(result.is_ok());
|
||||
|
||||
assert_eq!(
|
||||
mount_dir.as_path().metadata().unwrap().gid(),
|
||||
d.expected_group_id
|
||||
);
|
||||
assert_eq!(
|
||||
mount_dir.as_path().metadata().unwrap().permissions().mode(),
|
||||
(directory_mode | d.expected_permission)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_recursive_ownership_change() {
|
||||
skip_if_not_root!();
|
||||
|
||||
const COUNT: usize = 5;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
// Directory where the recursive ownership change should be performed on
|
||||
path: &'a str,
|
||||
|
||||
// User ID for ownership change
|
||||
uid: u32,
|
||||
|
||||
// Group ID for ownership change
|
||||
gid: u32,
|
||||
|
||||
// Set when the permission should be read-only
|
||||
read_only: bool,
|
||||
|
||||
// The expected permission of all directories after ownership change
|
||||
expected_permission_directory: u32,
|
||||
|
||||
// The expected permission of all files after ownership change
|
||||
expected_permission_file: u32,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
path: "no_gid_change",
|
||||
uid: 0,
|
||||
gid: 0,
|
||||
read_only: false,
|
||||
expected_permission_directory: 0,
|
||||
expected_permission_file: 0,
|
||||
},
|
||||
TestData {
|
||||
path: "rw_gid_change",
|
||||
uid: 0,
|
||||
gid: 3000,
|
||||
read_only: false,
|
||||
expected_permission_directory: RW_MASK | EXEC_MASK | MODE_SETGID,
|
||||
expected_permission_file: RW_MASK,
|
||||
},
|
||||
TestData {
|
||||
path: "ro_gid_change",
|
||||
uid: 0,
|
||||
gid: 3000,
|
||||
read_only: true,
|
||||
expected_permission_directory: RO_MASK | EXEC_MASK | MODE_SETGID,
|
||||
expected_permission_file: RO_MASK,
|
||||
},
|
||||
];
|
||||
|
||||
let tempdir = tempdir().expect("failed to create tmpdir");
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
let mount_dir = tempdir.path().join(d.path);
|
||||
fs::create_dir(&mount_dir)
|
||||
.unwrap_or_else(|_| panic!("{}: failed to create root directory", msg));
|
||||
|
||||
let directory_mode = mount_dir.as_path().metadata().unwrap().permissions().mode();
|
||||
let mut file_mode: u32 = 0;
|
||||
|
||||
// create testing directories and files
|
||||
for n in 1..COUNT {
|
||||
let nest_dir = mount_dir.join(format!("nested{}", n));
|
||||
fs::create_dir(&nest_dir)
|
||||
.unwrap_or_else(|_| panic!("{}: failed to create nest directory", msg));
|
||||
|
||||
for f in 1..COUNT {
|
||||
let filename = nest_dir.join(format!("file{}", f));
|
||||
File::create(&filename)
|
||||
.unwrap_or_else(|_| panic!("{}: failed to create file", msg));
|
||||
file_mode = filename.as_path().metadata().unwrap().permissions().mode();
|
||||
}
|
||||
}
|
||||
|
||||
let uid = if d.uid > 0 {
|
||||
Some(Uid::from_raw(d.uid))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let gid = if d.gid > 0 {
|
||||
Some(Gid::from_raw(d.gid))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let result = recursive_ownership_change(&mount_dir, uid, gid, d.read_only);
|
||||
|
||||
assert!(result.is_ok());
|
||||
|
||||
assert_eq!(mount_dir.as_path().metadata().unwrap().gid(), d.gid);
|
||||
assert_eq!(
|
||||
mount_dir.as_path().metadata().unwrap().permissions().mode(),
|
||||
(directory_mode | d.expected_permission_directory)
|
||||
);
|
||||
|
||||
for n in 1..COUNT {
|
||||
let nest_dir = mount_dir.join(format!("nested{}", n));
|
||||
for f in 1..COUNT {
|
||||
let filename = nest_dir.join(format!("file{}", f));
|
||||
let file = Path::new(&filename);
|
||||
|
||||
assert_eq!(file.metadata().unwrap().gid(), d.gid);
|
||||
assert_eq!(
|
||||
file.metadata().unwrap().permissions().mode(),
|
||||
(file_mode | d.expected_permission_file)
|
||||
);
|
||||
}
|
||||
|
||||
let dir = Path::new(&nest_dir);
|
||||
assert_eq!(dir.metadata().unwrap().gid(), d.gid);
|
||||
assert_eq!(
|
||||
dir.metadata().unwrap().permissions().mode(),
|
||||
(directory_mode | d.expected_permission_directory)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use futures::{future, StreamExt, TryStreamExt};
|
||||
use futures::{future, TryStreamExt};
|
||||
use ipnetwork::{IpNetwork, Ipv4Network, Ipv6Network};
|
||||
use nix::errno::Errno;
|
||||
use protobuf::RepeatedField;
|
||||
@@ -64,7 +64,7 @@ impl Handle {
|
||||
pub async fn update_interface(&mut self, iface: &Interface) -> Result<()> {
|
||||
// The reliable way to find link is using hardware address
|
||||
// as filter. However, hardware filter might not be supported
|
||||
// by netlink, we may have to dump link list and the find the
|
||||
// by netlink, we may have to dump link list and then find the
|
||||
// target link. filter using name or family is supported, but
|
||||
// we cannot use that to find target link.
|
||||
// let's try if hardware address filter works. -_-
|
||||
@@ -164,7 +164,7 @@ impl Handle {
|
||||
let request = self.handle.link().get();
|
||||
|
||||
let filtered = match filter {
|
||||
LinkFilter::Name(name) => request.set_name_filter(name.to_owned()),
|
||||
LinkFilter::Name(name) => request.match_name(name.to_owned()),
|
||||
LinkFilter::Index(index) => request.match_index(index),
|
||||
_ => request, // Post filters
|
||||
};
|
||||
@@ -178,7 +178,7 @@ impl Handle {
|
||||
.with_context(|| format!("Failed to parse MAC address: {}", addr))?;
|
||||
|
||||
// Hardware filter might not be supported by netlink,
|
||||
// we may have to dump link list and the find the target link.
|
||||
// we may have to dump link list and then find the target link.
|
||||
stream
|
||||
.try_filter(|f| {
|
||||
let result = f.nlas.iter().any(|n| match n {
|
||||
@@ -516,70 +516,24 @@ impl Handle {
|
||||
}
|
||||
|
||||
/// Adds an ARP neighbor.
|
||||
/// TODO: `rtnetlink` has no neighbours API, remove this after https://github.com/little-dude/netlink/pull/135
|
||||
async fn add_arp_neighbor(&mut self, neigh: &ARPNeighbor) -> Result<()> {
|
||||
let ip_address = neigh
|
||||
.toIPAddress
|
||||
.as_ref()
|
||||
.map(|to| to.address.as_str()) // Extract address field
|
||||
.and_then(|addr| if addr.is_empty() { None } else { Some(addr) }) // Make sure it's not empty
|
||||
.ok_or(anyhow!(nix::Error::EINVAL))?;
|
||||
.ok_or_else(|| anyhow!(nix::Error::EINVAL))?;
|
||||
|
||||
let ip = IpAddr::from_str(ip_address)
|
||||
.map_err(|e| anyhow!("Failed to parse IP {}: {:?}", ip_address, e))?;
|
||||
|
||||
// Import rtnetlink objects that make sense only for this function
|
||||
use packet::constants::{NDA_UNSPEC, NLM_F_ACK, NLM_F_CREATE, NLM_F_EXCL, NLM_F_REQUEST};
|
||||
use packet::neighbour::{NeighbourHeader, NeighbourMessage};
|
||||
use packet::nlas::neighbour::Nla;
|
||||
use packet::{NetlinkMessage, NetlinkPayload, RtnlMessage};
|
||||
use rtnetlink::Error;
|
||||
|
||||
const IFA_F_PERMANENT: u16 = 0x80; // See https://github.com/little-dude/netlink/blob/0185b2952505e271805902bf175fee6ea86c42b8/netlink-packet-route/src/rtnl/constants.rs#L770
|
||||
|
||||
let link = self.find_link(LinkFilter::Name(&neigh.device)).await?;
|
||||
|
||||
let message = NeighbourMessage {
|
||||
header: NeighbourHeader {
|
||||
family: match ip {
|
||||
IpAddr::V4(_) => packet::AF_INET,
|
||||
IpAddr::V6(_) => packet::AF_INET6,
|
||||
} as u8,
|
||||
ifindex: link.index(),
|
||||
state: if neigh.state != 0 {
|
||||
neigh.state as u16
|
||||
} else {
|
||||
IFA_F_PERMANENT
|
||||
},
|
||||
flags: neigh.flags as u8,
|
||||
ntype: NDA_UNSPEC as u8,
|
||||
},
|
||||
nlas: {
|
||||
let mut nlas = vec![Nla::Destination(match ip {
|
||||
IpAddr::V4(v4) => v4.octets().to_vec(),
|
||||
IpAddr::V6(v6) => v6.octets().to_vec(),
|
||||
})];
|
||||
|
||||
if !neigh.lladdr.is_empty() {
|
||||
nlas.push(Nla::LinkLocalAddress(
|
||||
parse_mac_address(&neigh.lladdr)?.to_vec(),
|
||||
));
|
||||
}
|
||||
|
||||
nlas
|
||||
},
|
||||
};
|
||||
|
||||
// Send request and ACK
|
||||
let mut req = NetlinkMessage::from(RtnlMessage::NewNeighbour(message));
|
||||
req.header.flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
|
||||
|
||||
let mut response = self.handle.request(req)?;
|
||||
while let Some(message) = response.next().await {
|
||||
if let NetlinkPayload::Error(err) = message.payload {
|
||||
return Err(anyhow!(Error::NetlinkError(err)));
|
||||
}
|
||||
}
|
||||
self.handle
|
||||
.neighbours()
|
||||
.add(link.index(), ip)
|
||||
.execute()
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -612,7 +566,7 @@ fn parse_mac_address(addr: &str) -> Result<[u8; 6]> {
|
||||
|
||||
// Parse single Mac address block
|
||||
let mut parse_next = || -> Result<u8> {
|
||||
let v = u8::from_str_radix(split.next().ok_or(anyhow!(nix::Error::EINVAL))?, 16)?;
|
||||
let v = u8::from_str_radix(split.next().ok_or_else(|| anyhow!(nix::Error::EINVAL))?, 16)?;
|
||||
Ok(v)
|
||||
};
|
||||
|
||||
@@ -950,7 +904,7 @@ mod tests {
|
||||
.expect("prepare: failed to delete neigh");
|
||||
}
|
||||
|
||||
fn prepare_env_for_test_add_one_arp_neighbor(dummy_name: &str, ip: &str) {
|
||||
fn prepare_env_for_test_add_one_arp_neighbor(dummy_name: &str, ip: &str, mac: &str) {
|
||||
clean_env_for_test_add_one_arp_neighbor(dummy_name, ip);
|
||||
// modprobe dummy
|
||||
Command::new("modprobe")
|
||||
@@ -964,6 +918,12 @@ mod tests {
|
||||
.output()
|
||||
.expect("failed to add dummy interface");
|
||||
|
||||
// ip link set dummy address 6a:92:3a:59:70:aa
|
||||
Command::new("ip")
|
||||
.args(&["link", "set", dummy_name, "address", mac])
|
||||
.output()
|
||||
.expect("failed to add dummy interface");
|
||||
|
||||
// ip addr add 192.168.0.2/16 dev dummy
|
||||
Command::new("ip")
|
||||
.args(&["addr", "add", "192.168.0.2/16", "dev", dummy_name])
|
||||
@@ -985,7 +945,7 @@ mod tests {
|
||||
let to_ip = "169.254.1.1";
|
||||
let dummy_name = "dummy_for_arp";
|
||||
|
||||
prepare_env_for_test_add_one_arp_neighbor(dummy_name, to_ip);
|
||||
prepare_env_for_test_add_one_arp_neighbor(dummy_name, to_ip, mac);
|
||||
|
||||
let mut ip_address = IPAddress::new();
|
||||
ip_address.set_address(to_ip.to_string());
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use anyhow::Result;
|
||||
use anyhow::{ensure, Result};
|
||||
use nix::errno::Errno;
|
||||
use nix::fcntl::{self, OFlag};
|
||||
use nix::sys::stat::Mode;
|
||||
@@ -13,7 +13,7 @@ use tracing::instrument;
|
||||
|
||||
pub const RNGDEV: &str = "/dev/random";
|
||||
pub const RNDADDTOENTCNT: libc::c_int = 0x40045201;
|
||||
pub const RNDRESEEDRNG: libc::c_int = 0x5207;
|
||||
pub const RNDRESEEDCRNG: libc::c_int = 0x5207;
|
||||
|
||||
// Handle the differing ioctl(2) request types for different targets
|
||||
#[cfg(target_env = "musl")]
|
||||
@@ -24,6 +24,9 @@ type IoctlRequestType = libc::c_ulong;
|
||||
#[instrument]
|
||||
pub fn reseed_rng(data: &[u8]) -> Result<()> {
|
||||
let len = data.len() as libc::c_long;
|
||||
|
||||
ensure!(len > 0, "missing entropy data");
|
||||
|
||||
fs::write(RNGDEV, data)?;
|
||||
|
||||
let f = {
|
||||
@@ -41,8 +44,52 @@ pub fn reseed_rng(data: &[u8]) -> Result<()> {
|
||||
};
|
||||
Errno::result(ret).map(drop)?;
|
||||
|
||||
let ret = unsafe { libc::ioctl(f.as_raw_fd(), RNDRESEEDRNG as IoctlRequestType, 0) };
|
||||
let ret = unsafe { libc::ioctl(f.as_raw_fd(), RNDRESEEDCRNG as IoctlRequestType, 0) };
|
||||
Errno::result(ret).map(drop)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::skip_if_not_root;
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
|
||||
#[test]
|
||||
fn test_reseed_rng() {
|
||||
skip_if_not_root!();
|
||||
const POOL_SIZE: usize = 512;
|
||||
let mut f = File::open("/dev/urandom").unwrap();
|
||||
let mut seed = [0; POOL_SIZE];
|
||||
let n = f.read(&mut seed).unwrap();
|
||||
// Ensure the buffer was filled.
|
||||
assert!(n == POOL_SIZE);
|
||||
let ret = reseed_rng(&seed);
|
||||
assert!(ret.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reseed_rng_not_root() {
|
||||
const POOL_SIZE: usize = 512;
|
||||
let mut f = File::open("/dev/urandom").unwrap();
|
||||
let mut seed = [0; POOL_SIZE];
|
||||
let n = f.read(&mut seed).unwrap();
|
||||
// Ensure the buffer was filled.
|
||||
assert!(n == POOL_SIZE);
|
||||
let ret = reseed_rng(&seed);
|
||||
if nix::unistd::Uid::effective().is_root() {
|
||||
assert!(ret.is_ok());
|
||||
} else {
|
||||
assert!(ret.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reseed_rng_zero_data() {
|
||||
let seed = [];
|
||||
let ret = reseed_rng(&seed);
|
||||
assert!(ret.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
1140
src/agent/src/rpc.rs
1140
src/agent/src/rpc.rs
File diff suppressed because it is too large
Load Diff
@@ -32,6 +32,8 @@ use tokio::sync::oneshot;
|
||||
use tokio::sync::Mutex;
|
||||
use tracing::instrument;
|
||||
|
||||
pub const ERR_INVALID_CONTAINER_ID: &str = "Invalid container id";
|
||||
|
||||
type UeventWatcher = (Box<dyn UeventMatcher>, oneshot::Sender<Uevent>);
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -149,7 +151,12 @@ impl Sandbox {
|
||||
pub fn remove_sandbox_storage(&self, path: &str) -> Result<()> {
|
||||
let mounts = vec![path.to_string()];
|
||||
remove_mounts(&mounts)?;
|
||||
fs::remove_dir_all(path).context(format!("failed to remove dir {:?}", path))?;
|
||||
// "remove_dir" will fail if the mount point is backed by a read-only filesystem.
|
||||
// This is the case with the device mapper snapshotter, where we mount the block device directly
|
||||
// at the underlying sandbox path which was provided from the base RO kataShared path from the host.
|
||||
if let Err(err) = fs::remove_dir(path) {
|
||||
warn!(self.logger, "failed to remove dir {}, {:?}", path, err);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -232,7 +239,7 @@ impl Sandbox {
|
||||
pub fn find_container_process(&mut self, cid: &str, eid: &str) -> Result<&mut Process> {
|
||||
let ctr = self
|
||||
.get_container(cid)
|
||||
.ok_or_else(|| anyhow!("Invalid container id"))?;
|
||||
.ok_or_else(|| anyhow!(ERR_INVALID_CONTAINER_ID))?;
|
||||
|
||||
if eid.is_empty() {
|
||||
return ctr
|
||||
@@ -463,7 +470,7 @@ fn online_memory(logger: &Logger) -> Result<()> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::Sandbox;
|
||||
use super::*;
|
||||
use crate::{mount::baremount, skip_if_not_root};
|
||||
use anyhow::{anyhow, Error};
|
||||
use nix::mount::MsFlags;
|
||||
@@ -473,6 +480,7 @@ mod tests {
|
||||
use rustjail::specconv::CreateOpts;
|
||||
use slog::Logger;
|
||||
use std::fs::{self, File};
|
||||
use std::io::prelude::*;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use tempfile::{tempdir, Builder, TempDir};
|
||||
@@ -562,19 +570,8 @@ mod tests {
|
||||
.remove_sandbox_storage(invalid_dir.to_str().unwrap())
|
||||
.is_err());
|
||||
|
||||
// Now, create a double mount as this guarantees the directory cannot
|
||||
// be deleted after the first umount.
|
||||
for _i in 0..2 {
|
||||
assert!(bind_mount(srcdir_path, destdir_path, &logger).is_ok());
|
||||
}
|
||||
assert!(bind_mount(srcdir_path, destdir_path, &logger).is_ok());
|
||||
|
||||
assert!(
|
||||
s.remove_sandbox_storage(destdir_path).is_err(),
|
||||
"Expect fail as deletion cannot happen due to the second mount."
|
||||
);
|
||||
|
||||
// This time it should work as the previous two calls have undone the double
|
||||
// mount.
|
||||
assert!(s.remove_sandbox_storage(destdir_path).is_ok());
|
||||
}
|
||||
|
||||
@@ -851,4 +848,259 @@ mod tests {
|
||||
let p = s.find_container_process("not-exist-cid", "");
|
||||
assert!(p.is_err(), "Expecting Error, Got {:?}", p);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_find_process() {
|
||||
let logger = slog::Logger::root(slog::Discard, o!());
|
||||
|
||||
let test_pids = [std::i32::MIN, -1, 0, 1, std::i32::MAX];
|
||||
|
||||
for test_pid in test_pids {
|
||||
let mut s = Sandbox::new(&logger).unwrap();
|
||||
let (mut linux_container, _root) = create_linuxcontainer();
|
||||
|
||||
let mut test_process = Process::new(
|
||||
&logger,
|
||||
&oci::Process::default(),
|
||||
"this_is_a_test_process",
|
||||
true,
|
||||
1,
|
||||
)
|
||||
.unwrap();
|
||||
// processes interally only have pids when manually set
|
||||
test_process.pid = test_pid;
|
||||
|
||||
linux_container.processes.insert(test_pid, test_process);
|
||||
|
||||
s.add_container(linux_container);
|
||||
|
||||
let find_result = s.find_process(test_pid);
|
||||
|
||||
// test first if it finds anything
|
||||
assert!(find_result.is_some(), "Should be able to find a process");
|
||||
|
||||
let found_process = find_result.unwrap();
|
||||
|
||||
// then test if it founds the correct process
|
||||
assert_eq!(
|
||||
found_process.pid, test_pid,
|
||||
"Should be able to find correct process"
|
||||
);
|
||||
}
|
||||
|
||||
// to test for nonexistent pids, any pid that isn't the one set
|
||||
// above should work, as linuxcontainer starts with no processes
|
||||
let mut s = Sandbox::new(&logger).unwrap();
|
||||
|
||||
let nonexistent_test_pid = 1234;
|
||||
|
||||
let find_result = s.find_process(nonexistent_test_pid);
|
||||
|
||||
assert!(
|
||||
find_result.is_none(),
|
||||
"Shouldn't find a process for non existent pid"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_online_resources() {
|
||||
#[derive(Debug, Default)]
|
||||
struct TestFile {
|
||||
name: String,
|
||||
content: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct TestDirectory<'a> {
|
||||
name: String,
|
||||
files: &'a [TestFile],
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
directory_autogen_name: String,
|
||||
number_autogen_directories: u32,
|
||||
|
||||
extra_directories: &'a [TestDirectory<'a>],
|
||||
pattern: String,
|
||||
to_enable: i32,
|
||||
|
||||
result: Result<i32>,
|
||||
}
|
||||
|
||||
impl Default for TestData<'_> {
|
||||
fn default() -> Self {
|
||||
TestData {
|
||||
directory_autogen_name: Default::default(),
|
||||
number_autogen_directories: Default::default(),
|
||||
extra_directories: Default::default(),
|
||||
pattern: Default::default(),
|
||||
to_enable: Default::default(),
|
||||
result: Ok(Default::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
// 4 well formed directories, request enabled 4,
|
||||
// correct result 4 enabled, should pass
|
||||
TestData {
|
||||
directory_autogen_name: String::from("cpu"),
|
||||
number_autogen_directories: 4,
|
||||
pattern: String::from(r"cpu[0-9]+"),
|
||||
to_enable: 4,
|
||||
result: Ok(4),
|
||||
..Default::default()
|
||||
},
|
||||
// 0 well formed directories, request enabled 4,
|
||||
// correct result 0 enabled, should pass
|
||||
TestData {
|
||||
number_autogen_directories: 0,
|
||||
to_enable: 4,
|
||||
result: Ok(0),
|
||||
..Default::default()
|
||||
},
|
||||
// 10 well formed directories, request enabled 4,
|
||||
// correct result 4 enabled, should pass
|
||||
TestData {
|
||||
directory_autogen_name: String::from("cpu"),
|
||||
number_autogen_directories: 10,
|
||||
pattern: String::from(r"cpu[0-9]+"),
|
||||
to_enable: 4,
|
||||
result: Ok(4),
|
||||
..Default::default()
|
||||
},
|
||||
// 0 well formed directories, request enabled 0,
|
||||
// correct result 0 enabled, should pass
|
||||
TestData {
|
||||
number_autogen_directories: 0,
|
||||
pattern: String::from(r"cpu[0-9]+"),
|
||||
to_enable: 0,
|
||||
result: Ok(0),
|
||||
..Default::default()
|
||||
},
|
||||
// 4 well formed directories, 1 malformed (no online file),
|
||||
// request enable 5, correct result 4
|
||||
TestData {
|
||||
directory_autogen_name: String::from("cpu"),
|
||||
number_autogen_directories: 4,
|
||||
pattern: String::from(r"cpu[0-9]+"),
|
||||
extra_directories: &[TestDirectory {
|
||||
name: String::from("cpu4"),
|
||||
files: &[],
|
||||
}],
|
||||
to_enable: 5,
|
||||
result: Ok(4),
|
||||
},
|
||||
// 3 malformed directories (no online files),
|
||||
// request enable 3, correct result 0
|
||||
TestData {
|
||||
pattern: String::from(r"cpu[0-9]+"),
|
||||
extra_directories: &[
|
||||
TestDirectory {
|
||||
name: String::from("cpu0"),
|
||||
files: &[],
|
||||
},
|
||||
TestDirectory {
|
||||
name: String::from("cpu1"),
|
||||
files: &[],
|
||||
},
|
||||
TestDirectory {
|
||||
name: String::from("cpu2"),
|
||||
files: &[],
|
||||
},
|
||||
],
|
||||
to_enable: 3,
|
||||
result: Ok(0),
|
||||
..Default::default()
|
||||
},
|
||||
// 1 malformed directories (online file with content "1"),
|
||||
// request enable 1, correct result 0
|
||||
TestData {
|
||||
pattern: String::from(r"cpu[0-9]+"),
|
||||
extra_directories: &[TestDirectory {
|
||||
name: String::from("cpu0"),
|
||||
files: &[TestFile {
|
||||
name: SYSFS_ONLINE_FILE.to_string(),
|
||||
content: String::from("1"),
|
||||
}],
|
||||
}],
|
||||
to_enable: 1,
|
||||
result: Ok(0),
|
||||
..Default::default()
|
||||
},
|
||||
// 2 well formed directories, 1 malformed (online file with content "1"),
|
||||
// request enable 3, correct result 2
|
||||
TestData {
|
||||
directory_autogen_name: String::from("cpu"),
|
||||
number_autogen_directories: 2,
|
||||
pattern: String::from(r"cpu[0-9]+"),
|
||||
extra_directories: &[TestDirectory {
|
||||
name: String::from("cpu2"),
|
||||
files: &[TestFile {
|
||||
name: SYSFS_ONLINE_FILE.to_string(),
|
||||
content: String::from("1"),
|
||||
}],
|
||||
}],
|
||||
to_enable: 3,
|
||||
result: Ok(2),
|
||||
},
|
||||
];
|
||||
|
||||
let logger = slog::Logger::root(slog::Discard, o!());
|
||||
let tmpdir = Builder::new().tempdir().unwrap();
|
||||
let tmpdir_path = tmpdir.path().to_str().unwrap();
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let current_test_dir_path = format!("{}/test_{}", tmpdir_path, i);
|
||||
fs::create_dir(¤t_test_dir_path).unwrap();
|
||||
|
||||
// create numbered directories and fill using root name
|
||||
for j in 0..d.number_autogen_directories {
|
||||
let subdir_path = format!(
|
||||
"{}/{}{}",
|
||||
current_test_dir_path, d.directory_autogen_name, j
|
||||
);
|
||||
let subfile_path = format!("{}/{}", subdir_path, SYSFS_ONLINE_FILE);
|
||||
fs::create_dir(&subdir_path).unwrap();
|
||||
let mut subfile = File::create(subfile_path).unwrap();
|
||||
subfile.write_all(b"0").unwrap();
|
||||
}
|
||||
// create extra directories and fill to specification
|
||||
for j in d.extra_directories {
|
||||
let subdir_path = format!("{}/{}", current_test_dir_path, j.name);
|
||||
fs::create_dir(&subdir_path).unwrap();
|
||||
for file in j.files {
|
||||
let subfile_path = format!("{}/{}", subdir_path, file.name);
|
||||
let mut subfile = File::create(&subfile_path).unwrap();
|
||||
subfile.write_all(file.content.as_bytes()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// run created directory structure against online_resources
|
||||
let result = online_resources(&logger, ¤t_test_dir_path, &d.pattern, d.to_enable);
|
||||
|
||||
let mut msg = format!(
|
||||
"test[{}]: {:?}, expected {}, actual {}",
|
||||
i,
|
||||
d,
|
||||
d.result.is_ok(),
|
||||
result.is_ok()
|
||||
);
|
||||
|
||||
assert_eq!(result.is_ok(), d.result.is_ok(), "{}", msg);
|
||||
|
||||
if d.result.is_ok() {
|
||||
let test_result_val = *d.result.as_ref().ok().unwrap();
|
||||
let result_val = result.ok().unwrap();
|
||||
|
||||
msg = format!(
|
||||
"test[{}]: {:?}, expected {}, actual {}",
|
||||
i, d, test_result_val, result_val
|
||||
);
|
||||
|
||||
assert_eq!(test_result_val, result_val, "{}", msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,14 @@
|
||||
#![allow(clippy::module_inception)]
|
||||
|
||||
#[cfg(test)]
|
||||
mod test_utils {
|
||||
pub mod test_utils {
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum TestUserType {
|
||||
RootOnly,
|
||||
NonRootOnly,
|
||||
Any,
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! skip_if_root {
|
||||
() => {
|
||||
@@ -53,4 +60,40 @@ mod test_utils {
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Parameters:
|
||||
//
|
||||
// 1: expected Result
|
||||
// 2: actual Result
|
||||
// 3: string used to identify the test on error
|
||||
#[macro_export]
|
||||
macro_rules! assert_result {
|
||||
($expected_result:expr, $actual_result:expr, $msg:expr) => {
|
||||
if $expected_result.is_ok() {
|
||||
let expected_value = $expected_result.as_ref().unwrap();
|
||||
let actual_value = $actual_result.unwrap();
|
||||
assert!(*expected_value == actual_value, "{}", $msg);
|
||||
} else {
|
||||
assert!($actual_result.is_err(), "{}", $msg);
|
||||
|
||||
let expected_error = $expected_result.as_ref().unwrap_err();
|
||||
let expected_error_msg = format!("{:?}", expected_error);
|
||||
|
||||
let actual_error_msg = format!("{:?}", $actual_result.unwrap_err());
|
||||
|
||||
assert!(expected_error_msg == actual_error_msg, "{}", $msg);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! skip_loop_by_user {
|
||||
($msg:expr, $user:expr) => {
|
||||
if $user == TestUserType::RootOnly {
|
||||
skip_loop_if_not_root!($msg);
|
||||
} else if $user == TestUserType::NonRootOnly {
|
||||
skip_loop_if_root!($msg);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,6 +69,8 @@ macro_rules! trace_rpc_call {
|
||||
propagator.extract(&extract_carrier_from_ttrpc($ctx))
|
||||
});
|
||||
|
||||
info!(sl!(), "rpc call from shim to agent: {:?}", $name);
|
||||
|
||||
// generate tracing span
|
||||
let rpc_span = span!(tracing::Level::INFO, $name, "mod"="rpc.rs", req=?$req);
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#![allow(unknown_lints)]
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::os::unix::fs::MetadataExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
@@ -13,6 +14,7 @@ use std::time::SystemTime;
|
||||
use anyhow::{ensure, Context, Result};
|
||||
use async_recursion::async_recursion;
|
||||
use nix::mount::{umount, MsFlags};
|
||||
use nix::unistd::{Gid, Uid};
|
||||
use slog::{debug, error, info, warn, Logger};
|
||||
use thiserror::Error;
|
||||
use tokio::fs;
|
||||
@@ -80,7 +82,8 @@ impl Drop for Storage {
|
||||
}
|
||||
|
||||
async fn copy(from: impl AsRef<Path>, to: impl AsRef<Path>) -> Result<()> {
|
||||
if fs::symlink_metadata(&from).await?.file_type().is_symlink() {
|
||||
let metadata = fs::symlink_metadata(&from).await?;
|
||||
if metadata.file_type().is_symlink() {
|
||||
// if source is a symlink, create new symlink with same link source. If
|
||||
// the symlink exists, remove and create new one:
|
||||
if fs::symlink_metadata(&to).await.is_ok() {
|
||||
@@ -88,8 +91,15 @@ async fn copy(from: impl AsRef<Path>, to: impl AsRef<Path>) -> Result<()> {
|
||||
}
|
||||
fs::symlink(fs::read_link(&from).await?, &to).await?;
|
||||
} else {
|
||||
fs::copy(from, to).await?;
|
||||
fs::copy(&from, &to).await?;
|
||||
}
|
||||
// preserve the source uid and gid to the destination.
|
||||
nix::unistd::chown(
|
||||
to.as_ref(),
|
||||
Some(Uid::from_raw(metadata.uid())),
|
||||
Some(Gid::from_raw(metadata.gid())),
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -106,14 +116,29 @@ impl Storage {
|
||||
|
||||
async fn update_target(&self, logger: &Logger, source_path: impl AsRef<Path>) -> Result<()> {
|
||||
let source_file_path = source_path.as_ref();
|
||||
let metadata = source_file_path.symlink_metadata()?;
|
||||
|
||||
// if we are creating a directory: just create it, nothing more to do
|
||||
if source_file_path.symlink_metadata()?.file_type().is_dir() {
|
||||
if metadata.file_type().is_dir() {
|
||||
let dest_file_path = self.make_target_path(&source_file_path)?;
|
||||
|
||||
fs::create_dir_all(&dest_file_path)
|
||||
.await
|
||||
.with_context(|| format!("Unable to mkdir all for {}", dest_file_path.display()))?;
|
||||
// set the directory permissions to match the source directory permissions
|
||||
fs::set_permissions(&dest_file_path, metadata.permissions())
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Unable to set permissions for {}", dest_file_path.display())
|
||||
})?;
|
||||
// preserve the source directory uid and gid to the destination.
|
||||
nix::unistd::chown(
|
||||
&dest_file_path,
|
||||
Some(Uid::from_raw(metadata.uid())),
|
||||
Some(Gid::from_raw(metadata.gid())),
|
||||
)
|
||||
.with_context(|| format!("Unable to set ownership for {}", dest_file_path.display()))?;
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -504,6 +529,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::mount::is_mounted;
|
||||
use crate::skip_if_not_root;
|
||||
use nix::unistd::{Gid, Uid};
|
||||
use std::fs;
|
||||
use std::thread;
|
||||
|
||||
@@ -895,20 +921,28 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_copy() {
|
||||
skip_if_not_root!();
|
||||
|
||||
// prepare tmp src/destination
|
||||
let source_dir = tempfile::tempdir().unwrap();
|
||||
let dest_dir = tempfile::tempdir().unwrap();
|
||||
let uid = Uid::from_raw(10);
|
||||
let gid = Gid::from_raw(200);
|
||||
|
||||
// verify copy of a regular file
|
||||
let src_file = source_dir.path().join("file.txt");
|
||||
let dst_file = dest_dir.path().join("file.txt");
|
||||
fs::write(&src_file, "foo").unwrap();
|
||||
nix::unistd::chown(&src_file, Some(uid), Some(gid)).unwrap();
|
||||
|
||||
copy(&src_file, &dst_file).await.unwrap();
|
||||
// verify destination:
|
||||
assert!(!fs::symlink_metadata(dst_file)
|
||||
assert!(!fs::symlink_metadata(&dst_file)
|
||||
.unwrap()
|
||||
.file_type()
|
||||
.is_symlink());
|
||||
assert_eq!(fs::metadata(&dst_file).unwrap().uid(), uid.as_raw());
|
||||
assert_eq!(fs::metadata(&dst_file).unwrap().gid(), gid.as_raw());
|
||||
|
||||
// verify copy of a symlink
|
||||
let src_symlink_file = source_dir.path().join("symlink_file.txt");
|
||||
@@ -916,7 +950,7 @@ mod tests {
|
||||
tokio::fs::symlink(&src_file, &src_symlink_file)
|
||||
.await
|
||||
.unwrap();
|
||||
copy(src_symlink_file, &dst_symlink_file).await.unwrap();
|
||||
copy(&src_symlink_file, &dst_symlink_file).await.unwrap();
|
||||
// verify destination:
|
||||
assert!(fs::symlink_metadata(&dst_symlink_file)
|
||||
.unwrap()
|
||||
@@ -924,6 +958,8 @@ mod tests {
|
||||
.is_symlink());
|
||||
assert_eq!(fs::read_link(&dst_symlink_file).unwrap(), src_file);
|
||||
assert_eq!(fs::read_to_string(&dst_symlink_file).unwrap(), "foo");
|
||||
assert_ne!(fs::metadata(&dst_symlink_file).unwrap().uid(), uid.as_raw());
|
||||
assert_ne!(fs::metadata(&dst_symlink_file).unwrap().gid(), gid.as_raw());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -1069,6 +1105,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn watch_directory() {
|
||||
skip_if_not_root!();
|
||||
|
||||
// Prepare source directory:
|
||||
// ./tmp/1.txt
|
||||
// ./tmp/A/B/2.txt
|
||||
@@ -1079,7 +1117,9 @@ mod tests {
|
||||
|
||||
// A/C is an empty directory
|
||||
let empty_dir = "A/C";
|
||||
fs::create_dir_all(source_dir.path().join(empty_dir)).unwrap();
|
||||
let path = source_dir.path().join(empty_dir);
|
||||
fs::create_dir_all(&path).unwrap();
|
||||
nix::unistd::chown(&path, Some(Uid::from_raw(10)), Some(Gid::from_raw(200))).unwrap();
|
||||
|
||||
// delay 20 ms between writes to files in order to ensure filesystem timestamps are unique
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
@@ -1123,7 +1163,9 @@ mod tests {
|
||||
|
||||
// create another empty directory A/C/D
|
||||
let empty_dir = "A/C/D";
|
||||
fs::create_dir_all(source_dir.path().join(empty_dir)).unwrap();
|
||||
let path = source_dir.path().join(empty_dir);
|
||||
fs::create_dir_all(&path).unwrap();
|
||||
nix::unistd::chown(&path, Some(Uid::from_raw(10)), Some(Gid::from_raw(200))).unwrap();
|
||||
assert_eq!(entry.scan(&logger).await.unwrap(), 1);
|
||||
assert!(dest_dir.path().join(empty_dir).exists());
|
||||
}
|
||||
|
||||
3
src/dragonball/.gitignore
vendored
Normal file
3
src/dragonball/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
target
|
||||
Cargo.lock
|
||||
.idea
|
||||
65
src/dragonball/Cargo.toml
Normal file
65
src/dragonball/Cargo.toml
Normal file
@@ -0,0 +1,65 @@
|
||||
[package]
|
||||
name = "dragonball"
|
||||
version = "0.1.0"
|
||||
authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
|
||||
description = "A secure sandbox for Kata Containers"
|
||||
keywords = ["kata-containers", "sandbox", "vmm", "dragonball"]
|
||||
homepage = "https://katacontainers.io/"
|
||||
repository = "https://github.com/kata-containers/kata-containers.git"
|
||||
license = "Apache-2.0"
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
arc-swap = "1.5.0"
|
||||
bytes = "1.1.0"
|
||||
dbs-address-space = "0.1.0"
|
||||
dbs-allocator = "0.1.0"
|
||||
dbs-arch = "0.1.0"
|
||||
dbs-boot = "0.2.0"
|
||||
dbs-device = "0.1.0"
|
||||
dbs-interrupt = { version = "0.1.0", features = ["kvm-irq"] }
|
||||
dbs-legacy-devices = "0.1.0"
|
||||
dbs-upcall = { version = "0.1.0", optional = true }
|
||||
dbs-utils = "0.1.0"
|
||||
dbs-virtio-devices = { version = "0.1.0", optional = true, features = ["virtio-mmio"] }
|
||||
kvm-bindings = "0.5.0"
|
||||
kvm-ioctls = "0.11.0"
|
||||
lazy_static = "1.2"
|
||||
libc = "0.2.39"
|
||||
linux-loader = "0.4.0"
|
||||
log = "0.4.14"
|
||||
nix = "0.23.1"
|
||||
seccompiler = "0.2.0"
|
||||
serde = "1.0.27"
|
||||
serde_derive = "1.0.27"
|
||||
serde_json = "1.0.9"
|
||||
slog = "2.5.2"
|
||||
slog-scope = "4.4.0"
|
||||
thiserror = "1"
|
||||
vmm-sys-util = "0.9.0"
|
||||
virtio-queue = { version = "0.1.0", optional = true }
|
||||
vm-memory = { version = "0.7.0", features = ["backend-mmap"] }
|
||||
|
||||
[dev-dependencies]
|
||||
slog-term = "2.9.0"
|
||||
slog-async = "2.7.0"
|
||||
|
||||
[features]
|
||||
acpi = []
|
||||
atomic-guest-memory = []
|
||||
hotplug = ["virtio-vsock"]
|
||||
virtio-vsock = ["dbs-virtio-devices/virtio-vsock", "virtio-queue"]
|
||||
virtio-blk = ["dbs-virtio-devices/virtio-blk", "virtio-queue"]
|
||||
virtio-net = ["dbs-virtio-devices/virtio-net", "virtio-queue"]
|
||||
# virtio-fs only work on atomic-guest-memory
|
||||
virtio-fs = ["dbs-virtio-devices/virtio-fs", "virtio-queue", "atomic-guest-memory"]
|
||||
|
||||
[patch.'crates-io']
|
||||
dbs-device = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
|
||||
dbs-interrupt = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
|
||||
dbs-legacy-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
|
||||
dbs-upcall = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
|
||||
dbs-utils = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
|
||||
dbs-virtio-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
|
||||
dbs-boot = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
|
||||
dbs-arch = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
|
||||
1
src/dragonball/LICENSE
Symbolic link
1
src/dragonball/LICENSE
Symbolic link
@@ -0,0 +1 @@
|
||||
../../LICENSE
|
||||
29
src/dragonball/Makefile
Normal file
29
src/dragonball/Makefile
Normal file
@@ -0,0 +1,29 @@
|
||||
# Copyright (c) 2019-2022 Alibaba Cloud. All rights reserved.
|
||||
# Copyright (c) 2019-2022 Ant Group. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
default: build
|
||||
|
||||
build:
|
||||
# FIXME: This line will be removed when we solve the vm-memory dependency problem in Dragonball Sandbox
|
||||
cargo update -p vm-memory:0.8.0 --precise 0.7.0
|
||||
cargo build --all-features
|
||||
|
||||
check: clippy format
|
||||
|
||||
clippy:
|
||||
@echo "INFO: cargo clippy..."
|
||||
cargo clippy --all-targets --all-features \
|
||||
-- \
|
||||
-D warnings
|
||||
|
||||
format:
|
||||
@echo "INFO: cargo fmt..."
|
||||
cargo fmt -- --check
|
||||
|
||||
clean:
|
||||
cargo clean
|
||||
|
||||
test:
|
||||
@echo "INFO: testing dragonball for development build"
|
||||
cargo test --all-features -- --nocapture
|
||||
40
src/dragonball/README.md
Normal file
40
src/dragonball/README.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Introduction
|
||||
`Dragonball Sandbox` is a light-weight virtual machine manager (VMM) based on Linux Kernel-based Virtual Machine (KVM),
|
||||
which is optimized for container workloads with:
|
||||
- container image management and acceleration service
|
||||
- flexible and high-performance virtual device drivers
|
||||
- low CPU and memory overhead
|
||||
- minimal startup time
|
||||
- optimized concurrent startup speed
|
||||
|
||||
`Dragonball Sandbox` aims to provide a simple solution for the Kata Containers community. It is integrated into Kata 3.0
|
||||
runtime as a built-in VMM and gives users an out-of-the-box Kata Containers experience without complex environment setup
|
||||
and configuration process.
|
||||
|
||||
# Getting Started
|
||||
[TODO](https://github.com/kata-containers/kata-containers/issues/4302)
|
||||
|
||||
# Documentation
|
||||
|
||||
Device: [Device Document](docs/device.md)
|
||||
vCPU: [vCPU Document](docs/vcpu.md)
|
||||
API: [API Document](docs/api.md)
|
||||
|
||||
Currently, the documents are still actively adding.
|
||||
You could see the [official documentation](docs/) page for more details.
|
||||
|
||||
# Supported Architectures
|
||||
- x86-64
|
||||
- aarch64
|
||||
|
||||
# Supported Kernel
|
||||
[TODO](https://github.com/kata-containers/kata-containers/issues/4303)
|
||||
|
||||
# Acknowledgement
|
||||
Part of the code is based on the [Cloud Hypervisor](https://github.com/cloud-hypervisor/cloud-hypervisor) project, [`crosvm`](https://github.com/google/crosvm) project and [Firecracker](https://github.com/firecracker-microvm/firecracker) project. They are all rust written virtual machine managers with advantages on safety and security.
|
||||
|
||||
`Dragonball sandbox` is designed to be a VMM that is customized for Kata Containers and we will focus on optimizing container workloads for Kata ecosystem. The focus on the Kata community is what differentiates us from other rust written virtual machines.
|
||||
|
||||
# License
|
||||
|
||||
`Dragonball` is licensed under [Apache License](http://www.apache.org/licenses/LICENSE-2.0), Version 2.0.
|
||||
27
src/dragonball/THIRD-PARTY
Normal file
27
src/dragonball/THIRD-PARTY
Normal file
@@ -0,0 +1,27 @@
|
||||
// Copyright 2017 The Chromium OS Authors. All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above
|
||||
// copyright notice, this list of conditions and the following disclaimer
|
||||
// in the documentation and/or other materials provided with the
|
||||
// distribution.
|
||||
// * Neither the name of Google Inc. nor the names of its
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
27
src/dragonball/docs/api.md
Normal file
27
src/dragonball/docs/api.md
Normal file
@@ -0,0 +1,27 @@
|
||||
# API
|
||||
|
||||
We provide plenty API for Kata runtime to interact with `Dragonball` virtual machine manager.
|
||||
This document provides the introduction for each of them.
|
||||
|
||||
## `ConfigureBootSource`
|
||||
Configure the boot source of the VM using `BootSourceConfig`. This action can only be called before the VM has booted.
|
||||
|
||||
### Boot Source Config
|
||||
1. `kernel_path`: Path of the kernel image. `Dragonball` only supports compressed kernel image for now.
|
||||
2. `initrd_path`: Path of the initrd (could be None)
|
||||
3. `boot_args`: Boot arguments passed to the kernel (could be None)
|
||||
|
||||
## `SetVmConfiguration`
|
||||
Set virtual machine configuration using `VmConfigInfo` to initialize VM.
|
||||
|
||||
### VM Config Info
|
||||
1. `vcpu_count`: Number of vCPU to start. Currently we only support up to 255 vCPUs.
|
||||
2. `max_vcpu_count`: Max number of vCPU can be added through CPU hotplug.
|
||||
3. `cpu_pm`: CPU power management.
|
||||
4. `cpu_topology`: CPU topology information (including `threads_per_core`, `cores_per_die`, `dies_per_socket` and `sockets`).
|
||||
5. `vpmu_feature`: `vPMU` feature level.
|
||||
6. `mem_type`: Memory type that can be either `hugetlbfs` or `shmem`, default is `shmem`.
|
||||
7. `mem_file_path` : Memory file path.
|
||||
8. `mem_size_mib`: The memory size in MiB. The maximum memory size is 1TB.
|
||||
9. `serial_path`: Optional sock path.
|
||||
|
||||
20
src/dragonball/docs/device.md
Normal file
20
src/dragonball/docs/device.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# Device
|
||||
|
||||
## Device Manager
|
||||
|
||||
Currently we have following device manager:
|
||||
| Name | Description |
|
||||
| --- | --- |
|
||||
| [address space manager](../src/address_space_manager.rs) | abstracts virtual machine's physical management and provide mapping for guest virtual memory and MMIO ranges of emulated virtual devices, pass-through devices and vCPU |
|
||||
| [config manager](../src/config_manager.rs) | provides abstractions for configuration information |
|
||||
| [console manager](../src/device_manager/console_manager.rs) | provides management for all console devices |
|
||||
| [resource manager](../src/resource_manager.rs) |provides resource management for `legacy_irq_pool`, `msi_irq_pool`, `pio_pool`, `mmio_pool`, `mem_pool`, `kvm_mem_slot_pool` with builder `ResourceManagerBuilder` |
|
||||
| [VSOCK device manager](../src/device_manager/vsock_dev_mgr.rs) | provides configuration info for `VIRTIO-VSOCK` and management for all VSOCK devices |
|
||||
|
||||
|
||||
## Device supported
|
||||
`VIRTIO-VSOCK`
|
||||
`i8042`
|
||||
`COM1`
|
||||
`COM2`
|
||||
|
||||
42
src/dragonball/docs/vcpu.md
Normal file
42
src/dragonball/docs/vcpu.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# vCPU
|
||||
|
||||
## vCPU Manager
|
||||
The vCPU manager is to manage all vCPU related actions, we will dive into some of the important structure members in this doc.
|
||||
|
||||
For now, aarch64 vCPU support is still under development, we'll introduce it when we merge `runtime-rs` to the master branch. (issue: #4445)
|
||||
|
||||
### vCPU config
|
||||
`VcpuConfig` is used to configure guest overall CPU info.
|
||||
|
||||
`boot_vcpu_count` is used to define the initial vCPU number.
|
||||
|
||||
`max_vcpu_count` is used to define the maximum vCPU number and it's used for the upper boundary for CPU hotplug feature
|
||||
|
||||
`thread_per_core`, `cores_per_die`, `dies_per_socket` and `socket` are used to define CPU topology.
|
||||
|
||||
`vpmu_feature` is used to define `vPMU` feature level.
|
||||
If `vPMU` feature is `Disabled`, it means `vPMU` feature is off (by default).
|
||||
If `vPMU` feature is `LimitedlyEnabled`, it means minimal `vPMU` counters are supported (cycles and instructions).
|
||||
If `vPMU` feature is `FullyEnabled`, it means all `vPMU` counters are supported
|
||||
|
||||
## vCPU State
|
||||
|
||||
There are four states for vCPU state machine: `running`, `paused`, `waiting_exit`, `exited`. There is a state machine to maintain the task flow.
|
||||
|
||||
When the vCPU is created, it'll turn to `paused` state. After vCPU resource is ready at VMM, it'll send a `Resume` event to the vCPU thread, and then vCPU state will change to `running`.
|
||||
|
||||
During the `running` state, VMM will catch vCPU exit and execute different logic according to the exit reason.
|
||||
|
||||
If the VMM catch some exit reasons that it cannot handle, the state will change to `waiting_exit` and VMM will stop the virtual machine.
|
||||
When the state switches to `waiting_exit`, an exit event will be sent to vCPU `exit_evt`, event manager will detect the change in `exit_evt` and set VMM `exit_evt_flag` as 1. A thread serving for VMM event loop will check `exit_evt_flag` and if the flag is 1, it'll stop the VMM.
|
||||
|
||||
When the VMM is stopped / destroyed, the state will change to `exited`.
|
||||
|
||||
## vCPU Hot plug
|
||||
Since `Dragonball Sandbox` doesn't support virtualization of ACPI system, we use [`upcall`](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall) to establish a direct communication channel between `Dragonball` and Guest in order to trigger vCPU hotplug.
|
||||
|
||||
To use `upcall`, kernel patches are needed, you can get the patches from [`upcall`](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall) page, and we'll provide a ready-to-use guest kernel binary for you to try.
|
||||
|
||||
vCPU hot plug / hot unplug range is [1, `max_vcpu_count`]. Operations not in this range will be invalid.
|
||||
|
||||
|
||||
892
src/dragonball/src/address_space_manager.rs
Normal file
892
src/dragonball/src/address_space_manager.rs
Normal file
@@ -0,0 +1,892 @@
|
||||
// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
//! Address space abstraction to manage virtual machine's physical address space.
|
||||
//!
|
||||
//! The AddressSpace abstraction is introduced to manage virtual machine's physical address space.
|
||||
//! The regions in virtual machine's physical address space may be used to:
|
||||
//! 1) map guest virtual memory
|
||||
//! 2) map MMIO ranges for emulated virtual devices, such as virtio-fs DAX window.
|
||||
//! 3) map MMIO ranges for pass-through devices, such as PCI device BARs.
|
||||
//! 4) map MMIO ranges for to vCPU, such as local APIC.
|
||||
//! 5) not used/available
|
||||
//!
|
||||
//! A related abstraction, vm_memory::GuestMemory, is used to access guest virtual memory only.
|
||||
//! In other words, AddressSpace is the resource owner, and GuestMemory is an accessor for guest
|
||||
//! virtual memory.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::fs::File;
|
||||
use std::os::unix::io::{AsRawFd, FromRawFd};
|
||||
use std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::thread;
|
||||
|
||||
use dbs_address_space::{
|
||||
AddressSpace, AddressSpaceError, AddressSpaceLayout, AddressSpaceRegion,
|
||||
AddressSpaceRegionType, NumaNode, NumaNodeInfo, MPOL_MF_MOVE, MPOL_PREFERRED,
|
||||
};
|
||||
use dbs_allocator::Constraint;
|
||||
use kvm_bindings::kvm_userspace_memory_region;
|
||||
use kvm_ioctls::VmFd;
|
||||
use log::{debug, error, info, warn};
|
||||
use nix::sys::mman;
|
||||
use nix::unistd::dup;
|
||||
#[cfg(feature = "atomic-guest-memory")]
|
||||
use vm_memory::atomic::GuestMemoryAtomic;
|
||||
use vm_memory::{
|
||||
Address, FileOffset, GuestAddress, GuestAddressSpace, GuestMemoryMmap, GuestMemoryRegion,
|
||||
GuestRegionMmap, GuestUsize, MemoryRegionAddress, MmapRegion,
|
||||
};
|
||||
|
||||
use crate::resource_manager::ResourceManager;
|
||||
use crate::vm::NumaRegionInfo;
|
||||
|
||||
#[cfg(not(feature = "atomic-guest-memory"))]
|
||||
/// Concrete GuestAddressSpace type used by the VMM.
|
||||
pub type GuestAddressSpaceImpl = Arc<GuestMemoryMmap>;
|
||||
|
||||
#[cfg(feature = "atomic-guest-memory")]
|
||||
/// Concrete GuestAddressSpace type used by the VMM.
|
||||
pub type GuestAddressSpaceImpl = GuestMemoryAtomic<GuestMemoryMmap>;
|
||||
|
||||
/// Concrete GuestMemory type used by the VMM.
|
||||
pub type GuestMemoryImpl = <Arc<vm_memory::GuestMemoryMmap> as GuestAddressSpace>::M;
|
||||
/// Concrete GuestRegion type used by the VMM.
|
||||
pub type GuestRegionImpl = GuestRegionMmap;
|
||||
|
||||
// Maximum number of working threads for memory pre-allocation.
|
||||
const MAX_PRE_ALLOC_THREAD: u64 = 16;
|
||||
|
||||
// Control the actual number of pre-allocating threads. After several performance tests, we decide to use one thread to do pre-allocating for every 4G memory.
|
||||
const PRE_ALLOC_GRANULARITY: u64 = 32;
|
||||
|
||||
// We don't have plan to support mainframe computer and only focus on PC servers.
|
||||
// 64 as max nodes should be enough for now.
|
||||
const MAX_NODE: u32 = 64;
|
||||
|
||||
// We will split the memory region if it conflicts with the MMIO hole.
|
||||
// But if the space below the MMIO hole is smaller than the MINIMAL_SPLIT_SPACE, we won't split the memory region in order to enhance performance.
|
||||
const MINIMAL_SPLIT_SPACE: u64 = 128 << 20;
|
||||
|
||||
/// Errors associated with virtual machine address space management.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum AddressManagerError {
|
||||
/// Invalid address space operation.
|
||||
#[error("invalid address space operation")]
|
||||
InvalidOperation,
|
||||
|
||||
/// Invalid address range.
|
||||
#[error("invalid address space region (0x{0:x}, 0x{1:x})")]
|
||||
InvalidAddressRange(u64, GuestUsize),
|
||||
|
||||
/// No available mem address.
|
||||
#[error("no available mem address")]
|
||||
NoAvailableMemAddress,
|
||||
|
||||
/// No available kvm slotse.
|
||||
#[error("no available kvm slots")]
|
||||
NoAvailableKvmSlot,
|
||||
|
||||
/// Address manager failed to create memfd to map anonymous memory.
|
||||
#[error("address manager failed to create memfd to map anonymous memory")]
|
||||
CreateMemFd(#[source] nix::Error),
|
||||
|
||||
/// Address manager failed to open memory file.
|
||||
#[error("address manager failed to open memory file")]
|
||||
OpenFile(#[source] std::io::Error),
|
||||
|
||||
/// Memory file provided is invalid due to empty file path, non-existent file path and other possible mistakes.
|
||||
#[error("memory file provided to address manager {0} is invalid")]
|
||||
FileInvalid(String),
|
||||
|
||||
/// Memory file provided is invalid due to empty memory type
|
||||
#[error("memory type provided to address manager {0} is invalid")]
|
||||
TypeInvalid(String),
|
||||
|
||||
/// Failed to set size for memory file.
|
||||
#[error("address manager failed to set size for memory file")]
|
||||
SetFileSize(#[source] std::io::Error),
|
||||
|
||||
/// Failed to unlink memory file.
|
||||
#[error("address manager failed to unlink memory file")]
|
||||
UnlinkFile(#[source] nix::Error),
|
||||
|
||||
/// Failed to duplicate fd of memory file.
|
||||
#[error("address manager failed to duplicate memory file descriptor")]
|
||||
DupFd(#[source] nix::Error),
|
||||
|
||||
/// Failure in accessing the memory located at some address.
|
||||
#[error("address manager failed to access guest memory located at 0x{0:x}")]
|
||||
AccessGuestMemory(u64, #[source] vm_memory::mmap::Error),
|
||||
|
||||
/// Failed to create GuestMemory
|
||||
#[error("address manager failed to create guest memory object")]
|
||||
CreateGuestMemory(#[source] vm_memory::Error),
|
||||
|
||||
/// Failure in initializing guest memory.
|
||||
#[error("address manager failed to initialize guest memory")]
|
||||
GuestMemoryNotInitialized,
|
||||
|
||||
/// Failed to mmap() guest memory
|
||||
#[error("address manager failed to mmap() guest memory into current process")]
|
||||
MmapGuestMemory(#[source] vm_memory::mmap::MmapRegionError),
|
||||
|
||||
/// Failed to set KVM memory slot.
|
||||
#[error("address manager failed to configure KVM memory slot")]
|
||||
KvmSetMemorySlot(#[source] kvm_ioctls::Error),
|
||||
|
||||
/// Failed to set madvise on AddressSpaceRegion
|
||||
#[error("address manager failed to set madvice() on guest memory region")]
|
||||
Madvise(#[source] nix::Error),
|
||||
|
||||
/// join threads fail
|
||||
#[error("address manager failed to join threads")]
|
||||
JoinFail,
|
||||
|
||||
/// Failed to create Address Space Region
|
||||
#[error("address manager failed to create Address Space Region {0}")]
|
||||
CreateAddressSpaceRegion(#[source] AddressSpaceError),
|
||||
}
|
||||
|
||||
type Result<T> = std::result::Result<T, AddressManagerError>;
|
||||
|
||||
/// Parameters to configure address space creation operations.
|
||||
pub struct AddressSpaceMgrBuilder<'a> {
|
||||
mem_type: &'a str,
|
||||
mem_file: &'a str,
|
||||
mem_index: u32,
|
||||
mem_suffix: bool,
|
||||
mem_prealloc: bool,
|
||||
dirty_page_logging: bool,
|
||||
vmfd: Option<Arc<VmFd>>,
|
||||
}
|
||||
|
||||
impl<'a> AddressSpaceMgrBuilder<'a> {
|
||||
/// Create a new [`AddressSpaceMgrBuilder`] object.
|
||||
pub fn new(mem_type: &'a str, mem_file: &'a str) -> Result<Self> {
|
||||
if mem_type.is_empty() {
|
||||
return Err(AddressManagerError::TypeInvalid(mem_type.to_string()));
|
||||
}
|
||||
Ok(AddressSpaceMgrBuilder {
|
||||
mem_type,
|
||||
mem_file,
|
||||
mem_index: 0,
|
||||
mem_suffix: true,
|
||||
mem_prealloc: false,
|
||||
dirty_page_logging: false,
|
||||
vmfd: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Enable/disable adding numbered suffix to memory file path.
|
||||
/// This feature could be useful to generate hugetlbfs files with number suffix. (e.g. shmem0, shmem1)
|
||||
pub fn toggle_file_suffix(&mut self, enabled: bool) {
|
||||
self.mem_suffix = enabled;
|
||||
}
|
||||
|
||||
/// Enable/disable memory pre-allocation.
|
||||
/// Enable this feature could improve performance stability at the start of workload by avoiding page fault.
|
||||
/// Disable this feature may influence performance stability but the cpu resource consumption and start-up time will decrease.
|
||||
pub fn toggle_prealloc(&mut self, prealloc: bool) {
|
||||
self.mem_prealloc = prealloc;
|
||||
}
|
||||
|
||||
/// Enable/disable KVM dirty page logging.
|
||||
pub fn toggle_dirty_page_logging(&mut self, logging: bool) {
|
||||
self.dirty_page_logging = logging;
|
||||
}
|
||||
|
||||
/// Set KVM [`VmFd`] handle to configure memory slots.
|
||||
pub fn set_kvm_vm_fd(&mut self, vmfd: Arc<VmFd>) -> Option<Arc<VmFd>> {
|
||||
let mut existing_vmfd = None;
|
||||
if self.vmfd.is_some() {
|
||||
existing_vmfd = self.vmfd.clone();
|
||||
}
|
||||
self.vmfd = Some(vmfd);
|
||||
existing_vmfd
|
||||
}
|
||||
|
||||
/// Build a ['AddressSpaceMgr'] using the configured parameters.
|
||||
pub fn build(
|
||||
self,
|
||||
res_mgr: &ResourceManager,
|
||||
numa_region_infos: &[NumaRegionInfo],
|
||||
) -> Result<AddressSpaceMgr> {
|
||||
let mut mgr = AddressSpaceMgr::default();
|
||||
mgr.create_address_space(res_mgr, numa_region_infos, self)?;
|
||||
Ok(mgr)
|
||||
}
|
||||
|
||||
fn get_next_mem_file(&mut self) -> String {
|
||||
if self.mem_suffix {
|
||||
let path = format!("{}{}", self.mem_file, self.mem_index);
|
||||
self.mem_index += 1;
|
||||
path
|
||||
} else {
|
||||
self.mem_file.to_string()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct to manage virtual machine's physical address space.
|
||||
pub struct AddressSpaceMgr {
|
||||
address_space: Option<AddressSpace>,
|
||||
vm_as: Option<GuestAddressSpaceImpl>,
|
||||
base_to_slot: Arc<Mutex<HashMap<u64, u32>>>,
|
||||
prealloc_handlers: Vec<thread::JoinHandle<()>>,
|
||||
prealloc_exit: Arc<AtomicBool>,
|
||||
numa_nodes: BTreeMap<u32, NumaNode>,
|
||||
}
|
||||
|
||||
impl AddressSpaceMgr {
|
||||
/// Query address space manager is initialized or not
|
||||
pub fn is_initialized(&self) -> bool {
|
||||
self.address_space.is_some()
|
||||
}
|
||||
|
||||
/// Gets address space.
|
||||
pub fn address_space(&self) -> Option<&AddressSpace> {
|
||||
self.address_space.as_ref()
|
||||
}
|
||||
|
||||
/// Create the address space for a virtual machine.
|
||||
///
|
||||
/// This method is designed to be called when starting up a virtual machine instead of at
|
||||
/// runtime, so it's expected the virtual machine will be tore down and no strict error recover.
|
||||
pub fn create_address_space(
|
||||
&mut self,
|
||||
res_mgr: &ResourceManager,
|
||||
numa_region_infos: &[NumaRegionInfo],
|
||||
mut param: AddressSpaceMgrBuilder,
|
||||
) -> Result<()> {
|
||||
let mut regions = Vec::new();
|
||||
let mut start_addr = dbs_boot::layout::GUEST_MEM_START;
|
||||
|
||||
// Create address space regions.
|
||||
for info in numa_region_infos.iter() {
|
||||
info!("numa_region_info {:?}", info);
|
||||
// convert size_in_mib to bytes
|
||||
let size = info
|
||||
.size
|
||||
.checked_shl(20)
|
||||
.ok_or_else(|| AddressManagerError::InvalidOperation)?;
|
||||
|
||||
// Guest memory does not intersect with the MMIO hole.
|
||||
// TODO: make it work for ARM (issue #4307)
|
||||
if start_addr > dbs_boot::layout::MMIO_LOW_END
|
||||
|| start_addr + size <= dbs_boot::layout::MMIO_LOW_START
|
||||
{
|
||||
let region = self.create_region(start_addr, size, info, &mut param)?;
|
||||
regions.push(region);
|
||||
start_addr = start_addr
|
||||
.checked_add(size)
|
||||
.ok_or_else(|| AddressManagerError::InvalidOperation)?;
|
||||
} else {
|
||||
// Add guest memory below the MMIO hole, avoid splitting the memory region
|
||||
// if the available address region is small than MINIMAL_SPLIT_SPACE MiB.
|
||||
let mut below_size = dbs_boot::layout::MMIO_LOW_START
|
||||
.checked_sub(start_addr)
|
||||
.ok_or_else(|| AddressManagerError::InvalidOperation)?;
|
||||
if below_size < (MINIMAL_SPLIT_SPACE) {
|
||||
below_size = 0;
|
||||
} else {
|
||||
let region = self.create_region(start_addr, below_size, info, &mut param)?;
|
||||
regions.push(region);
|
||||
}
|
||||
|
||||
// Add guest memory above the MMIO hole
|
||||
let above_start = dbs_boot::layout::MMIO_LOW_END + 1;
|
||||
let above_size = size
|
||||
.checked_sub(below_size)
|
||||
.ok_or_else(|| AddressManagerError::InvalidOperation)?;
|
||||
let region = self.create_region(above_start, above_size, info, &mut param)?;
|
||||
regions.push(region);
|
||||
start_addr = above_start
|
||||
.checked_add(above_size)
|
||||
.ok_or_else(|| AddressManagerError::InvalidOperation)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Create GuestMemory object
|
||||
let mut vm_memory = GuestMemoryMmap::new();
|
||||
for reg in regions.iter() {
|
||||
// Allocate used guest memory addresses.
|
||||
// These addresses are statically allocated, resource allocation/update should not fail.
|
||||
let constraint = Constraint::new(reg.len())
|
||||
.min(reg.start_addr().raw_value())
|
||||
.max(reg.last_addr().raw_value());
|
||||
let _key = res_mgr
|
||||
.allocate_mem_address(&constraint)
|
||||
.ok_or(AddressManagerError::NoAvailableMemAddress)?;
|
||||
let mmap_reg = self.create_mmap_region(reg.clone())?;
|
||||
|
||||
vm_memory = vm_memory
|
||||
.insert_region(mmap_reg.clone())
|
||||
.map_err(AddressManagerError::CreateGuestMemory)?;
|
||||
self.map_to_kvm(res_mgr, ¶m, reg, mmap_reg)?;
|
||||
}
|
||||
|
||||
#[cfg(feature = "atomic-guest-memory")]
|
||||
{
|
||||
self.vm_as = Some(AddressSpace::convert_into_vm_as(vm_memory));
|
||||
}
|
||||
#[cfg(not(feature = "atomic-guest-memory"))]
|
||||
{
|
||||
self.vm_as = Some(Arc::new(vm_memory));
|
||||
}
|
||||
|
||||
let layout = AddressSpaceLayout::new(
|
||||
*dbs_boot::layout::GUEST_PHYS_END,
|
||||
dbs_boot::layout::GUEST_MEM_START,
|
||||
*dbs_boot::layout::GUEST_MEM_END,
|
||||
);
|
||||
self.address_space = Some(AddressSpace::from_regions(regions, layout));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// size unit: Byte
|
||||
fn create_region(
|
||||
&mut self,
|
||||
start_addr: u64,
|
||||
size_bytes: u64,
|
||||
info: &NumaRegionInfo,
|
||||
param: &mut AddressSpaceMgrBuilder,
|
||||
) -> Result<Arc<AddressSpaceRegion>> {
|
||||
let mem_file_path = param.get_next_mem_file();
|
||||
let region = AddressSpaceRegion::create_default_memory_region(
|
||||
GuestAddress(start_addr),
|
||||
size_bytes,
|
||||
info.host_numa_node_id,
|
||||
param.mem_type,
|
||||
&mem_file_path,
|
||||
param.mem_prealloc,
|
||||
false,
|
||||
)
|
||||
.map_err(AddressManagerError::CreateAddressSpaceRegion)?;
|
||||
let region = Arc::new(region);
|
||||
|
||||
self.insert_into_numa_nodes(
|
||||
®ion,
|
||||
info.guest_numa_node_id.unwrap_or(0),
|
||||
&info.vcpu_ids,
|
||||
);
|
||||
info!(
|
||||
"create new region: guest addr 0x{:x}-0x{:x} size {}",
|
||||
start_addr,
|
||||
start_addr + size_bytes,
|
||||
size_bytes
|
||||
);
|
||||
|
||||
Ok(region)
|
||||
}
|
||||
|
||||
fn map_to_kvm(
|
||||
&mut self,
|
||||
res_mgr: &ResourceManager,
|
||||
param: &AddressSpaceMgrBuilder,
|
||||
reg: &Arc<AddressSpaceRegion>,
|
||||
mmap_reg: Arc<GuestRegionImpl>,
|
||||
) -> Result<()> {
|
||||
// Build mapping between GPA <-> HVA, by adding kvm memory slot.
|
||||
let slot = res_mgr
|
||||
.allocate_kvm_mem_slot(1, None)
|
||||
.ok_or(AddressManagerError::NoAvailableKvmSlot)?;
|
||||
|
||||
if let Some(vmfd) = param.vmfd.as_ref() {
|
||||
let host_addr = mmap_reg
|
||||
.get_host_address(MemoryRegionAddress(0))
|
||||
.map_err(|_e| AddressManagerError::InvalidOperation)?;
|
||||
let flags = 0u32;
|
||||
|
||||
let mem_region = kvm_userspace_memory_region {
|
||||
slot: slot as u32,
|
||||
guest_phys_addr: reg.start_addr().raw_value(),
|
||||
memory_size: reg.len() as u64,
|
||||
userspace_addr: host_addr as u64,
|
||||
flags,
|
||||
};
|
||||
|
||||
info!(
|
||||
"VM: guest memory region {:x} starts at {:x?}",
|
||||
reg.start_addr().raw_value(),
|
||||
host_addr
|
||||
);
|
||||
// Safe because the guest regions are guaranteed not to overlap.
|
||||
unsafe { vmfd.set_user_memory_region(mem_region) }
|
||||
.map_err(AddressManagerError::KvmSetMemorySlot)?;
|
||||
}
|
||||
|
||||
self.base_to_slot
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(reg.start_addr().raw_value(), slot as u32);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Mmap the address space region into current process.
|
||||
pub fn create_mmap_region(
|
||||
&mut self,
|
||||
region: Arc<AddressSpaceRegion>,
|
||||
) -> Result<Arc<GuestRegionImpl>> {
|
||||
// Special check for 32bit host with 64bit virtual machines.
|
||||
if region.len() > usize::MAX as u64 {
|
||||
return Err(AddressManagerError::InvalidAddressRange(
|
||||
region.start_addr().raw_value(),
|
||||
region.len(),
|
||||
));
|
||||
}
|
||||
// The device MMIO regions may not be backed by memory files, so refuse to mmap them.
|
||||
if region.region_type() == AddressSpaceRegionType::DeviceMemory {
|
||||
return Err(AddressManagerError::InvalidOperation);
|
||||
}
|
||||
|
||||
// The GuestRegionMmap/MmapRegion will take ownership of the FileOffset object,
|
||||
// so we have to duplicate the fd here. It's really a dirty design.
|
||||
let file_offset = match region.file_offset().as_ref() {
|
||||
Some(fo) => {
|
||||
let fd = dup(fo.file().as_raw_fd()).map_err(AddressManagerError::DupFd)?;
|
||||
// Safe because we have just duplicated the raw fd.
|
||||
let file = unsafe { File::from_raw_fd(fd) };
|
||||
let file_offset = FileOffset::new(file, fo.start());
|
||||
Some(file_offset)
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
let perm_flags = if (region.perm_flags() & libc::MAP_POPULATE) != 0 && region.is_hugepage()
|
||||
{
|
||||
// mmap(MAP_POPULATE) conflicts with madive(MADV_HUGEPAGE) because mmap(MAP_POPULATE)
|
||||
// will pre-fault in all memory with normal pages before madive(MADV_HUGEPAGE) gets
|
||||
// called. So remove the MAP_POPULATE flag and memory will be faulted in by working
|
||||
// threads.
|
||||
region.perm_flags() & (!libc::MAP_POPULATE)
|
||||
} else {
|
||||
region.perm_flags()
|
||||
};
|
||||
let mmap_reg = MmapRegion::build(
|
||||
file_offset,
|
||||
region.len() as usize,
|
||||
libc::PROT_READ | libc::PROT_WRITE,
|
||||
perm_flags,
|
||||
)
|
||||
.map_err(AddressManagerError::MmapGuestMemory)?;
|
||||
|
||||
if region.is_anonpage() {
|
||||
self.configure_anon_mem(&mmap_reg)?;
|
||||
}
|
||||
if let Some(node_id) = region.host_numa_node_id() {
|
||||
self.configure_numa(&mmap_reg, node_id)?;
|
||||
}
|
||||
if region.is_hugepage() {
|
||||
self.configure_thp_and_prealloc(®ion, &mmap_reg)?;
|
||||
}
|
||||
|
||||
let reg = GuestRegionImpl::new(mmap_reg, region.start_addr())
|
||||
.map_err(AddressManagerError::CreateGuestMemory)?;
|
||||
Ok(Arc::new(reg))
|
||||
}
|
||||
|
||||
fn configure_anon_mem(&self, mmap_reg: &MmapRegion) -> Result<()> {
|
||||
unsafe {
|
||||
mman::madvise(
|
||||
mmap_reg.as_ptr() as *mut libc::c_void,
|
||||
mmap_reg.size(),
|
||||
mman::MmapAdvise::MADV_DONTFORK,
|
||||
)
|
||||
}
|
||||
.map_err(AddressManagerError::Madvise)
|
||||
}
|
||||
|
||||
fn configure_numa(&self, mmap_reg: &MmapRegion, node_id: u32) -> Result<()> {
|
||||
let nodemask = 1_u64
|
||||
.checked_shl(node_id)
|
||||
.ok_or_else(|| AddressManagerError::InvalidOperation)?;
|
||||
let res = unsafe {
|
||||
libc::syscall(
|
||||
libc::SYS_mbind,
|
||||
mmap_reg.as_ptr() as *mut libc::c_void,
|
||||
mmap_reg.size(),
|
||||
MPOL_PREFERRED,
|
||||
&nodemask as *const u64,
|
||||
MAX_NODE,
|
||||
MPOL_MF_MOVE,
|
||||
)
|
||||
};
|
||||
if res < 0 {
|
||||
warn!(
|
||||
"failed to mbind memory to host_numa_node_id {}: this may affect performance",
|
||||
node_id
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// We set Transparent Huge Page (THP) through mmap to increase performance.
|
||||
// In order to reduce the impact of page fault on performance, we start several threads (up to MAX_PRE_ALLOC_THREAD) to touch every 4k page of the memory region to manually do memory pre-allocation.
|
||||
// The reason why we don't use mmap to enable THP and pre-alloction is that THP setting won't take effect in this operation (tested in kernel 4.9)
|
||||
fn configure_thp_and_prealloc(
|
||||
&mut self,
|
||||
region: &Arc<AddressSpaceRegion>,
|
||||
mmap_reg: &MmapRegion,
|
||||
) -> Result<()> {
|
||||
debug!(
|
||||
"Setting MADV_HUGEPAGE on AddressSpaceRegion addr {:x?} len {:x?}",
|
||||
mmap_reg.as_ptr(),
|
||||
mmap_reg.size()
|
||||
);
|
||||
|
||||
// Safe because we just create the MmapRegion
|
||||
unsafe {
|
||||
mman::madvise(
|
||||
mmap_reg.as_ptr() as *mut libc::c_void,
|
||||
mmap_reg.size(),
|
||||
mman::MmapAdvise::MADV_HUGEPAGE,
|
||||
)
|
||||
}
|
||||
.map_err(AddressManagerError::Madvise)?;
|
||||
|
||||
if region.perm_flags() & libc::MAP_POPULATE > 0 {
|
||||
// Touch every 4k page to trigger allocation. The step is 4K instead of 2M to ensure
|
||||
// pre-allocation when running out of huge pages.
|
||||
const PAGE_SIZE: u64 = 4096;
|
||||
const PAGE_SHIFT: u32 = 12;
|
||||
let addr = mmap_reg.as_ptr() as u64;
|
||||
// Here we use >> PAGE_SHIFT to calculate how many 4K pages in the memory region.
|
||||
let npage = (mmap_reg.size() as u64) >> PAGE_SHIFT;
|
||||
|
||||
let mut touch_thread = ((mmap_reg.size() as u64) >> PRE_ALLOC_GRANULARITY) + 1;
|
||||
if touch_thread > MAX_PRE_ALLOC_THREAD {
|
||||
touch_thread = MAX_PRE_ALLOC_THREAD;
|
||||
}
|
||||
|
||||
let per_npage = npage / touch_thread;
|
||||
for n in 0..touch_thread {
|
||||
let start_npage = per_npage * n;
|
||||
let end_npage = if n == (touch_thread - 1) {
|
||||
npage
|
||||
} else {
|
||||
per_npage * (n + 1)
|
||||
};
|
||||
let mut per_addr = addr + (start_npage * PAGE_SIZE);
|
||||
let should_stop = self.prealloc_exit.clone();
|
||||
|
||||
let handler = thread::Builder::new()
|
||||
.name("PreallocThread".to_string())
|
||||
.spawn(move || {
|
||||
info!("PreallocThread start start_npage: {:?}, end_npage: {:?}, per_addr: {:?}, thread_number: {:?}",
|
||||
start_npage, end_npage, per_addr, touch_thread );
|
||||
for _ in start_npage..end_npage {
|
||||
if should_stop.load(Ordering::Acquire) {
|
||||
info!("PreallocThread stop start_npage: {:?}, end_npage: {:?}, per_addr: {:?}, thread_number: {:?}",
|
||||
start_npage, end_npage, per_addr, touch_thread);
|
||||
break;
|
||||
}
|
||||
|
||||
// Reading from a THP page may be served by the zero page, so only
|
||||
// write operation could ensure THP memory allocation. So use
|
||||
// the compare_exchange(old_val, old_val) trick to trigger allocation.
|
||||
let addr_ptr = per_addr as *mut u8;
|
||||
let read_byte = unsafe { std::ptr::read_volatile(addr_ptr) };
|
||||
let atomic_u8 : &AtomicU8 = unsafe {&*(addr_ptr as *mut AtomicU8)};
|
||||
let _ = atomic_u8.compare_exchange(read_byte, read_byte, Ordering::SeqCst, Ordering::SeqCst);
|
||||
per_addr += PAGE_SIZE;
|
||||
}
|
||||
|
||||
info!("PreallocThread done start_npage: {:?}, end_npage: {:?}, per_addr: {:?}, thread_number: {:?}",
|
||||
start_npage, end_npage, per_addr, touch_thread );
|
||||
});
|
||||
|
||||
match handler {
|
||||
Err(e) => error!(
|
||||
"Failed to create working thread for async pre-allocation, {:?}. This may affect performance stability at the start of the workload.",
|
||||
e
|
||||
),
|
||||
Ok(hdl) => self.prealloc_handlers.push(hdl),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the address space object
|
||||
pub fn get_address_space(&self) -> Option<&AddressSpace> {
|
||||
self.address_space.as_ref()
|
||||
}
|
||||
|
||||
/// Get the default guest memory object, which will be used to access virtual machine's default
|
||||
/// guest memory.
|
||||
pub fn get_vm_as(&self) -> Option<&GuestAddressSpaceImpl> {
|
||||
self.vm_as.as_ref()
|
||||
}
|
||||
|
||||
/// Get the base to slot map
|
||||
pub fn get_base_to_slot_map(&self) -> Arc<Mutex<HashMap<u64, u32>>> {
|
||||
self.base_to_slot.clone()
|
||||
}
|
||||
|
||||
/// get numa nodes infos from address space manager.
|
||||
pub fn get_numa_nodes(&self) -> &BTreeMap<u32, NumaNode> {
|
||||
&self.numa_nodes
|
||||
}
|
||||
|
||||
/// add cpu and memory numa informations to BtreeMap
|
||||
fn insert_into_numa_nodes(
|
||||
&mut self,
|
||||
region: &Arc<AddressSpaceRegion>,
|
||||
guest_numa_node_id: u32,
|
||||
vcpu_ids: &[u32],
|
||||
) {
|
||||
let node = self
|
||||
.numa_nodes
|
||||
.entry(guest_numa_node_id)
|
||||
.or_insert_with(NumaNode::new);
|
||||
node.add_info(&NumaNodeInfo {
|
||||
base: region.start_addr(),
|
||||
size: region.len(),
|
||||
});
|
||||
node.add_vcpu_ids(vcpu_ids);
|
||||
}
|
||||
|
||||
/// get address space layout from address space manager.
|
||||
pub fn get_layout(&self) -> Result<AddressSpaceLayout> {
|
||||
self.address_space
|
||||
.as_ref()
|
||||
.map(|v| v.layout())
|
||||
.ok_or(AddressManagerError::GuestMemoryNotInitialized)
|
||||
}
|
||||
|
||||
/// Wait for the pre-allocation working threads to finish work.
|
||||
///
|
||||
/// Force all working threads to exit if `stop` is true.
|
||||
pub fn wait_prealloc(&mut self, stop: bool) -> Result<()> {
|
||||
if stop {
|
||||
self.prealloc_exit.store(true, Ordering::Release);
|
||||
}
|
||||
while let Some(handlers) = self.prealloc_handlers.pop() {
|
||||
if let Err(e) = handlers.join() {
|
||||
error!("wait_prealloc join fail {:?}", e);
|
||||
return Err(AddressManagerError::JoinFail);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AddressSpaceMgr {
|
||||
/// Create a new empty AddressSpaceMgr
|
||||
fn default() -> Self {
|
||||
AddressSpaceMgr {
|
||||
address_space: None,
|
||||
vm_as: None,
|
||||
base_to_slot: Arc::new(Mutex::new(HashMap::new())),
|
||||
prealloc_handlers: Vec::new(),
|
||||
prealloc_exit: Arc::new(AtomicBool::new(false)),
|
||||
numa_nodes: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use dbs_boot::layout::GUEST_MEM_START;
|
||||
use std::ops::Deref;
|
||||
|
||||
use vm_memory::{Bytes, GuestAddressSpace, GuestMemory, GuestMemoryRegion};
|
||||
use vmm_sys_util::tempfile::TempFile;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_create_address_space() {
|
||||
let res_mgr = ResourceManager::new(None);
|
||||
let mem_size = 128 << 20;
|
||||
let numa_region_infos = vec![NumaRegionInfo {
|
||||
size: mem_size >> 20,
|
||||
host_numa_node_id: None,
|
||||
guest_numa_node_id: Some(0),
|
||||
vcpu_ids: vec![1, 2],
|
||||
}];
|
||||
let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap();
|
||||
let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
|
||||
let vm_as = as_mgr.get_vm_as().unwrap();
|
||||
let guard = vm_as.memory();
|
||||
let gmem = guard.deref();
|
||||
assert_eq!(gmem.num_regions(), 1);
|
||||
|
||||
let reg = gmem
|
||||
.find_region(GuestAddress(GUEST_MEM_START + mem_size - 1))
|
||||
.unwrap();
|
||||
assert_eq!(reg.start_addr(), GuestAddress(GUEST_MEM_START));
|
||||
assert_eq!(reg.len(), mem_size);
|
||||
assert!(gmem
|
||||
.find_region(GuestAddress(GUEST_MEM_START + mem_size))
|
||||
.is_none());
|
||||
assert!(reg.file_offset().is_some());
|
||||
|
||||
let buf = [0x1u8, 0x2u8, 0x3u8, 0x4u8, 0x5u8];
|
||||
gmem.write_slice(&buf, GuestAddress(GUEST_MEM_START))
|
||||
.unwrap();
|
||||
|
||||
// Update middle of mapped memory region
|
||||
let mut val = 0xa5u8;
|
||||
gmem.write_obj(val, GuestAddress(GUEST_MEM_START + 0x1))
|
||||
.unwrap();
|
||||
val = gmem.read_obj(GuestAddress(GUEST_MEM_START + 0x1)).unwrap();
|
||||
assert_eq!(val, 0xa5);
|
||||
val = gmem.read_obj(GuestAddress(GUEST_MEM_START)).unwrap();
|
||||
assert_eq!(val, 1);
|
||||
val = gmem.read_obj(GuestAddress(GUEST_MEM_START + 0x2)).unwrap();
|
||||
assert_eq!(val, 3);
|
||||
val = gmem.read_obj(GuestAddress(GUEST_MEM_START + 0x5)).unwrap();
|
||||
assert_eq!(val, 0);
|
||||
|
||||
// Read ahead of mapped memory region
|
||||
assert!(gmem
|
||||
.read_obj::<u8>(GuestAddress(GUEST_MEM_START + mem_size))
|
||||
.is_err());
|
||||
|
||||
let res_mgr = ResourceManager::new(None);
|
||||
let mem_size = dbs_boot::layout::MMIO_LOW_START + (1 << 30);
|
||||
let numa_region_infos = vec![NumaRegionInfo {
|
||||
size: mem_size >> 20,
|
||||
host_numa_node_id: None,
|
||||
guest_numa_node_id: Some(0),
|
||||
vcpu_ids: vec![1, 2],
|
||||
}];
|
||||
let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap();
|
||||
let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
|
||||
let vm_as = as_mgr.get_vm_as().unwrap();
|
||||
let guard = vm_as.memory();
|
||||
let gmem = guard.deref();
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
assert_eq!(gmem.num_regions(), 2);
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
assert_eq!(gmem.num_regions(), 1);
|
||||
|
||||
// Test dropping GuestMemoryMmap object releases all resources.
|
||||
for _ in 0..10000 {
|
||||
let res_mgr = ResourceManager::new(None);
|
||||
let mem_size = 1 << 20;
|
||||
let numa_region_infos = vec![NumaRegionInfo {
|
||||
size: mem_size >> 20,
|
||||
host_numa_node_id: None,
|
||||
guest_numa_node_id: Some(0),
|
||||
vcpu_ids: vec![1, 2],
|
||||
}];
|
||||
let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap();
|
||||
let _as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
|
||||
}
|
||||
let file = TempFile::new().unwrap().into_file();
|
||||
let fd = file.as_raw_fd();
|
||||
// fd should be small enough if there's no leaking of fds.
|
||||
assert!(fd < 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_address_space_mgr_get_boundary() {
|
||||
let layout = AddressSpaceLayout::new(
|
||||
*dbs_boot::layout::GUEST_PHYS_END,
|
||||
dbs_boot::layout::GUEST_MEM_START,
|
||||
*dbs_boot::layout::GUEST_MEM_END,
|
||||
);
|
||||
let res_mgr = ResourceManager::new(None);
|
||||
let mem_size = 128 << 20;
|
||||
let numa_region_infos = vec![NumaRegionInfo {
|
||||
size: mem_size >> 20,
|
||||
host_numa_node_id: None,
|
||||
guest_numa_node_id: Some(0),
|
||||
vcpu_ids: vec![1, 2],
|
||||
}];
|
||||
let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap();
|
||||
let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
|
||||
assert_eq!(as_mgr.get_layout().unwrap(), layout);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_address_space_mgr_get_numa_nodes() {
|
||||
let res_mgr = ResourceManager::new(None);
|
||||
let mem_size = 128 << 20;
|
||||
let cpu_vec = vec![1, 2];
|
||||
let numa_region_infos = vec![NumaRegionInfo {
|
||||
size: mem_size >> 20,
|
||||
host_numa_node_id: None,
|
||||
guest_numa_node_id: Some(0),
|
||||
vcpu_ids: cpu_vec.clone(),
|
||||
}];
|
||||
let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap();
|
||||
let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
|
||||
let mut numa_node = NumaNode::new();
|
||||
numa_node.add_info(&NumaNodeInfo {
|
||||
base: GuestAddress(GUEST_MEM_START),
|
||||
size: mem_size,
|
||||
});
|
||||
numa_node.add_vcpu_ids(&cpu_vec);
|
||||
|
||||
assert_eq!(*as_mgr.get_numa_nodes().get(&0).unwrap(), numa_node);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_address_space_mgr_async_prealloc() {
|
||||
let res_mgr = ResourceManager::new(None);
|
||||
let mem_size = 2 << 20;
|
||||
let cpu_vec = vec![1, 2];
|
||||
let numa_region_infos = vec![NumaRegionInfo {
|
||||
size: mem_size >> 20,
|
||||
host_numa_node_id: None,
|
||||
guest_numa_node_id: Some(0),
|
||||
vcpu_ids: cpu_vec,
|
||||
}];
|
||||
let mut builder = AddressSpaceMgrBuilder::new("hugeshmem", "").unwrap();
|
||||
builder.toggle_prealloc(true);
|
||||
let mut as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
|
||||
as_mgr.wait_prealloc(false).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_address_space_mgr_builder() {
|
||||
let mut builder = AddressSpaceMgrBuilder::new("shmem", "/tmp/shmem").unwrap();
|
||||
|
||||
assert_eq!(builder.mem_type, "shmem");
|
||||
assert_eq!(builder.mem_file, "/tmp/shmem");
|
||||
assert_eq!(builder.mem_index, 0);
|
||||
assert!(builder.mem_suffix);
|
||||
assert!(!builder.mem_prealloc);
|
||||
assert!(!builder.dirty_page_logging);
|
||||
assert!(builder.vmfd.is_none());
|
||||
|
||||
assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem0");
|
||||
assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem1");
|
||||
assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem2");
|
||||
assert_eq!(builder.mem_index, 3);
|
||||
|
||||
builder.toggle_file_suffix(false);
|
||||
assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem");
|
||||
assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem");
|
||||
assert_eq!(builder.mem_index, 3);
|
||||
|
||||
builder.toggle_prealloc(true);
|
||||
builder.toggle_dirty_page_logging(true);
|
||||
assert!(builder.mem_prealloc);
|
||||
assert!(builder.dirty_page_logging);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_configure_invalid_numa() {
|
||||
let res_mgr = ResourceManager::new(None);
|
||||
let mem_size = 128 << 20;
|
||||
let numa_region_infos = vec![NumaRegionInfo {
|
||||
size: mem_size >> 20,
|
||||
host_numa_node_id: None,
|
||||
guest_numa_node_id: Some(0),
|
||||
vcpu_ids: vec![1, 2],
|
||||
}];
|
||||
let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap();
|
||||
let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
|
||||
let mmap_reg = MmapRegion::new(8).unwrap();
|
||||
|
||||
assert!(as_mgr.configure_numa(&mmap_reg, u32::MAX).is_err());
|
||||
}
|
||||
}
|
||||
6
src/dragonball/src/api/mod.rs
Normal file
6
src/dragonball/src/api/mod.rs
Normal file
@@ -0,0 +1,6 @@
|
||||
// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
//! API related data structures to configure the vmm.
|
||||
|
||||
pub mod v1;
|
||||
55
src/dragonball/src/api/v1/boot_source.rs
Normal file
55
src/dragonball/src/api/v1/boot_source.rs
Normal file
@@ -0,0 +1,55 @@
|
||||
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
|
||||
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use serde_derive::{Deserialize, Serialize};
|
||||
|
||||
/// Default guest kernel command line:
|
||||
/// - `reboot=k` shutdown the guest on reboot, instead of well... rebooting;
|
||||
/// - `panic=1` on panic, reboot after 1 second;
|
||||
/// - `pci=off` do not scan for PCI devices (ser boot time);
|
||||
/// - `nomodules` disable loadable kernel module support;
|
||||
/// - `8250.nr_uarts=0` disable 8250 serial interface;
|
||||
/// - `i8042.noaux` do not probe the i8042 controller for an attached mouse (ser boot time);
|
||||
/// - `i8042.nomux` do not probe i8042 for a multiplexing controller (ser boot time);
|
||||
/// - `i8042.nopnp` do not use ACPIPnP to discover KBD/AUX controllers (ser boot time);
|
||||
/// - `i8042.dumbkbd` do not attempt to control kbd state via the i8042 (ser boot time).
|
||||
pub const DEFAULT_KERNEL_CMDLINE: &str = "reboot=k panic=1 pci=off nomodules 8250.nr_uarts=0 \
|
||||
i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd";
|
||||
|
||||
/// Strongly typed data structure used to configure the boot source of the microvm.
|
||||
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize, Default)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct BootSourceConfig {
|
||||
/// Path of the kernel image.
|
||||
/// We only support uncompressed kernel for Dragonball.
|
||||
pub kernel_path: String,
|
||||
/// Path of the initrd, if there is one.
|
||||
/// ps. rootfs is set in BlockDeviceConfigInfo
|
||||
pub initrd_path: Option<String>,
|
||||
/// The boot arguments to pass to the kernel.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub boot_args: Option<String>,
|
||||
}
|
||||
|
||||
/// Errors associated with actions on `BootSourceConfig`.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum BootSourceConfigError {
|
||||
/// The kernel file cannot be opened.
|
||||
#[error(
|
||||
"the kernel file cannot be opened due to invalid kernel path or invalid permissions: {0}"
|
||||
)]
|
||||
InvalidKernelPath(#[source] std::io::Error),
|
||||
|
||||
/// The initrd file cannot be opened.
|
||||
#[error("the initrd file cannot be opened due to invalid path or invalid permissions: {0}")]
|
||||
InvalidInitrdPath(#[source] std::io::Error),
|
||||
|
||||
/// The kernel command line is invalid.
|
||||
#[error("the kernel command line is invalid: {0}")]
|
||||
InvalidKernelCommandLine(#[source] linux_loader::cmdline::Error),
|
||||
|
||||
/// The boot source cannot be update post boot.
|
||||
#[error("the update operation is not allowed after boot")]
|
||||
UpdateNotAllowedPostBoot,
|
||||
}
|
||||
88
src/dragonball/src/api/v1/instance_info.rs
Normal file
88
src/dragonball/src/api/v1/instance_info.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
|
||||
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use serde_derive::{Deserialize, Serialize};
|
||||
|
||||
/// The microvm state.
|
||||
///
|
||||
/// When Dragonball starts, the instance state is Uninitialized. Once start_microvm method is
|
||||
/// called, the state goes from Uninitialized to Starting. The state is changed to Running until
|
||||
/// the start_microvm method ends. Halting and Halted are currently unsupported.
|
||||
#[derive(Copy, Clone, Debug, Deserialize, PartialEq, Serialize)]
|
||||
pub enum InstanceState {
|
||||
/// Microvm is not initialized.
|
||||
Uninitialized,
|
||||
/// Microvm is starting.
|
||||
Starting,
|
||||
/// Microvm is running.
|
||||
Running,
|
||||
/// Microvm is Paused.
|
||||
Paused,
|
||||
/// Microvm received a halt instruction.
|
||||
Halting,
|
||||
/// Microvm is halted.
|
||||
Halted,
|
||||
/// Microvm exit instead of process exit.
|
||||
Exited(i32),
|
||||
}
|
||||
|
||||
/// The state of async actions
|
||||
#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
|
||||
pub enum AsyncState {
|
||||
/// Uninitialized
|
||||
Uninitialized,
|
||||
/// Success
|
||||
Success,
|
||||
/// Failure
|
||||
Failure,
|
||||
}
|
||||
|
||||
/// The strongly typed that contains general information about the microVM.
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct InstanceInfo {
|
||||
/// The ID of the microVM.
|
||||
pub id: String,
|
||||
/// The state of the microVM.
|
||||
pub state: InstanceState,
|
||||
/// The version of the VMM that runs the microVM.
|
||||
pub vmm_version: String,
|
||||
/// The pid of the current VMM process.
|
||||
pub pid: u32,
|
||||
/// The state of async actions.
|
||||
pub async_state: AsyncState,
|
||||
/// List of tids of vcpu threads (vcpu index, tid)
|
||||
pub tids: Vec<(u8, u32)>,
|
||||
/// Last instance downtime
|
||||
pub last_instance_downtime: u64,
|
||||
}
|
||||
|
||||
impl InstanceInfo {
|
||||
/// create instance info object with given id, version, and platform type
|
||||
pub fn new(id: String, vmm_version: String) -> Self {
|
||||
InstanceInfo {
|
||||
id,
|
||||
state: InstanceState::Uninitialized,
|
||||
vmm_version,
|
||||
pid: std::process::id(),
|
||||
async_state: AsyncState::Uninitialized,
|
||||
tids: Vec::new(),
|
||||
last_instance_downtime: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for InstanceInfo {
|
||||
fn default() -> Self {
|
||||
InstanceInfo {
|
||||
id: String::from(""),
|
||||
state: InstanceState::Uninitialized,
|
||||
vmm_version: env!("CARGO_PKG_VERSION").to_string(),
|
||||
pid: std::process::id(),
|
||||
async_state: AsyncState::Uninitialized,
|
||||
tids: Vec::new(),
|
||||
last_instance_downtime: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
86
src/dragonball/src/api/v1/machine_config.rs
Normal file
86
src/dragonball/src/api/v1/machine_config.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
|
||||
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
/// We only support this number of vcpus for now. Mostly because we have set all vcpu related metrics as u8
|
||||
/// and breaking u8 will take extra efforts.
|
||||
pub const MAX_SUPPORTED_VCPUS: u8 = 254;
|
||||
|
||||
/// Memory hotplug value should have alignment in this size (unit: MiB)
|
||||
pub const MEMORY_HOTPLUG_ALIGHMENT: u8 = 64;
|
||||
|
||||
/// Errors associated with configuring the microVM.
|
||||
#[derive(Debug, PartialEq, thiserror::Error)]
|
||||
pub enum VmConfigError {
|
||||
/// Cannot update the configuration of the microvm post boot.
|
||||
#[error("update operation is not allowed after boot")]
|
||||
UpdateNotAllowedPostBoot,
|
||||
|
||||
/// The max vcpu count is invalid.
|
||||
#[error("the vCPU number shouldn't large than {}", MAX_SUPPORTED_VCPUS)]
|
||||
VcpuCountExceedsMaximum,
|
||||
|
||||
/// The vcpu count is invalid. When hyperthreading is enabled, the `cpu_count` must be either
|
||||
/// 1 or an even number.
|
||||
#[error(
|
||||
"the vCPU number '{0}' can only be 1 or an even number when hyperthreading is enabled"
|
||||
)]
|
||||
InvalidVcpuCount(u8),
|
||||
|
||||
/// The threads_per_core is invalid. It should be either 1 or 2.
|
||||
#[error("the threads_per_core number '{0}' can only be 1 or 2")]
|
||||
InvalidThreadsPerCore(u8),
|
||||
|
||||
/// The cores_per_die is invalid. It should be larger than 0.
|
||||
#[error("the cores_per_die number '{0}' can only be larger than 0")]
|
||||
InvalidCoresPerDie(u8),
|
||||
|
||||
/// The dies_per_socket is invalid. It should be larger than 0.
|
||||
#[error("the dies_per_socket number '{0}' can only be larger than 0")]
|
||||
InvalidDiesPerSocket(u8),
|
||||
|
||||
/// The socket number is invalid. It should be either 1 or 2.
|
||||
#[error("the socket number '{0}' can only be 1 or 2")]
|
||||
InvalidSocket(u8),
|
||||
|
||||
/// max vcpu count inferred from cpu topology(threads_per_core * cores_per_die * dies_per_socket * sockets) should be larger or equal to vcpu_count
|
||||
#[error("the max vcpu count inferred from cpu topology '{0}' (threads_per_core * cores_per_die * dies_per_socket * sockets) should be larger or equal to vcpu_count")]
|
||||
InvalidCpuTopology(u8),
|
||||
|
||||
/// The max vcpu count is invalid.
|
||||
#[error(
|
||||
"the max vCPU number '{0}' shouldn't less than vCPU count and can only be 1 or an even number when hyperthreading is enabled"
|
||||
)]
|
||||
InvalidMaxVcpuCount(u8),
|
||||
|
||||
/// The memory size is invalid. The memory can only be an unsigned integer.
|
||||
#[error("the memory size 0x{0:x}MiB is invalid")]
|
||||
InvalidMemorySize(usize),
|
||||
|
||||
/// The hotplug memory size is invalid. The memory can only be an unsigned integer.
|
||||
#[error(
|
||||
"the hotplug memory size '{0}' (MiB) is invalid, must be multiple of {}",
|
||||
MEMORY_HOTPLUG_ALIGHMENT
|
||||
)]
|
||||
InvalidHotplugMemorySize(usize),
|
||||
|
||||
/// The memory type is invalid.
|
||||
#[error("the memory type '{0}' is invalid")]
|
||||
InvalidMemType(String),
|
||||
|
||||
/// The memory file path is invalid.
|
||||
#[error("the memory file path is invalid")]
|
||||
InvalidMemFilePath(String),
|
||||
|
||||
/// NUMA region memory size is invalid
|
||||
#[error("Total size of memory in NUMA regions: {0}, should matches memory size in config")]
|
||||
InvalidNumaRegionMemorySize(usize),
|
||||
|
||||
/// NUMA region vCPU count is invalid
|
||||
#[error("Total counts of vCPUs in NUMA regions: {0}, should matches max vcpu count in config")]
|
||||
InvalidNumaRegionCpuCount(u16),
|
||||
|
||||
/// NUMA region vCPU count is invalid
|
||||
#[error("Max id of vCPUs in NUMA regions: {0}, should matches max vcpu count in config")]
|
||||
InvalidNumaRegionCpuMaxId(u16),
|
||||
}
|
||||
19
src/dragonball/src/api/v1/mod.rs
Normal file
19
src/dragonball/src/api/v1/mod.rs
Normal file
@@ -0,0 +1,19 @@
|
||||
// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
//! API Version 1 related data structures to configure the vmm.
|
||||
|
||||
mod vmm_action;
|
||||
pub use self::vmm_action::*;
|
||||
|
||||
/// Wrapper for configuring the microVM boot source.
|
||||
mod boot_source;
|
||||
pub use self::boot_source::{BootSourceConfig, BootSourceConfigError, DEFAULT_KERNEL_CMDLINE};
|
||||
|
||||
/// Wrapper over the microVM general information.
|
||||
mod instance_info;
|
||||
pub use self::instance_info::{InstanceInfo, InstanceState};
|
||||
|
||||
/// Wrapper for configuring the memory and CPU of the microVM.
|
||||
mod machine_config;
|
||||
pub use self::machine_config::{VmConfigError, MAX_SUPPORTED_VCPUS};
|
||||
636
src/dragonball/src/api/v1/vmm_action.rs
Normal file
636
src/dragonball/src/api/v1/vmm_action.rs
Normal file
@@ -0,0 +1,636 @@
|
||||
// Copyright (C) 2020-2022 Alibaba Cloud. All rights reserved.
|
||||
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the THIRD-PARTY file.
|
||||
|
||||
use std::fs::File;
|
||||
use std::sync::mpsc::{Receiver, Sender, TryRecvError};
|
||||
|
||||
use log::{debug, error, info, warn};
|
||||
|
||||
use crate::error::{Result, StartMicroVmError, StopMicrovmError};
|
||||
use crate::event_manager::EventManager;
|
||||
use crate::vm::{CpuTopology, KernelConfigInfo, VmConfigInfo};
|
||||
use crate::vmm::Vmm;
|
||||
|
||||
use self::VmConfigError::*;
|
||||
use self::VmmActionError::MachineConfig;
|
||||
|
||||
#[cfg(feature = "virtio-blk")]
|
||||
pub use crate::device_manager::blk_dev_mgr::{
|
||||
BlockDeviceConfigInfo, BlockDeviceConfigUpdateInfo, BlockDeviceError, BlockDeviceMgr,
|
||||
};
|
||||
#[cfg(feature = "virtio-fs")]
|
||||
pub use crate::device_manager::fs_dev_mgr::{
|
||||
FsDeviceConfigInfo, FsDeviceConfigUpdateInfo, FsDeviceError, FsDeviceMgr, FsMountConfigInfo,
|
||||
};
|
||||
#[cfg(feature = "virtio-net")]
|
||||
pub use crate::device_manager::virtio_net_dev_mgr::{
|
||||
VirtioNetDeviceConfigInfo, VirtioNetDeviceConfigUpdateInfo, VirtioNetDeviceError,
|
||||
VirtioNetDeviceMgr,
|
||||
};
|
||||
#[cfg(feature = "virtio-vsock")]
|
||||
pub use crate::device_manager::vsock_dev_mgr::{VsockDeviceConfigInfo, VsockDeviceError};
|
||||
|
||||
use super::*;
|
||||
|
||||
/// Wrapper for all errors associated with VMM actions.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum VmmActionError {
|
||||
/// Invalid virtual machine instance ID.
|
||||
#[error("the virtual machine instance ID is invalid")]
|
||||
InvalidVMID,
|
||||
|
||||
/// Failed to hotplug, due to Upcall not ready.
|
||||
#[error("Upcall not ready, can't hotplug device.")]
|
||||
UpcallNotReady,
|
||||
|
||||
/// The action `ConfigureBootSource` failed either because of bad user input or an internal
|
||||
/// error.
|
||||
#[error("failed to configure boot source for VM: {0}")]
|
||||
BootSource(#[source] BootSourceConfigError),
|
||||
|
||||
/// The action `StartMicroVm` failed either because of bad user input or an internal error.
|
||||
#[error("failed to boot the VM: {0}")]
|
||||
StartMicroVm(#[source] StartMicroVmError),
|
||||
|
||||
/// The action `StopMicroVm` failed either because of bad user input or an internal error.
|
||||
#[error("failed to shutdown the VM: {0}")]
|
||||
StopMicrovm(#[source] StopMicrovmError),
|
||||
|
||||
/// One of the actions `GetVmConfiguration` or `SetVmConfiguration` failed either because of bad
|
||||
/// input or an internal error.
|
||||
#[error("failed to set configuration for the VM: {0}")]
|
||||
MachineConfig(#[source] VmConfigError),
|
||||
|
||||
#[cfg(feature = "virtio-vsock")]
|
||||
/// The action `InsertVsockDevice` failed either because of bad user input or an internal error.
|
||||
#[error("failed to add virtio-vsock device: {0}")]
|
||||
Vsock(#[source] VsockDeviceError),
|
||||
|
||||
#[cfg(feature = "virtio-blk")]
|
||||
/// Block device related errors.
|
||||
#[error("virtio-blk device error: {0}")]
|
||||
Block(#[source] BlockDeviceError),
|
||||
|
||||
#[cfg(feature = "virtio-net")]
|
||||
/// Net device related errors.
|
||||
#[error("virtio-net device error: {0}")]
|
||||
VirtioNet(#[source] VirtioNetDeviceError),
|
||||
|
||||
#[cfg(feature = "virtio-fs")]
|
||||
/// The action `InsertFsDevice` failed either because of bad user input or an internal error.
|
||||
#[error("virtio-fs device: {0}")]
|
||||
FsDevice(#[source] FsDeviceError),
|
||||
}
|
||||
|
||||
/// This enum represents the public interface of the VMM. Each action contains various
|
||||
/// bits of information (ids, paths, etc.).
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub enum VmmAction {
|
||||
/// Configure the boot source of the microVM using `BootSourceConfig`.
|
||||
/// This action can only be called before the microVM has booted.
|
||||
ConfigureBootSource(BootSourceConfig),
|
||||
|
||||
/// Launch the microVM. This action can only be called before the microVM has booted.
|
||||
StartMicroVm,
|
||||
|
||||
/// Shutdown the vmicroVM. This action can only be called after the microVM has booted.
|
||||
/// When vmm is used as the crate by the other process, which is need to
|
||||
/// shutdown the vcpu threads and destory all of the object.
|
||||
ShutdownMicroVm,
|
||||
|
||||
/// Get the configuration of the microVM.
|
||||
GetVmConfiguration,
|
||||
|
||||
/// Set the microVM configuration (memory & vcpu) using `VmConfig` as input. This
|
||||
/// action can only be called before the microVM has booted.
|
||||
SetVmConfiguration(VmConfigInfo),
|
||||
|
||||
#[cfg(feature = "virtio-vsock")]
|
||||
/// Add a new vsock device or update one that already exists using the
|
||||
/// `VsockDeviceConfig` as input. This action can only be called before the microVM has
|
||||
/// booted. The response is sent using the `OutcomeSender`.
|
||||
InsertVsockDevice(VsockDeviceConfigInfo),
|
||||
|
||||
#[cfg(feature = "virtio-blk")]
|
||||
/// Add a new block device or update one that already exists using the `BlockDeviceConfig` as
|
||||
/// input. This action can only be called before the microVM has booted.
|
||||
InsertBlockDevice(BlockDeviceConfigInfo),
|
||||
|
||||
#[cfg(feature = "virtio-blk")]
|
||||
/// Remove a new block device for according to given drive_id
|
||||
RemoveBlockDevice(String),
|
||||
|
||||
#[cfg(feature = "virtio-blk")]
|
||||
/// Update a block device, after microVM start. Currently, the only updatable properties
|
||||
/// are the RX and TX rate limiters.
|
||||
UpdateBlockDevice(BlockDeviceConfigUpdateInfo),
|
||||
|
||||
#[cfg(feature = "virtio-net")]
|
||||
/// Add a new network interface config or update one that already exists using the
|
||||
/// `NetworkInterfaceConfig` as input. This action can only be called before the microVM has
|
||||
/// booted. The response is sent using the `OutcomeSender`.
|
||||
InsertNetworkDevice(VirtioNetDeviceConfigInfo),
|
||||
|
||||
#[cfg(feature = "virtio-net")]
|
||||
/// Update a network interface, after microVM start. Currently, the only updatable properties
|
||||
/// are the RX and TX rate limiters.
|
||||
UpdateNetworkInterface(VirtioNetDeviceConfigUpdateInfo),
|
||||
|
||||
#[cfg(feature = "virtio-fs")]
|
||||
/// Add a new shared fs device or update one that already exists using the
|
||||
/// `FsDeviceConfig` as input. This action can only be called before the microVM has
|
||||
/// booted.
|
||||
InsertFsDevice(FsDeviceConfigInfo),
|
||||
|
||||
#[cfg(feature = "virtio-fs")]
|
||||
/// Attach a new virtiofs Backend fs or detach an existing virtiofs Backend fs using the
|
||||
/// `FsMountConfig` as input. This action can only be called _after_ the microVM has
|
||||
/// booted.
|
||||
ManipulateFsBackendFs(FsMountConfigInfo),
|
||||
|
||||
#[cfg(feature = "virtio-fs")]
|
||||
/// Update fs rate limiter, after microVM start.
|
||||
UpdateFsDevice(FsDeviceConfigUpdateInfo),
|
||||
}
|
||||
|
||||
/// The enum represents the response sent by the VMM in case of success. The response is either
|
||||
/// empty, when no data needs to be sent, or an internal VMM structure.
|
||||
#[derive(Debug)]
|
||||
pub enum VmmData {
|
||||
/// No data is sent on the channel.
|
||||
Empty,
|
||||
/// The microVM configuration represented by `VmConfigInfo`.
|
||||
MachineConfiguration(Box<VmConfigInfo>),
|
||||
}
|
||||
|
||||
/// Request data type used to communicate between the API and the VMM.
|
||||
pub type VmmRequest = Box<VmmAction>;
|
||||
|
||||
/// Data type used to communicate between the API and the VMM.
|
||||
pub type VmmRequestResult = std::result::Result<VmmData, VmmActionError>;
|
||||
|
||||
/// Response data type used to communicate between the API and the VMM.
|
||||
pub type VmmResponse = Box<VmmRequestResult>;
|
||||
|
||||
/// VMM Service to handle requests from the API server.
|
||||
///
|
||||
/// There are two levels of API servers as below:
|
||||
/// API client <--> VMM API Server <--> VMM Core
|
||||
pub struct VmmService {
|
||||
from_api: Receiver<VmmRequest>,
|
||||
to_api: Sender<VmmResponse>,
|
||||
machine_config: VmConfigInfo,
|
||||
}
|
||||
|
||||
impl VmmService {
|
||||
/// Create a new VMM API server instance.
|
||||
pub fn new(from_api: Receiver<VmmRequest>, to_api: Sender<VmmResponse>) -> Self {
|
||||
VmmService {
|
||||
from_api,
|
||||
to_api,
|
||||
machine_config: VmConfigInfo::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle requests from the HTTP API Server and send back replies.
|
||||
pub fn run_vmm_action(&mut self, vmm: &mut Vmm, event_mgr: &mut EventManager) -> Result<()> {
|
||||
let request = match self.from_api.try_recv() {
|
||||
Ok(t) => *t,
|
||||
Err(TryRecvError::Empty) => {
|
||||
warn!("Got a spurious notification from api thread");
|
||||
return Ok(());
|
||||
}
|
||||
Err(TryRecvError::Disconnected) => {
|
||||
panic!("The channel's sending half was disconnected. Cannot receive data.");
|
||||
}
|
||||
};
|
||||
debug!("receive vmm action: {:?}", request);
|
||||
|
||||
let response = match request {
|
||||
VmmAction::ConfigureBootSource(boot_source_body) => {
|
||||
self.configure_boot_source(vmm, boot_source_body)
|
||||
}
|
||||
VmmAction::StartMicroVm => self.start_microvm(vmm, event_mgr),
|
||||
VmmAction::ShutdownMicroVm => self.shutdown_microvm(vmm),
|
||||
VmmAction::GetVmConfiguration => Ok(VmmData::MachineConfiguration(Box::new(
|
||||
self.machine_config.clone(),
|
||||
))),
|
||||
VmmAction::SetVmConfiguration(machine_config) => {
|
||||
self.set_vm_configuration(vmm, machine_config)
|
||||
}
|
||||
#[cfg(feature = "virtio-vsock")]
|
||||
VmmAction::InsertVsockDevice(vsock_cfg) => self.add_vsock_device(vmm, vsock_cfg),
|
||||
#[cfg(feature = "virtio-blk")]
|
||||
VmmAction::InsertBlockDevice(block_device_config) => {
|
||||
self.add_block_device(vmm, event_mgr, block_device_config)
|
||||
}
|
||||
#[cfg(feature = "virtio-blk")]
|
||||
VmmAction::UpdateBlockDevice(blk_update) => {
|
||||
self.update_blk_rate_limiters(vmm, blk_update)
|
||||
}
|
||||
#[cfg(feature = "virtio-blk")]
|
||||
VmmAction::RemoveBlockDevice(drive_id) => {
|
||||
self.remove_block_device(vmm, event_mgr, &drive_id)
|
||||
}
|
||||
#[cfg(feature = "virtio-net")]
|
||||
VmmAction::InsertNetworkDevice(virtio_net_cfg) => {
|
||||
self.add_virtio_net_device(vmm, event_mgr, virtio_net_cfg)
|
||||
}
|
||||
#[cfg(feature = "virtio-net")]
|
||||
VmmAction::UpdateNetworkInterface(netif_update) => {
|
||||
self.update_net_rate_limiters(vmm, netif_update)
|
||||
}
|
||||
#[cfg(feature = "virtio-fs")]
|
||||
VmmAction::InsertFsDevice(fs_cfg) => self.add_fs_device(vmm, fs_cfg),
|
||||
|
||||
#[cfg(feature = "virtio-fs")]
|
||||
VmmAction::ManipulateFsBackendFs(fs_mount_cfg) => {
|
||||
self.manipulate_fs_backend_fs(vmm, fs_mount_cfg)
|
||||
}
|
||||
#[cfg(feature = "virtio-fs")]
|
||||
VmmAction::UpdateFsDevice(fs_update_cfg) => {
|
||||
self.update_fs_rate_limiters(vmm, fs_update_cfg)
|
||||
}
|
||||
};
|
||||
|
||||
debug!("send vmm response: {:?}", response);
|
||||
self.send_response(response)
|
||||
}
|
||||
|
||||
fn send_response(&self, result: VmmRequestResult) -> Result<()> {
|
||||
self.to_api
|
||||
.send(Box::new(result))
|
||||
.map_err(|_| ())
|
||||
.expect("vmm: one-shot API result channel has been closed");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn configure_boot_source(
|
||||
&self,
|
||||
vmm: &mut Vmm,
|
||||
boot_source_config: BootSourceConfig,
|
||||
) -> VmmRequestResult {
|
||||
use super::BootSourceConfigError::{
|
||||
InvalidInitrdPath, InvalidKernelCommandLine, InvalidKernelPath,
|
||||
UpdateNotAllowedPostBoot,
|
||||
};
|
||||
use super::VmmActionError::BootSource;
|
||||
|
||||
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
|
||||
if vm.is_vm_initialized() {
|
||||
return Err(BootSource(UpdateNotAllowedPostBoot));
|
||||
}
|
||||
|
||||
let kernel_file = File::open(&boot_source_config.kernel_path)
|
||||
.map_err(|e| BootSource(InvalidKernelPath(e)))?;
|
||||
|
||||
let initrd_file = match boot_source_config.initrd_path {
|
||||
None => None,
|
||||
Some(ref path) => Some(File::open(path).map_err(|e| BootSource(InvalidInitrdPath(e)))?),
|
||||
};
|
||||
|
||||
let mut cmdline = linux_loader::cmdline::Cmdline::new(dbs_boot::layout::CMDLINE_MAX_SIZE);
|
||||
let boot_args = boot_source_config
|
||||
.boot_args
|
||||
.clone()
|
||||
.unwrap_or_else(|| String::from(DEFAULT_KERNEL_CMDLINE));
|
||||
cmdline
|
||||
.insert_str(boot_args)
|
||||
.map_err(|e| BootSource(InvalidKernelCommandLine(e)))?;
|
||||
|
||||
let kernel_config = KernelConfigInfo::new(kernel_file, initrd_file, cmdline);
|
||||
vm.set_kernel_config(kernel_config);
|
||||
|
||||
Ok(VmmData::Empty)
|
||||
}
|
||||
|
||||
fn start_microvm(&mut self, vmm: &mut Vmm, event_mgr: &mut EventManager) -> VmmRequestResult {
|
||||
use self::StartMicroVmError::MicroVMAlreadyRunning;
|
||||
use self::VmmActionError::StartMicroVm;
|
||||
|
||||
let vmm_seccomp_filter = vmm.vmm_seccomp_filter();
|
||||
let vcpu_seccomp_filter = vmm.vcpu_seccomp_filter();
|
||||
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
|
||||
if vm.is_vm_initialized() {
|
||||
return Err(StartMicroVm(MicroVMAlreadyRunning));
|
||||
}
|
||||
|
||||
vm.start_microvm(event_mgr, vmm_seccomp_filter, vcpu_seccomp_filter)
|
||||
.map(|_| VmmData::Empty)
|
||||
.map_err(StartMicroVm)
|
||||
}
|
||||
|
||||
fn shutdown_microvm(&mut self, vmm: &mut Vmm) -> VmmRequestResult {
|
||||
vmm.event_ctx.exit_evt_triggered = true;
|
||||
|
||||
Ok(VmmData::Empty)
|
||||
}
|
||||
|
||||
/// Set virtual machine configuration.
|
||||
pub fn set_vm_configuration(
|
||||
&mut self,
|
||||
vmm: &mut Vmm,
|
||||
machine_config: VmConfigInfo,
|
||||
) -> VmmRequestResult {
|
||||
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
|
||||
if vm.is_vm_initialized() {
|
||||
return Err(MachineConfig(UpdateNotAllowedPostBoot));
|
||||
}
|
||||
|
||||
// If the check is successful, set it up together.
|
||||
let mut config = vm.vm_config().clone();
|
||||
if config.vcpu_count != machine_config.vcpu_count {
|
||||
let vcpu_count = machine_config.vcpu_count;
|
||||
// Check that the vcpu_count value is >=1.
|
||||
if vcpu_count == 0 {
|
||||
return Err(MachineConfig(InvalidVcpuCount(vcpu_count)));
|
||||
}
|
||||
config.vcpu_count = vcpu_count;
|
||||
}
|
||||
|
||||
if config.cpu_topology != machine_config.cpu_topology {
|
||||
let cpu_topology = &machine_config.cpu_topology;
|
||||
config.cpu_topology = handle_cpu_topology(cpu_topology, config.vcpu_count)?.clone();
|
||||
} else {
|
||||
// the same default
|
||||
let mut default_cpu_topology = CpuTopology {
|
||||
threads_per_core: 1,
|
||||
cores_per_die: config.vcpu_count,
|
||||
dies_per_socket: 1,
|
||||
sockets: 1,
|
||||
};
|
||||
if machine_config.max_vcpu_count > config.vcpu_count {
|
||||
default_cpu_topology.cores_per_die = machine_config.max_vcpu_count;
|
||||
}
|
||||
config.cpu_topology = default_cpu_topology;
|
||||
}
|
||||
let cpu_topology = &config.cpu_topology;
|
||||
let max_vcpu_from_topo = cpu_topology.threads_per_core
|
||||
* cpu_topology.cores_per_die
|
||||
* cpu_topology.dies_per_socket
|
||||
* cpu_topology.sockets;
|
||||
// If the max_vcpu_count inferred by cpu_topology is not equal to
|
||||
// max_vcpu_count, max_vcpu_count will be changed. currently, max vcpu size
|
||||
// is used when cpu_topology is not defined and help define the cores_per_die
|
||||
// for the default cpu topology.
|
||||
let mut max_vcpu_count = machine_config.max_vcpu_count;
|
||||
if max_vcpu_count < config.vcpu_count {
|
||||
return Err(MachineConfig(InvalidMaxVcpuCount(max_vcpu_count)));
|
||||
}
|
||||
if max_vcpu_from_topo != max_vcpu_count {
|
||||
max_vcpu_count = max_vcpu_from_topo;
|
||||
info!("Since max_vcpu_count is not equal to cpu topo information, we have changed the max vcpu count to {}", max_vcpu_from_topo);
|
||||
}
|
||||
config.max_vcpu_count = max_vcpu_count;
|
||||
|
||||
config.cpu_pm = machine_config.cpu_pm;
|
||||
config.mem_type = machine_config.mem_type;
|
||||
|
||||
let mem_size_mib_value = machine_config.mem_size_mib;
|
||||
// Support 1TB memory at most, 2MB aligned for huge page.
|
||||
if mem_size_mib_value == 0 || mem_size_mib_value > 0x10_0000 || mem_size_mib_value % 2 != 0
|
||||
{
|
||||
return Err(MachineConfig(InvalidMemorySize(mem_size_mib_value)));
|
||||
}
|
||||
config.mem_size_mib = mem_size_mib_value;
|
||||
|
||||
config.mem_file_path = machine_config.mem_file_path.clone();
|
||||
|
||||
if config.mem_type == "hugetlbfs" && config.mem_file_path.is_empty() {
|
||||
return Err(MachineConfig(InvalidMemFilePath("".to_owned())));
|
||||
}
|
||||
config.vpmu_feature = machine_config.vpmu_feature;
|
||||
|
||||
let vm_id = vm.shared_info().read().unwrap().id.clone();
|
||||
let serial_path = match machine_config.serial_path {
|
||||
Some(value) => value,
|
||||
None => {
|
||||
if config.serial_path.is_none() {
|
||||
String::from("/run/dragonball/") + &vm_id + "_com1"
|
||||
} else {
|
||||
// Safe to unwrap() because we have checked it has a value.
|
||||
config.serial_path.as_ref().unwrap().clone()
|
||||
}
|
||||
}
|
||||
};
|
||||
config.serial_path = Some(serial_path);
|
||||
|
||||
vm.set_vm_config(config.clone());
|
||||
self.machine_config = config;
|
||||
|
||||
Ok(VmmData::Empty)
|
||||
}
|
||||
|
||||
#[cfg(feature = "virtio-vsock")]
|
||||
fn add_vsock_device(&self, vmm: &mut Vmm, config: VsockDeviceConfigInfo) -> VmmRequestResult {
|
||||
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
|
||||
if vm.is_vm_initialized() {
|
||||
return Err(VmmActionError::Vsock(
|
||||
VsockDeviceError::UpdateNotAllowedPostBoot,
|
||||
));
|
||||
}
|
||||
|
||||
// VMADDR_CID_ANY (-1U) means any address for binding;
|
||||
// VMADDR_CID_HYPERVISOR (0) is reserved for services built into the hypervisor;
|
||||
// VMADDR_CID_RESERVED (1) must not be used;
|
||||
// VMADDR_CID_HOST (2) is the well-known address of the host.
|
||||
if config.guest_cid <= 2 {
|
||||
return Err(VmmActionError::Vsock(VsockDeviceError::GuestCIDInvalid(
|
||||
config.guest_cid,
|
||||
)));
|
||||
}
|
||||
|
||||
info!("add_vsock_device: {:?}", config);
|
||||
let ctx = vm.create_device_op_context(None).map_err(|e| {
|
||||
info!("create device op context error: {:?}", e);
|
||||
VmmActionError::Vsock(VsockDeviceError::UpdateNotAllowedPostBoot)
|
||||
})?;
|
||||
|
||||
vm.device_manager_mut()
|
||||
.vsock_manager
|
||||
.insert_device(ctx, config)
|
||||
.map(|_| VmmData::Empty)
|
||||
.map_err(VmmActionError::Vsock)
|
||||
}
|
||||
|
||||
#[cfg(feature = "virtio-blk")]
|
||||
// Only call this function as part of the API.
|
||||
// If the drive_id does not exist, a new Block Device Config is added to the list.
|
||||
fn add_block_device(
|
||||
&mut self,
|
||||
vmm: &mut Vmm,
|
||||
event_mgr: &mut EventManager,
|
||||
config: BlockDeviceConfigInfo,
|
||||
) -> VmmRequestResult {
|
||||
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
|
||||
let ctx = vm
|
||||
.create_device_op_context(Some(event_mgr.epoll_manager()))
|
||||
.map_err(|e| {
|
||||
if let StartMicroVmError::UpcallNotReady = e {
|
||||
return VmmActionError::UpcallNotReady;
|
||||
}
|
||||
VmmActionError::Block(BlockDeviceError::UpdateNotAllowedPostBoot)
|
||||
})?;
|
||||
|
||||
BlockDeviceMgr::insert_device(vm.device_manager_mut(), ctx, config)
|
||||
.map(|_| VmmData::Empty)
|
||||
.map_err(VmmActionError::Block)
|
||||
}
|
||||
|
||||
#[cfg(feature = "virtio-blk")]
|
||||
/// Updates configuration for an emulated net device as described in `config`.
|
||||
fn update_blk_rate_limiters(
|
||||
&mut self,
|
||||
vmm: &mut Vmm,
|
||||
config: BlockDeviceConfigUpdateInfo,
|
||||
) -> VmmRequestResult {
|
||||
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
|
||||
|
||||
BlockDeviceMgr::update_device_ratelimiters(vm.device_manager_mut(), config)
|
||||
.map(|_| VmmData::Empty)
|
||||
.map_err(VmmActionError::Block)
|
||||
}
|
||||
|
||||
#[cfg(feature = "virtio-blk")]
|
||||
// Remove the device
|
||||
fn remove_block_device(
|
||||
&mut self,
|
||||
vmm: &mut Vmm,
|
||||
event_mgr: &mut EventManager,
|
||||
drive_id: &str,
|
||||
) -> VmmRequestResult {
|
||||
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
|
||||
let ctx = vm
|
||||
.create_device_op_context(Some(event_mgr.epoll_manager()))
|
||||
.map_err(|_| VmmActionError::Block(BlockDeviceError::UpdateNotAllowedPostBoot))?;
|
||||
|
||||
BlockDeviceMgr::remove_device(vm.device_manager_mut(), ctx, drive_id)
|
||||
.map(|_| VmmData::Empty)
|
||||
.map_err(VmmActionError::Block)
|
||||
}
|
||||
|
||||
#[cfg(feature = "virtio-net")]
|
||||
fn add_virtio_net_device(
|
||||
&mut self,
|
||||
vmm: &mut Vmm,
|
||||
event_mgr: &mut EventManager,
|
||||
config: VirtioNetDeviceConfigInfo,
|
||||
) -> VmmRequestResult {
|
||||
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
|
||||
let ctx = vm
|
||||
.create_device_op_context(Some(event_mgr.epoll_manager()))
|
||||
.map_err(|e| {
|
||||
if let StartMicroVmError::MicroVMAlreadyRunning = e {
|
||||
VmmActionError::VirtioNet(VirtioNetDeviceError::UpdateNotAllowedPostBoot)
|
||||
} else if let StartMicroVmError::UpcallNotReady = e {
|
||||
VmmActionError::UpcallNotReady
|
||||
} else {
|
||||
VmmActionError::StartMicroVm(e)
|
||||
}
|
||||
})?;
|
||||
|
||||
VirtioNetDeviceMgr::insert_device(vm.device_manager_mut(), ctx, config)
|
||||
.map(|_| VmmData::Empty)
|
||||
.map_err(VmmActionError::VirtioNet)
|
||||
}
|
||||
|
||||
#[cfg(feature = "virtio-net")]
|
||||
fn update_net_rate_limiters(
|
||||
&mut self,
|
||||
vmm: &mut Vmm,
|
||||
config: VirtioNetDeviceConfigUpdateInfo,
|
||||
) -> VmmRequestResult {
|
||||
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
|
||||
|
||||
VirtioNetDeviceMgr::update_device_ratelimiters(vm.device_manager_mut(), config)
|
||||
.map(|_| VmmData::Empty)
|
||||
.map_err(VmmActionError::VirtioNet)
|
||||
}
|
||||
|
||||
#[cfg(feature = "virtio-fs")]
|
||||
fn add_fs_device(&mut self, vmm: &mut Vmm, config: FsDeviceConfigInfo) -> VmmRequestResult {
|
||||
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
|
||||
let hotplug = vm.is_vm_initialized();
|
||||
if !cfg!(feature = "hotplug") && hotplug {
|
||||
return Err(VmmActionError::FsDevice(
|
||||
FsDeviceError::UpdateNotAllowedPostBoot,
|
||||
));
|
||||
}
|
||||
|
||||
let ctx = vm.create_device_op_context(None).map_err(|e| {
|
||||
info!("create device op context error: {:?}", e);
|
||||
VmmActionError::FsDevice(FsDeviceError::UpdateNotAllowedPostBoot)
|
||||
})?;
|
||||
FsDeviceMgr::insert_device(vm.device_manager_mut(), ctx, config)
|
||||
.map(|_| VmmData::Empty)
|
||||
.map_err(VmmActionError::FsDevice)
|
||||
}
|
||||
|
||||
#[cfg(feature = "virtio-fs")]
|
||||
fn manipulate_fs_backend_fs(
|
||||
&self,
|
||||
vmm: &mut Vmm,
|
||||
config: FsMountConfigInfo,
|
||||
) -> VmmRequestResult {
|
||||
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
|
||||
|
||||
if !vm.is_vm_initialized() {
|
||||
return Err(VmmActionError::FsDevice(FsDeviceError::MicroVMNotRunning));
|
||||
}
|
||||
|
||||
FsDeviceMgr::manipulate_backend_fs(vm.device_manager_mut(), config)
|
||||
.map(|_| VmmData::Empty)
|
||||
.map_err(VmmActionError::FsDevice)
|
||||
}
|
||||
|
||||
#[cfg(feature = "virtio-fs")]
|
||||
fn update_fs_rate_limiters(
|
||||
&self,
|
||||
vmm: &mut Vmm,
|
||||
config: FsDeviceConfigUpdateInfo,
|
||||
) -> VmmRequestResult {
|
||||
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
|
||||
|
||||
if !vm.is_vm_initialized() {
|
||||
return Err(VmmActionError::FsDevice(FsDeviceError::MicroVMNotRunning));
|
||||
}
|
||||
|
||||
FsDeviceMgr::update_device_ratelimiters(vm.device_manager_mut(), config)
|
||||
.map(|_| VmmData::Empty)
|
||||
.map_err(VmmActionError::FsDevice)
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_cpu_topology(
|
||||
cpu_topology: &CpuTopology,
|
||||
vcpu_count: u8,
|
||||
) -> std::result::Result<&CpuTopology, VmmActionError> {
|
||||
// Check if dies_per_socket, cores_per_die, threads_per_core and socket number is valid
|
||||
if cpu_topology.threads_per_core < 1 || cpu_topology.threads_per_core > 2 {
|
||||
return Err(MachineConfig(InvalidThreadsPerCore(
|
||||
cpu_topology.threads_per_core,
|
||||
)));
|
||||
}
|
||||
let vcpu_count_from_topo = cpu_topology
|
||||
.sockets
|
||||
.checked_mul(cpu_topology.dies_per_socket)
|
||||
.ok_or(MachineConfig(VcpuCountExceedsMaximum))?
|
||||
.checked_mul(cpu_topology.cores_per_die)
|
||||
.ok_or(MachineConfig(VcpuCountExceedsMaximum))?
|
||||
.checked_mul(cpu_topology.threads_per_core)
|
||||
.ok_or(MachineConfig(VcpuCountExceedsMaximum))?;
|
||||
if vcpu_count_from_topo > MAX_SUPPORTED_VCPUS {
|
||||
return Err(MachineConfig(VcpuCountExceedsMaximum));
|
||||
}
|
||||
if vcpu_count_from_topo < vcpu_count {
|
||||
return Err(MachineConfig(InvalidCpuTopology(vcpu_count_from_topo)));
|
||||
}
|
||||
|
||||
Ok(cpu_topology)
|
||||
}
|
||||
760
src/dragonball/src/config_manager.rs
Normal file
760
src/dragonball/src/config_manager.rs
Normal file
@@ -0,0 +1,760 @@
|
||||
// Copyright (C) 2020-2022 Alibaba Cloud. All rights reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use std::convert::TryInto;
|
||||
use std::io;
|
||||
use std::ops::{Index, IndexMut};
|
||||
use std::sync::Arc;
|
||||
|
||||
use dbs_device::DeviceIo;
|
||||
use dbs_utils::rate_limiter::{RateLimiter, TokenBucket};
|
||||
use serde_derive::{Deserialize, Serialize};
|
||||
|
||||
/// Get bucket update for rate limiter.
|
||||
#[macro_export]
|
||||
macro_rules! get_bucket_update {
|
||||
($self:ident, $rate_limiter: ident, $metric: ident) => {{
|
||||
match &$self.$rate_limiter {
|
||||
Some(rl_cfg) => {
|
||||
let tb_cfg = &rl_cfg.$metric;
|
||||
dbs_utils::rate_limiter::RateLimiter::make_bucket(
|
||||
tb_cfg.size,
|
||||
tb_cfg.one_time_burst,
|
||||
tb_cfg.refill_time,
|
||||
)
|
||||
// Updated active rate-limiter.
|
||||
.map(dbs_utils::rate_limiter::BucketUpdate::Update)
|
||||
// Updated/deactivated rate-limiter
|
||||
.unwrap_or(dbs_utils::rate_limiter::BucketUpdate::Disabled)
|
||||
}
|
||||
// No update to the rate-limiter.
|
||||
None => dbs_utils::rate_limiter::BucketUpdate::None,
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// Trait for generic configuration information.
|
||||
pub trait ConfigItem {
|
||||
/// Related errors.
|
||||
type Err;
|
||||
|
||||
/// Get the unique identifier of the configuration item.
|
||||
fn id(&self) -> &str;
|
||||
|
||||
/// Check whether current configuration item conflicts with another one.
|
||||
fn check_conflicts(&self, other: &Self) -> std::result::Result<(), Self::Err>;
|
||||
}
|
||||
|
||||
/// Struct to manage a group of configuration items.
|
||||
#[derive(Debug, Default, Deserialize, PartialEq, Serialize)]
|
||||
pub struct ConfigInfos<T>
|
||||
where
|
||||
T: ConfigItem + Clone,
|
||||
{
|
||||
configs: Vec<T>,
|
||||
}
|
||||
|
||||
impl<T> ConfigInfos<T>
|
||||
where
|
||||
T: ConfigItem + Clone + Default,
|
||||
{
|
||||
/// Constructor
|
||||
pub fn new() -> Self {
|
||||
ConfigInfos::default()
|
||||
}
|
||||
|
||||
/// Insert a configuration item in the group.
|
||||
pub fn insert(&mut self, config: T) -> std::result::Result<(), T::Err> {
|
||||
for item in self.configs.iter() {
|
||||
config.check_conflicts(item)?;
|
||||
}
|
||||
self.configs.push(config);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update a configuration item in the group.
|
||||
pub fn update(&mut self, config: T, err: T::Err) -> std::result::Result<(), T::Err> {
|
||||
match self.get_index_by_id(&config) {
|
||||
None => Err(err),
|
||||
Some(index) => {
|
||||
for (idx, item) in self.configs.iter().enumerate() {
|
||||
if idx != index {
|
||||
config.check_conflicts(item)?;
|
||||
}
|
||||
}
|
||||
self.configs[index] = config;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert or update a configuration item in the group.
|
||||
pub fn insert_or_update(&mut self, config: T) -> std::result::Result<(), T::Err> {
|
||||
match self.get_index_by_id(&config) {
|
||||
None => {
|
||||
for item in self.configs.iter() {
|
||||
config.check_conflicts(item)?;
|
||||
}
|
||||
|
||||
self.configs.push(config)
|
||||
}
|
||||
Some(index) => {
|
||||
for (idx, item) in self.configs.iter().enumerate() {
|
||||
if idx != index {
|
||||
config.check_conflicts(item)?;
|
||||
}
|
||||
}
|
||||
self.configs[index] = config;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remove the matching configuration entry.
|
||||
pub fn remove(&mut self, config: &T) -> Option<T> {
|
||||
if let Some(index) = self.get_index_by_id(config) {
|
||||
Some(self.configs.remove(index))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an immutable iterator over the config items
|
||||
pub fn iter(&self) -> ::std::slice::Iter<T> {
|
||||
self.configs.iter()
|
||||
}
|
||||
|
||||
/// Get the configuration entry with matching ID.
|
||||
pub fn get_by_id(&self, item: &T) -> Option<&T> {
|
||||
let id = item.id();
|
||||
|
||||
self.configs.iter().rfind(|cfg| cfg.id() == id)
|
||||
}
|
||||
|
||||
fn get_index_by_id(&self, item: &T) -> Option<usize> {
|
||||
let id = item.id();
|
||||
self.configs.iter().position(|cfg| cfg.id() == id)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Clone for ConfigInfos<T>
|
||||
where
|
||||
T: ConfigItem + Clone,
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
ConfigInfos {
|
||||
configs: self.configs.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct to maintain configuration information for a device.
|
||||
pub struct DeviceConfigInfo<T>
|
||||
where
|
||||
T: ConfigItem + Clone,
|
||||
{
|
||||
/// Configuration information for the device object.
|
||||
pub config: T,
|
||||
/// The associated device object.
|
||||
pub device: Option<Arc<dyn DeviceIo>>,
|
||||
}
|
||||
|
||||
impl<T> DeviceConfigInfo<T>
|
||||
where
|
||||
T: ConfigItem + Clone,
|
||||
{
|
||||
/// Create a new instance of ['DeviceInfoGroup'].
|
||||
pub fn new(config: T) -> Self {
|
||||
DeviceConfigInfo {
|
||||
config,
|
||||
device: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new instance of ['DeviceInfoGroup'] with optional device.
|
||||
pub fn new_with_device(config: T, device: Option<Arc<dyn DeviceIo>>) -> Self {
|
||||
DeviceConfigInfo { config, device }
|
||||
}
|
||||
|
||||
/// Set the device object associated with the configuration.
|
||||
pub fn set_device(&mut self, device: Arc<dyn DeviceIo>) {
|
||||
self.device = Some(device);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Clone for DeviceConfigInfo<T>
|
||||
where
|
||||
T: ConfigItem + Clone,
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
DeviceConfigInfo::new_with_device(self.config.clone(), self.device.clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct to maintain configuration information for a group of devices.
|
||||
pub struct DeviceConfigInfos<T>
|
||||
where
|
||||
T: ConfigItem + Clone,
|
||||
{
|
||||
info_list: Vec<DeviceConfigInfo<T>>,
|
||||
}
|
||||
|
||||
impl<T> Default for DeviceConfigInfos<T>
|
||||
where
|
||||
T: ConfigItem + Clone,
|
||||
{
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DeviceConfigInfos<T>
|
||||
where
|
||||
T: ConfigItem + Clone,
|
||||
{
|
||||
/// Create a new instance of ['DeviceConfigInfos'].
|
||||
pub fn new() -> Self {
|
||||
DeviceConfigInfos {
|
||||
info_list: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert or update configuration information for a device.
|
||||
pub fn insert_or_update(&mut self, config: &T) -> std::result::Result<usize, T::Err> {
|
||||
let device_info = DeviceConfigInfo::new(config.clone());
|
||||
Ok(match self.get_index_by_id(config) {
|
||||
Some(index) => {
|
||||
for (idx, info) in self.info_list.iter().enumerate() {
|
||||
if idx != index {
|
||||
info.config.check_conflicts(config)?;
|
||||
}
|
||||
}
|
||||
self.info_list[index] = device_info;
|
||||
index
|
||||
}
|
||||
None => {
|
||||
for info in self.info_list.iter() {
|
||||
info.config.check_conflicts(config)?;
|
||||
}
|
||||
self.info_list.push(device_info);
|
||||
self.info_list.len() - 1
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Remove a device configuration information object.
|
||||
pub fn remove(&mut self, index: usize) -> Option<DeviceConfigInfo<T>> {
|
||||
if self.info_list.len() > index {
|
||||
Some(self.info_list.remove(index))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Get number of device configuration information objects.
|
||||
pub fn len(&self) -> usize {
|
||||
self.info_list.len()
|
||||
}
|
||||
|
||||
/// Returns true if the device configuration information objects is empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.info_list.len() == 0
|
||||
}
|
||||
|
||||
/// Add a device configuration information object at the tail.
|
||||
pub fn push(&mut self, info: DeviceConfigInfo<T>) {
|
||||
self.info_list.push(info);
|
||||
}
|
||||
|
||||
/// Iterator for configuration information objects.
|
||||
pub fn iter(&self) -> std::slice::Iter<DeviceConfigInfo<T>> {
|
||||
self.info_list.iter()
|
||||
}
|
||||
|
||||
/// Mutable iterator for configuration information objects.
|
||||
pub fn iter_mut(&mut self) -> std::slice::IterMut<DeviceConfigInfo<T>> {
|
||||
self.info_list.iter_mut()
|
||||
}
|
||||
|
||||
fn get_index_by_id(&self, config: &T) -> Option<usize> {
|
||||
self.info_list
|
||||
.iter()
|
||||
.position(|info| info.config.id().eq(config.id()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Index<usize> for DeviceConfigInfos<T>
|
||||
where
|
||||
T: ConfigItem + Clone,
|
||||
{
|
||||
type Output = DeviceConfigInfo<T>;
|
||||
fn index(&self, idx: usize) -> &Self::Output {
|
||||
&self.info_list[idx]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> IndexMut<usize> for DeviceConfigInfos<T>
|
||||
where
|
||||
T: ConfigItem + Clone,
|
||||
{
|
||||
fn index_mut(&mut self, idx: usize) -> &mut Self::Output {
|
||||
&mut self.info_list[idx]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Clone for DeviceConfigInfos<T>
|
||||
where
|
||||
T: ConfigItem + Clone,
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
DeviceConfigInfos {
|
||||
info_list: self.info_list.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration information for RateLimiter token bucket.
|
||||
#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)]
|
||||
pub struct TokenBucketConfigInfo {
|
||||
/// The size for the token bucket. A TokenBucket of `size` total capacity will take `refill_time`
|
||||
/// milliseconds to go from zero tokens to total capacity.
|
||||
pub size: u64,
|
||||
/// Number of free initial tokens, that can be consumed at no cost.
|
||||
pub one_time_burst: u64,
|
||||
/// Complete refill time in milliseconds.
|
||||
pub refill_time: u64,
|
||||
}
|
||||
|
||||
impl TokenBucketConfigInfo {
|
||||
fn resize(&mut self, n: u64) {
|
||||
if n != 0 {
|
||||
self.size /= n;
|
||||
self.one_time_burst /= n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TokenBucketConfigInfo> for TokenBucket {
|
||||
fn from(t: TokenBucketConfigInfo) -> TokenBucket {
|
||||
(&t).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&TokenBucketConfigInfo> for TokenBucket {
|
||||
fn from(t: &TokenBucketConfigInfo) -> TokenBucket {
|
||||
TokenBucket::new(t.size, t.one_time_burst, t.refill_time)
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration information for RateLimiter objects.
|
||||
#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)]
|
||||
pub struct RateLimiterConfigInfo {
|
||||
/// Data used to initialize the RateLimiter::bandwidth bucket.
|
||||
pub bandwidth: TokenBucketConfigInfo,
|
||||
/// Data used to initialize the RateLimiter::ops bucket.
|
||||
pub ops: TokenBucketConfigInfo,
|
||||
}
|
||||
|
||||
impl RateLimiterConfigInfo {
|
||||
/// Update the bandwidth budget configuration.
|
||||
pub fn update_bandwidth(&mut self, new_config: TokenBucketConfigInfo) {
|
||||
self.bandwidth = new_config;
|
||||
}
|
||||
|
||||
/// Update the ops budget configuration.
|
||||
pub fn update_ops(&mut self, new_config: TokenBucketConfigInfo) {
|
||||
self.ops = new_config;
|
||||
}
|
||||
|
||||
/// resize the limiter to its 1/n.
|
||||
pub fn resize(&mut self, n: u64) {
|
||||
self.bandwidth.resize(n);
|
||||
self.ops.resize(n);
|
||||
}
|
||||
}
|
||||
|
||||
impl TryInto<RateLimiter> for &RateLimiterConfigInfo {
|
||||
type Error = io::Error;
|
||||
|
||||
fn try_into(self) -> Result<RateLimiter, Self::Error> {
|
||||
RateLimiter::new(
|
||||
self.bandwidth.size,
|
||||
self.bandwidth.one_time_burst,
|
||||
self.bandwidth.refill_time,
|
||||
self.ops.size,
|
||||
self.ops.one_time_burst,
|
||||
self.ops.refill_time,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryInto<RateLimiter> for RateLimiterConfigInfo {
|
||||
type Error = io::Error;
|
||||
|
||||
fn try_into(self) -> Result<RateLimiter, Self::Error> {
|
||||
RateLimiter::new(
|
||||
self.bandwidth.size,
|
||||
self.bandwidth.one_time_burst,
|
||||
self.bandwidth.refill_time,
|
||||
self.ops.size,
|
||||
self.ops.one_time_burst,
|
||||
self.ops.refill_time,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DummyError {
|
||||
#[error("configuration entry exists")]
|
||||
Exist,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct DummyConfigInfo {
|
||||
id: String,
|
||||
content: String,
|
||||
}
|
||||
|
||||
impl ConfigItem for DummyConfigInfo {
|
||||
type Err = DummyError;
|
||||
|
||||
fn id(&self) -> &str {
|
||||
&self.id
|
||||
}
|
||||
|
||||
fn check_conflicts(&self, other: &Self) -> Result<(), DummyError> {
|
||||
if self.id == other.id || self.content == other.content {
|
||||
Err(DummyError::Exist)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type DummyConfigInfos = ConfigInfos<DummyConfigInfo>;
|
||||
|
||||
#[test]
|
||||
fn test_insert_config_info() {
|
||||
let mut configs = DummyConfigInfos::new();
|
||||
|
||||
let config1 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "a".to_owned(),
|
||||
};
|
||||
configs.insert(config1).unwrap();
|
||||
assert_eq!(configs.configs.len(), 1);
|
||||
assert_eq!(configs.configs[0].id, "1");
|
||||
assert_eq!(configs.configs[0].content, "a");
|
||||
|
||||
// Test case: cannot insert new item with the same id.
|
||||
let config2 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "b".to_owned(),
|
||||
};
|
||||
configs.insert(config2).unwrap_err();
|
||||
assert_eq!(configs.configs.len(), 1);
|
||||
assert_eq!(configs.configs[0].id, "1");
|
||||
assert_eq!(configs.configs[0].content, "a");
|
||||
|
||||
let config3 = DummyConfigInfo {
|
||||
id: "2".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
};
|
||||
configs.insert(config3).unwrap();
|
||||
assert_eq!(configs.configs.len(), 2);
|
||||
assert_eq!(configs.configs[0].id, "1");
|
||||
assert_eq!(configs.configs[0].content, "a");
|
||||
assert_eq!(configs.configs[1].id, "2");
|
||||
assert_eq!(configs.configs[1].content, "c");
|
||||
|
||||
// Test case: cannot insert new item with the same content.
|
||||
let config4 = DummyConfigInfo {
|
||||
id: "3".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
};
|
||||
configs.insert(config4).unwrap_err();
|
||||
assert_eq!(configs.configs.len(), 2);
|
||||
assert_eq!(configs.configs[0].id, "1");
|
||||
assert_eq!(configs.configs[0].content, "a");
|
||||
assert_eq!(configs.configs[1].id, "2");
|
||||
assert_eq!(configs.configs[1].content, "c");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_update_config_info() {
|
||||
let mut configs = DummyConfigInfos::new();
|
||||
|
||||
let config1 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "a".to_owned(),
|
||||
};
|
||||
configs.insert(config1).unwrap();
|
||||
assert_eq!(configs.configs.len(), 1);
|
||||
assert_eq!(configs.configs[0].id, "1");
|
||||
assert_eq!(configs.configs[0].content, "a");
|
||||
|
||||
// Test case: succeed to update an existing entry
|
||||
let config2 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "b".to_owned(),
|
||||
};
|
||||
configs.update(config2, DummyError::Exist).unwrap();
|
||||
assert_eq!(configs.configs.len(), 1);
|
||||
assert_eq!(configs.configs[0].id, "1");
|
||||
assert_eq!(configs.configs[0].content, "b");
|
||||
|
||||
// Test case: cannot update a non-existing entry
|
||||
let config3 = DummyConfigInfo {
|
||||
id: "2".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
};
|
||||
configs.update(config3, DummyError::Exist).unwrap_err();
|
||||
assert_eq!(configs.configs.len(), 1);
|
||||
assert_eq!(configs.configs[0].id, "1");
|
||||
assert_eq!(configs.configs[0].content, "b");
|
||||
|
||||
// Test case: cannot update an entry with conflicting content
|
||||
let config4 = DummyConfigInfo {
|
||||
id: "2".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
};
|
||||
configs.insert(config4).unwrap();
|
||||
let config5 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
};
|
||||
configs.update(config5, DummyError::Exist).unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_insert_or_update_config_info() {
|
||||
let mut configs = DummyConfigInfos::new();
|
||||
|
||||
let config1 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "a".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(config1).unwrap();
|
||||
assert_eq!(configs.configs.len(), 1);
|
||||
assert_eq!(configs.configs[0].id, "1");
|
||||
assert_eq!(configs.configs[0].content, "a");
|
||||
|
||||
// Test case: succeed to update an existing entry
|
||||
let config2 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "b".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(config2.clone()).unwrap();
|
||||
assert_eq!(configs.configs.len(), 1);
|
||||
assert_eq!(configs.configs[0].id, "1");
|
||||
assert_eq!(configs.configs[0].content, "b");
|
||||
|
||||
// Add a second entry
|
||||
let config3 = DummyConfigInfo {
|
||||
id: "2".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(config3.clone()).unwrap();
|
||||
assert_eq!(configs.configs.len(), 2);
|
||||
assert_eq!(configs.configs[0].id, "1");
|
||||
assert_eq!(configs.configs[0].content, "b");
|
||||
assert_eq!(configs.configs[1].id, "2");
|
||||
assert_eq!(configs.configs[1].content, "c");
|
||||
|
||||
// Lookup the first entry
|
||||
let config4 = configs
|
||||
.get_by_id(&DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "b".to_owned(),
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(config4.id, config2.id);
|
||||
assert_eq!(config4.content, config2.content);
|
||||
|
||||
// Lookup the second entry
|
||||
let config5 = configs
|
||||
.get_by_id(&DummyConfigInfo {
|
||||
id: "2".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(config5.id, config3.id);
|
||||
assert_eq!(config5.content, config3.content);
|
||||
|
||||
// Test case: can't insert an entry with conflicting content
|
||||
let config6 = DummyConfigInfo {
|
||||
id: "3".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(config6).unwrap_err();
|
||||
assert_eq!(configs.configs.len(), 2);
|
||||
assert_eq!(configs.configs[0].id, "1");
|
||||
assert_eq!(configs.configs[0].content, "b");
|
||||
assert_eq!(configs.configs[1].id, "2");
|
||||
assert_eq!(configs.configs[1].content, "c");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_config_info() {
|
||||
let mut configs = DummyConfigInfos::new();
|
||||
|
||||
let config1 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "a".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(config1).unwrap();
|
||||
let config2 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "b".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(config2.clone()).unwrap();
|
||||
let config3 = DummyConfigInfo {
|
||||
id: "2".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(config3.clone()).unwrap();
|
||||
assert_eq!(configs.configs.len(), 2);
|
||||
assert_eq!(configs.configs[0].id, "1");
|
||||
assert_eq!(configs.configs[0].content, "b");
|
||||
assert_eq!(configs.configs[1].id, "2");
|
||||
assert_eq!(configs.configs[1].content, "c");
|
||||
|
||||
let config4 = configs
|
||||
.remove(&DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "no value".to_owned(),
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(config4.id, config2.id);
|
||||
assert_eq!(config4.content, config2.content);
|
||||
assert_eq!(configs.configs.len(), 1);
|
||||
assert_eq!(configs.configs[0].id, "2");
|
||||
assert_eq!(configs.configs[0].content, "c");
|
||||
|
||||
let config5 = configs
|
||||
.remove(&DummyConfigInfo {
|
||||
id: "2".to_owned(),
|
||||
content: "no value".to_owned(),
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(config5.id, config3.id);
|
||||
assert_eq!(config5.content, config3.content);
|
||||
assert_eq!(configs.configs.len(), 0);
|
||||
}
|
||||
|
||||
type DummyDeviceInfoList = DeviceConfigInfos<DummyConfigInfo>;
|
||||
|
||||
#[test]
|
||||
fn test_insert_or_update_device_info() {
|
||||
let mut configs = DummyDeviceInfoList::new();
|
||||
|
||||
let config1 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "a".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(&config1).unwrap();
|
||||
assert_eq!(configs.len(), 1);
|
||||
assert_eq!(configs[0].config.id, "1");
|
||||
assert_eq!(configs[0].config.content, "a");
|
||||
|
||||
// Test case: succeed to update an existing entry
|
||||
let config2 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "b".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(&config2 /* */).unwrap();
|
||||
assert_eq!(configs.len(), 1);
|
||||
assert_eq!(configs[0].config.id, "1");
|
||||
assert_eq!(configs[0].config.content, "b");
|
||||
|
||||
// Add a second entry
|
||||
let config3 = DummyConfigInfo {
|
||||
id: "2".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(&config3).unwrap();
|
||||
assert_eq!(configs.len(), 2);
|
||||
assert_eq!(configs[0].config.id, "1");
|
||||
assert_eq!(configs[0].config.content, "b");
|
||||
assert_eq!(configs[1].config.id, "2");
|
||||
assert_eq!(configs[1].config.content, "c");
|
||||
|
||||
// Lookup the first entry
|
||||
let config4_id = configs
|
||||
.get_index_by_id(&DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "b".to_owned(),
|
||||
})
|
||||
.unwrap();
|
||||
let config4 = &configs[config4_id].config;
|
||||
assert_eq!(config4.id, config2.id);
|
||||
assert_eq!(config4.content, config2.content);
|
||||
|
||||
// Lookup the second entry
|
||||
let config5_id = configs
|
||||
.get_index_by_id(&DummyConfigInfo {
|
||||
id: "2".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
})
|
||||
.unwrap();
|
||||
let config5 = &configs[config5_id].config;
|
||||
assert_eq!(config5.id, config3.id);
|
||||
assert_eq!(config5.content, config3.content);
|
||||
|
||||
// Test case: can't insert an entry with conflicting content
|
||||
let config6 = DummyConfigInfo {
|
||||
id: "3".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(&config6).unwrap_err();
|
||||
assert_eq!(configs.len(), 2);
|
||||
assert_eq!(configs[0].config.id, "1");
|
||||
assert_eq!(configs[0].config.content, "b");
|
||||
assert_eq!(configs[1].config.id, "2");
|
||||
assert_eq!(configs[1].config.content, "c");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_device_info() {
|
||||
let mut configs = DummyDeviceInfoList::new();
|
||||
|
||||
let config1 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "a".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(&config1).unwrap();
|
||||
let config2 = DummyConfigInfo {
|
||||
id: "1".to_owned(),
|
||||
content: "b".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(&config2).unwrap();
|
||||
let config3 = DummyConfigInfo {
|
||||
id: "2".to_owned(),
|
||||
content: "c".to_owned(),
|
||||
};
|
||||
configs.insert_or_update(&config3).unwrap();
|
||||
assert_eq!(configs.len(), 2);
|
||||
assert_eq!(configs[0].config.id, "1");
|
||||
assert_eq!(configs[0].config.content, "b");
|
||||
assert_eq!(configs[1].config.id, "2");
|
||||
assert_eq!(configs[1].config.content, "c");
|
||||
|
||||
let config4 = configs.remove(0).unwrap().config;
|
||||
assert_eq!(config4.id, config2.id);
|
||||
assert_eq!(config4.content, config2.content);
|
||||
assert_eq!(configs.len(), 1);
|
||||
assert_eq!(configs[0].config.id, "2");
|
||||
assert_eq!(configs[0].config.content, "c");
|
||||
|
||||
let config5 = configs.remove(0).unwrap().config;
|
||||
assert_eq!(config5.id, config3.id);
|
||||
assert_eq!(config5.content, config3.content);
|
||||
assert_eq!(configs.len(), 0);
|
||||
}
|
||||
}
|
||||
773
src/dragonball/src/device_manager/blk_dev_mgr.rs
Normal file
773
src/dragonball/src/device_manager/blk_dev_mgr.rs
Normal file
@@ -0,0 +1,773 @@
|
||||
// Copyright 2020-2022 Alibaba, Inc. or its affiliates. All Rights Reserved.
|
||||
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the THIRD-PARTY file.
|
||||
|
||||
//! Device manager for virtio-blk and vhost-user-blk devices.
|
||||
use std::collections::{vec_deque, VecDeque};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::OpenOptions;
|
||||
use std::os::unix::fs::OpenOptionsExt;
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
|
||||
use dbs_virtio_devices as virtio;
|
||||
use dbs_virtio_devices::block::{aio::Aio, io_uring::IoUring, Block, LocalFile, Ufile};
|
||||
use serde_derive::{Deserialize, Serialize};
|
||||
|
||||
use crate::address_space_manager::GuestAddressSpaceImpl;
|
||||
use crate::config_manager::{ConfigItem, DeviceConfigInfo, RateLimiterConfigInfo};
|
||||
use crate::device_manager::blk_dev_mgr::BlockDeviceError::InvalidDeviceId;
|
||||
use crate::device_manager::{DeviceManager, DeviceMgrError, DeviceOpContext};
|
||||
use crate::get_bucket_update;
|
||||
use crate::vm::KernelConfigInfo;
|
||||
|
||||
use super::DbsMmioV2Device;
|
||||
|
||||
// The flag of whether to use the shared irq.
|
||||
const USE_SHARED_IRQ: bool = true;
|
||||
// The flag of whether to use the generic irq.
|
||||
const USE_GENERIC_IRQ: bool = true;
|
||||
|
||||
macro_rules! info(
|
||||
($l:expr, $($args:tt)+) => {
|
||||
slog::info!($l, $($args)+; slog::o!("subsystem" => "block_manager"))
|
||||
};
|
||||
);
|
||||
|
||||
macro_rules! error(
|
||||
($l:expr, $($args:tt)+) => {
|
||||
slog::error!($l, $($args)+; slog::o!("subsystem" => "block_manager"))
|
||||
};
|
||||
);
|
||||
|
||||
/// Default queue size for VirtIo block devices.
|
||||
pub const QUEUE_SIZE: u16 = 128;
|
||||
|
||||
/// Errors associated with the operations allowed on a drive.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum BlockDeviceError {
|
||||
/// Invalid VM instance ID.
|
||||
#[error("invalid VM instance id")]
|
||||
InvalidVMID,
|
||||
|
||||
/// The block device path is invalid.
|
||||
#[error("invalid block device path '{0}'")]
|
||||
InvalidBlockDevicePath(PathBuf),
|
||||
|
||||
/// The block device type is invalid.
|
||||
#[error("invalid block device type")]
|
||||
InvalidBlockDeviceType,
|
||||
|
||||
/// The block device path was already used for a different drive.
|
||||
#[error("block device path '{0}' already exists")]
|
||||
BlockDevicePathAlreadyExists(PathBuf),
|
||||
|
||||
/// The device id doesn't exist.
|
||||
#[error("invalid block device id '{0}'")]
|
||||
InvalidDeviceId(String),
|
||||
|
||||
/// Cannot perform the requested operation after booting the microVM.
|
||||
#[error("block device does not support runtime update")]
|
||||
UpdateNotAllowedPostBoot,
|
||||
|
||||
/// A root block device was already added.
|
||||
#[error("could not add multiple virtual machine root devices")]
|
||||
RootBlockDeviceAlreadyAdded,
|
||||
|
||||
/// Failed to send patch message to block epoll handler.
|
||||
#[error("could not send patch message to the block epoll handler")]
|
||||
BlockEpollHanderSendFail,
|
||||
|
||||
/// Failure from device manager,
|
||||
#[error("device manager errors: {0}")]
|
||||
DeviceManager(#[from] DeviceMgrError),
|
||||
|
||||
/// Failure from virtio subsystem.
|
||||
#[error(transparent)]
|
||||
Virtio(virtio::Error),
|
||||
|
||||
/// Unable to seek the block device backing file due to invalid permissions or
|
||||
/// the file was deleted/corrupted.
|
||||
#[error("cannot create block device: {0}")]
|
||||
CreateBlockDevice(#[source] virtio::Error),
|
||||
|
||||
/// Cannot open the block device backing file.
|
||||
#[error("cannot open the block device backing file: {0}")]
|
||||
OpenBlockDevice(#[source] std::io::Error),
|
||||
|
||||
/// Cannot initialize a MMIO Block Device or add a device to the MMIO Bus.
|
||||
#[error("failure while registering block device: {0}")]
|
||||
RegisterBlockDevice(#[source] DeviceMgrError),
|
||||
}
|
||||
|
||||
/// Type of low level storage device/protocol for virtio-blk devices.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum BlockDeviceType {
|
||||
/// Unknown low level device type.
|
||||
Unknown,
|
||||
/// Vhost-user-blk based low level device.
|
||||
/// SPOOL is a reliable NVMe virtualization system for the cloud environment.
|
||||
/// You could learn more SPOOL here: https://www.usenix.org/conference/atc20/presentation/xue
|
||||
Spool,
|
||||
/// Local disk/file based low level device.
|
||||
RawBlock,
|
||||
}
|
||||
|
||||
impl BlockDeviceType {
|
||||
/// Get type of low level storage device/protocol by parsing `path`.
|
||||
pub fn get_type(path: &str) -> BlockDeviceType {
|
||||
// SPOOL path should be started with "spool", e.g. "spool:/device1"
|
||||
if path.starts_with("spool:/") {
|
||||
BlockDeviceType::Spool
|
||||
} else {
|
||||
BlockDeviceType::RawBlock
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration information for a block device.
|
||||
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
|
||||
pub struct BlockDeviceConfigUpdateInfo {
|
||||
/// Unique identifier of the drive.
|
||||
pub drive_id: String,
|
||||
/// Rate Limiter for I/O operations.
|
||||
pub rate_limiter: Option<RateLimiterConfigInfo>,
|
||||
}
|
||||
|
||||
impl BlockDeviceConfigUpdateInfo {
|
||||
/// Provides a `BucketUpdate` description for the bandwidth rate limiter.
|
||||
pub fn bytes(&self) -> dbs_utils::rate_limiter::BucketUpdate {
|
||||
get_bucket_update!(self, rate_limiter, bandwidth)
|
||||
}
|
||||
/// Provides a `BucketUpdate` description for the ops rate limiter.
|
||||
pub fn ops(&self) -> dbs_utils::rate_limiter::BucketUpdate {
|
||||
get_bucket_update!(self, rate_limiter, ops)
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration information for a block device.
|
||||
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
|
||||
pub struct BlockDeviceConfigInfo {
|
||||
/// Unique identifier of the drive.
|
||||
pub drive_id: String,
|
||||
/// Type of low level storage/protocol.
|
||||
pub device_type: BlockDeviceType,
|
||||
/// Path of the drive.
|
||||
pub path_on_host: PathBuf,
|
||||
/// If set to true, it makes the current device the root block device.
|
||||
/// Setting this flag to true will mount the block device in the
|
||||
/// guest under /dev/vda unless the part_uuid is present.
|
||||
pub is_root_device: bool,
|
||||
/// Part-UUID. Represents the unique id of the boot partition of this device.
|
||||
/// It is optional and it will be used only if the `is_root_device` field is true.
|
||||
pub part_uuid: Option<String>,
|
||||
/// If set to true, the drive is opened in read-only mode. Otherwise, the
|
||||
/// drive is opened as read-write.
|
||||
pub is_read_only: bool,
|
||||
/// If set to false, the drive is opened with buffered I/O mode. Otherwise, the
|
||||
/// drive is opened with direct I/O mode.
|
||||
pub is_direct: bool,
|
||||
/// Don't close `path_on_host` file when dropping the device.
|
||||
pub no_drop: bool,
|
||||
/// Block device multi-queue
|
||||
pub num_queues: usize,
|
||||
/// Virtio queue size. Size: byte
|
||||
pub queue_size: u16,
|
||||
/// Rate Limiter for I/O operations.
|
||||
pub rate_limiter: Option<RateLimiterConfigInfo>,
|
||||
/// Use shared irq
|
||||
pub use_shared_irq: Option<bool>,
|
||||
/// Use generic irq
|
||||
pub use_generic_irq: Option<bool>,
|
||||
}
|
||||
|
||||
impl std::default::Default for BlockDeviceConfigInfo {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
drive_id: String::default(),
|
||||
device_type: BlockDeviceType::RawBlock,
|
||||
path_on_host: PathBuf::default(),
|
||||
is_root_device: false,
|
||||
part_uuid: None,
|
||||
is_read_only: false,
|
||||
is_direct: Self::default_direct(),
|
||||
no_drop: Self::default_no_drop(),
|
||||
num_queues: Self::default_num_queues(),
|
||||
queue_size: 256,
|
||||
rate_limiter: None,
|
||||
use_shared_irq: None,
|
||||
use_generic_irq: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockDeviceConfigInfo {
|
||||
/// Get default queue numbers
|
||||
pub fn default_num_queues() -> usize {
|
||||
1
|
||||
}
|
||||
|
||||
/// Get default value of is_direct switch
|
||||
pub fn default_direct() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// Get default value of no_drop switch
|
||||
pub fn default_no_drop() -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// Get type of low level storage/protocol.
|
||||
pub fn device_type(&self) -> BlockDeviceType {
|
||||
self.device_type
|
||||
}
|
||||
|
||||
/// Returns a reference to `path_on_host`.
|
||||
pub fn path_on_host(&self) -> &PathBuf {
|
||||
&self.path_on_host
|
||||
}
|
||||
|
||||
/// Returns a reference to the part_uuid.
|
||||
pub fn get_part_uuid(&self) -> Option<&String> {
|
||||
self.part_uuid.as_ref()
|
||||
}
|
||||
|
||||
/// Checks whether the drive had read only permissions.
|
||||
pub fn is_read_only(&self) -> bool {
|
||||
self.is_read_only
|
||||
}
|
||||
|
||||
/// Checks whether the drive uses direct I/O
|
||||
pub fn is_direct(&self) -> bool {
|
||||
self.is_direct
|
||||
}
|
||||
|
||||
/// Get number and size of queues supported.
|
||||
pub fn queue_sizes(&self) -> Vec<u16> {
|
||||
(0..self.num_queues)
|
||||
.map(|_| self.queue_size)
|
||||
.collect::<Vec<u16>>()
|
||||
}
|
||||
}
|
||||
|
||||
impl ConfigItem for BlockDeviceConfigInfo {
|
||||
type Err = BlockDeviceError;
|
||||
|
||||
fn id(&self) -> &str {
|
||||
&self.drive_id
|
||||
}
|
||||
|
||||
fn check_conflicts(&self, other: &Self) -> Result<(), BlockDeviceError> {
|
||||
if self.drive_id == other.drive_id {
|
||||
Ok(())
|
||||
} else if self.path_on_host == other.path_on_host {
|
||||
Err(BlockDeviceError::BlockDevicePathAlreadyExists(
|
||||
self.path_on_host.clone(),
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for BlockDeviceInfo {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:?}", self.config)
|
||||
}
|
||||
}
|
||||
|
||||
/// Block Device Info
|
||||
pub type BlockDeviceInfo = DeviceConfigInfo<BlockDeviceConfigInfo>;
|
||||
|
||||
/// Wrapper for the collection that holds all the Block Devices Configs
|
||||
//#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
|
||||
#[derive(Clone)]
|
||||
pub struct BlockDeviceMgr {
|
||||
/// A list of `BlockDeviceInfo` objects.
|
||||
info_list: VecDeque<BlockDeviceInfo>,
|
||||
has_root_block: bool,
|
||||
has_part_uuid_root: bool,
|
||||
read_only_root: bool,
|
||||
part_uuid: Option<String>,
|
||||
use_shared_irq: bool,
|
||||
}
|
||||
|
||||
impl BlockDeviceMgr {
|
||||
/// returns a front-to-back iterator.
|
||||
pub fn iter(&self) -> vec_deque::Iter<BlockDeviceInfo> {
|
||||
self.info_list.iter()
|
||||
}
|
||||
|
||||
/// Checks whether any of the added BlockDevice is the root.
|
||||
pub fn has_root_block_device(&self) -> bool {
|
||||
self.has_root_block
|
||||
}
|
||||
|
||||
/// Checks whether the root device is configured using a part UUID.
|
||||
pub fn has_part_uuid_root(&self) -> bool {
|
||||
self.has_part_uuid_root
|
||||
}
|
||||
|
||||
/// Checks whether the root device has read-only permisssions.
|
||||
pub fn is_read_only_root(&self) -> bool {
|
||||
self.read_only_root
|
||||
}
|
||||
|
||||
/// Gets the index of the device with the specified `drive_id` if it exists in the list.
|
||||
pub fn get_index_of_drive_id(&self, id: &str) -> Option<usize> {
|
||||
self.info_list
|
||||
.iter()
|
||||
.position(|info| info.config.id().eq(id))
|
||||
}
|
||||
|
||||
/// Gets the 'BlockDeviceConfigInfo' of the device with the specified `drive_id` if it exists in the list.
|
||||
pub fn get_config_of_drive_id(&self, drive_id: &str) -> Option<BlockDeviceConfigInfo> {
|
||||
match self.get_index_of_drive_id(drive_id) {
|
||||
Some(index) => {
|
||||
let config = self.info_list.get(index).unwrap().config.clone();
|
||||
Some(config)
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Inserts `block_device_config` in the block device configuration list.
|
||||
/// If an entry with the same id already exists, it will attempt to update
|
||||
/// the existing entry.
|
||||
/// Inserting a secondary root block device will fail.
|
||||
pub fn insert_device(
|
||||
device_mgr: &mut DeviceManager,
|
||||
mut ctx: DeviceOpContext,
|
||||
config: BlockDeviceConfigInfo,
|
||||
) -> std::result::Result<(), BlockDeviceError> {
|
||||
if !cfg!(feature = "hotplug") && ctx.is_hotplug {
|
||||
return Err(BlockDeviceError::UpdateNotAllowedPostBoot);
|
||||
}
|
||||
|
||||
let mgr = &mut device_mgr.block_manager;
|
||||
|
||||
// If the id of the drive already exists in the list, the operation is update.
|
||||
match mgr.get_index_of_drive_id(config.id()) {
|
||||
Some(index) => {
|
||||
// No support for runtime update yet.
|
||||
if ctx.is_hotplug {
|
||||
Err(BlockDeviceError::BlockDevicePathAlreadyExists(
|
||||
config.path_on_host.clone(),
|
||||
))
|
||||
} else {
|
||||
for (idx, info) in mgr.info_list.iter().enumerate() {
|
||||
if idx != index {
|
||||
info.config.check_conflicts(&config)?;
|
||||
}
|
||||
}
|
||||
mgr.update(index, config)
|
||||
}
|
||||
}
|
||||
None => {
|
||||
for info in mgr.info_list.iter() {
|
||||
info.config.check_conflicts(&config)?;
|
||||
}
|
||||
let index = mgr.create(config.clone())?;
|
||||
if !ctx.is_hotplug {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
match config.device_type {
|
||||
BlockDeviceType::RawBlock => {
|
||||
let device = Self::create_blk_device(&config, &mut ctx)
|
||||
.map_err(BlockDeviceError::Virtio)?;
|
||||
let dev = DeviceManager::create_mmio_virtio_device(
|
||||
device,
|
||||
&mut ctx,
|
||||
config.use_shared_irq.unwrap_or(mgr.use_shared_irq),
|
||||
config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ),
|
||||
)
|
||||
.map_err(BlockDeviceError::DeviceManager)?;
|
||||
mgr.update_device_by_index(index, Arc::clone(&dev))?;
|
||||
// live-upgrade need save/restore device from info.device.
|
||||
mgr.info_list[index].set_device(dev.clone());
|
||||
ctx.insert_hotplug_mmio_device(&dev, None).map_err(|e| {
|
||||
let logger = ctx.logger().new(slog::o!());
|
||||
BlockDeviceMgr::remove_device(device_mgr, ctx, &config.drive_id)
|
||||
.unwrap();
|
||||
error!(
|
||||
logger,
|
||||
"failed to hot-add virtio block device {}, {:?}",
|
||||
&config.drive_id,
|
||||
e
|
||||
);
|
||||
BlockDeviceError::DeviceManager(e)
|
||||
})
|
||||
}
|
||||
_ => Err(BlockDeviceError::InvalidBlockDeviceType),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Attaches all block devices from the BlockDevicesConfig.
|
||||
pub fn attach_devices(
|
||||
&mut self,
|
||||
ctx: &mut DeviceOpContext,
|
||||
) -> std::result::Result<(), BlockDeviceError> {
|
||||
for info in self.info_list.iter_mut() {
|
||||
match info.config.device_type {
|
||||
BlockDeviceType::RawBlock => {
|
||||
info!(
|
||||
ctx.logger(),
|
||||
"attach virtio-blk device, drive_id {}, path {}",
|
||||
info.config.drive_id,
|
||||
info.config.path_on_host.to_str().unwrap_or("<unknown>")
|
||||
);
|
||||
let device = Self::create_blk_device(&info.config, ctx)
|
||||
.map_err(BlockDeviceError::Virtio)?;
|
||||
let device = DeviceManager::create_mmio_virtio_device(
|
||||
device,
|
||||
ctx,
|
||||
info.config.use_shared_irq.unwrap_or(self.use_shared_irq),
|
||||
info.config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ),
|
||||
)
|
||||
.map_err(BlockDeviceError::RegisterBlockDevice)?;
|
||||
info.device = Some(device);
|
||||
}
|
||||
_ => {
|
||||
return Err(BlockDeviceError::OpenBlockDevice(
|
||||
std::io::Error::from_raw_os_error(libc::EINVAL),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Removes all virtio-blk devices
|
||||
pub fn remove_devices(&mut self, ctx: &mut DeviceOpContext) -> Result<(), DeviceMgrError> {
|
||||
while let Some(mut info) = self.info_list.pop_back() {
|
||||
info!(ctx.logger(), "remove drive {}", info.config.drive_id);
|
||||
if let Some(device) = info.device.take() {
|
||||
DeviceManager::destroy_mmio_virtio_device(device, ctx)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove(&mut self, drive_id: &str) -> Option<BlockDeviceInfo> {
|
||||
match self.get_index_of_drive_id(drive_id) {
|
||||
Some(index) => self.info_list.remove(index),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// remove a block device, it basically is the inverse operation of `insert_device``
|
||||
pub fn remove_device(
|
||||
dev_mgr: &mut DeviceManager,
|
||||
mut ctx: DeviceOpContext,
|
||||
drive_id: &str,
|
||||
) -> std::result::Result<(), BlockDeviceError> {
|
||||
if !cfg!(feature = "hotplug") {
|
||||
return Err(BlockDeviceError::UpdateNotAllowedPostBoot);
|
||||
}
|
||||
|
||||
let mgr = &mut dev_mgr.block_manager;
|
||||
match mgr.remove(drive_id) {
|
||||
Some(mut info) => {
|
||||
info!(ctx.logger(), "remove drive {}", info.config.drive_id);
|
||||
if let Some(device) = info.device.take() {
|
||||
DeviceManager::destroy_mmio_virtio_device(device, &mut ctx)
|
||||
.map_err(BlockDeviceError::DeviceManager)?;
|
||||
}
|
||||
}
|
||||
None => return Err(BlockDeviceError::InvalidDeviceId(drive_id.to_owned())),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_blk_device(
|
||||
cfg: &BlockDeviceConfigInfo,
|
||||
ctx: &mut DeviceOpContext,
|
||||
) -> std::result::Result<Box<Block<GuestAddressSpaceImpl>>, virtio::Error> {
|
||||
let epoll_mgr = ctx.epoll_mgr.clone().ok_or(virtio::Error::InvalidInput)?;
|
||||
|
||||
let mut block_files: Vec<Box<dyn Ufile>> = vec![];
|
||||
|
||||
match cfg.device_type {
|
||||
BlockDeviceType::RawBlock => {
|
||||
let custom_flags = if cfg.is_direct() {
|
||||
info!(
|
||||
ctx.logger(),
|
||||
"Open block device \"{}\" in direct mode.",
|
||||
cfg.path_on_host().display()
|
||||
);
|
||||
libc::O_DIRECT
|
||||
} else {
|
||||
info!(
|
||||
ctx.logger(),
|
||||
"Open block device \"{}\" in buffer mode.",
|
||||
cfg.path_on_host().display(),
|
||||
);
|
||||
0
|
||||
};
|
||||
let io_uring_supported = IoUring::is_supported();
|
||||
for i in 0..cfg.num_queues {
|
||||
let queue_size = cfg.queue_sizes()[i] as u32;
|
||||
let file = OpenOptions::new()
|
||||
.read(true)
|
||||
.custom_flags(custom_flags)
|
||||
.write(!cfg.is_read_only())
|
||||
.open(cfg.path_on_host())?;
|
||||
info!(ctx.logger(), "Queue {}: block file opened", i);
|
||||
|
||||
if io_uring_supported {
|
||||
info!(
|
||||
ctx.logger(),
|
||||
"Queue {}: Using io_uring Raw disk file, queue size {}.", i, queue_size
|
||||
);
|
||||
let io_engine = IoUring::new(file.as_raw_fd(), queue_size)?;
|
||||
block_files.push(Box::new(LocalFile::new(file, cfg.no_drop, io_engine)?));
|
||||
} else {
|
||||
info!(
|
||||
ctx.logger(),
|
||||
"Queue {}: Since io_uring_supported is not enabled, change to default support of Aio Raw disk file, queue size {}", i, queue_size
|
||||
);
|
||||
let io_engine = Aio::new(file.as_raw_fd(), queue_size)?;
|
||||
block_files.push(Box::new(LocalFile::new(file, cfg.no_drop, io_engine)?));
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
error!(
|
||||
ctx.logger(),
|
||||
"invalid block device type: {:?}", cfg.device_type
|
||||
);
|
||||
return Err(virtio::Error::InvalidInput);
|
||||
}
|
||||
};
|
||||
|
||||
let mut limiters = vec![];
|
||||
for _i in 0..cfg.num_queues {
|
||||
if let Some(limiter) = cfg.rate_limiter.clone().map(|mut v| {
|
||||
v.resize(cfg.num_queues as u64);
|
||||
v.try_into().unwrap()
|
||||
}) {
|
||||
limiters.push(limiter);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Box::new(Block::new(
|
||||
block_files,
|
||||
cfg.is_read_only,
|
||||
Arc::new(cfg.queue_sizes()),
|
||||
epoll_mgr,
|
||||
limiters,
|
||||
)?))
|
||||
}
|
||||
|
||||
/// Generated guest kernel commandline related to root block device.
|
||||
pub fn generate_kernel_boot_args(
|
||||
&self,
|
||||
kernel_config: &mut KernelConfigInfo,
|
||||
) -> std::result::Result<(), DeviceMgrError> {
|
||||
// Respect user configuration if kernel_cmdline contains "root=",
|
||||
// special attention for the case when kernel command line starting with "root=xxx"
|
||||
let old_kernel_cmdline = format!(" {}", kernel_config.kernel_cmdline().as_str());
|
||||
if !old_kernel_cmdline.contains(" root=") && self.has_root_block {
|
||||
let cmdline = kernel_config.kernel_cmdline_mut();
|
||||
if let Some(ref uuid) = self.part_uuid {
|
||||
cmdline
|
||||
.insert("root", &format!("PART_UUID={}", uuid))
|
||||
.map_err(DeviceMgrError::Cmdline)?;
|
||||
} else {
|
||||
cmdline
|
||||
.insert("root", "/dev/vda")
|
||||
.map_err(DeviceMgrError::Cmdline)?;
|
||||
}
|
||||
if self.read_only_root {
|
||||
if old_kernel_cmdline.contains(" rw") {
|
||||
return Err(DeviceMgrError::InvalidOperation);
|
||||
}
|
||||
cmdline.insert_str("ro").map_err(DeviceMgrError::Cmdline)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// insert a block device's config. return index on success.
|
||||
fn create(
|
||||
&mut self,
|
||||
block_device_config: BlockDeviceConfigInfo,
|
||||
) -> std::result::Result<usize, BlockDeviceError> {
|
||||
self.check_data_file_present(&block_device_config)?;
|
||||
if self
|
||||
.get_index_of_drive_path(&block_device_config.path_on_host)
|
||||
.is_some()
|
||||
{
|
||||
return Err(BlockDeviceError::BlockDevicePathAlreadyExists(
|
||||
block_device_config.path_on_host,
|
||||
));
|
||||
}
|
||||
|
||||
// check whether the Device Config belongs to a root device
|
||||
// we need to satisfy the condition by which a VMM can only have on root device
|
||||
if block_device_config.is_root_device {
|
||||
if self.has_root_block {
|
||||
return Err(BlockDeviceError::RootBlockDeviceAlreadyAdded);
|
||||
} else {
|
||||
self.has_root_block = true;
|
||||
self.read_only_root = block_device_config.is_read_only;
|
||||
self.has_part_uuid_root = block_device_config.part_uuid.is_some();
|
||||
self.part_uuid = block_device_config.part_uuid.clone();
|
||||
// Root Device should be the first in the list whether or not PART_UUID is specified
|
||||
// in order to avoid bugs in case of switching from part_uuid boot scenarios to
|
||||
// /dev/vda boot type.
|
||||
self.info_list
|
||||
.push_front(BlockDeviceInfo::new(block_device_config));
|
||||
Ok(0)
|
||||
}
|
||||
} else {
|
||||
self.info_list
|
||||
.push_back(BlockDeviceInfo::new(block_device_config));
|
||||
Ok(self.info_list.len() - 1)
|
||||
}
|
||||
}
|
||||
|
||||
/// Updates a Block Device Config. The update fails if it would result in two
|
||||
/// root block devices.
|
||||
fn update(
|
||||
&mut self,
|
||||
mut index: usize,
|
||||
new_config: BlockDeviceConfigInfo,
|
||||
) -> std::result::Result<(), BlockDeviceError> {
|
||||
// Check if the path exists
|
||||
self.check_data_file_present(&new_config)?;
|
||||
if let Some(idx) = self.get_index_of_drive_path(&new_config.path_on_host) {
|
||||
if idx != index {
|
||||
return Err(BlockDeviceError::BlockDevicePathAlreadyExists(
|
||||
new_config.path_on_host.clone(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if self.info_list.get(index).is_none() {
|
||||
return Err(InvalidDeviceId(index.to_string()));
|
||||
}
|
||||
// Check if the root block device is being updated.
|
||||
if self.info_list[index].config.is_root_device {
|
||||
self.has_root_block = new_config.is_root_device;
|
||||
self.read_only_root = new_config.is_root_device && new_config.is_read_only;
|
||||
self.has_part_uuid_root = new_config.part_uuid.is_some();
|
||||
self.part_uuid = new_config.part_uuid.clone();
|
||||
} else if new_config.is_root_device {
|
||||
// Check if a second root block device is being added.
|
||||
if self.has_root_block {
|
||||
return Err(BlockDeviceError::RootBlockDeviceAlreadyAdded);
|
||||
} else {
|
||||
// One of the non-root blocks is becoming root.
|
||||
self.has_root_block = true;
|
||||
self.read_only_root = new_config.is_read_only;
|
||||
self.has_part_uuid_root = new_config.part_uuid.is_some();
|
||||
self.part_uuid = new_config.part_uuid.clone();
|
||||
|
||||
// Make sure the root device is on the first position.
|
||||
self.info_list.swap(0, index);
|
||||
// Block config to be updated has moved to first position.
|
||||
index = 0;
|
||||
}
|
||||
}
|
||||
// Update the config.
|
||||
self.info_list[index].config = new_config;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn check_data_file_present(
|
||||
&self,
|
||||
block_device_config: &BlockDeviceConfigInfo,
|
||||
) -> std::result::Result<(), BlockDeviceError> {
|
||||
if block_device_config.device_type == BlockDeviceType::RawBlock
|
||||
&& !block_device_config.path_on_host.exists()
|
||||
{
|
||||
Err(BlockDeviceError::InvalidBlockDevicePath(
|
||||
block_device_config.path_on_host.clone(),
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn get_index_of_drive_path(&self, drive_path: &Path) -> Option<usize> {
|
||||
self.info_list
|
||||
.iter()
|
||||
.position(|info| info.config.path_on_host.eq(drive_path))
|
||||
}
|
||||
|
||||
/// update devce information in `info_list`. The caller of this method is
|
||||
/// `insert_device` when hotplug is true.
|
||||
pub fn update_device_by_index(
|
||||
&mut self,
|
||||
index: usize,
|
||||
device: Arc<DbsMmioV2Device>,
|
||||
) -> Result<(), BlockDeviceError> {
|
||||
if let Some(info) = self.info_list.get_mut(index) {
|
||||
info.device = Some(device);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
Err(BlockDeviceError::InvalidDeviceId("".to_owned()))
|
||||
}
|
||||
|
||||
/// Update the ratelimiter settings of a virtio blk device.
|
||||
pub fn update_device_ratelimiters(
|
||||
device_mgr: &mut DeviceManager,
|
||||
new_cfg: BlockDeviceConfigUpdateInfo,
|
||||
) -> std::result::Result<(), BlockDeviceError> {
|
||||
let mgr = &mut device_mgr.block_manager;
|
||||
match mgr.get_index_of_drive_id(&new_cfg.drive_id) {
|
||||
Some(index) => {
|
||||
let config = &mut mgr.info_list[index].config;
|
||||
config.rate_limiter = new_cfg.rate_limiter.clone();
|
||||
let device = mgr.info_list[index]
|
||||
.device
|
||||
.as_mut()
|
||||
.ok_or_else(|| BlockDeviceError::InvalidDeviceId("".to_owned()))?;
|
||||
if let Some(mmio_dev) = device.as_any().downcast_ref::<DbsMmioV2Device>() {
|
||||
let guard = mmio_dev.state();
|
||||
let inner_dev = guard.get_inner_device();
|
||||
if let Some(blk_dev) = inner_dev
|
||||
.as_any()
|
||||
.downcast_ref::<virtio::block::Block<GuestAddressSpaceImpl>>()
|
||||
{
|
||||
return blk_dev
|
||||
.set_patch_rate_limiters(new_cfg.bytes(), new_cfg.ops())
|
||||
.map(|_p| ())
|
||||
.map_err(|_e| BlockDeviceError::BlockEpollHanderSendFail);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
None => Err(BlockDeviceError::InvalidDeviceId(new_cfg.drive_id)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BlockDeviceMgr {
|
||||
/// Constructor for the BlockDeviceMgr. It initializes an empty LinkedList.
|
||||
fn default() -> BlockDeviceMgr {
|
||||
BlockDeviceMgr {
|
||||
info_list: VecDeque::<BlockDeviceInfo>::new(),
|
||||
has_root_block: false,
|
||||
has_part_uuid_root: false,
|
||||
read_only_root: false,
|
||||
part_uuid: None,
|
||||
use_shared_irq: USE_SHARED_IRQ,
|
||||
}
|
||||
}
|
||||
}
|
||||
440
src/dragonball/src/device_manager/console_manager.rs
Normal file
440
src/dragonball/src/device_manager/console_manager.rs
Normal file
@@ -0,0 +1,440 @@
|
||||
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
|
||||
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the THIRD-PARTY file.
|
||||
|
||||
//! Virtual machine console device manager.
|
||||
//!
|
||||
//! A virtual console are composed up of two parts: frontend in virtual machine and backend in
|
||||
//! host OS. A frontend may be serial port, virtio-console etc, a backend may be stdio or Unix
|
||||
//! domain socket. The manager connects the frontend with the backend.
|
||||
use std::io::{self, Read};
|
||||
use std::os::unix::net::{UnixListener, UnixStream};
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use dbs_legacy_devices::{ConsoleHandler, SerialDevice};
|
||||
use dbs_utils::epoll_manager::{
|
||||
EpollManager, EventOps, EventSet, Events, MutEventSubscriber, SubscriberId,
|
||||
};
|
||||
use vmm_sys_util::terminal::Terminal;
|
||||
|
||||
use super::{DeviceMgrError, Result};
|
||||
|
||||
const EPOLL_EVENT_SERIAL: u32 = 0;
|
||||
const EPOLL_EVENT_SERIAL_DATA: u32 = 1;
|
||||
const EPOLL_EVENT_STDIN: u32 = 2;
|
||||
// Maximal backend throughput for every data transaction.
|
||||
const MAX_BACKEND_THROUGHPUT: usize = 64;
|
||||
|
||||
/// Errors related to Console manager operations.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ConsoleManagerError {
|
||||
/// Cannot create unix domain socket for serial port
|
||||
#[error("cannot create socket for serial console")]
|
||||
CreateSerialSock(#[source] std::io::Error),
|
||||
|
||||
/// An operation on the epoll instance failed due to resource exhaustion or bad configuration.
|
||||
#[error("failure while managing epoll event for console fd")]
|
||||
EpollMgr(#[source] dbs_utils::epoll_manager::Error),
|
||||
|
||||
/// Cannot set mode for terminal.
|
||||
#[error("failure while setting attribute for terminal")]
|
||||
StdinHandle(#[source] vmm_sys_util::errno::Error),
|
||||
}
|
||||
|
||||
enum Backend {
|
||||
StdinHandle(std::io::Stdin),
|
||||
SockPath(String),
|
||||
}
|
||||
|
||||
/// Console manager to manage frontend and backend console devices.
|
||||
pub struct ConsoleManager {
|
||||
epoll_mgr: EpollManager,
|
||||
logger: slog::Logger,
|
||||
subscriber_id: Option<SubscriberId>,
|
||||
backend: Option<Backend>,
|
||||
}
|
||||
|
||||
impl ConsoleManager {
|
||||
/// Create a console manager instance.
|
||||
pub fn new(epoll_mgr: EpollManager, logger: &slog::Logger) -> Self {
|
||||
let logger = logger.new(slog::o!("subsystem" => "console_manager"));
|
||||
ConsoleManager {
|
||||
epoll_mgr,
|
||||
logger,
|
||||
subscriber_id: Default::default(),
|
||||
backend: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a console backend device by using stdio streams.
|
||||
pub fn create_stdio_console(&mut self, device: Arc<Mutex<SerialDevice>>) -> Result<()> {
|
||||
let stdin_handle = std::io::stdin();
|
||||
stdin_handle
|
||||
.lock()
|
||||
.set_raw_mode()
|
||||
.map_err(|e| DeviceMgrError::ConsoleManager(ConsoleManagerError::StdinHandle(e)))?;
|
||||
|
||||
let handler = ConsoleEpollHandler::new(device, Some(stdin_handle), None, &self.logger);
|
||||
self.subscriber_id = Some(self.epoll_mgr.add_subscriber(Box::new(handler)));
|
||||
self.backend = Some(Backend::StdinHandle(std::io::stdin()));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create s console backend device by using Unix Domain socket.
|
||||
pub fn create_socket_console(
|
||||
&mut self,
|
||||
device: Arc<Mutex<SerialDevice>>,
|
||||
sock_path: String,
|
||||
) -> Result<()> {
|
||||
let sock_listener = Self::bind_domain_socket(&sock_path).map_err(|e| {
|
||||
DeviceMgrError::ConsoleManager(ConsoleManagerError::CreateSerialSock(e))
|
||||
})?;
|
||||
let handler = ConsoleEpollHandler::new(device, None, Some(sock_listener), &self.logger);
|
||||
|
||||
self.subscriber_id = Some(self.epoll_mgr.add_subscriber(Box::new(handler)));
|
||||
self.backend = Some(Backend::SockPath(sock_path));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reset the host side terminal to canonical mode.
|
||||
pub fn reset_console(&self) -> Result<()> {
|
||||
if let Some(Backend::StdinHandle(stdin_handle)) = self.backend.as_ref() {
|
||||
stdin_handle
|
||||
.lock()
|
||||
.set_canon_mode()
|
||||
.map_err(|e| DeviceMgrError::ConsoleManager(ConsoleManagerError::StdinHandle(e)))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn bind_domain_socket(serial_path: &str) -> std::result::Result<UnixListener, std::io::Error> {
|
||||
let path = Path::new(serial_path);
|
||||
if path.is_file() {
|
||||
let _ = std::fs::remove_file(serial_path);
|
||||
}
|
||||
|
||||
UnixListener::bind(path)
|
||||
}
|
||||
}
|
||||
|
||||
struct ConsoleEpollHandler {
|
||||
device: Arc<Mutex<SerialDevice>>,
|
||||
stdin_handle: Option<std::io::Stdin>,
|
||||
sock_listener: Option<UnixListener>,
|
||||
sock_conn: Option<UnixStream>,
|
||||
logger: slog::Logger,
|
||||
}
|
||||
|
||||
impl ConsoleEpollHandler {
|
||||
fn new(
|
||||
device: Arc<Mutex<SerialDevice>>,
|
||||
stdin_handle: Option<std::io::Stdin>,
|
||||
sock_listener: Option<UnixListener>,
|
||||
logger: &slog::Logger,
|
||||
) -> Self {
|
||||
ConsoleEpollHandler {
|
||||
device,
|
||||
stdin_handle,
|
||||
sock_listener,
|
||||
sock_conn: None,
|
||||
logger: logger.new(slog::o!("subsystem" => "console_manager")),
|
||||
}
|
||||
}
|
||||
|
||||
fn uds_listener_accept(&mut self, ops: &mut EventOps) -> std::io::Result<()> {
|
||||
if self.sock_conn.is_some() {
|
||||
slog::warn!(self.logger,
|
||||
"UDS for serial port 1 already exists, reject the new connection";
|
||||
"subsystem" => "console_mgr",
|
||||
);
|
||||
// Do not expected poisoned lock.
|
||||
let _ = self.sock_listener.as_mut().unwrap().accept();
|
||||
} else {
|
||||
// Safe to unwrap() because self.sock_conn is Some().
|
||||
let (conn_sock, _) = self.sock_listener.as_ref().unwrap().accept()?;
|
||||
let events = Events::with_data(&conn_sock, EPOLL_EVENT_SERIAL_DATA, EventSet::IN);
|
||||
if let Err(e) = ops.add(events) {
|
||||
slog::error!(self.logger,
|
||||
"failed to register epoll event for serial, {:?}", e;
|
||||
"subsystem" => "console_mgr",
|
||||
);
|
||||
return Err(std::io::Error::last_os_error());
|
||||
}
|
||||
|
||||
let conn_sock_copy = conn_sock.try_clone()?;
|
||||
// Do not expected poisoned lock.
|
||||
self.device
|
||||
.lock()
|
||||
.unwrap()
|
||||
.set_output_stream(Some(Box::new(conn_sock_copy)));
|
||||
|
||||
self.sock_conn = Some(conn_sock);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn uds_read_in(&mut self, ops: &mut EventOps) -> std::io::Result<()> {
|
||||
let mut should_drop = true;
|
||||
|
||||
if let Some(conn_sock) = self.sock_conn.as_mut() {
|
||||
let mut out = [0u8; MAX_BACKEND_THROUGHPUT];
|
||||
match conn_sock.read(&mut out[..]) {
|
||||
Ok(0) => {
|
||||
// Zero-length read means EOF. Remove this conn sock.
|
||||
self.device
|
||||
.lock()
|
||||
.expect("console: poisoned console lock")
|
||||
.set_output_stream(None);
|
||||
}
|
||||
Ok(count) => {
|
||||
self.device
|
||||
.lock()
|
||||
.expect("console: poisoned console lock")
|
||||
.raw_input(&out[..count])?;
|
||||
should_drop = false;
|
||||
}
|
||||
Err(e) => {
|
||||
slog::warn!(self.logger,
|
||||
"error while reading serial conn sock: {:?}", e;
|
||||
"subsystem" => "console_mgr"
|
||||
);
|
||||
self.device
|
||||
.lock()
|
||||
.expect("console: poisoned console lock")
|
||||
.set_output_stream(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if should_drop {
|
||||
assert!(self.sock_conn.is_some());
|
||||
// Safe to unwrap() because self.sock_conn is Some().
|
||||
let sock_conn = self.sock_conn.take().unwrap();
|
||||
let events = Events::with_data(&sock_conn, EPOLL_EVENT_SERIAL_DATA, EventSet::IN);
|
||||
if let Err(e) = ops.remove(events) {
|
||||
slog::error!(self.logger,
|
||||
"failed deregister epoll event for UDS, {:?}", e;
|
||||
"subsystem" => "console_mgr"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn stdio_read_in(&mut self, ops: &mut EventOps) -> std::io::Result<()> {
|
||||
let mut should_drop = true;
|
||||
|
||||
if let Some(handle) = self.stdin_handle.as_ref() {
|
||||
let mut out = [0u8; MAX_BACKEND_THROUGHPUT];
|
||||
// Safe to unwrap() because self.stdin_handle is Some().
|
||||
let stdin_lock = handle.lock();
|
||||
match stdin_lock.read_raw(&mut out[..]) {
|
||||
Ok(0) => {
|
||||
// Zero-length read indicates EOF. Remove from pollables.
|
||||
self.device
|
||||
.lock()
|
||||
.expect("console: poisoned console lock")
|
||||
.set_output_stream(None);
|
||||
}
|
||||
Ok(count) => {
|
||||
self.device
|
||||
.lock()
|
||||
.expect("console: poisoned console lock")
|
||||
.raw_input(&out[..count])?;
|
||||
should_drop = false;
|
||||
}
|
||||
Err(e) => {
|
||||
slog::warn!(self.logger,
|
||||
"error while reading stdin: {:?}", e;
|
||||
"subsystem" => "console_mgr"
|
||||
);
|
||||
self.device
|
||||
.lock()
|
||||
.expect("console: poisoned console lock")
|
||||
.set_output_stream(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if should_drop {
|
||||
let events = Events::with_data_raw(libc::STDIN_FILENO, EPOLL_EVENT_STDIN, EventSet::IN);
|
||||
if let Err(e) = ops.remove(events) {
|
||||
slog::error!(self.logger,
|
||||
"failed to deregister epoll event for stdin, {:?}", e;
|
||||
"subsystem" => "console_mgr"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl MutEventSubscriber for ConsoleEpollHandler {
|
||||
fn process(&mut self, events: Events, ops: &mut EventOps) {
|
||||
slog::trace!(self.logger, "ConsoleEpollHandler::process()");
|
||||
let slot = events.data();
|
||||
match slot {
|
||||
EPOLL_EVENT_SERIAL => {
|
||||
if let Err(e) = self.uds_listener_accept(ops) {
|
||||
slog::warn!(self.logger, "failed to accept incoming connection, {:?}", e);
|
||||
}
|
||||
}
|
||||
EPOLL_EVENT_SERIAL_DATA => {
|
||||
if let Err(e) = self.uds_read_in(ops) {
|
||||
slog::warn!(self.logger, "failed to read data from UDS, {:?}", e);
|
||||
}
|
||||
}
|
||||
EPOLL_EVENT_STDIN => {
|
||||
if let Err(e) = self.stdio_read_in(ops) {
|
||||
slog::warn!(self.logger, "failed to read data from stdin, {:?}", e);
|
||||
}
|
||||
}
|
||||
_ => slog::error!(self.logger, "unknown epoll slot number {}", slot),
|
||||
}
|
||||
}
|
||||
|
||||
fn init(&mut self, ops: &mut EventOps) {
|
||||
slog::trace!(self.logger, "ConsoleEpollHandler::init()");
|
||||
|
||||
if self.stdin_handle.is_some() {
|
||||
slog::info!(self.logger, "ConsoleEpollHandler: stdin handler");
|
||||
let events = Events::with_data_raw(libc::STDIN_FILENO, EPOLL_EVENT_STDIN, EventSet::IN);
|
||||
if let Err(e) = ops.add(events) {
|
||||
slog::error!(
|
||||
self.logger,
|
||||
"failed to register epoll event for stdin, {:?}",
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
if let Some(sock) = self.sock_listener.as_ref() {
|
||||
slog::info!(self.logger, "ConsoleEpollHandler: sock listener");
|
||||
let events = Events::with_data(sock, EPOLL_EVENT_SERIAL, EventSet::IN);
|
||||
if let Err(e) = ops.add(events) {
|
||||
slog::error!(
|
||||
self.logger,
|
||||
"failed to register epoll event for UDS listener, {:?}",
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(conn) = self.sock_conn.as_ref() {
|
||||
slog::info!(self.logger, "ConsoleEpollHandler: sock connection");
|
||||
let events = Events::with_data(conn, EPOLL_EVENT_SERIAL_DATA, EventSet::IN);
|
||||
if let Err(e) = ops.add(events) {
|
||||
slog::error!(
|
||||
self.logger,
|
||||
"failed to register epoll event for UDS connection, {:?}",
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Writer to process guest kernel dmesg.
|
||||
pub struct DmesgWriter {
|
||||
buf: BytesMut,
|
||||
logger: slog::Logger,
|
||||
}
|
||||
|
||||
impl DmesgWriter {
|
||||
/// Creates a new instance.
|
||||
pub fn new(logger: &slog::Logger) -> Self {
|
||||
Self {
|
||||
buf: BytesMut::with_capacity(1024),
|
||||
logger: logger.new(slog::o!("subsystem" => "dmesg")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Write for DmesgWriter {
|
||||
/// 0000000 [ 0 . 0 3 4 9 1 6 ] R
|
||||
/// 5b 20 20 20 20 30 2e 30 33 34 39 31 36 5d 20 52
|
||||
/// 0000020 u n / s b i n / i n i t a s
|
||||
/// 75 6e 20 2f 73 62 69 6e 2f 69 6e 69 74 20 61 73
|
||||
/// 0000040 i n i t p r o c e s s \r \n [
|
||||
///
|
||||
/// dmesg message end a line with /r/n . When redirect message to logger, we should
|
||||
/// remove the /r/n .
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
let arr: Vec<&[u8]> = buf.split(|c| *c == b'\n').collect();
|
||||
let count = arr.len();
|
||||
|
||||
for (i, sub) in arr.iter().enumerate() {
|
||||
if sub.is_empty() {
|
||||
if !self.buf.is_empty() {
|
||||
slog::info!(
|
||||
self.logger,
|
||||
"{}",
|
||||
String::from_utf8_lossy(self.buf.as_ref()).trim_end()
|
||||
);
|
||||
self.buf.clear();
|
||||
}
|
||||
} else if sub.len() < buf.len() && i < count - 1 {
|
||||
slog::info!(
|
||||
self.logger,
|
||||
"{}{}",
|
||||
String::from_utf8_lossy(self.buf.as_ref()).trim_end(),
|
||||
String::from_utf8_lossy(sub).trim_end(),
|
||||
);
|
||||
self.buf.clear();
|
||||
} else {
|
||||
self.buf.put_slice(sub);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use slog::Drain;
|
||||
use std::io::Write;
|
||||
|
||||
fn create_logger() -> slog::Logger {
|
||||
let decorator = slog_term::TermDecorator::new().build();
|
||||
let drain = slog_term::FullFormat::new(decorator).build().fuse();
|
||||
let drain = slog_async::Async::new(drain).build().fuse();
|
||||
slog::Logger::root(drain, slog::o!())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dmesg_writer() {
|
||||
let mut writer = DmesgWriter {
|
||||
buf: Default::default(),
|
||||
logger: create_logger(),
|
||||
};
|
||||
|
||||
writer.flush().unwrap();
|
||||
writer.write_all("".as_bytes()).unwrap();
|
||||
writer.write_all("\n".as_bytes()).unwrap();
|
||||
writer.write_all("\n\n".as_bytes()).unwrap();
|
||||
writer.write_all("\n\n\n".as_bytes()).unwrap();
|
||||
writer.write_all("12\n23\n34\n56".as_bytes()).unwrap();
|
||||
writer.write_all("78".as_bytes()).unwrap();
|
||||
writer.write_all("90\n".as_bytes()).unwrap();
|
||||
writer.flush().unwrap();
|
||||
}
|
||||
|
||||
// TODO: add unit tests for console manager
|
||||
}
|
||||
528
src/dragonball/src/device_manager/fs_dev_mgr.rs
Normal file
528
src/dragonball/src/device_manager/fs_dev_mgr.rs
Normal file
@@ -0,0 +1,528 @@
|
||||
// Copyright 2020-2022 Alibaba Cloud. All Rights Reserved.
|
||||
// Copyright 2019 Intel Corporation. All Rights Reserved.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use std::convert::TryInto;
|
||||
|
||||
use dbs_utils::epoll_manager::EpollManager;
|
||||
use dbs_virtio_devices::{self as virtio, Error as VirtIoError};
|
||||
use serde_derive::{Deserialize, Serialize};
|
||||
use slog::{error, info};
|
||||
|
||||
use crate::address_space_manager::GuestAddressSpaceImpl;
|
||||
use crate::config_manager::{
|
||||
ConfigItem, DeviceConfigInfo, DeviceConfigInfos, RateLimiterConfigInfo,
|
||||
};
|
||||
use crate::device_manager::{
|
||||
DbsMmioV2Device, DeviceManager, DeviceMgrError, DeviceOpContext, DeviceVirtioRegionHandler,
|
||||
};
|
||||
use crate::get_bucket_update;
|
||||
|
||||
use super::DbsVirtioDevice;
|
||||
|
||||
// The flag of whether to use the shared irq.
|
||||
const USE_SHARED_IRQ: bool = true;
|
||||
// The flag of whether to use the generic irq.
|
||||
const USE_GENERIC_IRQ: bool = true;
|
||||
// Default cache size is 2 Gi since this is a typical VM memory size.
|
||||
const DEFAULT_CACHE_SIZE: u64 = 2 * 1024 * 1024 * 1024;
|
||||
// We have 2 supported fs device mode, vhostuser and virtio
|
||||
const VHOSTUSER_FS_MODE: &str = "vhostuser";
|
||||
// We have 2 supported fs device mode, vhostuser and virtio
|
||||
const VIRTIO_FS_MODE: &str = "virtio";
|
||||
|
||||
/// Errors associated with `FsDeviceConfig`.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum FsDeviceError {
|
||||
/// Invalid fs, "virtio" or "vhostuser" is allowed.
|
||||
#[error("the fs type is invalid, virtio or vhostuser is allowed")]
|
||||
InvalidFs,
|
||||
|
||||
/// Cannot access address space.
|
||||
#[error("Cannot access address space.")]
|
||||
AddressSpaceNotInitialized,
|
||||
|
||||
/// Cannot convert RateLimterConfigInfo into RateLimiter.
|
||||
#[error("failure while converting RateLimterConfigInfo into RateLimiter: {0}")]
|
||||
RateLimterConfigInfoTryInto(#[source] std::io::Error),
|
||||
|
||||
/// The fs device tag was already used for a different fs.
|
||||
#[error("VirtioFs device tag {0} already exists")]
|
||||
FsDeviceTagAlreadyExists(String),
|
||||
|
||||
/// The fs device path was already used for a different fs.
|
||||
#[error("VirtioFs device tag {0} already exists")]
|
||||
FsDevicePathAlreadyExists(String),
|
||||
|
||||
/// The update is not allowed after booting the microvm.
|
||||
#[error("update operation is not allowed after boot")]
|
||||
UpdateNotAllowedPostBoot,
|
||||
|
||||
/// The attachbackendfs operation fails.
|
||||
#[error("Fs device attach a backend fs failed")]
|
||||
AttachBackendFailed(String),
|
||||
|
||||
/// attach backend fs must be done when vm is running.
|
||||
#[error("vm is not running when attaching a backend fs")]
|
||||
MicroVMNotRunning,
|
||||
|
||||
/// The mount tag doesn't exist.
|
||||
#[error("fs tag'{0}' doesn't exist")]
|
||||
TagNotExists(String),
|
||||
|
||||
/// Failed to send patch message to VirtioFs epoll handler.
|
||||
#[error("could not send patch message to the VirtioFs epoll handler")]
|
||||
VirtioFsEpollHanderSendFail,
|
||||
|
||||
/// Creating a shared-fs device fails (if the vhost-user socket cannot be open.)
|
||||
#[error("cannot create shared-fs device: {0}")]
|
||||
CreateFsDevice(#[source] VirtIoError),
|
||||
|
||||
/// Cannot initialize a shared-fs device or add a device to the MMIO Bus.
|
||||
#[error("failure while registering shared-fs device: {0}")]
|
||||
RegisterFsDevice(#[source] DeviceMgrError),
|
||||
|
||||
/// The device manager errors.
|
||||
#[error("DeviceManager error: {0}")]
|
||||
DeviceManager(#[source] DeviceMgrError),
|
||||
}
|
||||
|
||||
/// Configuration information for a vhost-user-fs device.
|
||||
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
|
||||
pub struct FsDeviceConfigInfo {
|
||||
/// vhost-user socket path.
|
||||
pub sock_path: String,
|
||||
/// virtiofs mount tag name used inside the guest.
|
||||
/// used as the device name during mount.
|
||||
pub tag: String,
|
||||
/// Number of virtqueues to use.
|
||||
pub num_queues: usize,
|
||||
/// Size of each virtqueue. Unit: byte.
|
||||
pub queue_size: u16,
|
||||
/// DAX cache window size
|
||||
pub cache_size: u64,
|
||||
/// Number of thread pool workers.
|
||||
pub thread_pool_size: u16,
|
||||
/// The caching policy the file system should use (auto, always or never).
|
||||
/// This cache policy is set for virtio-fs, visit https://gitlab.com/virtio-fs/virtiofsd to get further information.
|
||||
pub cache_policy: String,
|
||||
/// Writeback cache
|
||||
pub writeback_cache: bool,
|
||||
/// Enable no_open or not
|
||||
pub no_open: bool,
|
||||
/// Enable xattr or not
|
||||
pub xattr: bool,
|
||||
/// Drop CAP_SYS_RESOURCE or not
|
||||
pub drop_sys_resource: bool,
|
||||
/// virtio fs or vhostuser fs.
|
||||
pub mode: String,
|
||||
/// Enable kill_priv_v2 or not
|
||||
pub fuse_killpriv_v2: bool,
|
||||
/// Enable no_readdir or not
|
||||
pub no_readdir: bool,
|
||||
/// Rate Limiter for I/O operations.
|
||||
pub rate_limiter: Option<RateLimiterConfigInfo>,
|
||||
/// Use shared irq
|
||||
pub use_shared_irq: Option<bool>,
|
||||
/// Use generic irq
|
||||
pub use_generic_irq: Option<bool>,
|
||||
}
|
||||
|
||||
impl std::default::Default for FsDeviceConfigInfo {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sock_path: String::default(),
|
||||
tag: String::default(),
|
||||
num_queues: 1,
|
||||
queue_size: 1024,
|
||||
cache_size: DEFAULT_CACHE_SIZE,
|
||||
thread_pool_size: 0,
|
||||
cache_policy: Self::default_cache_policy(),
|
||||
writeback_cache: Self::default_writeback_cache(),
|
||||
no_open: Self::default_no_open(),
|
||||
fuse_killpriv_v2: Self::default_fuse_killpriv_v2(),
|
||||
no_readdir: Self::default_no_readdir(),
|
||||
xattr: Self::default_xattr(),
|
||||
drop_sys_resource: Self::default_drop_sys_resource(),
|
||||
mode: Self::default_fs_mode(),
|
||||
rate_limiter: Some(RateLimiterConfigInfo::default()),
|
||||
use_shared_irq: None,
|
||||
use_generic_irq: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FsDeviceConfigInfo {
|
||||
/// The default mode is set to 'virtio' for 'virtio-fs' device.
|
||||
pub fn default_fs_mode() -> String {
|
||||
String::from(VIRTIO_FS_MODE)
|
||||
}
|
||||
|
||||
/// The default cache policy
|
||||
pub fn default_cache_policy() -> String {
|
||||
"always".to_string()
|
||||
}
|
||||
|
||||
/// The default setting of writeback cache
|
||||
pub fn default_writeback_cache() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// The default setting of no_open
|
||||
pub fn default_no_open() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// The default setting of killpriv_v2
|
||||
pub fn default_fuse_killpriv_v2() -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// The default setting of xattr
|
||||
pub fn default_xattr() -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// The default setting of drop_sys_resource
|
||||
pub fn default_drop_sys_resource() -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// The default setting of no_readdir
|
||||
pub fn default_no_readdir() -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// The default setting of rate limiter
|
||||
pub fn default_fs_rate_limiter() -> Option<RateLimiterConfigInfo> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration information for virtio-fs.
|
||||
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
|
||||
pub struct FsDeviceConfigUpdateInfo {
|
||||
/// virtiofs mount tag name used inside the guest.
|
||||
/// used as the device name during mount.
|
||||
pub tag: String,
|
||||
/// Rate Limiter for I/O operations.
|
||||
pub rate_limiter: Option<RateLimiterConfigInfo>,
|
||||
}
|
||||
|
||||
impl FsDeviceConfigUpdateInfo {
|
||||
/// Provides a `BucketUpdate` description for the bandwidth rate limiter.
|
||||
pub fn bytes(&self) -> dbs_utils::rate_limiter::BucketUpdate {
|
||||
get_bucket_update!(self, rate_limiter, bandwidth)
|
||||
}
|
||||
/// Provides a `BucketUpdate` description for the ops rate limiter.
|
||||
pub fn ops(&self) -> dbs_utils::rate_limiter::BucketUpdate {
|
||||
get_bucket_update!(self, rate_limiter, ops)
|
||||
}
|
||||
}
|
||||
|
||||
impl ConfigItem for FsDeviceConfigInfo {
|
||||
type Err = FsDeviceError;
|
||||
|
||||
fn id(&self) -> &str {
|
||||
&self.tag
|
||||
}
|
||||
|
||||
fn check_conflicts(&self, other: &Self) -> Result<(), FsDeviceError> {
|
||||
if self.tag == other.tag {
|
||||
Err(FsDeviceError::FsDeviceTagAlreadyExists(self.tag.clone()))
|
||||
} else if self.mode.as_str() == VHOSTUSER_FS_MODE && self.sock_path == other.sock_path {
|
||||
Err(FsDeviceError::FsDevicePathAlreadyExists(
|
||||
self.sock_path.clone(),
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration information of manipulating backend fs for a virtiofs device.
|
||||
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
|
||||
pub struct FsMountConfigInfo {
|
||||
/// Mount operations, mount, update, umount
|
||||
pub ops: String,
|
||||
/// The backend fs type to mount.
|
||||
pub fstype: Option<String>,
|
||||
/// the source file/directory the backend fs points to
|
||||
pub source: Option<String>,
|
||||
/// where the backend fs gets mounted
|
||||
pub mountpoint: String,
|
||||
/// backend fs config content in json format
|
||||
pub config: Option<String>,
|
||||
/// virtiofs mount tag name used inside the guest.
|
||||
/// used as the device name during mount.
|
||||
pub tag: String,
|
||||
/// Path to file that contains file lists that should be prefetched by rafs
|
||||
pub prefetch_list_path: Option<String>,
|
||||
/// What size file supports dax
|
||||
pub dax_threshold_size_kb: Option<u64>,
|
||||
}
|
||||
|
||||
pub(crate) type FsDeviceInfo = DeviceConfigInfo<FsDeviceConfigInfo>;
|
||||
|
||||
impl ConfigItem for FsDeviceInfo {
|
||||
type Err = FsDeviceError;
|
||||
fn id(&self) -> &str {
|
||||
&self.config.tag
|
||||
}
|
||||
|
||||
fn check_conflicts(&self, other: &Self) -> Result<(), FsDeviceError> {
|
||||
if self.config.tag == other.config.tag {
|
||||
Err(FsDeviceError::FsDeviceTagAlreadyExists(
|
||||
self.config.tag.clone(),
|
||||
))
|
||||
} else if self.config.sock_path == other.config.sock_path {
|
||||
Err(FsDeviceError::FsDevicePathAlreadyExists(
|
||||
self.config.sock_path.clone(),
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for the collection that holds all the Fs Devices Configs
|
||||
pub struct FsDeviceMgr {
|
||||
/// A list of `FsDeviceConfig` objects.
|
||||
pub(crate) info_list: DeviceConfigInfos<FsDeviceConfigInfo>,
|
||||
pub(crate) use_shared_irq: bool,
|
||||
}
|
||||
|
||||
impl FsDeviceMgr {
|
||||
/// Inserts `fs_cfg` in the shared-fs device configuration list.
|
||||
pub fn insert_device(
|
||||
device_mgr: &mut DeviceManager,
|
||||
ctx: DeviceOpContext,
|
||||
fs_cfg: FsDeviceConfigInfo,
|
||||
) -> std::result::Result<(), FsDeviceError> {
|
||||
// It's too complicated to manage life cycle of shared-fs service process for hotplug.
|
||||
if ctx.is_hotplug {
|
||||
error!(
|
||||
ctx.logger(),
|
||||
"no support of shared-fs device hotplug";
|
||||
"subsystem" => "shared-fs",
|
||||
"tag" => &fs_cfg.tag,
|
||||
);
|
||||
return Err(FsDeviceError::UpdateNotAllowedPostBoot);
|
||||
}
|
||||
|
||||
info!(
|
||||
ctx.logger(),
|
||||
"add shared-fs device configuration";
|
||||
"subsystem" => "shared-fs",
|
||||
"tag" => &fs_cfg.tag,
|
||||
);
|
||||
device_mgr
|
||||
.fs_manager
|
||||
.lock()
|
||||
.unwrap()
|
||||
.info_list
|
||||
.insert_or_update(&fs_cfg)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Attaches all vhost-user-fs devices from the FsDevicesConfig.
|
||||
pub fn attach_devices(
|
||||
&mut self,
|
||||
ctx: &mut DeviceOpContext,
|
||||
) -> std::result::Result<(), FsDeviceError> {
|
||||
let epoll_mgr = ctx
|
||||
.epoll_mgr
|
||||
.clone()
|
||||
.ok_or(FsDeviceError::CreateFsDevice(virtio::Error::InvalidInput))?;
|
||||
|
||||
for info in self.info_list.iter_mut() {
|
||||
let device = Self::create_fs_device(&info.config, ctx, epoll_mgr.clone())?;
|
||||
let mmio_device = DeviceManager::create_mmio_virtio_device(
|
||||
device,
|
||||
ctx,
|
||||
info.config.use_shared_irq.unwrap_or(self.use_shared_irq),
|
||||
info.config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ),
|
||||
)
|
||||
.map_err(FsDeviceError::RegisterFsDevice)?;
|
||||
|
||||
info.set_device(mmio_device);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_fs_device(
|
||||
config: &FsDeviceConfigInfo,
|
||||
ctx: &mut DeviceOpContext,
|
||||
epoll_mgr: EpollManager,
|
||||
) -> std::result::Result<DbsVirtioDevice, FsDeviceError> {
|
||||
match &config.mode as &str {
|
||||
VIRTIO_FS_MODE => Self::attach_virtio_fs_devices(config, ctx, epoll_mgr),
|
||||
_ => Err(FsDeviceError::CreateFsDevice(virtio::Error::InvalidInput)),
|
||||
}
|
||||
}
|
||||
|
||||
fn attach_virtio_fs_devices(
|
||||
config: &FsDeviceConfigInfo,
|
||||
ctx: &mut DeviceOpContext,
|
||||
epoll_mgr: EpollManager,
|
||||
) -> std::result::Result<DbsVirtioDevice, FsDeviceError> {
|
||||
info!(
|
||||
ctx.logger(),
|
||||
"add virtio-fs device configuration";
|
||||
"subsystem" => "virito-fs",
|
||||
"tag" => &config.tag,
|
||||
"dax_window_size" => &config.cache_size,
|
||||
);
|
||||
|
||||
let limiter = if let Some(rlc) = config.rate_limiter.clone() {
|
||||
Some(
|
||||
rlc.try_into()
|
||||
.map_err(FsDeviceError::RateLimterConfigInfoTryInto)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let vm_as = ctx.get_vm_as().map_err(|e| {
|
||||
error!(ctx.logger(), "virtio-fs get vm_as error: {:?}", e;
|
||||
"subsystem" => "virito-fs");
|
||||
FsDeviceError::DeviceManager(e)
|
||||
})?;
|
||||
let address_space = match ctx.address_space.as_ref() {
|
||||
Some(address_space) => address_space.clone(),
|
||||
None => {
|
||||
error!(ctx.logger(), "virtio-fs get address_space error"; "subsystem" => "virito-fs");
|
||||
return Err(FsDeviceError::AddressSpaceNotInitialized);
|
||||
}
|
||||
};
|
||||
let handler = DeviceVirtioRegionHandler {
|
||||
vm_as,
|
||||
address_space,
|
||||
};
|
||||
|
||||
let device = Box::new(
|
||||
virtio::fs::VirtioFs::new(
|
||||
&config.tag,
|
||||
config.num_queues,
|
||||
config.queue_size,
|
||||
config.cache_size,
|
||||
&config.cache_policy,
|
||||
config.thread_pool_size,
|
||||
config.writeback_cache,
|
||||
config.no_open,
|
||||
config.fuse_killpriv_v2,
|
||||
config.xattr,
|
||||
config.drop_sys_resource,
|
||||
config.no_readdir,
|
||||
Box::new(handler),
|
||||
epoll_mgr,
|
||||
limiter,
|
||||
)
|
||||
.map_err(FsDeviceError::CreateFsDevice)?,
|
||||
);
|
||||
|
||||
Ok(device)
|
||||
}
|
||||
|
||||
/// Attach a backend fs to a VirtioFs device or detach a backend
|
||||
/// fs from a Virtiofs device
|
||||
pub fn manipulate_backend_fs(
|
||||
device_mgr: &mut DeviceManager,
|
||||
config: FsMountConfigInfo,
|
||||
) -> std::result::Result<(), FsDeviceError> {
|
||||
let mut found = false;
|
||||
|
||||
let mgr = &mut device_mgr.fs_manager.lock().unwrap();
|
||||
for info in mgr
|
||||
.info_list
|
||||
.iter()
|
||||
.filter(|info| info.config.tag.as_str() == config.tag.as_str())
|
||||
{
|
||||
found = true;
|
||||
if let Some(device) = info.device.as_ref() {
|
||||
if let Some(mmio_dev) = device.as_any().downcast_ref::<DbsMmioV2Device>() {
|
||||
let mut guard = mmio_dev.state();
|
||||
let inner_dev = guard.get_inner_device_mut();
|
||||
if let Some(virtio_fs_dev) = inner_dev
|
||||
.as_any_mut()
|
||||
.downcast_mut::<virtio::fs::VirtioFs<GuestAddressSpaceImpl>>()
|
||||
{
|
||||
return virtio_fs_dev
|
||||
.manipulate_backend_fs(
|
||||
config.source,
|
||||
config.fstype,
|
||||
&config.mountpoint,
|
||||
config.config,
|
||||
&config.ops,
|
||||
config.prefetch_list_path,
|
||||
config.dax_threshold_size_kb,
|
||||
)
|
||||
.map(|_p| ())
|
||||
.map_err(|e| FsDeviceError::AttachBackendFailed(e.to_string()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
Err(FsDeviceError::AttachBackendFailed(
|
||||
"fs tag not found".to_string(),
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the index of the device with the specified `tag` if it exists in the list.
|
||||
pub fn get_index_of_tag(&self, tag: &str) -> Option<usize> {
|
||||
self.info_list
|
||||
.iter()
|
||||
.position(|info| info.config.id().eq(tag))
|
||||
}
|
||||
|
||||
/// Update the ratelimiter settings of a virtio fs device.
|
||||
pub fn update_device_ratelimiters(
|
||||
device_mgr: &mut DeviceManager,
|
||||
new_cfg: FsDeviceConfigUpdateInfo,
|
||||
) -> std::result::Result<(), FsDeviceError> {
|
||||
let mgr = &mut device_mgr.fs_manager.lock().unwrap();
|
||||
match mgr.get_index_of_tag(&new_cfg.tag) {
|
||||
Some(index) => {
|
||||
let config = &mut mgr.info_list[index].config;
|
||||
config.rate_limiter = new_cfg.rate_limiter.clone();
|
||||
let device = mgr.info_list[index]
|
||||
.device
|
||||
.as_mut()
|
||||
.ok_or_else(|| FsDeviceError::TagNotExists("".to_owned()))?;
|
||||
|
||||
if let Some(mmio_dev) = device.as_any().downcast_ref::<DbsMmioV2Device>() {
|
||||
let guard = mmio_dev.state();
|
||||
let inner_dev = guard.get_inner_device();
|
||||
if let Some(fs_dev) = inner_dev
|
||||
.as_any()
|
||||
.downcast_ref::<virtio::fs::VirtioFs<GuestAddressSpaceImpl>>()
|
||||
{
|
||||
return fs_dev
|
||||
.set_patch_rate_limiters(new_cfg.bytes(), new_cfg.ops())
|
||||
.map(|_p| ())
|
||||
.map_err(|_e| FsDeviceError::VirtioFsEpollHanderSendFail);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
None => Err(FsDeviceError::TagNotExists(new_cfg.tag)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for FsDeviceMgr {
|
||||
/// Create a new `FsDeviceMgr` object..
|
||||
fn default() -> Self {
|
||||
FsDeviceMgr {
|
||||
info_list: DeviceConfigInfos::new(),
|
||||
use_shared_irq: USE_SHARED_IRQ,
|
||||
}
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user