From 8065bb615a2e7687e723ae0b1436ff2663ec3f61 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 8 Mar 2019 11:42:46 -0600 Subject: [PATCH 1/6] rootfs-builder: delete dnf and rmp data dnf and rmp data are not needed in the final rootfs, removing them we save 2MB of disk Signed-off-by: Julio Montes --- rootfs-builder/rootfs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rootfs-builder/rootfs.sh b/rootfs-builder/rootfs.sh index 653276e1b..a1b10eb61 100755 --- a/rootfs-builder/rootfs.sh +++ b/rootfs-builder/rootfs.sh @@ -362,7 +362,7 @@ mkdir -p ${ROOTFS_DIR} build_rootfs ${ROOTFS_DIR} pushd "${ROOTFS_DIR}" >> /dev/null if [ "$PWD" != "/" ] ; then - rm -rf ./var/cache/dnf/ + rm -rf ./var/cache/ ./var/lib fi popd >> /dev/null From 71ccc0a6eab62b0f9df52913adfe17b17e2ae76b Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 8 Mar 2019 10:08:14 -0600 Subject: [PATCH 2/6] mage-builder: remove -s option and IMG_SIZE envar guest kernel needs 64 bytes of DRAM per 4K page of emulated PMEM, hence the image size should be as small as possible to reduce the container's memory footprint. The image size is recalculated automatically if it's too small to contain the rootfs. Signed-off-by: Julio Montes --- Makefile | 5 ++--- image-builder/image_builder.sh | 14 +------------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 93a57d13a..52c451cb9 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,6 @@ ROOTFS_BUILDER := $(MK_DIR)/rootfs-builder/rootfs.sh INITRD_BUILDER := $(MK_DIR)/initrd-builder/initrd_builder.sh IMAGE_BUILDER := $(MK_DIR)/image-builder/image_builder.sh -IMG_SIZE = 500 AGENT_INIT ?= no DISTRO ?= centos ROOTFS_BUILD_DEST := $(PWD) @@ -42,7 +41,7 @@ image-%: $(IMAGES_BUILD_DEST)/kata-containers-image-%.img .PRECIOUS: $(IMAGES_BUILD_DEST)/kata-containers-image-%.img $(IMAGES_BUILD_DEST)/kata-containers-image-%.img: rootfs-% @echo Creating image based on $^ - $(IMAGE_BUILDER) -s $(IMG_SIZE) -o $@ $(ROOTFS_BUILD_DEST)/$*_rootfs + $(IMAGE_BUILDER) -o $@ $(ROOTFS_BUILD_DEST)/$*_rootfs initrd-%: $(IMAGES_BUILD_DEST)/kata-containers-initrd-%.img @ # DONT remove. This is not cancellation rule. @@ -63,7 +62,7 @@ image: $(DISTRO_IMAGE) $(DISTRO_IMAGE): $(DISTRO_ROOTFS_MARKER) @echo Creating image based on "$(DISTRO_ROOTFS)" - $(IMAGE_BUILDER) -s "$(IMG_SIZE)" "$(DISTRO_ROOTFS)" + $(IMAGE_BUILDER) "$(DISTRO_ROOTFS)" .PHONY: initrd initrd: $(DISTRO_INITRD) diff --git a/image-builder/image_builder.sh b/image-builder/image_builder.sh index 21791e366..017b91d61 100755 --- a/image-builder/image_builder.sh +++ b/image-builder/image_builder.sh @@ -17,6 +17,7 @@ source "$lib_file" [ "$(id -u)" -eq 0 ] || die "$0: must be run as root" IMAGE="${IMAGE:-kata-containers.img}" +IMG_SIZE=128 AGENT_BIN=${AGENT_BIN:-kata-agent} AGENT_INIT=${AGENT_INIT:-no} @@ -27,13 +28,10 @@ usage() Usage: ${script_name} [options] This script will create a Kata Containers image file of an adequate size based on the directory. - The size of the image can be also be specified manually - by '-s' flag. Options: -h Show this help -o path to generate image file ENV: IMAGE - -s Image size in MB ENV: IMG_SIZE -r Free space of the root partition in MB ENV: ROOT_FREE_SPACE Extra environment variables: @@ -67,16 +65,6 @@ do h) usage ;; o) IMAGE="${OPTARG}" ;; r) ROOT_FREE_SPACE="${OPTARG}" ;; - s) { - IMG_SIZE=${OPTARG} - if [ ${IMG_SIZE} -le 0 ]; then - die "Image size has to be greater than 0 MB." - fi - if [ ${IMG_SIZE} -gt ${MAX_IMG_SIZE_MB} ]; then - die "Image size should not be greater than ${MAX_IMG_SIZE_MB} MB." - fi - } - ;; f) FS_TYPE="${OPTARG}" ;; esac done From dc5bc078253f9762b5ada810ea682818d0bfb1a2 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 8 Mar 2019 10:25:53 -0600 Subject: [PATCH 3/6] image-builder: fix mem boundary recalculation $/${} is unnecessary on arithmetic variables. [SC2004] Signed-off-by: Julio Montes --- image-builder/image_builder.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image-builder/image_builder.sh b/image-builder/image_builder.sh index 017b91d61..42fd08236 100755 --- a/image-builder/image_builder.sh +++ b/image-builder/image_builder.sh @@ -252,7 +252,7 @@ create_rootfs_disk() # of disk creation by adding 5% in the inital assumed value $ROOTFS_SIZE if [ $ROOTFS_SIZE -gt $AVAIL_DISK ]; then # Increase the size but remain aligned to the original MEM_BOUNDARY_MB, which is stored in $ORIG_MEM_BOUNDARY_MB - MEM_BOUNDARY_MB=$(($MEM_BOUNDARY_MB+$ORIG_MEM_BOUNDARY_MB)) + MEM_BOUNDARY_MB=$((MEM_BOUNDARY_MB+ORIG_MEM_BOUNDARY_MB)) OLD_IMG_SIZE=${IMG_SIZE} unset IMG_SIZE unmount From 7620066c8a182a02fe2bd5fe1c01a6cc0859033d Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 8 Mar 2019 10:28:03 -0600 Subject: [PATCH 4/6] image-builder: sync rootfs data after copying it into the image Rootfs data must be sync'd after copying it into the image to avoid data corruption Signed-off-by: Julio Montes --- image-builder/image_builder.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/image-builder/image_builder.sh b/image-builder/image_builder.sh index 42fd08236..0177830b5 100755 --- a/image-builder/image_builder.sh +++ b/image-builder/image_builder.sh @@ -267,6 +267,7 @@ create_rootfs_disk info "rootfs size ${ROOTFS_SIZE} MB" info "Copying content from rootfs to root partition" cp -a "${ROOTFS}"/* ${MOUNT_DIR} +sync OK "rootfs copied" unmount From cbe5642b9d018c100c87a1186def22d1907ce633 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 8 Mar 2019 10:35:06 -0600 Subject: [PATCH 5/6] image-builder: add gcc as dependecy to generate the image gcc is required to build the binary in charge to fill out the device namespace information (matadata) into the kata containers image. Signed-off-by: Julio Montes --- image-builder/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image-builder/Dockerfile b/image-builder/Dockerfile index af8674dfd..0f10f8c8d 100644 --- a/image-builder/Dockerfile +++ b/image-builder/Dockerfile @@ -7,4 +7,4 @@ From fedora:latest RUN [ -n "$http_proxy" ] && sed -i '$ a proxy='$http_proxy /etc/dnf/dnf.conf ; true -RUN dnf install -y qemu-img parted gdisk e2fsprogs +RUN dnf install -y qemu-img parted gdisk e2fsprogs gcc From 726f798ff795ef4a8300201cab8d83e83c1496a5 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 8 Mar 2019 10:40:24 -0600 Subject: [PATCH 6/6] image-builder: fill out device namespace information into kata image The new NVDIMM driver implementation (kernel >= 4.16) needs to know the device namespace information to map pages, this metadata is read from the nvdimm namespace at 4k offset. fixes #235 Signed-off-by: Julio Montes --- .gitignore | 1 + image-builder/image_builder.sh | 34 ++++++- image-builder/nsdax.gpl.c | 171 +++++++++++++++++++++++++++++++++ 3 files changed, 204 insertions(+), 2 deletions(-) create mode 100644 .gitignore create mode 100644 image-builder/nsdax.gpl.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..f177a5587 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +image-builder/nsdax diff --git a/image-builder/image_builder.sh b/image-builder/image_builder.sh index 0177830b5..d66b4b9cf 100755 --- a/image-builder/image_builder.sh +++ b/image-builder/image_builder.sh @@ -20,6 +20,8 @@ IMAGE="${IMAGE:-kata-containers.img}" IMG_SIZE=128 AGENT_BIN=${AGENT_BIN:-kata-agent} AGENT_INIT=${AGENT_INIT:-no} +IMG_HEADER_SZ=2 +IMG_HEADER_SZ_B=$((IMG_HEADER_SZ*1024*1024)) usage() { @@ -140,6 +142,16 @@ align_memory() warning "image size '$IMG_SIZE' is not aligned to memory boundary '$MEM_BOUNDARY_MB', aligning it" IMG_SIZE=$(($IMG_SIZE + $MEM_BOUNDARY_MB - $remaining)) fi + + # To support: + # * memory hotplug: the image size MUST BE aligned to MEM_BOUNDARY_MB (128 or 1024 MB) + # * DAX: NVDIMM driver reads the device namespace information from nvdimm namespace (4K offset). + # The namespace information is saved in the first 2MB of the image. + # * DAX huge pages [2]: 2MB alignment + # + # [1] - nd_pfn_validate(): https://github.com/torvalds/linux/blob/master/drivers/nvdimm/pfn_devs.c + # [2] - https://nvdimm.wiki.kernel.org/2mib_fs_dax + IMG_SIZE=$((IMG_SIZE-IMG_HEADER_SZ)) } # Calculate image size based on the rootfs @@ -211,8 +223,10 @@ create_rootfs_disk() # The partition is the rootfs content info "Creating partitions" - parted "${IMAGE}" --script "mklabel gpt" \ - "mkpart ${FS_TYPE} 1M -1M" + parted -s -a optimal "${IMAGE}" \ + mklabel gpt -- \ + mkpart primary "${FS_TYPE}" 1M -1M \ + print OK "Partitions created" # Get the loop device bound to the image file (requires /dev mounted in the @@ -275,4 +289,20 @@ unmount fsck.ext4 -D -y "${DEVICE}p1" detach +info "Set device namespace information (metadata)" +# Fill out namespace information +tmp_img="$(mktemp)" +chmod 0644 "${tmp_img}" +# metadate header +dd if=/dev/zero of="${tmp_img}" bs="${IMG_HEADER_SZ}M" count=1 +# append image data (rootfs) +dd if="${IMAGE}" of="${tmp_img}" oflag=append conv=notrunc +# copy final image +mv "${tmp_img}" "${IMAGE}" +# Set metadata header +# Issue: https://github.com/kata-containers/osbuilder/issues/240 +gcc -O2 "${script_dir}/nsdax.gpl.c" -o "${script_dir}/nsdax" +"${script_dir}/nsdax" "${IMAGE}" "${IMG_HEADER_SZ_B}" "${IMG_HEADER_SZ_B}" +sync + info "Image created. Virtual size: ${IMG_SIZE}MB." diff --git a/image-builder/nsdax.gpl.c b/image-builder/nsdax.gpl.c new file mode 100644 index 000000000..333f7804b --- /dev/null +++ b/image-builder/nsdax.gpl.c @@ -0,0 +1,171 @@ +/* + * Copyright(c) 2013-2019 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define __KERNEL__ +#include +#include + +/* + Next types, definitions and functions were copied from kernel 4.19.24 source + code, specifically from nvdimm driver +*/ + +#define PFN_SIG_LEN 16 +#define PFN_SIG "NVDIMM_PFN_INFO" +#define SZ_4K 0x00001000 + +typedef __u16 u16; +typedef __u8 u8; +typedef __u64 u64; +typedef __u32 u32; + +enum nd_pfn_mode { + PFN_MODE_NONE, + PFN_MODE_RAM, + PFN_MODE_PMEM, +}; + +struct nd_pfn_sb { + u8 signature[PFN_SIG_LEN]; + u8 uuid[16]; + u8 parent_uuid[16]; + __le32 flags; + __le16 version_major; + __le16 version_minor; + __le64 dataoff; /* relative to namespace_base + start_pad */ + __le64 npfns; + __le32 mode; + /* minor-version-1 additions for section alignment */ + __le32 start_pad; + __le32 end_trunc; + /* minor-version-2 record the base alignment of the mapping */ + __le32 align; + u8 padding[4000]; + __le64 checksum; +}; + +struct nd_gen_sb { + char reserved[SZ_4K - 8]; + __le64 checksum; +}; + + +u64 nd_fletcher64(void *addr, size_t len, bool le) +{ + u32 *buf = addr; + u32 lo32 = 0; + u64 hi32 = 0; + int i; + + for (i = 0; i < len / sizeof(u32); i++) { + lo32 += le ? __le32_to_cpu((__le32) buf[i]) : buf[i]; + hi32 += lo32; + } + + return hi32 << 32 | lo32; +} + + +/* + * nd_sb_checksum: compute checksum for a generic info block + * + * Returns a fletcher64 checksum of everything in the given info block + * except the last field (since that's where the checksum lives). + */ +u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb) +{ + u64 sum; + __le64 sum_save; + + sum_save = nd_gen_sb->checksum; + nd_gen_sb->checksum = 0; + sum = nd_fletcher64(nd_gen_sb, sizeof(*nd_gen_sb), 1); + nd_gen_sb->checksum = sum_save; + return sum; +} + + +void show_usage(const char* name) { + printf("Usage: %s IMAGE_FILE DATA_OFFSET ALIGNMENT\n", name); + printf("DATA_OFFSET and ALIGNMENT must be in bytes\n"); +} + +int main(int argc, char *argv[]) { + if (argc != 4) { + show_usage(argv[0]); + return -1; + } + + const char* img_path = argv[1]; + + char *ptr = NULL; + const long int data_offset = strtol(argv[2], &ptr, 10); + if (ptr == argv[2]) { + fprintf(stderr, "Couldn't convert string '%s' to int\n", argv[2]); + show_usage(argv[0]); + return -1; + } + + ptr = NULL; + const long int alignment = strtol(argv[3], &ptr, 10); + if (ptr == argv[3]) { + fprintf(stderr, "Couldn't convert string '%s' to int\n", argv[3]); + show_usage(argv[0]); + return -1; + } + + printf("Opening file '%s'\n", img_path); + int fd = open(img_path, O_WRONLY); + if (fd == -1) { + perror("open:"); + return -1; + } + + struct nd_pfn_sb sb = { 0 }; + + snprintf((char*)sb.signature, PFN_SIG_LEN, PFN_SIG); + sb.mode = PFN_MODE_RAM; + sb.align = alignment; + sb.dataoff = data_offset; + sb.version_minor = 2; + + // checksum must be calculated at the end + sb.checksum = nd_sb_checksum((struct nd_gen_sb*) &sb); + + // NVDIMM driver: SZ_4K is the namespace-relative starting offset + int ret = lseek(fd, SZ_4K, SEEK_SET); + if (ret == -1) { + perror("lseek: "); + close(fd); + return -1; + } + + printf("Writing metadata\n"); + ret = write(fd, &sb, sizeof(sb)); + if (ret == -1) { + perror("write: "); + } + + close(fd); + printf("OK!\n"); + + return 0; +}