diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..f177a5587 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +image-builder/nsdax diff --git a/Makefile b/Makefile index 93a57d13a..52c451cb9 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,6 @@ ROOTFS_BUILDER := $(MK_DIR)/rootfs-builder/rootfs.sh INITRD_BUILDER := $(MK_DIR)/initrd-builder/initrd_builder.sh IMAGE_BUILDER := $(MK_DIR)/image-builder/image_builder.sh -IMG_SIZE = 500 AGENT_INIT ?= no DISTRO ?= centos ROOTFS_BUILD_DEST := $(PWD) @@ -42,7 +41,7 @@ image-%: $(IMAGES_BUILD_DEST)/kata-containers-image-%.img .PRECIOUS: $(IMAGES_BUILD_DEST)/kata-containers-image-%.img $(IMAGES_BUILD_DEST)/kata-containers-image-%.img: rootfs-% @echo Creating image based on $^ - $(IMAGE_BUILDER) -s $(IMG_SIZE) -o $@ $(ROOTFS_BUILD_DEST)/$*_rootfs + $(IMAGE_BUILDER) -o $@ $(ROOTFS_BUILD_DEST)/$*_rootfs initrd-%: $(IMAGES_BUILD_DEST)/kata-containers-initrd-%.img @ # DONT remove. This is not cancellation rule. @@ -63,7 +62,7 @@ image: $(DISTRO_IMAGE) $(DISTRO_IMAGE): $(DISTRO_ROOTFS_MARKER) @echo Creating image based on "$(DISTRO_ROOTFS)" - $(IMAGE_BUILDER) -s "$(IMG_SIZE)" "$(DISTRO_ROOTFS)" + $(IMAGE_BUILDER) "$(DISTRO_ROOTFS)" .PHONY: initrd initrd: $(DISTRO_INITRD) diff --git a/image-builder/Dockerfile b/image-builder/Dockerfile index af8674dfd..0f10f8c8d 100644 --- a/image-builder/Dockerfile +++ b/image-builder/Dockerfile @@ -7,4 +7,4 @@ From fedora:latest RUN [ -n "$http_proxy" ] && sed -i '$ a proxy='$http_proxy /etc/dnf/dnf.conf ; true -RUN dnf install -y qemu-img parted gdisk e2fsprogs +RUN dnf install -y qemu-img parted gdisk e2fsprogs gcc diff --git a/image-builder/image_builder.sh b/image-builder/image_builder.sh index 21791e366..d66b4b9cf 100755 --- a/image-builder/image_builder.sh +++ b/image-builder/image_builder.sh @@ -17,8 +17,11 @@ source "$lib_file" [ "$(id -u)" -eq 0 ] || die "$0: must be run as root" IMAGE="${IMAGE:-kata-containers.img}" +IMG_SIZE=128 AGENT_BIN=${AGENT_BIN:-kata-agent} AGENT_INIT=${AGENT_INIT:-no} +IMG_HEADER_SZ=2 +IMG_HEADER_SZ_B=$((IMG_HEADER_SZ*1024*1024)) usage() { @@ -27,13 +30,10 @@ usage() Usage: ${script_name} [options] This script will create a Kata Containers image file of an adequate size based on the directory. - The size of the image can be also be specified manually - by '-s' flag. Options: -h Show this help -o path to generate image file ENV: IMAGE - -s Image size in MB ENV: IMG_SIZE -r Free space of the root partition in MB ENV: ROOT_FREE_SPACE Extra environment variables: @@ -67,16 +67,6 @@ do h) usage ;; o) IMAGE="${OPTARG}" ;; r) ROOT_FREE_SPACE="${OPTARG}" ;; - s) { - IMG_SIZE=${OPTARG} - if [ ${IMG_SIZE} -le 0 ]; then - die "Image size has to be greater than 0 MB." - fi - if [ ${IMG_SIZE} -gt ${MAX_IMG_SIZE_MB} ]; then - die "Image size should not be greater than ${MAX_IMG_SIZE_MB} MB." - fi - } - ;; f) FS_TYPE="${OPTARG}" ;; esac done @@ -152,6 +142,16 @@ align_memory() warning "image size '$IMG_SIZE' is not aligned to memory boundary '$MEM_BOUNDARY_MB', aligning it" IMG_SIZE=$(($IMG_SIZE + $MEM_BOUNDARY_MB - $remaining)) fi + + # To support: + # * memory hotplug: the image size MUST BE aligned to MEM_BOUNDARY_MB (128 or 1024 MB) + # * DAX: NVDIMM driver reads the device namespace information from nvdimm namespace (4K offset). + # The namespace information is saved in the first 2MB of the image. + # * DAX huge pages [2]: 2MB alignment + # + # [1] - nd_pfn_validate(): https://github.com/torvalds/linux/blob/master/drivers/nvdimm/pfn_devs.c + # [2] - https://nvdimm.wiki.kernel.org/2mib_fs_dax + IMG_SIZE=$((IMG_SIZE-IMG_HEADER_SZ)) } # Calculate image size based on the rootfs @@ -223,8 +223,10 @@ create_rootfs_disk() # The partition is the rootfs content info "Creating partitions" - parted "${IMAGE}" --script "mklabel gpt" \ - "mkpart ${FS_TYPE} 1M -1M" + parted -s -a optimal "${IMAGE}" \ + mklabel gpt -- \ + mkpart primary "${FS_TYPE}" 1M -1M \ + print OK "Partitions created" # Get the loop device bound to the image file (requires /dev mounted in the @@ -264,7 +266,7 @@ create_rootfs_disk() # of disk creation by adding 5% in the inital assumed value $ROOTFS_SIZE if [ $ROOTFS_SIZE -gt $AVAIL_DISK ]; then # Increase the size but remain aligned to the original MEM_BOUNDARY_MB, which is stored in $ORIG_MEM_BOUNDARY_MB - MEM_BOUNDARY_MB=$(($MEM_BOUNDARY_MB+$ORIG_MEM_BOUNDARY_MB)) + MEM_BOUNDARY_MB=$((MEM_BOUNDARY_MB+ORIG_MEM_BOUNDARY_MB)) OLD_IMG_SIZE=${IMG_SIZE} unset IMG_SIZE unmount @@ -279,6 +281,7 @@ create_rootfs_disk info "rootfs size ${ROOTFS_SIZE} MB" info "Copying content from rootfs to root partition" cp -a "${ROOTFS}"/* ${MOUNT_DIR} +sync OK "rootfs copied" unmount @@ -286,4 +289,20 @@ unmount fsck.ext4 -D -y "${DEVICE}p1" detach +info "Set device namespace information (metadata)" +# Fill out namespace information +tmp_img="$(mktemp)" +chmod 0644 "${tmp_img}" +# metadate header +dd if=/dev/zero of="${tmp_img}" bs="${IMG_HEADER_SZ}M" count=1 +# append image data (rootfs) +dd if="${IMAGE}" of="${tmp_img}" oflag=append conv=notrunc +# copy final image +mv "${tmp_img}" "${IMAGE}" +# Set metadata header +# Issue: https://github.com/kata-containers/osbuilder/issues/240 +gcc -O2 "${script_dir}/nsdax.gpl.c" -o "${script_dir}/nsdax" +"${script_dir}/nsdax" "${IMAGE}" "${IMG_HEADER_SZ_B}" "${IMG_HEADER_SZ_B}" +sync + info "Image created. Virtual size: ${IMG_SIZE}MB." diff --git a/image-builder/nsdax.gpl.c b/image-builder/nsdax.gpl.c new file mode 100644 index 000000000..333f7804b --- /dev/null +++ b/image-builder/nsdax.gpl.c @@ -0,0 +1,171 @@ +/* + * Copyright(c) 2013-2019 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define __KERNEL__ +#include +#include + +/* + Next types, definitions and functions were copied from kernel 4.19.24 source + code, specifically from nvdimm driver +*/ + +#define PFN_SIG_LEN 16 +#define PFN_SIG "NVDIMM_PFN_INFO" +#define SZ_4K 0x00001000 + +typedef __u16 u16; +typedef __u8 u8; +typedef __u64 u64; +typedef __u32 u32; + +enum nd_pfn_mode { + PFN_MODE_NONE, + PFN_MODE_RAM, + PFN_MODE_PMEM, +}; + +struct nd_pfn_sb { + u8 signature[PFN_SIG_LEN]; + u8 uuid[16]; + u8 parent_uuid[16]; + __le32 flags; + __le16 version_major; + __le16 version_minor; + __le64 dataoff; /* relative to namespace_base + start_pad */ + __le64 npfns; + __le32 mode; + /* minor-version-1 additions for section alignment */ + __le32 start_pad; + __le32 end_trunc; + /* minor-version-2 record the base alignment of the mapping */ + __le32 align; + u8 padding[4000]; + __le64 checksum; +}; + +struct nd_gen_sb { + char reserved[SZ_4K - 8]; + __le64 checksum; +}; + + +u64 nd_fletcher64(void *addr, size_t len, bool le) +{ + u32 *buf = addr; + u32 lo32 = 0; + u64 hi32 = 0; + int i; + + for (i = 0; i < len / sizeof(u32); i++) { + lo32 += le ? __le32_to_cpu((__le32) buf[i]) : buf[i]; + hi32 += lo32; + } + + return hi32 << 32 | lo32; +} + + +/* + * nd_sb_checksum: compute checksum for a generic info block + * + * Returns a fletcher64 checksum of everything in the given info block + * except the last field (since that's where the checksum lives). + */ +u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb) +{ + u64 sum; + __le64 sum_save; + + sum_save = nd_gen_sb->checksum; + nd_gen_sb->checksum = 0; + sum = nd_fletcher64(nd_gen_sb, sizeof(*nd_gen_sb), 1); + nd_gen_sb->checksum = sum_save; + return sum; +} + + +void show_usage(const char* name) { + printf("Usage: %s IMAGE_FILE DATA_OFFSET ALIGNMENT\n", name); + printf("DATA_OFFSET and ALIGNMENT must be in bytes\n"); +} + +int main(int argc, char *argv[]) { + if (argc != 4) { + show_usage(argv[0]); + return -1; + } + + const char* img_path = argv[1]; + + char *ptr = NULL; + const long int data_offset = strtol(argv[2], &ptr, 10); + if (ptr == argv[2]) { + fprintf(stderr, "Couldn't convert string '%s' to int\n", argv[2]); + show_usage(argv[0]); + return -1; + } + + ptr = NULL; + const long int alignment = strtol(argv[3], &ptr, 10); + if (ptr == argv[3]) { + fprintf(stderr, "Couldn't convert string '%s' to int\n", argv[3]); + show_usage(argv[0]); + return -1; + } + + printf("Opening file '%s'\n", img_path); + int fd = open(img_path, O_WRONLY); + if (fd == -1) { + perror("open:"); + return -1; + } + + struct nd_pfn_sb sb = { 0 }; + + snprintf((char*)sb.signature, PFN_SIG_LEN, PFN_SIG); + sb.mode = PFN_MODE_RAM; + sb.align = alignment; + sb.dataoff = data_offset; + sb.version_minor = 2; + + // checksum must be calculated at the end + sb.checksum = nd_sb_checksum((struct nd_gen_sb*) &sb); + + // NVDIMM driver: SZ_4K is the namespace-relative starting offset + int ret = lseek(fd, SZ_4K, SEEK_SET); + if (ret == -1) { + perror("lseek: "); + close(fd); + return -1; + } + + printf("Writing metadata\n"); + ret = write(fd, &sb, sizeof(sb)); + if (ret == -1) { + perror("write: "); + } + + close(fd); + printf("OK!\n"); + + return 0; +} diff --git a/rootfs-builder/rootfs.sh b/rootfs-builder/rootfs.sh index 653276e1b..a1b10eb61 100755 --- a/rootfs-builder/rootfs.sh +++ b/rootfs-builder/rootfs.sh @@ -362,7 +362,7 @@ mkdir -p ${ROOTFS_DIR} build_rootfs ${ROOTFS_DIR} pushd "${ROOTFS_DIR}" >> /dev/null if [ "$PWD" != "/" ] ; then - rm -rf ./var/cache/dnf/ + rm -rf ./var/cache/ ./var/lib fi popd >> /dev/null