From 726f798ff795ef4a8300201cab8d83e83c1496a5 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 8 Mar 2019 10:40:24 -0600 Subject: [PATCH] image-builder: fill out device namespace information into kata image The new NVDIMM driver implementation (kernel >= 4.16) needs to know the device namespace information to map pages, this metadata is read from the nvdimm namespace at 4k offset. fixes #235 Signed-off-by: Julio Montes --- .gitignore | 1 + image-builder/image_builder.sh | 34 ++++++- image-builder/nsdax.gpl.c | 171 +++++++++++++++++++++++++++++++++ 3 files changed, 204 insertions(+), 2 deletions(-) create mode 100644 .gitignore create mode 100644 image-builder/nsdax.gpl.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..f177a5587f --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +image-builder/nsdax diff --git a/image-builder/image_builder.sh b/image-builder/image_builder.sh index 0177830b50..d66b4b9cf3 100755 --- a/image-builder/image_builder.sh +++ b/image-builder/image_builder.sh @@ -20,6 +20,8 @@ IMAGE="${IMAGE:-kata-containers.img}" IMG_SIZE=128 AGENT_BIN=${AGENT_BIN:-kata-agent} AGENT_INIT=${AGENT_INIT:-no} +IMG_HEADER_SZ=2 +IMG_HEADER_SZ_B=$((IMG_HEADER_SZ*1024*1024)) usage() { @@ -140,6 +142,16 @@ align_memory() warning "image size '$IMG_SIZE' is not aligned to memory boundary '$MEM_BOUNDARY_MB', aligning it" IMG_SIZE=$(($IMG_SIZE + $MEM_BOUNDARY_MB - $remaining)) fi + + # To support: + # * memory hotplug: the image size MUST BE aligned to MEM_BOUNDARY_MB (128 or 1024 MB) + # * DAX: NVDIMM driver reads the device namespace information from nvdimm namespace (4K offset). + # The namespace information is saved in the first 2MB of the image. + # * DAX huge pages [2]: 2MB alignment + # + # [1] - nd_pfn_validate(): https://github.com/torvalds/linux/blob/master/drivers/nvdimm/pfn_devs.c + # [2] - https://nvdimm.wiki.kernel.org/2mib_fs_dax + IMG_SIZE=$((IMG_SIZE-IMG_HEADER_SZ)) } # Calculate image size based on the rootfs @@ -211,8 +223,10 @@ create_rootfs_disk() # The partition is the rootfs content info "Creating partitions" - parted "${IMAGE}" --script "mklabel gpt" \ - "mkpart ${FS_TYPE} 1M -1M" + parted -s -a optimal "${IMAGE}" \ + mklabel gpt -- \ + mkpart primary "${FS_TYPE}" 1M -1M \ + print OK "Partitions created" # Get the loop device bound to the image file (requires /dev mounted in the @@ -275,4 +289,20 @@ unmount fsck.ext4 -D -y "${DEVICE}p1" detach +info "Set device namespace information (metadata)" +# Fill out namespace information +tmp_img="$(mktemp)" +chmod 0644 "${tmp_img}" +# metadate header +dd if=/dev/zero of="${tmp_img}" bs="${IMG_HEADER_SZ}M" count=1 +# append image data (rootfs) +dd if="${IMAGE}" of="${tmp_img}" oflag=append conv=notrunc +# copy final image +mv "${tmp_img}" "${IMAGE}" +# Set metadata header +# Issue: https://github.com/kata-containers/osbuilder/issues/240 +gcc -O2 "${script_dir}/nsdax.gpl.c" -o "${script_dir}/nsdax" +"${script_dir}/nsdax" "${IMAGE}" "${IMG_HEADER_SZ_B}" "${IMG_HEADER_SZ_B}" +sync + info "Image created. Virtual size: ${IMG_SIZE}MB." diff --git a/image-builder/nsdax.gpl.c b/image-builder/nsdax.gpl.c new file mode 100644 index 0000000000..333f7804b8 --- /dev/null +++ b/image-builder/nsdax.gpl.c @@ -0,0 +1,171 @@ +/* + * Copyright(c) 2013-2019 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define __KERNEL__ +#include +#include + +/* + Next types, definitions and functions were copied from kernel 4.19.24 source + code, specifically from nvdimm driver +*/ + +#define PFN_SIG_LEN 16 +#define PFN_SIG "NVDIMM_PFN_INFO" +#define SZ_4K 0x00001000 + +typedef __u16 u16; +typedef __u8 u8; +typedef __u64 u64; +typedef __u32 u32; + +enum nd_pfn_mode { + PFN_MODE_NONE, + PFN_MODE_RAM, + PFN_MODE_PMEM, +}; + +struct nd_pfn_sb { + u8 signature[PFN_SIG_LEN]; + u8 uuid[16]; + u8 parent_uuid[16]; + __le32 flags; + __le16 version_major; + __le16 version_minor; + __le64 dataoff; /* relative to namespace_base + start_pad */ + __le64 npfns; + __le32 mode; + /* minor-version-1 additions for section alignment */ + __le32 start_pad; + __le32 end_trunc; + /* minor-version-2 record the base alignment of the mapping */ + __le32 align; + u8 padding[4000]; + __le64 checksum; +}; + +struct nd_gen_sb { + char reserved[SZ_4K - 8]; + __le64 checksum; +}; + + +u64 nd_fletcher64(void *addr, size_t len, bool le) +{ + u32 *buf = addr; + u32 lo32 = 0; + u64 hi32 = 0; + int i; + + for (i = 0; i < len / sizeof(u32); i++) { + lo32 += le ? __le32_to_cpu((__le32) buf[i]) : buf[i]; + hi32 += lo32; + } + + return hi32 << 32 | lo32; +} + + +/* + * nd_sb_checksum: compute checksum for a generic info block + * + * Returns a fletcher64 checksum of everything in the given info block + * except the last field (since that's where the checksum lives). + */ +u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb) +{ + u64 sum; + __le64 sum_save; + + sum_save = nd_gen_sb->checksum; + nd_gen_sb->checksum = 0; + sum = nd_fletcher64(nd_gen_sb, sizeof(*nd_gen_sb), 1); + nd_gen_sb->checksum = sum_save; + return sum; +} + + +void show_usage(const char* name) { + printf("Usage: %s IMAGE_FILE DATA_OFFSET ALIGNMENT\n", name); + printf("DATA_OFFSET and ALIGNMENT must be in bytes\n"); +} + +int main(int argc, char *argv[]) { + if (argc != 4) { + show_usage(argv[0]); + return -1; + } + + const char* img_path = argv[1]; + + char *ptr = NULL; + const long int data_offset = strtol(argv[2], &ptr, 10); + if (ptr == argv[2]) { + fprintf(stderr, "Couldn't convert string '%s' to int\n", argv[2]); + show_usage(argv[0]); + return -1; + } + + ptr = NULL; + const long int alignment = strtol(argv[3], &ptr, 10); + if (ptr == argv[3]) { + fprintf(stderr, "Couldn't convert string '%s' to int\n", argv[3]); + show_usage(argv[0]); + return -1; + } + + printf("Opening file '%s'\n", img_path); + int fd = open(img_path, O_WRONLY); + if (fd == -1) { + perror("open:"); + return -1; + } + + struct nd_pfn_sb sb = { 0 }; + + snprintf((char*)sb.signature, PFN_SIG_LEN, PFN_SIG); + sb.mode = PFN_MODE_RAM; + sb.align = alignment; + sb.dataoff = data_offset; + sb.version_minor = 2; + + // checksum must be calculated at the end + sb.checksum = nd_sb_checksum((struct nd_gen_sb*) &sb); + + // NVDIMM driver: SZ_4K is the namespace-relative starting offset + int ret = lseek(fd, SZ_4K, SEEK_SET); + if (ret == -1) { + perror("lseek: "); + close(fd); + return -1; + } + + printf("Writing metadata\n"); + ret = write(fd, &sb, sizeof(sb)); + if (ret == -1) { + perror("write: "); + } + + close(fd); + printf("OK!\n"); + + return 0; +}