mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-07-05 19:47:53 +00:00
Merge pull request #236 from devimc/topic/fixDAX
image-builder: fill out device namespace information into kata image
This commit is contained in:
commit
ecd072430f
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
image-builder/nsdax
|
5
Makefile
5
Makefile
@ -9,7 +9,6 @@ ROOTFS_BUILDER := $(MK_DIR)/rootfs-builder/rootfs.sh
|
|||||||
INITRD_BUILDER := $(MK_DIR)/initrd-builder/initrd_builder.sh
|
INITRD_BUILDER := $(MK_DIR)/initrd-builder/initrd_builder.sh
|
||||||
IMAGE_BUILDER := $(MK_DIR)/image-builder/image_builder.sh
|
IMAGE_BUILDER := $(MK_DIR)/image-builder/image_builder.sh
|
||||||
|
|
||||||
IMG_SIZE = 500
|
|
||||||
AGENT_INIT ?= no
|
AGENT_INIT ?= no
|
||||||
DISTRO ?= centos
|
DISTRO ?= centos
|
||||||
ROOTFS_BUILD_DEST := $(PWD)
|
ROOTFS_BUILD_DEST := $(PWD)
|
||||||
@ -42,7 +41,7 @@ image-%: $(IMAGES_BUILD_DEST)/kata-containers-image-%.img
|
|||||||
.PRECIOUS: $(IMAGES_BUILD_DEST)/kata-containers-image-%.img
|
.PRECIOUS: $(IMAGES_BUILD_DEST)/kata-containers-image-%.img
|
||||||
$(IMAGES_BUILD_DEST)/kata-containers-image-%.img: rootfs-%
|
$(IMAGES_BUILD_DEST)/kata-containers-image-%.img: rootfs-%
|
||||||
@echo Creating image based on $^
|
@echo Creating image based on $^
|
||||||
$(IMAGE_BUILDER) -s $(IMG_SIZE) -o $@ $(ROOTFS_BUILD_DEST)/$*_rootfs
|
$(IMAGE_BUILDER) -o $@ $(ROOTFS_BUILD_DEST)/$*_rootfs
|
||||||
|
|
||||||
initrd-%: $(IMAGES_BUILD_DEST)/kata-containers-initrd-%.img
|
initrd-%: $(IMAGES_BUILD_DEST)/kata-containers-initrd-%.img
|
||||||
@ # DONT remove. This is not cancellation rule.
|
@ # DONT remove. This is not cancellation rule.
|
||||||
@ -63,7 +62,7 @@ image: $(DISTRO_IMAGE)
|
|||||||
|
|
||||||
$(DISTRO_IMAGE): $(DISTRO_ROOTFS_MARKER)
|
$(DISTRO_IMAGE): $(DISTRO_ROOTFS_MARKER)
|
||||||
@echo Creating image based on "$(DISTRO_ROOTFS)"
|
@echo Creating image based on "$(DISTRO_ROOTFS)"
|
||||||
$(IMAGE_BUILDER) -s "$(IMG_SIZE)" "$(DISTRO_ROOTFS)"
|
$(IMAGE_BUILDER) "$(DISTRO_ROOTFS)"
|
||||||
|
|
||||||
.PHONY: initrd
|
.PHONY: initrd
|
||||||
initrd: $(DISTRO_INITRD)
|
initrd: $(DISTRO_INITRD)
|
||||||
|
@ -7,4 +7,4 @@ From fedora:latest
|
|||||||
|
|
||||||
RUN [ -n "$http_proxy" ] && sed -i '$ a proxy='$http_proxy /etc/dnf/dnf.conf ; true
|
RUN [ -n "$http_proxy" ] && sed -i '$ a proxy='$http_proxy /etc/dnf/dnf.conf ; true
|
||||||
|
|
||||||
RUN dnf install -y qemu-img parted gdisk e2fsprogs
|
RUN dnf install -y qemu-img parted gdisk e2fsprogs gcc
|
||||||
|
@ -17,8 +17,11 @@ source "$lib_file"
|
|||||||
[ "$(id -u)" -eq 0 ] || die "$0: must be run as root"
|
[ "$(id -u)" -eq 0 ] || die "$0: must be run as root"
|
||||||
|
|
||||||
IMAGE="${IMAGE:-kata-containers.img}"
|
IMAGE="${IMAGE:-kata-containers.img}"
|
||||||
|
IMG_SIZE=128
|
||||||
AGENT_BIN=${AGENT_BIN:-kata-agent}
|
AGENT_BIN=${AGENT_BIN:-kata-agent}
|
||||||
AGENT_INIT=${AGENT_INIT:-no}
|
AGENT_INIT=${AGENT_INIT:-no}
|
||||||
|
IMG_HEADER_SZ=2
|
||||||
|
IMG_HEADER_SZ_B=$((IMG_HEADER_SZ*1024*1024))
|
||||||
|
|
||||||
usage()
|
usage()
|
||||||
{
|
{
|
||||||
@ -27,13 +30,10 @@ usage()
|
|||||||
Usage: ${script_name} [options] <rootfs-dir>
|
Usage: ${script_name} [options] <rootfs-dir>
|
||||||
This script will create a Kata Containers image file of
|
This script will create a Kata Containers image file of
|
||||||
an adequate size based on the <rootfs-dir> directory.
|
an adequate size based on the <rootfs-dir> directory.
|
||||||
The size of the image can be also be specified manually
|
|
||||||
by '-s' flag.
|
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
-h Show this help
|
-h Show this help
|
||||||
-o path to generate image file ENV: IMAGE
|
-o path to generate image file ENV: IMAGE
|
||||||
-s Image size in MB ENV: IMG_SIZE
|
|
||||||
-r Free space of the root partition in MB ENV: ROOT_FREE_SPACE
|
-r Free space of the root partition in MB ENV: ROOT_FREE_SPACE
|
||||||
|
|
||||||
Extra environment variables:
|
Extra environment variables:
|
||||||
@ -67,16 +67,6 @@ do
|
|||||||
h) usage ;;
|
h) usage ;;
|
||||||
o) IMAGE="${OPTARG}" ;;
|
o) IMAGE="${OPTARG}" ;;
|
||||||
r) ROOT_FREE_SPACE="${OPTARG}" ;;
|
r) ROOT_FREE_SPACE="${OPTARG}" ;;
|
||||||
s) {
|
|
||||||
IMG_SIZE=${OPTARG}
|
|
||||||
if [ ${IMG_SIZE} -le 0 ]; then
|
|
||||||
die "Image size has to be greater than 0 MB."
|
|
||||||
fi
|
|
||||||
if [ ${IMG_SIZE} -gt ${MAX_IMG_SIZE_MB} ]; then
|
|
||||||
die "Image size should not be greater than ${MAX_IMG_SIZE_MB} MB."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
;;
|
|
||||||
f) FS_TYPE="${OPTARG}" ;;
|
f) FS_TYPE="${OPTARG}" ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
@ -152,6 +142,16 @@ align_memory()
|
|||||||
warning "image size '$IMG_SIZE' is not aligned to memory boundary '$MEM_BOUNDARY_MB', aligning it"
|
warning "image size '$IMG_SIZE' is not aligned to memory boundary '$MEM_BOUNDARY_MB', aligning it"
|
||||||
IMG_SIZE=$(($IMG_SIZE + $MEM_BOUNDARY_MB - $remaining))
|
IMG_SIZE=$(($IMG_SIZE + $MEM_BOUNDARY_MB - $remaining))
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# To support:
|
||||||
|
# * memory hotplug: the image size MUST BE aligned to MEM_BOUNDARY_MB (128 or 1024 MB)
|
||||||
|
# * DAX: NVDIMM driver reads the device namespace information from nvdimm namespace (4K offset).
|
||||||
|
# The namespace information is saved in the first 2MB of the image.
|
||||||
|
# * DAX huge pages [2]: 2MB alignment
|
||||||
|
#
|
||||||
|
# [1] - nd_pfn_validate(): https://github.com/torvalds/linux/blob/master/drivers/nvdimm/pfn_devs.c
|
||||||
|
# [2] - https://nvdimm.wiki.kernel.org/2mib_fs_dax
|
||||||
|
IMG_SIZE=$((IMG_SIZE-IMG_HEADER_SZ))
|
||||||
}
|
}
|
||||||
|
|
||||||
# Calculate image size based on the rootfs
|
# Calculate image size based on the rootfs
|
||||||
@ -223,8 +223,10 @@ create_rootfs_disk()
|
|||||||
# The partition is the rootfs content
|
# The partition is the rootfs content
|
||||||
|
|
||||||
info "Creating partitions"
|
info "Creating partitions"
|
||||||
parted "${IMAGE}" --script "mklabel gpt" \
|
parted -s -a optimal "${IMAGE}" \
|
||||||
"mkpart ${FS_TYPE} 1M -1M"
|
mklabel gpt -- \
|
||||||
|
mkpart primary "${FS_TYPE}" 1M -1M \
|
||||||
|
print
|
||||||
OK "Partitions created"
|
OK "Partitions created"
|
||||||
|
|
||||||
# Get the loop device bound to the image file (requires /dev mounted in the
|
# Get the loop device bound to the image file (requires /dev mounted in the
|
||||||
@ -264,7 +266,7 @@ create_rootfs_disk()
|
|||||||
# of disk creation by adding 5% in the inital assumed value $ROOTFS_SIZE
|
# of disk creation by adding 5% in the inital assumed value $ROOTFS_SIZE
|
||||||
if [ $ROOTFS_SIZE -gt $AVAIL_DISK ]; then
|
if [ $ROOTFS_SIZE -gt $AVAIL_DISK ]; then
|
||||||
# Increase the size but remain aligned to the original MEM_BOUNDARY_MB, which is stored in $ORIG_MEM_BOUNDARY_MB
|
# Increase the size but remain aligned to the original MEM_BOUNDARY_MB, which is stored in $ORIG_MEM_BOUNDARY_MB
|
||||||
MEM_BOUNDARY_MB=$(($MEM_BOUNDARY_MB+$ORIG_MEM_BOUNDARY_MB))
|
MEM_BOUNDARY_MB=$((MEM_BOUNDARY_MB+ORIG_MEM_BOUNDARY_MB))
|
||||||
OLD_IMG_SIZE=${IMG_SIZE}
|
OLD_IMG_SIZE=${IMG_SIZE}
|
||||||
unset IMG_SIZE
|
unset IMG_SIZE
|
||||||
unmount
|
unmount
|
||||||
@ -279,6 +281,7 @@ create_rootfs_disk
|
|||||||
info "rootfs size ${ROOTFS_SIZE} MB"
|
info "rootfs size ${ROOTFS_SIZE} MB"
|
||||||
info "Copying content from rootfs to root partition"
|
info "Copying content from rootfs to root partition"
|
||||||
cp -a "${ROOTFS}"/* ${MOUNT_DIR}
|
cp -a "${ROOTFS}"/* ${MOUNT_DIR}
|
||||||
|
sync
|
||||||
OK "rootfs copied"
|
OK "rootfs copied"
|
||||||
|
|
||||||
unmount
|
unmount
|
||||||
@ -286,4 +289,20 @@ unmount
|
|||||||
fsck.ext4 -D -y "${DEVICE}p1"
|
fsck.ext4 -D -y "${DEVICE}p1"
|
||||||
detach
|
detach
|
||||||
|
|
||||||
|
info "Set device namespace information (metadata)"
|
||||||
|
# Fill out namespace information
|
||||||
|
tmp_img="$(mktemp)"
|
||||||
|
chmod 0644 "${tmp_img}"
|
||||||
|
# metadate header
|
||||||
|
dd if=/dev/zero of="${tmp_img}" bs="${IMG_HEADER_SZ}M" count=1
|
||||||
|
# append image data (rootfs)
|
||||||
|
dd if="${IMAGE}" of="${tmp_img}" oflag=append conv=notrunc
|
||||||
|
# copy final image
|
||||||
|
mv "${tmp_img}" "${IMAGE}"
|
||||||
|
# Set metadata header
|
||||||
|
# Issue: https://github.com/kata-containers/osbuilder/issues/240
|
||||||
|
gcc -O2 "${script_dir}/nsdax.gpl.c" -o "${script_dir}/nsdax"
|
||||||
|
"${script_dir}/nsdax" "${IMAGE}" "${IMG_HEADER_SZ_B}" "${IMG_HEADER_SZ_B}"
|
||||||
|
sync
|
||||||
|
|
||||||
info "Image created. Virtual size: ${IMG_SIZE}MB."
|
info "Image created. Virtual size: ${IMG_SIZE}MB."
|
||||||
|
171
image-builder/nsdax.gpl.c
Normal file
171
image-builder/nsdax.gpl.c
Normal file
@ -0,0 +1,171 @@
|
|||||||
|
/*
|
||||||
|
* Copyright(c) 2013-2019 Intel Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of version 2 of the GNU General Public License as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but
|
||||||
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* General Public License for more details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
#define __KERNEL__
|
||||||
|
#include <linux/types.h>
|
||||||
|
#include <linux/byteorder/little_endian.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
Next types, definitions and functions were copied from kernel 4.19.24 source
|
||||||
|
code, specifically from nvdimm driver
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define PFN_SIG_LEN 16
|
||||||
|
#define PFN_SIG "NVDIMM_PFN_INFO"
|
||||||
|
#define SZ_4K 0x00001000
|
||||||
|
|
||||||
|
typedef __u16 u16;
|
||||||
|
typedef __u8 u8;
|
||||||
|
typedef __u64 u64;
|
||||||
|
typedef __u32 u32;
|
||||||
|
|
||||||
|
enum nd_pfn_mode {
|
||||||
|
PFN_MODE_NONE,
|
||||||
|
PFN_MODE_RAM,
|
||||||
|
PFN_MODE_PMEM,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct nd_pfn_sb {
|
||||||
|
u8 signature[PFN_SIG_LEN];
|
||||||
|
u8 uuid[16];
|
||||||
|
u8 parent_uuid[16];
|
||||||
|
__le32 flags;
|
||||||
|
__le16 version_major;
|
||||||
|
__le16 version_minor;
|
||||||
|
__le64 dataoff; /* relative to namespace_base + start_pad */
|
||||||
|
__le64 npfns;
|
||||||
|
__le32 mode;
|
||||||
|
/* minor-version-1 additions for section alignment */
|
||||||
|
__le32 start_pad;
|
||||||
|
__le32 end_trunc;
|
||||||
|
/* minor-version-2 record the base alignment of the mapping */
|
||||||
|
__le32 align;
|
||||||
|
u8 padding[4000];
|
||||||
|
__le64 checksum;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct nd_gen_sb {
|
||||||
|
char reserved[SZ_4K - 8];
|
||||||
|
__le64 checksum;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
u64 nd_fletcher64(void *addr, size_t len, bool le)
|
||||||
|
{
|
||||||
|
u32 *buf = addr;
|
||||||
|
u32 lo32 = 0;
|
||||||
|
u64 hi32 = 0;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < len / sizeof(u32); i++) {
|
||||||
|
lo32 += le ? __le32_to_cpu((__le32) buf[i]) : buf[i];
|
||||||
|
hi32 += lo32;
|
||||||
|
}
|
||||||
|
|
||||||
|
return hi32 << 32 | lo32;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* nd_sb_checksum: compute checksum for a generic info block
|
||||||
|
*
|
||||||
|
* Returns a fletcher64 checksum of everything in the given info block
|
||||||
|
* except the last field (since that's where the checksum lives).
|
||||||
|
*/
|
||||||
|
u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb)
|
||||||
|
{
|
||||||
|
u64 sum;
|
||||||
|
__le64 sum_save;
|
||||||
|
|
||||||
|
sum_save = nd_gen_sb->checksum;
|
||||||
|
nd_gen_sb->checksum = 0;
|
||||||
|
sum = nd_fletcher64(nd_gen_sb, sizeof(*nd_gen_sb), 1);
|
||||||
|
nd_gen_sb->checksum = sum_save;
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void show_usage(const char* name) {
|
||||||
|
printf("Usage: %s IMAGE_FILE DATA_OFFSET ALIGNMENT\n", name);
|
||||||
|
printf("DATA_OFFSET and ALIGNMENT must be in bytes\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
if (argc != 4) {
|
||||||
|
show_usage(argv[0]);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* img_path = argv[1];
|
||||||
|
|
||||||
|
char *ptr = NULL;
|
||||||
|
const long int data_offset = strtol(argv[2], &ptr, 10);
|
||||||
|
if (ptr == argv[2]) {
|
||||||
|
fprintf(stderr, "Couldn't convert string '%s' to int\n", argv[2]);
|
||||||
|
show_usage(argv[0]);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ptr = NULL;
|
||||||
|
const long int alignment = strtol(argv[3], &ptr, 10);
|
||||||
|
if (ptr == argv[3]) {
|
||||||
|
fprintf(stderr, "Couldn't convert string '%s' to int\n", argv[3]);
|
||||||
|
show_usage(argv[0]);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Opening file '%s'\n", img_path);
|
||||||
|
int fd = open(img_path, O_WRONLY);
|
||||||
|
if (fd == -1) {
|
||||||
|
perror("open:");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct nd_pfn_sb sb = { 0 };
|
||||||
|
|
||||||
|
snprintf((char*)sb.signature, PFN_SIG_LEN, PFN_SIG);
|
||||||
|
sb.mode = PFN_MODE_RAM;
|
||||||
|
sb.align = alignment;
|
||||||
|
sb.dataoff = data_offset;
|
||||||
|
sb.version_minor = 2;
|
||||||
|
|
||||||
|
// checksum must be calculated at the end
|
||||||
|
sb.checksum = nd_sb_checksum((struct nd_gen_sb*) &sb);
|
||||||
|
|
||||||
|
// NVDIMM driver: SZ_4K is the namespace-relative starting offset
|
||||||
|
int ret = lseek(fd, SZ_4K, SEEK_SET);
|
||||||
|
if (ret == -1) {
|
||||||
|
perror("lseek: ");
|
||||||
|
close(fd);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Writing metadata\n");
|
||||||
|
ret = write(fd, &sb, sizeof(sb));
|
||||||
|
if (ret == -1) {
|
||||||
|
perror("write: ");
|
||||||
|
}
|
||||||
|
|
||||||
|
close(fd);
|
||||||
|
printf("OK!\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
@ -362,7 +362,7 @@ mkdir -p ${ROOTFS_DIR}
|
|||||||
build_rootfs ${ROOTFS_DIR}
|
build_rootfs ${ROOTFS_DIR}
|
||||||
pushd "${ROOTFS_DIR}" >> /dev/null
|
pushd "${ROOTFS_DIR}" >> /dev/null
|
||||||
if [ "$PWD" != "/" ] ; then
|
if [ "$PWD" != "/" ] ; then
|
||||||
rm -rf ./var/cache/dnf/
|
rm -rf ./var/cache/ ./var/lib
|
||||||
fi
|
fi
|
||||||
popd >> /dev/null
|
popd >> /dev/null
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user