projects: add shiftfs project

Signed-off-by: Tycho Andersen <tycho@docker.com>
This commit is contained in:
Tycho Andersen 2017-06-13 11:07:41 -06:00
parent fd00f19c59
commit d29b2a909c
9 changed files with 5082 additions and 0 deletions

View File

@ -23,6 +23,8 @@ If you want to create a project, please submit a pull request to create a new di
- [kernel-config](kernel-config/) an experiment on how to manage kernel config
- [IMA-namespace](ima-namespace/) patches for supporting per-mount-namespace
IMA policies
- [shiftfs](shiftfs/) is a filesystem for mapping mountpoints across user
namespaces
## Current projects not yet documented
- VMWare support (VMWare)

View File

@ -0,0 +1,73 @@
FROM linuxkit/kernel-compile:1b396c221af673757703258159ddc8539843b02b@sha256:6b32d205bfc6407568324337b707d195d027328dbfec554428ea93e7b0a8299b AS kernel-build
ARG KERNEL_VERSION
ARG KERNEL_SERIES
ARG DEBUG
ENV KERNEL_SOURCE=https://www.kernel.org/pub/linux/kernel/v4.x/linux-${KERNEL_VERSION}.tar.xz
RUN curl -fsSL -o linux-${KERNEL_VERSION}.tar.xz ${KERNEL_SOURCE}
RUN cat linux-${KERNEL_VERSION}.tar.xz | tar --absolute-names -xJ && mv /linux-${KERNEL_VERSION} /linux
COPY kernel_config-${KERNEL_SERIES} /linux/arch/x86/configs/x86_64_defconfig
COPY kernel_config.debug /linux/debug_config
RUN if [ -n "${DEBUG}" ]; then \
sed -i 's/CONFIG_PANIC_ON_OOPS=y/# CONFIG_PANIC_ON_OOPS is not set/' /linux/arch/x86/configs/x86_64_defconfig; \
cat /linux/debug_config >> /linux/arch/x86/configs/x86_64_defconfig; \
fi
# Apply local patches
COPY patches-${KERNEL_SERIES} /patches
WORKDIR /linux
RUN set -e && for patch in /patches/*.patch; do \
echo "Applying $patch"; \
patch -p1 < "$patch"; \
done
RUN mkdir /out
# Kernel
RUN make defconfig && \
make oldconfig && \
make -j "$(getconf _NPROCESSORS_ONLN)" KCFLAGS="-fno-pie" && \
cp arch/x86_64/boot/bzImage /out/kernel && \
cp System.map /out && \
([ -n "${DEBUG}" ] && cp vmlinux /out || true)
# Modules
RUN make INSTALL_MOD_PATH=/tmp/kernel-modules modules_install && \
( DVER=$(basename $(find /tmp/kernel-modules/lib/modules/ -mindepth 1 -maxdepth 1)) && \
cd /tmp/kernel-modules/lib/modules/$DVER && \
rm build source && \
ln -s /usr/src/linux-headers-$DVER build ) && \
( cd /tmp/kernel-modules && tar cf /out/kernel.tar lib )
# Headers (userspace API)
RUN mkdir -p /tmp/kernel-headers/usr && \
make INSTALL_HDR_PATH=/tmp/kernel-headers/usr headers_install && \
( cd /tmp/kernel-headers && tar cf /out/kernel-headers.tar usr )
# Headers (kernel development)
RUN DVER=$(basename $(find /tmp/kernel-modules/lib/modules/ -mindepth 1 -maxdepth 1)) && \
dir=/tmp/usr/src/linux-headers-$DVER && \
mkdir -p $dir && \
cp /linux/.config $dir && \
cp /linux/Module.symvers $dir && \
find . -path './include/*' -prune -o \
-path './arch/*/include' -prune -o \
-path './scripts/*' -prune -o \
-type f \( -name 'Makefile*' -o -name 'Kconfig*' -o -name 'Kbuild*' -o \
-name '*.lds' -o -name '*.pl' -o -name '*.sh' \) | \
tar cf - -T - | (cd $dir; tar xf -) && \
( cd /tmp && tar cf /out/kernel-dev.tar usr/src )
RUN printf "KERNEL_SOURCE=${KERNEL_SOURCE}\n" > /out/kernel-source-info
FROM scratch
ENTRYPOINT []
CMD []
WORKDIR /
COPY --from=kernel-build /out/* /

66
projects/shiftfs/Makefile Normal file
View File

@ -0,0 +1,66 @@
# This builds the supported LinuxKit kernels. Kernels are wrapped up
# in a minimal toybox container, which contains the bzImage, a tar
# ball with modules and the kernel source.
#
# Each kernel is pushed to hub twice, once as
# linuxkit/kernel:<kernel>.<major>.<minor>-<hash> and once as
# inuxkit/kernel:<kernel>.<major>.x. The <hash> is the git tree hash
# of the current directory. The build will only rebuild the kernel
# image if the git tree hash changed.
# Git tree hash of this directory. Override to force build
HASH?=$(shell git ls-tree HEAD -- ../$(notdir $(CURDIR)) | awk '{print $$3}')
# Name and Org on Hub
ORG?=linuxkitprojects
IMAGE:=kernel-shiftfs
.PHONY: check tag push sign
# Targets:
# build: builds all kernels
# push: pushes all tagged kernel images to hub
# sign: sign and push all kernel images to hub
build:
push:
sign:
# A template for defining kernel build
# Arguments:
# $1: Full kernel version, e.g., 4.9.22
# $2: Kernel "series", e.g., 4.9.x
# $3: Build a debug kernel (used as suffix for image)
# This defines targets like:
# build_4.9.x, push_4.9.x and sign_4.9.x and adds them as dependencies
# to the global targets
# Set $3 to "_dbg", to build debug kernels. This defines targets like
# build_4.9.x_dbg and adds "_dbg" to the hub image name.
define kernel
build_$(2)$(3): Dockerfile Makefile $(wildcard patches-$(2)/*) kernel_config-$(2) kernel_config.debug
docker pull $(ORG)/$(IMAGE):$(1)$(3)-$(HASH) || \
docker build \
--build-arg KERNEL_VERSION=$(1) \
--build-arg KERNEL_SERIES=$(2) \
--build-arg DEBUG=$(3) \
--no-cache -t $(ORG)/$(IMAGE):$(1)$(3)-$(HASH) .
push_$(2)$(3): build_$(2)$(3)
docker pull $(ORG)/$(IMAGE):$(1)$(3)-$(HASH) || \
(docker push $(ORG)/$(IMAGE):$(1)$(3)-$(HASH) && \
docker tag $(ORG)/$(IMAGE):$(1)$(3)-$(HASH) $(ORG)/$(IMAGE):$(2)$(3) && \
docker push $(ORG)/$(IMAGE):$(2)$(3))
sign_$(2)$(3): build_$(2)$(3)
DOCKER_CONTENT_TRUST=1 docker pull $(ORG)/$(IMAGE):$(1)$(3)-$(HASH) || \
(DOCKER_CONTENT_TRUST=1 docker push $(ORG)/$(IMAGE):$(1)$(3)-$(HASH) && \
docker tag $(ORG)/$(IMAGE):$(1)$(3)-$(HASH) $(ORG)/$(IMAGE):$(2)$(3) && \
DOCKER_CONTENT_TRUST=1 docker push $(ORG)/$(IMAGE):$(2)$(3))
build: build_$(2)$(3)
push: push_$(2)$(3)
sign: sign_$(2)$(3)
endef
#
# Build Targets
# Debug targets only for latest stable and LTS stable
#
$(eval $(call kernel,4.11.4,4.11.x))

View File

@ -0,0 +1,30 @@
## shiftfs
Shiftfs is a virtual filesystem for mapping mountpoints across user namespaces.
The idea is that it would be useful for dockerds spawning containers: they can
keep filesystems on the host disk in terms of real root, but mount the
container roots via shiftfs, allowing containers to share a particular
filesystem with different uid maps, while not having to uidshift every file on
disk (and thus destroying some of the sharing properties).
The version included here is the v2 version of shiftfs, using the superblock's
user namespace instead of mountopts to figure out mappings. Thus, an extra step
of "marking" mounts is needed. For example:
# mkdir source
# touch source/foo # a root owned file
# mount -t shiftfs -o mark source source
# chmod 777 source
Now, let's make a user namespace:
# setuid 1000 unshare -rm
# cat /proc/self/uidmap
0 1000 1
# mkdir dest
# mount -t shiftfs source dest
# stat dest/foo | grep Uid
Access: (0644/-rw-r--r--) Uid: ( 0/ root) Gid: ( 0/ root)
And thanks to the magic of shiftfs, the file is root owned in the user
namespce.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,26 @@
## LinuxKit DEBUG OPTIONS ##
CONFIG_LOCKDEP=y
CONFIG_FRAME_POINTER=y
CONFIG_LOCKUP_DETECTOR=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_DEBUG_TIMEKEEPING=y
CONFIG_DEBUG_RT_MUTEXES=y
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_MUTEXES=y
CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_PROVE_LOCKING=y
CONFIG_LOCK_STAT=y
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DEBUG_LIST=y
CONFIG_DEBUG_NOTIFIERS=y
CONFIG_PROVE_RCU=y
CONFIG_RCU_TRACE=y
CONFIG_KGDB=y
CONFIG_KGDB_SERIAL_CONSOLE=y
CONFIG_KGDBOC=y
CONFIG_DEBUG_RODATA_TEST=y
CONFIG_DEBUG_WX=y

View File

@ -0,0 +1,929 @@
From bec86f3997034944e349e947808dc1766f79767d Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Fri, 14 Apr 2017 14:22:01 -0600
Subject: [PATCH 1/2] shiftfs: uid/gid shifting filesystem (s_user_ns version)
This allows any subtree to be uid/gid shifted and bound elsewhere. It
does this by operating simlarly to overlayfs. Its primary use is for
shifting the underlying uids of filesystems used to support
unpriviliged (uid shifted) containers. The usual use case here is
that the container is operating with an uid shifted unprivileged root
but sometimes needs to make use of or work with a filesystem image
that has root at real uid 0.
The mechanism is to allow any subordinate mount namespace to mount a
shiftfs filesystem (by marking it FS_USERNS_MOUNT) but only allowing
it to mount marked subtrees (using the -o mark option as root). Once
mounted, the subtree is mapped via the super block user namespace so
that the interior ids of the mounting user namespace are the ids
written to the filesystem.
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
v1 - based on original shiftfs with uid mappings now done via s_user_ns
v2 - fix revalidation of dentries
add inode aliasing
---
fs/Kconfig | 8 +
fs/Makefile | 1 +
fs/shiftfs.c | 847 +++++++++++++++++++++++++++++++++++++++++++++
include/uapi/linux/magic.h | 2 +
4 files changed, 858 insertions(+)
diff --git a/fs/Kconfig b/fs/Kconfig
index b0e42b6a96b9..a66dff8d4256 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -106,6 +106,14 @@ source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
source "fs/overlayfs/Kconfig"
+config SHIFT_FS
+ tristate "UID/GID shifting overlay filesystem for containers"
+ help
+ This filesystem can overlay any mounted filesystem and shift
+ the uid/gid the files appear at. The idea is that
+ unprivileged containers can use this to mount root volumes
+ using this technique.
+
menu "Caches"
source "fs/fscache/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 7bbaca9c67b1..2aa3ad47a286 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -128,3 +128,4 @@ obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
+obj-$(CONFIG_SHIFT_FS) += shiftfs.o
diff --git a/fs/shiftfs.c b/fs/shiftfs.c
new file mode 100644
index 000000000000..ea8ac57b3ce1
--- /dev/null
+++ b/fs/shiftfs.c
@@ -0,0 +1,847 @@
+#include <linux/cred.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/magic.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include <linux/uidgid.h>
+#include <linux/xattr.h>
+
+struct shiftfs_super_info {
+ struct vfsmount *mnt;
+ struct user_namespace *userns;
+ bool mark;
+};
+
+static struct inode *shiftfs_new_inode(struct super_block *sb, umode_t mode,
+ struct dentry *dentry);
+
+enum {
+ OPT_MARK,
+ OPT_LAST,
+};
+
+/* global filesystem options */
+static const match_table_t tokens = {
+ { OPT_MARK, "mark" },
+ { OPT_LAST, NULL }
+};
+
+static const struct cred *shiftfs_get_up_creds(struct super_block *sb)
+{
+ struct shiftfs_super_info *ssi = sb->s_fs_info;
+ struct cred *cred = prepare_creds();
+
+ if (!cred)
+ return NULL;
+
+ cred->fsuid = KUIDT_INIT(from_kuid(sb->s_user_ns, cred->fsuid));
+ cred->fsgid = KGIDT_INIT(from_kgid(sb->s_user_ns, cred->fsgid));
+ put_user_ns(cred->user_ns);
+ cred->user_ns = get_user_ns(ssi->userns);
+
+ return cred;
+}
+
+static const struct cred *shiftfs_new_creds(const struct cred **newcred,
+ struct super_block *sb)
+{
+ const struct cred *cred = shiftfs_get_up_creds(sb);
+
+ *newcred = cred;
+
+ if (cred)
+ cred = override_creds(cred);
+ else
+ printk(KERN_ERR "shiftfs: Credential override failed: no memory\n");
+
+ return cred;
+}
+
+static void shiftfs_old_creds(const struct cred *oldcred,
+ const struct cred **newcred)
+{
+ if (!*newcred)
+ return;
+
+ revert_creds(oldcred);
+ put_cred(*newcred);
+}
+
+static int shiftfs_parse_options(struct shiftfs_super_info *ssi, char *options)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+
+ ssi->mark = false;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case OPT_MARK:
+ ssi->mark = true;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static void shiftfs_d_release(struct dentry *dentry)
+{
+ struct dentry *real = dentry->d_fsdata;
+
+ dput(real);
+}
+
+static struct dentry *shiftfs_d_real(struct dentry *dentry,
+ const struct inode *inode,
+ unsigned int flags)
+{
+ struct dentry *real = dentry->d_fsdata;
+
+ if (unlikely(real->d_flags & DCACHE_OP_REAL))
+ return real->d_op->d_real(real, real->d_inode, flags);
+
+ return real;
+}
+
+static int shiftfs_d_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct dentry *real = dentry->d_fsdata;
+
+ if (d_unhashed(real))
+ return 0;
+
+ if (!(real->d_flags & DCACHE_OP_WEAK_REVALIDATE))
+ return 1;
+
+ return real->d_op->d_weak_revalidate(real, flags);
+}
+
+static int shiftfs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct dentry *real = dentry->d_fsdata;
+ struct inode *reali = d_inode(real), *inode = d_inode(dentry);
+ int ret;
+
+ if (d_unhashed(real))
+ return 0;
+
+ /*
+ * inode state of underlying changed from positive to negative
+ * or vice versa; force a lookup to update our view
+ */
+ if (d_is_negative(real) != d_is_negative(dentry))
+ return 0;
+
+ /*
+ * non dir link count is > 1 and our inode is currently not in
+ * the inode hash => need to drop and reget our dentry to make
+ * sure we're aliasing it correctly.
+ */
+ if (reali &&!S_ISDIR(reali->i_mode) && reali->i_nlink > 1 &&
+ (!inode || inode_unhashed(inode)))
+ return 0;
+
+ if (!(real->d_flags & DCACHE_OP_REVALIDATE))
+ return 1;
+
+ ret = real->d_op->d_revalidate(real, flags);
+
+ if (ret == 0 && !(flags & LOOKUP_RCU))
+ d_invalidate(real);
+
+ return ret;
+}
+
+static const struct dentry_operations shiftfs_dentry_ops = {
+ .d_release = shiftfs_d_release,
+ .d_real = shiftfs_d_real,
+ .d_revalidate = shiftfs_d_revalidate,
+ .d_weak_revalidate = shiftfs_d_weak_revalidate,
+};
+
+static int shiftfs_readlink(struct dentry *dentry, char __user *data,
+ int flags)
+{
+ struct dentry *real = dentry->d_fsdata;
+ const struct inode_operations *iop = real->d_inode->i_op;
+
+ if (iop->readlink)
+ return iop->readlink(real, data, flags);
+
+ return -EINVAL;
+}
+
+static const char *shiftfs_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *done)
+{
+ if (dentry) {
+ struct dentry *real = dentry->d_fsdata;
+ struct inode *reali = real->d_inode;
+ const struct inode_operations *iop = reali->i_op;
+ const char *res = ERR_PTR(-EPERM);
+
+ if (iop->get_link)
+ res = iop->get_link(real, reali, done);
+
+ return res;
+ } else {
+ /* RCU lookup not supported */
+ return ERR_PTR(-ECHILD);
+ }
+}
+
+static int shiftfs_setxattr(struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct dentry *real = dentry->d_fsdata;
+ int err = -EOPNOTSUPP;
+ const struct cred *oldcred, *newcred;
+
+ oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+ err = vfs_setxattr(real, name, value, size, flags);
+ shiftfs_old_creds(oldcred, &newcred);
+
+ return err;
+}
+
+static int shiftfs_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size)
+{
+ struct dentry *real = dentry->d_fsdata;
+ int err;
+ const struct cred *oldcred, *newcred;
+
+ oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+ err = vfs_getxattr(real, name, value, size);
+ shiftfs_old_creds(oldcred, &newcred);
+
+ return err;
+}
+
+static ssize_t shiftfs_listxattr(struct dentry *dentry, char *list,
+ size_t size)
+{
+ struct dentry *real = dentry->d_fsdata;
+ int err;
+ const struct cred *oldcred, *newcred;
+
+ oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+ err = vfs_listxattr(real, list, size);
+ shiftfs_old_creds(oldcred, &newcred);
+
+ return err;
+}
+
+static int shiftfs_removexattr(struct dentry *dentry, const char *name)
+{
+ struct dentry *real = dentry->d_fsdata;
+ int err;
+ const struct cred *oldcred, *newcred;
+
+ oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+ err = vfs_removexattr(real, name);
+ shiftfs_old_creds(oldcred, &newcred);
+
+ return err;
+}
+
+static int shiftfs_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value, size_t size,
+ int flags)
+{
+ if (!value)
+ return shiftfs_removexattr(dentry, name);
+ return shiftfs_setxattr(dentry, inode, name, value, size, flags);
+}
+
+static void shiftfs_fill_inode(struct inode *inode, struct dentry *dentry)
+{
+ struct inode *reali;
+
+ if (!dentry)
+ return;
+
+ reali = dentry->d_inode;
+
+ if (!reali->i_op->get_link)
+ inode->i_opflags |= IOP_NOFOLLOW;
+
+ inode->i_mapping = reali->i_mapping;
+ inode->i_private = dentry;
+}
+
+static int shiftfs_make_object(struct inode *dir, struct dentry *dentry,
+ umode_t mode, const char *symlink,
+ struct dentry *hardlink, bool excl)
+{
+ struct dentry *real = dir->i_private, *new = dentry->d_fsdata,
+ *realhardlink = NULL;
+ struct inode *reali = real->d_inode, *newi;
+ const struct inode_operations *iop = reali->i_op;
+ int err;
+ const struct cred *oldcred, *newcred;
+ bool op_ok = false;
+
+ if (hardlink) {
+ realhardlink = hardlink->d_fsdata;
+ op_ok = iop->link;
+ } else {
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ op_ok = iop->mkdir;
+ break;
+ case S_IFREG:
+ op_ok = iop->create;
+ break;
+ case S_IFLNK:
+ op_ok = iop->symlink;
+ }
+ }
+ if (!op_ok)
+ return -EINVAL;
+
+
+ newi = shiftfs_new_inode(dentry->d_sb, mode, realhardlink);
+ if (!newi)
+ return -ENOMEM;
+
+ oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+
+ inode_lock_nested(reali, I_MUTEX_PARENT);
+
+ err = -EINVAL; /* shut gcc up about uninit var */
+ if (hardlink) {
+ err = vfs_link(realhardlink, reali, new, NULL);
+ } else {
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ err = vfs_mkdir(reali, new, mode);
+ break;
+ case S_IFREG:
+ err = vfs_create(reali, new, mode, excl);
+ break;
+ case S_IFLNK:
+ err = vfs_symlink(reali, new, symlink);
+ }
+ }
+
+ shiftfs_old_creds(oldcred, &newcred);
+
+ if (err)
+ goto out_dput;
+
+ if (!hardlink)
+ shiftfs_fill_inode(newi, new);
+ else if (inode_unhashed(newi) && !S_ISDIR(newi->i_mode))
+ /*
+ * although dentry and hardlink now each point to
+ * newi, the link count was 1 when they were created,
+ * so insert into the inode cache now that the link
+ * count has gone above one.
+ */
+ __insert_inode_hash(newi, (unsigned long)d_inode(new));
+
+ d_instantiate(dentry, newi);
+
+ new = NULL;
+ newi = NULL;
+
+ out_dput:
+ dput(new);
+ iput(newi);
+ inode_unlock(reali);
+
+ return err;
+}
+
+static int shiftfs_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool excl)
+{
+ mode |= S_IFREG;
+
+ return shiftfs_make_object(dir, dentry, mode, NULL, NULL, excl);
+}
+
+static int shiftfs_mkdir(struct inode *dir, struct dentry *dentry,
+ umode_t mode)
+{
+ mode |= S_IFDIR;
+
+ return shiftfs_make_object(dir, dentry, mode, NULL, NULL, false);
+}
+
+static int shiftfs_link(struct dentry *hardlink, struct inode *dir,
+ struct dentry *dentry)
+{
+ return shiftfs_make_object(dir, dentry, 0, NULL, hardlink, false);
+}
+
+static int shiftfs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symlink)
+{
+ return shiftfs_make_object(dir, dentry, S_IFLNK, symlink, NULL, false);
+}
+
+static int shiftfs_rm(struct inode *dir, struct dentry *dentry, bool rmdir)
+{
+ struct dentry *real = dir->i_private, *new = dentry->d_fsdata;
+ struct inode *reali = real->d_inode;
+ int err;
+ const struct cred *oldcred, *newcred;
+
+ inode_lock_nested(reali, I_MUTEX_PARENT);
+
+ oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+
+ if (rmdir)
+ err = vfs_rmdir(reali, new);
+ else
+ err = vfs_unlink(reali, new, NULL);
+
+ shiftfs_old_creds(oldcred, &newcred);
+ inode_unlock(reali);
+
+ return err;
+}
+
+static int shiftfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ return shiftfs_rm(dir, dentry, false);
+}
+
+static int shiftfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ return shiftfs_rm(dir, dentry, true);
+}
+
+static int shiftfs_rename(struct inode *olddir, struct dentry *old,
+ struct inode *newdir, struct dentry *new,
+ unsigned int flags)
+{
+ struct dentry *rodd = olddir->i_private, *rndd = newdir->i_private,
+ *realold = old->d_fsdata,
+ *realnew = new->d_fsdata, *trap;
+ struct inode *realolddir = rodd->d_inode, *realnewdir = rndd->d_inode;
+ int err = -EINVAL;
+ const struct cred *oldcred, *newcred;
+
+ trap = lock_rename(rndd, rodd);
+
+ if (trap == realold || trap == realnew)
+ goto out_unlock;
+
+ oldcred = shiftfs_new_creds(&newcred, old->d_sb);
+
+ err = vfs_rename(realolddir, realold, realnewdir,
+ realnew, NULL, flags);
+
+ shiftfs_old_creds(oldcred, &newcred);
+
+ out_unlock:
+ unlock_rename(rndd, rodd);
+
+ return err;
+}
+
+static struct dentry *shiftfs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct dentry *real = dir->i_private, *new;
+ struct inode *reali = real->d_inode, *newi;
+ const struct cred *oldcred, *newcred;
+
+ inode_lock(reali);
+ oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+ new = lookup_one_len(dentry->d_name.name, real, dentry->d_name.len);
+ shiftfs_old_creds(oldcred, &newcred);
+ inode_unlock(reali);
+
+ if (IS_ERR(new))
+ return new;
+
+ dentry->d_fsdata = new;
+
+ newi = NULL;
+ if (!new->d_inode)
+ goto out;
+
+ newi = shiftfs_new_inode(dentry->d_sb, new->d_inode->i_mode, new);
+ if (!newi) {
+ dput(new);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ out:
+ return d_splice_alias(newi, dentry);
+}
+
+static int shiftfs_permission(struct inode *inode, int mask)
+{
+ struct dentry *real = inode->i_private;
+ struct inode *reali = real->d_inode;
+ const struct inode_operations *iop = reali->i_op;
+ int err;
+ const struct cred *oldcred, *newcred;
+
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ oldcred = shiftfs_new_creds(&newcred, inode->i_sb);
+ if (iop->permission)
+ err = iop->permission(reali, mask);
+ else
+ err = generic_permission(reali, mask);
+ shiftfs_old_creds(oldcred, &newcred);
+
+ return err;
+}
+
+static int shiftfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct dentry *real = dentry->d_fsdata;
+ struct inode *reali = real->d_inode;
+ const struct inode_operations *iop = reali->i_op;
+ struct iattr newattr = *attr;
+ const struct cred *oldcred, *newcred;
+ struct super_block *sb = dentry->d_sb;
+ int err;
+
+ newattr.ia_uid = KUIDT_INIT(from_kuid(sb->s_user_ns, attr->ia_uid));
+ newattr.ia_gid = KGIDT_INIT(from_kgid(sb->s_user_ns, attr->ia_gid));
+
+ oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+ inode_lock(reali);
+ if (iop->setattr)
+ err = iop->setattr(real, &newattr);
+ else
+ err = simple_setattr(real, &newattr);
+ inode_unlock(reali);
+ shiftfs_old_creds(oldcred, &newcred);
+
+ if (err)
+ return err;
+
+ /* all OK, reflect the change on our inode */
+ setattr_copy(d_inode(dentry), attr);
+ return 0;
+}
+
+static int shiftfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ struct dentry *real = inode->i_private;
+ struct inode *reali = real->d_inode;
+ const struct inode_operations *iop = reali->i_op;
+ int err = 0;
+
+ mnt = dentry->d_sb->s_fs_info;
+
+ if (iop->getattr)
+ err = iop->getattr(mnt, real, stat);
+ else
+ generic_fillattr(reali, stat);
+
+ if (err)
+ return err;
+
+ /* transform the underlying id */
+ stat->uid = make_kuid(inode->i_sb->s_user_ns, __kuid_val(stat->uid));
+ stat->gid = make_kgid(inode->i_sb->s_user_ns, __kgid_val(stat->gid));
+ return 0;
+}
+
+static const struct inode_operations shiftfs_inode_ops = {
+ .lookup = shiftfs_lookup,
+ .getattr = shiftfs_getattr,
+ .setattr = shiftfs_setattr,
+ .permission = shiftfs_permission,
+ .mkdir = shiftfs_mkdir,
+ .symlink = shiftfs_symlink,
+ .get_link = shiftfs_get_link,
+ .readlink = shiftfs_readlink,
+ .unlink = shiftfs_unlink,
+ .rmdir = shiftfs_rmdir,
+ .rename = shiftfs_rename,
+ .link = shiftfs_link,
+ .create = shiftfs_create,
+ .mknod = NULL, /* no special files currently */
+ .listxattr = shiftfs_listxattr,
+};
+
+static int shiftfs_test(struct inode *inode, void *data)
+{
+ struct dentry *d1 = inode->i_private, *d2 = data;
+ struct inode *i1 = d_inode(d1), *i2 = d_inode(d2);
+
+ return i1 && i1 == i2;
+}
+
+static int shiftfs_set(struct inode *inode, void *data)
+{
+ struct dentry *dentry = data;
+
+ shiftfs_fill_inode(inode, dentry);
+
+ return 0;
+}
+
+static struct inode *shiftfs_new_inode(struct super_block *sb, umode_t mode,
+ struct dentry *dentry)
+{
+ struct inode *inode;
+ struct inode *reali = dentry ? d_inode(dentry): NULL;
+ bool use_inode_hash = false;
+
+ /*
+ * Here we hash the inode only if the underlying link count is
+ * greater than one and it's not a directory (meaning the hash
+ * contains all items that might be aliases). We keep this
+ * accurate by checking the underlying link count on
+ * revalidation and forcing a new lookup if the underlying
+ * link count is raised.
+ *
+ * Note: if the link count drops again, we don't remove the
+ * inode from the hash, so the hash contains all inodes that
+ * may be aliases plus a few others.
+ */
+ if (reali)
+ use_inode_hash = ACCESS_ONCE(reali->i_nlink) > 1 &&
+ !S_ISDIR(reali->i_mode);
+
+ if (use_inode_hash) {
+ inode = iget5_locked(sb, (unsigned long)reali, shiftfs_test,
+ shiftfs_set, dentry);
+ if (inode && !(inode->i_state & I_NEW))
+ return inode;
+ } else {
+ inode = new_inode(sb);
+ }
+
+ if (!inode)
+ return NULL;
+
+ /*
+ * our inode is completely vestigial. All lookups, getattr
+ * and permission checks are done on the underlying inode, so
+ * what the user sees is entirely from the underlying inode.
+ */
+ mode &= S_IFMT;
+
+ inode->i_ino = get_next_ino();
+ inode->i_mode = mode;
+ inode->i_flags |= S_NOATIME | S_NOCMTIME;
+
+ inode->i_op = &shiftfs_inode_ops;
+
+ if (use_inode_hash)
+ unlock_new_inode(inode);
+ else
+ shiftfs_fill_inode(inode, dentry);
+
+ return inode;
+}
+
+static int shiftfs_show_options(struct seq_file *m, struct dentry *dentry)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct shiftfs_super_info *ssi = sb->s_fs_info;
+
+ if (ssi->mark)
+ seq_show_option(m, "mark", NULL);
+
+ return 0;
+}
+
+static int shiftfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct shiftfs_super_info *ssi = sb->s_fs_info;
+ struct dentry *root = sb->s_root;
+ struct dentry *realroot = root->d_fsdata;
+ struct path realpath = { .mnt = ssi->mnt, .dentry = realroot };
+ int err;
+
+ err = vfs_statfs(&realpath, buf);
+ if (err)
+ return err;
+
+ buf->f_type = sb->s_magic;
+
+ return 0;
+}
+
+static void shiftfs_put_super(struct super_block *sb)
+{
+ struct shiftfs_super_info *ssi = sb->s_fs_info;
+
+ mntput(ssi->mnt);
+ put_user_ns(ssi->userns);
+ kfree(ssi);
+}
+
+static const struct xattr_handler shiftfs_xattr_handler = {
+ .prefix = "",
+ .get = shiftfs_xattr_get,
+ .set = shiftfs_xattr_set,
+};
+
+const struct xattr_handler *shiftfs_xattr_handlers[] = {
+ &shiftfs_xattr_handler,
+ NULL
+};
+
+static const struct super_operations shiftfs_super_ops = {
+ .put_super = shiftfs_put_super,
+ .show_options = shiftfs_show_options,
+ .statfs = shiftfs_statfs,
+};
+
+struct shiftfs_data {
+ void *data;
+ const char *path;
+};
+
+static int shiftfs_fill_super(struct super_block *sb, void *raw_data,
+ int silent)
+{
+ struct shiftfs_data *data = raw_data;
+ char *name = kstrdup(data->path, GFP_KERNEL);
+ int err = -ENOMEM;
+ struct shiftfs_super_info *ssi = NULL;
+ struct path path;
+ struct dentry *dentry;
+
+ if (!name)
+ goto out;
+
+ ssi = kzalloc(sizeof(*ssi), GFP_KERNEL);
+ if (!ssi)
+ goto out;
+
+ err = -EPERM;
+ err = shiftfs_parse_options(ssi, data->data);
+ if (err)
+ goto out;
+
+ /* to mark a mount point, must be real root */
+ if (ssi->mark && !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ /* else to mount a mark, must be userns admin */
+ if (!ssi->mark && !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ goto out;
+
+ err = kern_path(name, LOOKUP_FOLLOW, &path);
+ if (err)
+ goto out;
+
+ err = -EPERM;
+
+ if (!S_ISDIR(path.dentry->d_inode->i_mode)) {
+ err = -ENOTDIR;
+ goto out_put;
+ }
+
+ sb->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
+ if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+ printk(KERN_ERR "shiftfs: maximum stacking depth exceeded\n");
+ err = -EINVAL;
+ goto out_put;
+ }
+
+ if (ssi->mark) {
+ /*
+ * this part is visible unshifted, so make sure no
+ * executables that could be used to give suid
+ * privileges
+ */
+ sb->s_iflags = SB_I_NOEXEC;
+ ssi->mnt = path.mnt;
+ dentry = path.dentry;
+ } else {
+ struct shiftfs_super_info *mp_ssi;
+
+ /*
+ * this leg executes if we're admin capable in
+ * the namespace, so be very careful
+ */
+ if (path.dentry->d_sb->s_magic != SHIFTFS_MAGIC)
+ goto out_put;
+ mp_ssi = path.dentry->d_sb->s_fs_info;
+ if (!mp_ssi->mark)
+ goto out_put;
+ ssi->mnt = mntget(mp_ssi->mnt);
+ dentry = dget(path.dentry->d_fsdata);
+ path_put(&path);
+ }
+ ssi->userns = get_user_ns(dentry->d_sb->s_user_ns);
+ sb->s_fs_info = ssi;
+ sb->s_magic = SHIFTFS_MAGIC;
+ sb->s_op = &shiftfs_super_ops;
+ sb->s_xattr = shiftfs_xattr_handlers;
+ sb->s_d_op = &shiftfs_dentry_ops;
+ sb->s_root = d_make_root(shiftfs_new_inode(sb, S_IFDIR, dentry));
+ sb->s_root->d_fsdata = dentry;
+
+ return 0;
+
+ out_put:
+ path_put(&path);
+ out:
+ kfree(name);
+ kfree(ssi);
+ return err;
+}
+
+static struct dentry *shiftfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ struct shiftfs_data d = { data, dev_name };
+
+ return mount_nodev(fs_type, flags, &d, shiftfs_fill_super);
+}
+
+static struct file_system_type shiftfs_type = {
+ .owner = THIS_MODULE,
+ .name = "shiftfs",
+ .mount = shiftfs_mount,
+ .kill_sb = kill_anon_super,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+
+static int __init shiftfs_init(void)
+{
+ return register_filesystem(&shiftfs_type);
+}
+
+static void __exit shiftfs_exit(void)
+{
+ unregister_filesystem(&shiftfs_type);
+}
+
+MODULE_ALIAS_FS("shiftfs");
+MODULE_AUTHOR("James Bottomley");
+MODULE_DESCRIPTION("uid/gid shifting bind filesystem");
+MODULE_LICENSE("GPL v2");
+module_init(shiftfs_init)
+module_exit(shiftfs_exit)
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index e230af2e6855..a2fdb01a1a4e 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -85,4 +85,6 @@
#define BALLOON_KVM_MAGIC 0x13661366
#define ZSMALLOC_MAGIC 0x58295829
+#define SHIFTFS_MAGIC 0x6a656a62
+
#endif /* __LINUX_MAGIC_H__ */
--
2.11.0

View File

@ -0,0 +1,44 @@
From b0eb5c2b15df95ddec67436766f613aa7dd031be Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@docker.com>
Date: Fri, 14 Apr 2017 15:37:31 -0600
Subject: [PATCH 2/2] shiftfs: update to compile with a528d35e8bfcc
Signed-off-by: Tycho Andersen <tycho@docker.com>
---
fs/shiftfs.c | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/fs/shiftfs.c b/fs/shiftfs.c
index ea8ac57b3ce1..fbe336ca0aa1 100644
--- a/fs/shiftfs.c
+++ b/fs/shiftfs.c
@@ -545,19 +545,21 @@ static int shiftfs_setattr(struct dentry *dentry, struct iattr *attr)
return 0;
}
-static int shiftfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat)
+static int shiftfs_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(path->dentry);
struct dentry *real = inode->i_private;
struct inode *reali = real->d_inode;
const struct inode_operations *iop = reali->i_op;
+ struct path realpath;
int err = 0;
- mnt = dentry->d_sb->s_fs_info;
+ realpath.mnt = path->dentry->d_sb->s_fs_info;
+ realpath.dentry = real;
if (iop->getattr)
- err = iop->getattr(mnt, real, stat);
+ err = iop->getattr(&realpath, stat, request_mask, flags);
else
generic_fillattr(reali, stat);
--
2.11.0

View File

@ -0,0 +1,55 @@
kernel:
image: "linuxkitprojects/kernel-shiftfs:4.11.4-881a041fc14bd95814cf140b5e98d97dd65160b5"
cmdline: "console=ttyS0 console=tty0 page_poison=1"
init:
- linuxkit/init:2599bcd5013ce5962aa155ee8929c26160de13bd
- linuxkit/runc:3a4e6cbf15470f62501b019b55e1caac5ee7689f
- linuxkit/containerd:b50181bc6e0084e5fcd6b6ad3cf433c4f66cae5a
- linuxkit/ca-certificates:75cf419fb58770884c3464eb687ec8dfc704169d
onboot:
- name: sysctl
image: "linuxkit/sysctl:3aa6bc663c2849ef239be7d941d3eaf3e6fcc018"
- name: binfmt
image: "linuxkit/binfmt:8ac5535f57f0c6f5fe88317b9d22a7677093c765"
- name: dhcpcd
image: "linuxkit/dhcpcd:7d2b8aaaf20c24ad7d11a5ea2ea5b4a80dc966f1"
command: ["/sbin/dhcpcd", "--nobackground", "-f", "/dhcpcd.conf", "-1"]
services:
- name: getty
image: "linuxkit/getty:ef9d667af71089326419fb08e9cc9d567cf15748"
env:
- INSECURE=true
- name: rngd
image: "linuxkit/rngd:1fa4de44c961bb5075647181891a3e7e7ba51c31"
- name: nginx
image: "nginx:alpine"
capabilities:
- CAP_NET_BIND_SERVICE
- CAP_CHOWN
- CAP_SETUID
- CAP_SETGID
- CAP_DAC_OVERRIDE
files:
- path: etc/containerd/config.toml
contents: |
state = "/run/containerd"
root = "/var/lib/containerd"
snapshotter = "overlay"
subreaper = false
[grpc]
address = "/run/containerd/containerd.sock"
uid = 0
gid = 0
[debug]
address = "/run/containerd/debug.sock"
level = "info"
[metrics]
address = ":13337"
trust:
org:
- linuxkit
image:
- nginx:alpine