switch to vndr

vndr is almost exactly the same as our old good hack/vendor.sh. Except it's cleaner and it allows to re-vendor just one dependency if needed (which we do a lot for containers/image). Signed-off-by: Antonio Murdaca <runcom@redhat.com>
2025-09-21 09:57:19 +00:00 · 2017-01-09 16:38:21 +01:00
parent bbc0c69624
commit fefeeb4c70
119 changed files with 4340 additions and 40236 deletions
--- a/vendor/github.com/opencontainers/runc/NOTICE
+++ b/vendor/github.com/opencontainers/runc/NOTICE
@@ -0,0 +1,17 @@
+runc
+
+Copyright 2012-2015 Docker, Inc.
+
+This product includes software developed at Docker, Inc. (http://www.docker.com).
+
+The following is courtesy of our legal counsel:
+
+
+Use and transfer of Docker may be subject to certain restrictions by the
+United States and other governments.  
+It is your responsibility to ensure that your use and/or transfer does not
+violate applicable laws. 
+
+For more information, please see http://www.bis.doc.gov
+
+See also http://www.apache.org/dev/crypto.html and/or seek legal counsel.
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h
@@ -0,0 +1,32 @@
+#ifndef NSENTER_NAMESPACE_H
+#define NSENTER_NAMESPACE_H
+
+#ifndef _GNU_SOURCE
+#	define _GNU_SOURCE
+#endif
+#include <sched.h>
+
+/* All of these are taken from include/uapi/linux/sched.h */
+#ifndef CLONE_NEWNS
+#	define CLONE_NEWNS 0x00020000 /* New mount namespace group */
+#endif
+#ifndef CLONE_NEWCGROUP
+#	define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
+#endif
+#ifndef CLONE_NEWUTS
+#	define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
+#endif
+#ifndef CLONE_NEWIPC
+#	define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
+#endif
+#ifndef CLONE_NEWUSER
+#	define CLONE_NEWUSER 0x10000000 /* New user namespace */
+#endif
+#ifndef CLONE_NEWPID
+#	define CLONE_NEWPID 0x20000000 /* New pid namespace */
+#endif
+#ifndef CLONE_NEWNET
+#	define CLONE_NEWNET 0x40000000 /* New network namespace */
+#endif
+
+#endif /* NSENTER_NAMESPACE_H */
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go
@@ -0,0 +1,12 @@
+// +build linux,!gccgo
+
+package nsenter
+
+/*
+#cgo CFLAGS: -Wall
+extern void nsexec();
+void __attribute__((constructor)) init(void) {
+	nsexec();
+}
+*/
+import "C"
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go
@@ -0,0 +1,25 @@
+// +build linux,gccgo
+
+package nsenter
+
+/*
+#cgo CFLAGS: -Wall
+extern void nsexec();
+void __attribute__((constructor)) init(void) {
+	nsexec();
+}
+*/
+import "C"
+
+// AlwaysFalse is here to stay false
+// (and be exported so the compiler doesn't optimize out its reference)
+var AlwaysFalse bool
+
+func init() {
+	if AlwaysFalse {
+		// by referencing this C init() in a noop test, it will ensure the compiler
+		// links in the C function.
+		// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134
+		C.init()
+	}
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go
@@ -0,0 +1,5 @@
+// +build !linux !cgo
+
+package nsenter
+
+import "C"
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
@@ -0,0 +1,754 @@
+#define _GNU_SOURCE
+#include <endian.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <sched.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <linux/limits.h>
+#include <linux/netlink.h>
+#include <linux/types.h>
+
+/* Get all of the CLONE_NEW* flags. */
+#include "namespace.h"
+
+/* Synchronisation values. */
+enum sync_t {
+	SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
+	SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
+	SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
+	SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
+	SYNC_CHILD_READY = 0x44, /* The grandchild is ready to return. */
+
+	/* XXX: This doesn't help with segfaults and other such issues. */
+	SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
+};
+
+/* longjmp() arguments. */
+#define JUMP_PARENT 0x00
+#define JUMP_CHILD  0xA0
+#define JUMP_INIT   0xA1
+
+/* JSON buffer. */
+#define JSON_MAX 4096
+
+/* Assume the stack grows down, so arguments should be above it. */
+struct clone_t {
+	/*
+	 * Reserve some space for clone() to locate arguments
+	 * and retcode in this place
+	 */
+	char stack[4096] __attribute__ ((aligned(16)));
+	char stack_ptr[0];
+
+	/* There's two children. This is used to execute the different code. */
+	jmp_buf *env;
+	int jmpval;
+};
+
+struct nlconfig_t {
+	char *data;
+	uint32_t cloneflags;
+	char *uidmap;
+	size_t uidmap_len;
+	char *gidmap;
+	size_t gidmap_len;
+	char *namespaces;
+	size_t namespaces_len;
+	uint8_t is_setgroup;
+};
+
+/*
+ * List of netlink message types sent to us as part of bootstrapping the init.
+ * These constants are defined in libcontainer/message_linux.go.
+ */
+#define INIT_MSG		62000
+#define CLONE_FLAGS_ATTR	27281
+#define NS_PATHS_ATTR		27282
+#define UIDMAP_ATTR		27283
+#define GIDMAP_ATTR		27284
+#define SETGROUP_ATTR		27285
+
+/*
+ * Use the raw syscall for versions of glibc which don't include a function for
+ * it, namely (glibc 2.12).
+ */
+#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
+#	define _GNU_SOURCE
+#	include "syscall.h"
+#	if !defined(SYS_setns) && defined(__NR_setns)
+#		define SYS_setns __NR_setns
+#	endif
+
+#ifndef SYS_setns
+#	error "setns(2) syscall not supported by glibc version"
+#endif
+
+int setns(int fd, int nstype)
+{
+	return syscall(SYS_setns, fd, nstype);
+}
+#endif
+
+/* XXX: This is ugly. */
+static int syncfd = -1;
+
+/* TODO(cyphar): Fix this so it correctly deals with syncT. */
+#define bail(fmt, ...)								\
+	do {									\
+		int ret = __COUNTER__ + 1;					\
+		fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__);	\
+		if (syncfd >= 0) {						\
+			enum sync_t s = SYNC_ERR;				\
+			if (write(syncfd, &s, sizeof(s)) != sizeof(s))		\
+				fprintf(stderr, "nsenter: failed: write(s)");	\
+			if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret))	\
+				fprintf(stderr, "nsenter: failed: write(ret)");	\
+		}								\
+		exit(ret);							\
+	} while(0)
+
+static int write_file(char *data, size_t data_len, char *pathfmt, ...)
+{
+	int fd, len, ret = 0;
+	char path[PATH_MAX];
+
+	va_list ap;
+	va_start(ap, pathfmt);
+	len = vsnprintf(path, PATH_MAX, pathfmt, ap);
+	va_end(ap);
+	if (len < 0)
+		return -1;
+
+	fd = open(path, O_RDWR);
+	if (fd < 0) {
+		ret = -1;
+		goto out;
+	}
+
+	len = write(fd, data, data_len);
+	if (len != data_len) {
+		ret = -1;
+		goto out;
+	}
+
+out:
+	close(fd);
+	return ret;
+}
+
+enum policy_t {
+	SETGROUPS_DEFAULT = 0,
+	SETGROUPS_ALLOW,
+	SETGROUPS_DENY,
+};
+
+/* This *must* be called before we touch gid_map. */
+static void update_setgroups(int pid, enum policy_t setgroup)
+{
+	char *policy;
+
+	switch (setgroup) {
+		case SETGROUPS_ALLOW:
+			policy = "allow";
+			break;
+		case SETGROUPS_DENY:
+			policy = "deny";
+			break;
+		case SETGROUPS_DEFAULT:
+			/* Nothing to do. */
+			return;
+	}
+
+	if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
+		/*
+		 * If the kernel is too old to support /proc/pid/setgroups,
+		 * open(2) or write(2) will return ENOENT. This is fine.
+		 */
+		if (errno != ENOENT)
+			bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
+	}
+}
+
+static void update_uidmap(int pid, char *map, int map_len)
+{
+	if (map == NULL || map_len <= 0)
+		return;
+
+	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0)
+		bail("failed to update /proc/%d/uid_map", pid);
+}
+
+static void update_gidmap(int pid, char *map, int map_len)
+{
+	if (map == NULL || map_len <= 0)
+		return;
+
+	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0)
+		bail("failed to update /proc/%d/gid_map", pid);
+}
+
+/* A dummy function that just jumps to the given jumpval. */
+static int child_func(void *arg) __attribute__ ((noinline));
+static int child_func(void *arg)
+{
+	struct clone_t *ca = (struct clone_t *)arg;
+	longjmp(*ca->env, ca->jmpval);
+}
+
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
+static int clone_parent(jmp_buf *env, int jmpval)
+{
+	struct clone_t ca = {
+		.env    = env,
+		.jmpval = jmpval,
+	};
+
+	return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
+}
+
+/*
+ * Gets the init pipe fd from the environment, which is used to read the
+ * bootstrap data and tell the parent what the new pid is after we finish
+ * setting up the environment.
+ */
+static int initpipe(void)
+{
+	int pipenum;
+	char *initpipe, *endptr;
+
+	initpipe = getenv("_LIBCONTAINER_INITPIPE");
+	if (initpipe == NULL || *initpipe == '\0')
+		return -1;
+
+	pipenum = strtol(initpipe, &endptr, 10);
+	if (*endptr != '\0')
+		bail("unable to parse _LIBCONTAINER_INITPIPE");
+
+	return pipenum;
+}
+
+/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
+static int nsflag(char *name)
+{
+	if (!strcmp(name, "cgroup"))
+		return CLONE_NEWCGROUP;
+	else if (!strcmp(name, "ipc"))
+		return CLONE_NEWIPC;
+	else if (!strcmp(name, "mnt"))
+		return CLONE_NEWNS;
+	else if (!strcmp(name, "net"))
+		return CLONE_NEWNET;
+	else if (!strcmp(name, "pid"))
+		return CLONE_NEWPID;
+	else if (!strcmp(name, "user"))
+		return CLONE_NEWUSER;
+	else if (!strcmp(name, "uts"))
+		return CLONE_NEWUTS;
+
+	/* If we don't recognise a name, fallback to 0. */
+	return 0;
+}
+
+static uint32_t readint32(char *buf)
+{
+	return *(uint32_t *) buf;
+}
+
+static uint8_t readint8(char *buf)
+{
+	return *(uint8_t *) buf;
+}
+
+static void nl_parse(int fd, struct nlconfig_t *config)
+{
+	size_t len, size;
+	struct nlmsghdr hdr;
+	char *data, *current;
+
+	/* Retrieve the netlink header. */
+	len = read(fd, &hdr, NLMSG_HDRLEN);
+	if (len != NLMSG_HDRLEN)
+		bail("invalid netlink header length %lu", len);
+
+	if (hdr.nlmsg_type == NLMSG_ERROR)
+		bail("failed to read netlink message");
+
+	if (hdr.nlmsg_type != INIT_MSG)
+		bail("unexpected msg type %d", hdr.nlmsg_type);
+
+	/* Retrieve data. */
+	size = NLMSG_PAYLOAD(&hdr, 0);
+	current = data = malloc(size);
+	if (!data)
+		bail("failed to allocate %zu bytes of memory for nl_payload", size);
+
+	len = read(fd, data, size);
+	if (len != size)
+		bail("failed to read netlink payload, %lu != %lu", len, size);
+
+	/* Parse the netlink payload. */
+	config->data = data;
+	while (current < data + size) {
+		struct nlattr *nlattr = (struct nlattr *)current;
+		size_t payload_len = nlattr->nla_len - NLA_HDRLEN;
+
+		/* Advance to payload. */
+		current += NLA_HDRLEN;
+
+		/* Handle payload. */
+		switch (nlattr->nla_type) {
+		case CLONE_FLAGS_ATTR:
+			config->cloneflags = readint32(current);
+			break;
+		case NS_PATHS_ATTR:
+			config->namespaces = current;
+			config->namespaces_len = payload_len;
+			break;
+		case UIDMAP_ATTR:
+			config->uidmap = current;
+			config->uidmap_len = payload_len;
+			break;
+		case GIDMAP_ATTR:
+			config->gidmap = current;
+			config->gidmap_len = payload_len;
+			break;
+		case SETGROUP_ATTR:
+			config->is_setgroup = readint8(current);
+			break;
+		default:
+			bail("unknown netlink message type %d", nlattr->nla_type);
+		}
+
+		current += NLA_ALIGN(payload_len);
+	}
+}
+
+void nl_free(struct nlconfig_t *config)
+{
+	free(config->data);
+}
+
+void join_namespaces(char *nslist)
+{
+	int num = 0, i;
+	char *saveptr = NULL;
+	char *namespace = strtok_r(nslist, ",", &saveptr);
+	struct namespace_t {
+		int fd;
+		int ns;
+		char type[PATH_MAX];
+		char path[PATH_MAX];
+	} *namespaces = NULL;
+
+	if (!namespace || !strlen(namespace) || !strlen(nslist))
+		bail("ns paths are empty");
+
+	/*
+	 * We have to open the file descriptors first, since after
+	 * we join the mnt namespace we might no longer be able to
+	 * access the paths.
+	 */
+	do {
+		int fd;
+		char *path;
+		struct namespace_t *ns;
+
+		/* Resize the namespace array. */
+		namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
+		if (!namespaces)
+			bail("failed to reallocate namespace array");
+		ns = &namespaces[num - 1];
+
+		/* Split 'ns:path'. */
+		path = strstr(namespace, ":");
+		if (!path)
+			bail("failed to parse %s", namespace);
+		*path++ = '\0';
+
+		fd = open(path, O_RDONLY);
+		if (fd < 0)
+			bail("failed to open %s", path);
+
+		ns->fd = fd;
+		ns->ns = nsflag(namespace);
+		strncpy(ns->path, path, PATH_MAX);
+	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
+
+	/*
+	 * The ordering in which we join namespaces is important. We should
+	 * always join the user namespace *first*. This is all guaranteed
+	 * from the container_linux.go side of this, so we're just going to
+	 * follow the order given to us.
+	 */
+
+	for (i = 0; i < num; i++) {
+		struct namespace_t ns = namespaces[i];
+
+		if (setns(ns.fd, ns.ns) < 0)
+			bail("failed to setns to %s", ns.path);
+
+		close(ns.fd);
+	}
+
+	free(namespaces);
+}
+
+void nsexec(void)
+{
+	int pipenum;
+	jmp_buf env;
+	int syncpipe[2];
+	struct nlconfig_t config = {0};
+
+	/*
+	 * If we don't have an init pipe, just return to the go routine.
+	 * We'll only get an init pipe for start or exec.
+	 */
+	pipenum = initpipe();
+	if (pipenum == -1)
+		return;
+
+	/* Parse all of the netlink configuration. */
+	nl_parse(pipenum, &config);
+
+	/* Pipe so we can tell the child when we've finished setting up. */
+	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0)
+		bail("failed to setup sync pipe between parent and child");
+
+	/* TODO: Currently we aren't dealing with child deaths properly. */
+
+	/*
+	 * Okay, so this is quite annoying.
+	 *
+	 * In order for this unsharing code to be more extensible we need to split
+	 * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
+	 * would be if we did clone(CLONE_NEWUSER) and the other namespaces
+	 * separately, but because of SELinux issues we cannot really do that. But
+	 * we cannot just dump the namespace flags into clone(...) because several
+	 * usecases (such as rootless containers) require more granularity around
+	 * the namespace setup. In addition, some older kernels had issues where
+	 * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
+	 * handle this while also dealing with SELinux so we choose SELinux support
+	 * over broken kernel support).
+	 *
+	 * However, if we unshare(2) the user namespace *before* we clone(2), then
+	 * all hell breaks loose.
+	 *
+	 * The parent no longer has permissions to do many things (unshare(2) drops
+	 * all capabilities in your old namespace), and the container cannot be set
+	 * up to have more than one {uid,gid} mapping. This is obviously less than
+	 * ideal. In order to fix this, we have to first clone(2) and then unshare.
+	 *
+	 * Unfortunately, it's not as simple as that. We have to fork to enter the
+	 * PID namespace (the PID namespace only applies to children). Since we'll
+	 * have to double-fork, this clone_parent() call won't be able to get the
+	 * PID of the _actual_ init process (without doing more synchronisation than
+	 * I can deal with at the moment). So we'll just get the parent to send it
+	 * for us, the only job of this process is to update
+	 * /proc/pid/{setgroups,uid_map,gid_map}.
+	 *
+	 * And as a result of the above, we also need to setns(2) in the first child
+	 * because if we join a PID namespace in the topmost parent then our child
+	 * will be in that namespace (and it will not be able to give us a PID value
+	 * that makes sense without resorting to sending things with cmsg).
+	 *
+	 * This also deals with an older issue caused by dumping cloneflags into
+	 * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
+	 * we have to unshare(2) before clone(2) in order to do this. This was fixed
+	 * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
+	 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
+	 * aware, the last mainline kernel which had this bug was Linux 3.12.
+	 * However, we cannot comment on which kernels the broken patch was
+	 * backported to.
+	 *
+	 * -- Aleksa "what has my life come to?" Sarai
+	 */
+
+	switch (setjmp(env)) {
+	/*
+	 * Stage 0: We're in the parent. Our job is just to create a new child
+	 *          (stage 1: JUMP_CHILD) process and write its uid_map and
+	 *          gid_map. That process will go on to create a new process, then
+	 *          it will send us its PID which we will send to the bootstrap
+	 *          process.
+	 */
+	case JUMP_PARENT: {
+			int len, ready = 0;
+			pid_t child;
+			char buf[JSON_MAX];
+
+			/* For debugging. */
+			prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
+
+			/* Start the process of getting a container. */
+			child = clone_parent(&env, JUMP_CHILD);
+			if (child < 0)
+				bail("unable to fork: child_func");
+
+			/*
+			 * State machine for synchronisation with the children.
+			 *
+			 * Father only return when both child and grandchild are
+			 * ready, so we can receive all possible error codes
+			 * generated by children.
+			 */
+			while (ready < 2) {
+				enum sync_t s;
+
+				/* This doesn't need to be global, we're in the parent. */
+				int syncfd = syncpipe[1];
+
+				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+					bail("failed to sync with child: next state");
+
+				switch (s) {
+				case SYNC_ERR: {
+						/* We have to mirror the error code of the child. */
+						int ret;
+
+						if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
+							bail("failed to sync with child: read(error code)");
+
+						exit(ret);
+					}
+					break;
+				case SYNC_USERMAP_PLS:
+					/* Enable setgroups(2) if we've been asked to. */
+					if (config.is_setgroup)
+						update_setgroups(child, SETGROUPS_ALLOW);
+
+					/* Set up mappings. */
+					update_uidmap(child, config.uidmap, config.uidmap_len);
+					update_gidmap(child, config.gidmap, config.gidmap_len);
+
+					s = SYNC_USERMAP_ACK;
+					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+						kill(child, SIGKILL);
+						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
+					}
+					break;
+				case SYNC_USERMAP_ACK:
+					/* We should _never_ receive acks. */
+					kill(child, SIGKILL);
+					bail("failed to sync with child: unexpected SYNC_USERMAP_ACK");
+					break;
+				case SYNC_RECVPID_PLS: {
+						pid_t old = child;
+
+						/* Get the init_func pid. */
+						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
+							kill(old, SIGKILL);
+							bail("failed to sync with child: read(childpid)");
+						}
+
+						/* Send ACK. */
+						s = SYNC_RECVPID_ACK;
+						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+							kill(old, SIGKILL);
+							kill(child, SIGKILL);
+							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
+						}
+					}
+
+					ready++;
+					break;
+				case SYNC_RECVPID_ACK:
+					/* We should _never_ receive acks. */
+					kill(child, SIGKILL);
+					bail("failed to sync with child: unexpected SYNC_RECVPID_ACK");
+					break;
+				case SYNC_CHILD_READY:
+					ready++;
+					break;
+				default:
+					bail("unexpected sync value");
+					break;
+				}
+			}
+
+			/* Send the init_func pid back to our parent. */
+			len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
+			if (len < 0) {
+				kill(child, SIGKILL);
+				bail("unable to generate JSON for child pid");
+			}
+			if (write(pipenum, buf, len) != len) {
+				kill(child, SIGKILL);
+				bail("unable to send child pid to bootstrapper");
+			}
+
+			exit(0);
+		}
+
+	/*
+	 * Stage 1: We're in the first child process. Our job is to join any
+	 *          provided namespaces in the netlink payload and unshare all
+	 *          of the requested namespaces. If we've been asked to
+	 *          CLONE_NEWUSER, we will ask our parent (stage 0) to set up
+	 *          our user mappings for us. Then, we create a new child
+	 *          (stage 2: JUMP_INIT) for PID namespace. We then send the
+	 *          child's PID to our parent (stage 0).
+	 */
+	case JUMP_CHILD: {
+			pid_t child;
+			enum sync_t s;
+
+			/* We're in a child and thus need to tell the parent if we die. */
+			syncfd = syncpipe[0];
+
+			/* For debugging. */
+			prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
+
+			/*
+			 * We need to setns first. We cannot do this earlier (in stage 0)
+			 * because of the fact that we forked to get here (the PID of
+			 * [stage 2: JUMP_INIT]) would be meaningless). We could send it
+			 * using cmsg(3) but that's just annoying.
+			 */
+			if (config.namespaces)
+				join_namespaces(config.namespaces);
+
+			/*
+			 * Unshare all of the namespaces. Now, it should be noted that this
+			 * ordering might break in the future (especially with rootless
+			 * containers). But for now, it's not possible to split this into
+			 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
+			 *
+			 * Note that we don't merge this with clone() because there were
+			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
+			 * was broken, so we'll just do it the long way anyway.
+			 */
+			if (unshare(config.cloneflags) < 0)
+				bail("failed to unshare namespaces");
+
+			/*
+			 * Deal with user namespaces first. They are quite special, as they
+			 * affect our ability to unshare other namespaces and are used as
+			 * context for privilege checks.
+			 */
+			if (config.cloneflags & CLONE_NEWUSER) {
+				/*
+				 * We don't have the privileges to do any mapping here (see the
+				 * clone_parent rant). So signal our parent to hook us up.
+				 */
+
+				s = SYNC_USERMAP_PLS;
+				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
+					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
+
+				/* ... wait for mapping ... */
+
+				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
+				if (s != SYNC_USERMAP_ACK)
+					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
+			}
+
+			/*
+			 * TODO: What about non-namespace clone flags that we're dropping here?
+			 *
+			 * We fork again because of PID namespace, setns(2) or unshare(2) don't
+			 * change the PID namespace of the calling process, because doing so
+			 * would change the caller's idea of its own PID (as reported by getpid()),
+			 * which would break many applications and libraries, so we must fork
+			 * to actually enter the new PID namespace.
+			 */
+			child = clone_parent(&env, JUMP_INIT);
+			if (child < 0)
+				bail("unable to fork: init_func");
+
+			/* Send the child to our parent, which knows what it's doing. */
+			s = SYNC_RECVPID_PLS;
+			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+				kill(child, SIGKILL);
+				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
+			}
+			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
+				kill(child, SIGKILL);
+				bail("failed to sync with parent: write(childpid)");
+			}
+
+			/* ... wait for parent to get the pid ... */
+
+			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
+				kill(child, SIGKILL);
+				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
+			}
+			if (s != SYNC_RECVPID_ACK) {
+				kill(child, SIGKILL);
+				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
+			}
+
+			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
+			exit(0);
+		}
+
+	/*
+	 * Stage 2: We're the final child process, and the only process that will
+	 *          actually return to the Go runtime. Our job is to just do the
+	 *          final cleanup steps and then return to the Go runtime to allow
+	 *          init_linux.go to run.
+	 */
+	case JUMP_INIT: {
+			/*
+			 * We're inside the child now, having jumped from the
+			 * start_child() code after forking in the parent.
+			 */
+			enum sync_t s;
+
+			/* We're in a child and thus need to tell the parent if we die. */
+			syncfd = syncpipe[0];
+
+			/* For debugging. */
+			prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0);
+
+			if (setsid() < 0)
+				bail("setsid failed");
+
+			if (setuid(0) < 0)
+				bail("setuid failed");
+
+			if (setgid(0) < 0)
+				bail("setgid failed");
+
+			if (setgroups(0, NULL) < 0)
+				bail("setgroups failed");
+
+			s = SYNC_CHILD_READY;
+			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
+				bail("failed to sync with patent: write(SYNC_CHILD_READY)");
+
+			/* Close sync pipes. */
+			close(syncpipe[0]);
+			close(syncpipe[1]);
+
+			/* Free netlink data. */
+			nl_free(&config);
+
+			/* Finish executing, let the Go runtime take over. */
+			return;
+		}
+	default:
+		bail("unexpected jump value");
+		break;
+	}
+
+	/* Should never be reached. */
+	bail("should never be reached");
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go
@@ -0,0 +1,551 @@
+// +build linux
+
+package selinux
+
+import (
+	"bufio"
+	"crypto/rand"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strconv"
+	"strings"
+	"sync"
+	"syscall"
+
+	"github.com/opencontainers/runc/libcontainer/system"
+)
+
+const (
+	Enforcing        = 1
+	Permissive       = 0
+	Disabled         = -1
+	selinuxDir       = "/etc/selinux/"
+	selinuxConfig    = selinuxDir + "config"
+	selinuxTypeTag   = "SELINUXTYPE"
+	selinuxTag       = "SELINUX"
+	selinuxPath      = "/sys/fs/selinux"
+	xattrNameSelinux = "security.selinux"
+	stRdOnly         = 0x01
+)
+
+type selinuxState struct {
+	enabledSet   bool
+	enabled      bool
+	selinuxfsSet bool
+	selinuxfs    string
+	mcsList      map[string]bool
+	sync.Mutex
+}
+
+var (
+	assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`)
+	state       = selinuxState{
+		mcsList: make(map[string]bool),
+	}
+)
+
+type SELinuxContext map[string]string
+
+func (s *selinuxState) setEnable(enabled bool) bool {
+	s.Lock()
+	defer s.Unlock()
+	s.enabledSet = true
+	s.enabled = enabled
+	return s.enabled
+}
+
+func (s *selinuxState) getEnabled() bool {
+	s.Lock()
+	enabled := s.enabled
+	enabledSet := s.enabledSet
+	s.Unlock()
+	if enabledSet {
+		return enabled
+	}
+
+	enabled = false
+	if fs := getSelinuxMountPoint(); fs != "" {
+		if con, _ := Getcon(); con != "kernel" {
+			enabled = true
+		}
+	}
+	return s.setEnable(enabled)
+}
+
+// SetDisabled disables selinux support for the package
+func SetDisabled() {
+	state.setEnable(false)
+}
+
+func (s *selinuxState) setSELinuxfs(selinuxfs string) string {
+	s.Lock()
+	defer s.Unlock()
+	s.selinuxfsSet = true
+	s.selinuxfs = selinuxfs
+	return s.selinuxfs
+}
+
+func (s *selinuxState) getSELinuxfs() string {
+	s.Lock()
+	selinuxfs := s.selinuxfs
+	selinuxfsSet := s.selinuxfsSet
+	s.Unlock()
+	if selinuxfsSet {
+		return selinuxfs
+	}
+
+	selinuxfs = ""
+	f, err := os.Open("/proc/self/mountinfo")
+	if err != nil {
+		return selinuxfs
+	}
+	defer f.Close()
+
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		txt := scanner.Text()
+		// Safe as mountinfo encodes mountpoints with spaces as \040.
+		sepIdx := strings.Index(txt, " - ")
+		if sepIdx == -1 {
+			continue
+		}
+		if !strings.Contains(txt[sepIdx:], "selinuxfs") {
+			continue
+		}
+		fields := strings.Split(txt, " ")
+		if len(fields) < 5 {
+			continue
+		}
+		selinuxfs = fields[4]
+		break
+	}
+
+	if selinuxfs != "" {
+		var buf syscall.Statfs_t
+		syscall.Statfs(selinuxfs, &buf)
+		if (buf.Flags & stRdOnly) == 1 {
+			selinuxfs = ""
+		}
+	}
+	return s.setSELinuxfs(selinuxfs)
+}
+
+// getSelinuxMountPoint returns the path to the mountpoint of an selinuxfs
+// filesystem or an empty string if no mountpoint is found.  Selinuxfs is
+// a proc-like pseudo-filesystem that exposes the selinux policy API to
+// processes.  The existence of an selinuxfs mount is used to determine
+// whether selinux is currently enabled or not.
+func getSelinuxMountPoint() string {
+	return state.getSELinuxfs()
+}
+
+// SelinuxEnabled returns whether selinux is currently enabled.
+func SelinuxEnabled() bool {
+	return state.getEnabled()
+}
+
+func readConfig(target string) (value string) {
+	var (
+		val, key string
+		bufin    *bufio.Reader
+	)
+
+	in, err := os.Open(selinuxConfig)
+	if err != nil {
+		return ""
+	}
+	defer in.Close()
+
+	bufin = bufio.NewReader(in)
+
+	for done := false; !done; {
+		var line string
+		if line, err = bufin.ReadString('\n'); err != nil {
+			if err != io.EOF {
+				return ""
+			}
+			done = true
+		}
+		line = strings.TrimSpace(line)
+		if len(line) == 0 {
+			// Skip blank lines
+			continue
+		}
+		if line[0] == ';' || line[0] == '#' {
+			// Skip comments
+			continue
+		}
+		if groups := assignRegex.FindStringSubmatch(line); groups != nil {
+			key, val = strings.TrimSpace(groups[1]), strings.TrimSpace(groups[2])
+			if key == target {
+				return strings.Trim(val, "\"")
+			}
+		}
+	}
+	return ""
+}
+
+func getSELinuxPolicyRoot() string {
+	return selinuxDir + readConfig(selinuxTypeTag)
+}
+
+func readCon(name string) (string, error) {
+	var val string
+
+	in, err := os.Open(name)
+	if err != nil {
+		return "", err
+	}
+	defer in.Close()
+
+	_, err = fmt.Fscanf(in, "%s", &val)
+	return val, err
+}
+
+// Setfilecon sets the SELinux label for this path or returns an error.
+func Setfilecon(path string, scon string) error {
+	return system.Lsetxattr(path, xattrNameSelinux, []byte(scon), 0)
+}
+
+// Getfilecon returns the SELinux label for this path or returns an error.
+func Getfilecon(path string) (string, error) {
+	con, err := system.Lgetxattr(path, xattrNameSelinux)
+	if err != nil {
+		return "", err
+	}
+	// Trim the NUL byte at the end of the byte buffer, if present.
+	if len(con) > 0 && con[len(con)-1] == '\x00' {
+		con = con[:len(con)-1]
+	}
+	return string(con), nil
+}
+
+func Setfscreatecon(scon string) error {
+	return writeCon(fmt.Sprintf("/proc/self/task/%d/attr/fscreate", syscall.Gettid()), scon)
+}
+
+func Getfscreatecon() (string, error) {
+	return readCon(fmt.Sprintf("/proc/self/task/%d/attr/fscreate", syscall.Gettid()))
+}
+
+// Getcon returns the SELinux label of the current process thread, or an error.
+func Getcon() (string, error) {
+	return readCon(fmt.Sprintf("/proc/self/task/%d/attr/current", syscall.Gettid()))
+}
+
+// Getpidcon returns the SELinux label of the given pid, or an error.
+func Getpidcon(pid int) (string, error) {
+	return readCon(fmt.Sprintf("/proc/%d/attr/current", pid))
+}
+
+func Getexeccon() (string, error) {
+	return readCon(fmt.Sprintf("/proc/self/task/%d/attr/exec", syscall.Gettid()))
+}
+
+func writeCon(name string, val string) error {
+	out, err := os.OpenFile(name, os.O_WRONLY, 0)
+	if err != nil {
+		return err
+	}
+	defer out.Close()
+
+	if val != "" {
+		_, err = out.Write([]byte(val))
+	} else {
+		_, err = out.Write(nil)
+	}
+	return err
+}
+
+func Setexeccon(scon string) error {
+	return writeCon(fmt.Sprintf("/proc/self/task/%d/attr/exec", syscall.Gettid()), scon)
+}
+
+func (c SELinuxContext) Get() string {
+	return fmt.Sprintf("%s:%s:%s:%s", c["user"], c["role"], c["type"], c["level"])
+}
+
+func NewContext(scon string) SELinuxContext {
+	c := make(SELinuxContext)
+
+	if len(scon) != 0 {
+		con := strings.SplitN(scon, ":", 4)
+		c["user"] = con[0]
+		c["role"] = con[1]
+		c["type"] = con[2]
+		c["level"] = con[3]
+	}
+	return c
+}
+
+func ReserveLabel(scon string) {
+	if len(scon) != 0 {
+		con := strings.SplitN(scon, ":", 4)
+		mcsAdd(con[3])
+	}
+}
+
+func selinuxEnforcePath() string {
+	return fmt.Sprintf("%s/enforce", selinuxPath)
+}
+
+func SelinuxGetEnforce() int {
+	var enforce int
+
+	enforceS, err := readCon(selinuxEnforcePath())
+	if err != nil {
+		return -1
+	}
+
+	enforce, err = strconv.Atoi(string(enforceS))
+	if err != nil {
+		return -1
+	}
+	return enforce
+}
+
+func SelinuxSetEnforce(mode int) error {
+	return writeCon(selinuxEnforcePath(), fmt.Sprintf("%d", mode))
+}
+
+func SelinuxGetEnforceMode() int {
+	switch readConfig(selinuxTag) {
+	case "enforcing":
+		return Enforcing
+	case "permissive":
+		return Permissive
+	}
+	return Disabled
+}
+
+func mcsAdd(mcs string) error {
+	state.Lock()
+	defer state.Unlock()
+	if state.mcsList[mcs] {
+		return fmt.Errorf("MCS Label already exists")
+	}
+	state.mcsList[mcs] = true
+	return nil
+}
+
+func mcsDelete(mcs string) {
+	state.Lock()
+	defer state.Unlock()
+	state.mcsList[mcs] = false
+}
+
+func IntToMcs(id int, catRange uint32) string {
+	var (
+		SETSIZE = int(catRange)
+		TIER    = SETSIZE
+		ORD     = id
+	)
+
+	if id < 1 || id > 523776 {
+		return ""
+	}
+
+	for ORD > TIER {
+		ORD = ORD - TIER
+		TIER--
+	}
+	TIER = SETSIZE - TIER
+	ORD = ORD + TIER
+	return fmt.Sprintf("s0:c%d,c%d", TIER, ORD)
+}
+
+func uniqMcs(catRange uint32) string {
+	var (
+		n      uint32
+		c1, c2 uint32
+		mcs    string
+	)
+
+	for {
+		binary.Read(rand.Reader, binary.LittleEndian, &n)
+		c1 = n % catRange
+		binary.Read(rand.Reader, binary.LittleEndian, &n)
+		c2 = n % catRange
+		if c1 == c2 {
+			continue
+		} else {
+			if c1 > c2 {
+				t := c1
+				c1 = c2
+				c2 = t
+			}
+		}
+		mcs = fmt.Sprintf("s0:c%d,c%d", c1, c2)
+		if err := mcsAdd(mcs); err != nil {
+			continue
+		}
+		break
+	}
+	return mcs
+}
+
+func FreeLxcContexts(scon string) {
+	if len(scon) != 0 {
+		con := strings.SplitN(scon, ":", 4)
+		mcsDelete(con[3])
+	}
+}
+
+var roFileLabel string
+
+func GetROFileLabel() (fileLabel string) {
+	return roFileLabel
+}
+
+func GetLxcContexts() (processLabel string, fileLabel string) {
+	var (
+		val, key string
+		bufin    *bufio.Reader
+	)
+
+	if !SelinuxEnabled() {
+		return "", ""
+	}
+	lxcPath := fmt.Sprintf("%s/contexts/lxc_contexts", getSELinuxPolicyRoot())
+	in, err := os.Open(lxcPath)
+	if err != nil {
+		return "", ""
+	}
+	defer in.Close()
+
+	bufin = bufio.NewReader(in)
+
+	for done := false; !done; {
+		var line string
+		if line, err = bufin.ReadString('\n'); err != nil {
+			if err == io.EOF {
+				done = true
+			} else {
+				goto exit
+			}
+		}
+		line = strings.TrimSpace(line)
+		if len(line) == 0 {
+			// Skip blank lines
+			continue
+		}
+		if line[0] == ';' || line[0] == '#' {
+			// Skip comments
+			continue
+		}
+		if groups := assignRegex.FindStringSubmatch(line); groups != nil {
+			key, val = strings.TrimSpace(groups[1]), strings.TrimSpace(groups[2])
+			if key == "process" {
+				processLabel = strings.Trim(val, "\"")
+			}
+			if key == "file" {
+				fileLabel = strings.Trim(val, "\"")
+			}
+			if key == "ro_file" {
+				roFileLabel = strings.Trim(val, "\"")
+			}
+		}
+	}
+
+	if processLabel == "" || fileLabel == "" {
+		return "", ""
+	}
+
+	if roFileLabel == "" {
+		roFileLabel = fileLabel
+	}
+exit:
+	//	mcs := IntToMcs(os.Getpid(), 1024)
+	mcs := uniqMcs(1024)
+	scon := NewContext(processLabel)
+	scon["level"] = mcs
+	processLabel = scon.Get()
+	scon = NewContext(fileLabel)
+	scon["level"] = mcs
+	fileLabel = scon.Get()
+	return processLabel, fileLabel
+}
+
+func SecurityCheckContext(val string) error {
+	return writeCon(fmt.Sprintf("%s.context", selinuxPath), val)
+}
+
+func CopyLevel(src, dest string) (string, error) {
+	if src == "" {
+		return "", nil
+	}
+	if err := SecurityCheckContext(src); err != nil {
+		return "", err
+	}
+	if err := SecurityCheckContext(dest); err != nil {
+		return "", err
+	}
+	scon := NewContext(src)
+	tcon := NewContext(dest)
+	mcsDelete(tcon["level"])
+	mcsAdd(scon["level"])
+	tcon["level"] = scon["level"]
+	return tcon.Get(), nil
+}
+
+// Prevent users from relabing system files
+func badPrefix(fpath string) error {
+	var badprefixes = []string{"/usr"}
+
+	for _, prefix := range badprefixes {
+		if fpath == prefix || strings.HasPrefix(fpath, fmt.Sprintf("%s/", prefix)) {
+			return fmt.Errorf("Relabeling content in %s is not allowed.", prefix)
+		}
+	}
+	return nil
+}
+
+// Chcon changes the fpath file object to the SELinux label scon.
+// If the fpath is a directory and recurse is true Chcon will walk the
+// directory tree setting the label
+func Chcon(fpath string, scon string, recurse bool) error {
+	if scon == "" {
+		return nil
+	}
+	if err := badPrefix(fpath); err != nil {
+		return err
+	}
+	callback := func(p string, info os.FileInfo, err error) error {
+		return Setfilecon(p, scon)
+	}
+
+	if recurse {
+		return filepath.Walk(fpath, callback)
+	}
+
+	return Setfilecon(fpath, scon)
+}
+
+// DupSecOpt takes an SELinux process label and returns security options that
+// can will set the SELinux Type and Level for future container processes
+func DupSecOpt(src string) []string {
+	if src == "" {
+		return nil
+	}
+	con := NewContext(src)
+	if con["user"] == "" ||
+		con["role"] == "" ||
+		con["type"] == "" ||
+		con["level"] == "" {
+		return nil
+	}
+	return []string{"label=user:" + con["user"],
+		"label=role:" + con["role"],
+		"label=type:" + con["type"],
+		"label=level:" + con["level"]}
+}
+
+// DisableSecOpt returns a security opt that can be used to disabling SELinux
+// labeling support for future container processes
+func DisableSecOpt() []string {
+	return []string{"label=disable"}
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/user/MAINTAINERS
+++ b/vendor/github.com/opencontainers/runc/libcontainer/user/MAINTAINERS
@@ -1,2 +0,0 @@
-Tianon Gravi <admwiggin@gmail.com> (@tianon)
-Aleksa Sarai <cyphar@cyphar.com> (@cyphar)
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.c
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2016 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "cmsg.h"
+
+#define error(fmt, ...)							\
+	({								\
+		fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
+		errno = ECOMM;						\
+		goto err; /* return value */				\
+	})
+
+/*
+ * Sends a file descriptor along the sockfd provided. Returns the return
+ * value of sendmsg(2). Any synchronisation and preparation of state
+ * should be done external to this (we expect the other side to be in
+ * recvfd() in the code).
+ */
+ssize_t sendfd(int sockfd, struct file_t file)
+{
+	struct msghdr msg = {0};
+	struct iovec iov[1] = {0};
+	struct cmsghdr *cmsg;
+	int *fdptr;
+	int ret;
+
+	union {
+		char buf[CMSG_SPACE(sizeof(file.fd))];
+		struct cmsghdr align;
+	} u;
+
+	/*
+	 * We need to send some other data along with the ancillary data,
+	 * otherwise the other side won't recieve any data. This is very
+	 * well-hidden in the documentation (and only applies to
+	 * SOCK_STREAM). See the bottom part of unix(7).
+	 */
+	iov[0].iov_base = file.name;
+	iov[0].iov_len = strlen(file.name) + 1;
+
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = u.buf;
+	msg.msg_controllen = sizeof(u.buf);
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+
+	fdptr = (int *) CMSG_DATA(cmsg);
+	memcpy(fdptr, &file.fd, sizeof(int));
+
+	return sendmsg(sockfd, &msg, 0);
+}
+
+/*
+ * Receives a file descriptor from the sockfd provided. Returns the file
+ * descriptor as sent from sendfd(). It will return the file descriptor
+ * or die (literally) trying. Any synchronisation and preparation of
+ * state should be done external to this (we expect the other side to be
+ * in sendfd() in the code).
+ */
+struct file_t recvfd(int sockfd)
+{
+	struct msghdr msg = {0};
+	struct iovec iov[1] = {0};
+	struct cmsghdr *cmsg;
+	struct file_t file = {0};
+	int *fdptr;
+	int olderrno;
+
+	union {
+		char buf[CMSG_SPACE(sizeof(file.fd))];
+		struct cmsghdr align;
+	} u;
+
+	/* Allocate a buffer. */
+	/* TODO: Make this dynamic with MSG_PEEK. */
+	file.name = malloc(TAG_BUFFER);
+	if (!file.name)
+		error("recvfd: failed to allocate file.tag buffer\n");
+
+	/*
+	 * We need to "recieve" the non-ancillary data even though we don't
+	 * plan to use it at all. Otherwise, things won't work as expected.
+	 * See unix(7) and other well-hidden documentation.
+	 */
+	iov[0].iov_base = file.name;
+	iov[0].iov_len = TAG_BUFFER;
+
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = u.buf;
+	msg.msg_controllen = sizeof(u.buf);
+
+	ssize_t ret = recvmsg(sockfd, &msg, 0);
+	if (ret < 0)
+		goto err;
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+	if (!cmsg)
+		error("recvfd: got NULL from CMSG_FIRSTHDR");
+	if (cmsg->cmsg_level != SOL_SOCKET)
+		error("recvfd: expected SOL_SOCKET in cmsg: %d", cmsg->cmsg_level);
+	if (cmsg->cmsg_type != SCM_RIGHTS)
+		error("recvfd: expected SCM_RIGHTS in cmsg: %d", cmsg->cmsg_type);
+	if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+		error("recvfd: expected correct CMSG_LEN in cmsg: %lu", cmsg->cmsg_len);
+
+	fdptr = (int *) CMSG_DATA(cmsg);
+	if (!fdptr || *fdptr < 0)
+		error("recvfd: recieved invalid pointer");
+
+	file.fd = *fdptr;
+	return file;
+
+err:
+	olderrno = errno;
+	free(file.name);
+	errno = olderrno;
+	return (struct file_t){0};
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
@@ -0,0 +1,57 @@
+// +build linux
+
+package utils
+
+/*
+ * Copyright 2016 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+#include <errno.h>
+#include <stdlib.h>
+#include "cmsg.h"
+*/
+import "C"
+
+import (
+	"os"
+	"unsafe"
+)
+
+// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
+// socket. The file name of the remote file descriptor will be recreated
+// locally (it is sent as non-auxilliary data in the same payload).
+func RecvFd(socket *os.File) (*os.File, error) {
+	file, err := C.recvfd(C.int(socket.Fd()))
+	if err != nil {
+		return nil, err
+	}
+	defer C.free(unsafe.Pointer(file.name))
+	return os.NewFile(uintptr(file.fd), C.GoString(file.name)), nil
+}
+
+// SendFd sends a file descriptor over the given AF_UNIX socket. In
+// addition, the file.Name() of the given file will also be sent as
+// non-auxilliary data in the same payload (allowing to send contextual
+// information for a file descriptor).
+func SendFd(socket, file *os.File) error {
+	var cfile C.struct_file_t
+	cfile.fd = C.int(file.Fd())
+	cfile.name = C.CString(file.Name())
+	defer C.free(unsafe.Pointer(cfile.name))
+
+	_, err := C.sendfd(C.int(socket.Fd()), cfile)
+	return err
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.h
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2016 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if !defined(CMSG_H)
+#define CMSG_H
+
+#include <sys/types.h>
+
+/* TODO: Implement this properly with MSG_PEEK. */
+#define TAG_BUFFER 4096
+
+/* This mirrors Go's (*os.File). */
+struct file_t {
+	char *name;
+	int fd;
+};
+
+struct file_t recvfd(int sockfd);
+ssize_t sendfd(int sockfd, struct file_t file);
+
+#endif /* !defined(CMSG_H) */
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
@@ -0,0 +1,126 @@
+package utils
+
+import (
+	"crypto/rand"
+	"encoding/hex"
+	"encoding/json"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"unsafe"
+)
+
+const (
+	exitSignalOffset = 128
+)
+
+// GenerateRandomName returns a new name joined with a prefix.  This size
+// specified is used to truncate the randomly generated value
+func GenerateRandomName(prefix string, size int) (string, error) {
+	id := make([]byte, 32)
+	if _, err := io.ReadFull(rand.Reader, id); err != nil {
+		return "", err
+	}
+	if size > 64 {
+		size = 64
+	}
+	return prefix + hex.EncodeToString(id)[:size], nil
+}
+
+// ResolveRootfs ensures that the current working directory is
+// not a symlink and returns the absolute path to the rootfs
+func ResolveRootfs(uncleanRootfs string) (string, error) {
+	rootfs, err := filepath.Abs(uncleanRootfs)
+	if err != nil {
+		return "", err
+	}
+	return filepath.EvalSymlinks(rootfs)
+}
+
+// ExitStatus returns the correct exit status for a process based on if it
+// was signaled or exited cleanly
+func ExitStatus(status syscall.WaitStatus) int {
+	if status.Signaled() {
+		return exitSignalOffset + int(status.Signal())
+	}
+	return status.ExitStatus()
+}
+
+// WriteJSON writes the provided struct v to w using standard json marshaling
+func WriteJSON(w io.Writer, v interface{}) error {
+	data, err := json.Marshal(v)
+	if err != nil {
+		return err
+	}
+	_, err = w.Write(data)
+	return err
+}
+
+// CleanPath makes a path safe for use with filepath.Join. This is done by not
+// only cleaning the path, but also (if the path is relative) adding a leading
+// '/' and cleaning it (then removing the leading '/'). This ensures that a
+// path resulting from prepending another path will always resolve to lexically
+// be a subdirectory of the prefixed path. This is all done lexically, so paths
+// that include symlinks won't be safe as a result of using CleanPath.
+func CleanPath(path string) string {
+	// Deal with empty strings nicely.
+	if path == "" {
+		return ""
+	}
+
+	// Ensure that all paths are cleaned (especially problematic ones like
+	// "/../../../../../" which can cause lots of issues).
+	path = filepath.Clean(path)
+
+	// If the path isn't absolute, we need to do more processing to fix paths
+	// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
+	// paths to relative ones.
+	if !filepath.IsAbs(path) {
+		path = filepath.Clean(string(os.PathSeparator) + path)
+		// This can't fail, as (by definition) all paths are relative to root.
+		path, _ = filepath.Rel(string(os.PathSeparator), path)
+	}
+
+	// Clean the path again for good measure.
+	return filepath.Clean(path)
+}
+
+// SearchLabels searches a list of key-value pairs for the provided key and
+// returns the corresponding value. The pairs must be separated with '='.
+func SearchLabels(labels []string, query string) string {
+	for _, l := range labels {
+		parts := strings.SplitN(l, "=", 2)
+		if len(parts) < 2 {
+			continue
+		}
+		if parts[0] == query {
+			return parts[1]
+		}
+	}
+	return ""
+}
+
+// Annotations returns the bundle path and user defined annotations from the
+// libcontainer state.  We need to remove the bundle because that is a label
+// added by libcontainer.
+func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
+	userAnnotations = make(map[string]string)
+	for _, l := range labels {
+		parts := strings.SplitN(l, "=", 2)
+		if len(parts) < 2 {
+			continue
+		}
+		if parts[0] == "bundle" {
+			bundle = parts[1]
+		} else {
+			userAnnotations[parts[0]] = parts[1]
+		}
+	}
+	return
+}
+
+func GetIntSize() int {
+	return int(unsafe.Sizeof(1))
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
@@ -0,0 +1,33 @@
+// +build !windows
+
+package utils
+
+import (
+	"io/ioutil"
+	"strconv"
+	"syscall"
+)
+
+func CloseExecFrom(minFd int) error {
+	fdList, err := ioutil.ReadDir("/proc/self/fd")
+	if err != nil {
+		return err
+	}
+	for _, fi := range fdList {
+		fd, err := strconv.Atoi(fi.Name())
+		if err != nil {
+			// ignore non-numeric file names
+			continue
+		}
+
+		if fd < minFd {
+			// ignore descriptors lower than our specified minimum
+			continue
+		}
+
+		// intentionally ignore errors from syscall.CloseOnExec
+		syscall.CloseOnExec(fd)
+		// the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall)
+	}
+	return nil
+}