From 3b774e3657af4b49c2a673178e515adf5686d984 Mon Sep 17 00:00:00 2001 From: Justin Cormack Date: Tue, 13 Dec 2016 17:28:32 -0800 Subject: [PATCH] Make AUFS optional and ship choice of kernels This seems the best option, although none are great - build with `make AUFS=1` to build with AUFS support, currently with 4.8 kernel - default is to build without AUFS support, with 4.9 kernel This recognises that AUFS supprot is temporary #620 and only there until we can phase it out on desktop editions, and allow the other editions that never shipped with AUFS to ship something very close to mainline. However we do still apply the patches so that the non AUFS branch runs fine on all platforms, so it can be tested elsewhere. We may be able to move the kernel versions back in line when 4.9 aufs support is out. Plan is to shift CI to build both sets of images, and get the Desktop editions to pick up the aufs set automatically, once this is merged. Signed-off-by: Justin Cormack --- Makefile | 20 +- alpine/kernel/Dockerfile | 48 +- alpine/kernel/Dockerfile.aufs | 85 + alpine/kernel/Makefile | 16 +- alpine/kernel/kernel_config | 16 - alpine/kernel/kernel_config.aufs | 16 + ...-host-network-namespace-to-use-AF_VS.patch | 30 + ...-fix-the-race-when-querying-updating.patch | 0 ...03-hv_sock-introduce-Hyper-V-Sockets.patch | 1791 +++++++++++++++++ ...n-t-spam-the-logs-with-unknown-GUIDs.patch | 30 + 10 files changed, 1982 insertions(+), 70 deletions(-) create mode 100644 alpine/kernel/Dockerfile.aufs create mode 100644 alpine/kernel/kernel_config.aufs create mode 100644 alpine/kernel/patches-aufs/0001-VSOCK-Only-allow-host-network-namespace-to-use-AF_VS.patch rename alpine/kernel/{patches => patches-aufs}/0002-Drivers-hv-vmbus-fix-the-race-when-querying-updating.patch (100%) create mode 100644 alpine/kernel/patches-aufs/0003-hv_sock-introduce-Hyper-V-Sockets.patch create mode 100644 alpine/kernel/patches-aufs/0004-vmbus-Don-t-spam-the-logs-with-unknown-GUIDs.patch diff --git a/Makefile b/Makefile index f7d329c0c..6ac1691ca 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,9 @@ all: $(MAKE) -C alpine +aufs: + $(MAKE) AUFS=true all + alpine/initrd.img: $(MAKE) -C alpine initrd.img @@ -42,15 +45,20 @@ test: Dockerfile.test alpine/initrd-test.img alpine/kernel/x86_64/vmlinuz64 TAG=$(shell git rev-parse HEAD) STATUS=$(shell git status -s) MOBYLINUX_TAG=alpine/mobylinux.tag +ifdef AUFS +AUFS_PREFIX=aufs- +endif +MEDIA_IMAGE=mobylinux/media:$(MEDIA_PREFIX)$(AUFS_PREFIX)$(TAG) +KERNEL_IMAGE=mobylinux/kernel:$(MEDIA_PREFIX)$(AUFS_PREFIX)$(TAG) media: Dockerfile.media alpine/initrd.img alpine/kernel/x86_64/vmlinuz64 alpine/mobylinux-efi.iso ifeq ($(STATUS),) - tar cf - $^ alpine/mobylinux.efi alpine/kernel/x86_64/vmlinux alpine/kernel/x86_64/kernel-headers.tar | docker build -f Dockerfile.media -t mobylinux/media:$(MEDIA_PREFIX)$(TAG) - - docker push mobylinux/media:$(MEDIA_PREFIX)$(TAG) + tar cf - $^ alpine/mobylinux.efi alpine/kernel/x86_64/vmlinux alpine/kernel/x86_64/kernel-headers.tar | docker build -f Dockerfile.media -t $(MEDIA_IMAGE) - + docker push $(MEDIA_IMAGE) [ -f $(MOBYLINUX_TAG) ] - docker tag $(shell cat $(MOBYLINUX_TAG)) mobylinux/mobylinux:$(MEDIA_PREFIX)$(TAG) - docker push mobylinux/mobylinux:$(MEDIA_PREFIX)$(TAG) - tar cf - Dockerfile.kernel alpine/kernel/x86_64/vmlinuz64 | docker build -f Dockerfile.kernel -t mobylinux/kernel:$(MEDIA_PREFIX)$(TAG) - - docker push mobylinux/kernel:$(MEDIA_PREFIX)$(TAG) + docker tag $(shell cat $(MOBYLINUX_TAG)) $(MEDIA_IMAGE) + docker push $(MEDIA_IMAGE) + tar cf - Dockerfile.kernel alpine/kernel/x86_64/vmlinuz64 | docker build -f Dockerfile.kernel -t $(KERNEL_IMAGE) - + docker push $(KERNEL_IMAGE) else $(error "git not clean") endif diff --git a/alpine/kernel/Dockerfile b/alpine/kernel/Dockerfile index 155b1f50b..6ad788b41 100644 --- a/alpine/kernel/Dockerfile +++ b/alpine/kernel/Dockerfile @@ -1,7 +1,7 @@ # Tag: 36aecb5cf4738737634140eec9abebe1f6559a39 FROM mobylinux/alpine-build-c@sha256:d66b9625abc831f28f8c584991a9cb6975e85d3bb3d3768474b592f1cf32a3a6 -ARG KERNEL_VERSION=4.8.14 +ARG KERNEL_VERSION=4.9 ENV KERNEL_SOURCE=https://www.kernel.org/pub/linux/kernel/v4.x/linux-${KERNEL_VERSION}.tar.xz @@ -9,42 +9,6 @@ RUN curl -fsSL -o linux-${KERNEL_VERSION}.tar.xz ${KERNEL_SOURCE} RUN cat linux-${KERNEL_VERSION}.tar.xz | tar --absolute-names -xJ && mv /linux-${KERNEL_VERSION} /linux -# this is aufs4.8 20161010 -ENV AUFS_REPO https://github.com/sfjro/aufs4-standalone -ENV AUFS_BRANCH aufs4.8 -ENV AUFS_COMMIT e9fd128dcb16167417683e199a5feb14f3c9eca8 - -# Download AUFS -RUN git clone -b "$AUFS_BRANCH" "$AUFS_REPO" /aufs && \ - cd /aufs && \ - git checkout -q "$AUFS_COMMIT" - -# aufs-util 20151116 -ENV AUFS_TOOLS_REPO https://github.com/ncopa/aufs-util.git -ENV AUFS_TOOLS_COMMIT 3b7c5e262b53598a8204a915e485489c46d4e7a4 - -# Download aufs tools -RUN git clone ${AUFS_TOOLS_REPO} && \ - cd /aufs-util && \ - git checkout "$AUFS_TOOLS_COMMIT" - -#BUILD -# patch kernel with aufs -RUN cd /linux && \ - cp -r /aufs/Documentation /linux && \ - cp -r /aufs/fs /linux && \ - cp -r /aufs/include/uapi/linux/aufs_type.h /linux/include/uapi/linux/ && \ - set -e && for patch in \ - /aufs/aufs*-kbuild.patch \ - /aufs/aufs*-base.patch \ - /aufs/aufs*-mmap.patch \ - /aufs/aufs*-standalone.patch \ - /aufs/aufs*-loopback.patch \ - /aufs/lockdep-debug.patch \ - ; do \ - patch -p1 < "$patch"; \ - done - COPY kernel_config /linux/arch/x86/configs/x86_64_defconfig COPY kernel_config.debug /linux/debug_config @@ -72,12 +36,4 @@ RUN make INSTALL_MOD_PATH=/tmp/kernel-modules modules_install && \ ( cd /tmp && tar cf /kernel-headers.tar include ) && \ ( cd /tmp/kernel-modules && tar cf /kernel-modules.tar . ) -# Build aufs tools, do this here as they need kernel headers and to match aufs -# Fortunately they are built statically linked -RUN cd /aufs-util && \ - CPPFLAGS="-I/tmp/include" CFLAGS=$CPPFLAGS LDFLAGS=$CPPFLAGS make && \ - DESTDIR=/tmp/aufs-utils make install && \ - rm -rf /tmp/aufs-utils/usr/lib /tmp/aufs-utils/usr/share && \ - cd /tmp/aufs-utils && rm libau* && tar cf /aufs-utils.tar . - -RUN printf "KERNEL_SOURCE=${KERNEL_SOURCE}\nAUFS_REPO=${AUFS_REPO}\nAUFS_BRANCH=${AUFS_BRANCH}\nAUFS_COMMIT=${AUFS_COMMIT}\nAUFS_TOOLS_REPO=${AUFS_TOOLS_REPO}\nAUFS_TOOLS_COMMIT=${AUFS_TOOLS_COMMIT}\n" > /kernel-source-info +RUN printf "KERNEL_SOURCE=${KERNEL_SOURCE}\n" > /kernel-source-info diff --git a/alpine/kernel/Dockerfile.aufs b/alpine/kernel/Dockerfile.aufs new file mode 100644 index 000000000..c8adfa6ef --- /dev/null +++ b/alpine/kernel/Dockerfile.aufs @@ -0,0 +1,85 @@ +# Tag: 36aecb5cf4738737634140eec9abebe1f6559a39 +FROM mobylinux/alpine-build-c@sha256:d66b9625abc831f28f8c584991a9cb6975e85d3bb3d3768474b592f1cf32a3a6 + +ARG KERNEL_VERSION=4.8.14 + +ENV KERNEL_SOURCE=https://www.kernel.org/pub/linux/kernel/v4.x/linux-${KERNEL_VERSION}.tar.xz + +RUN curl -fsSL -o linux-${KERNEL_VERSION}.tar.xz ${KERNEL_SOURCE} + +RUN cat linux-${KERNEL_VERSION}.tar.xz | tar --absolute-names -xJ && mv /linux-${KERNEL_VERSION} /linux + +# this is aufs4.8 20161010 +ENV AUFS_REPO https://github.com/sfjro/aufs4-standalone +ENV AUFS_BRANCH aufs4.8 +ENV AUFS_COMMIT e9fd128dcb16167417683e199a5feb14f3c9eca8 + +# Download AUFS +RUN git clone -b "$AUFS_BRANCH" "$AUFS_REPO" /aufs && \ + cd /aufs && \ + git checkout -q "$AUFS_COMMIT" + +# aufs-util 20151116 +ENV AUFS_TOOLS_REPO https://github.com/ncopa/aufs-util.git +ENV AUFS_TOOLS_COMMIT 3b7c5e262b53598a8204a915e485489c46d4e7a4 + +# Download aufs tools +RUN git clone ${AUFS_TOOLS_REPO} && \ + cd /aufs-util && \ + git checkout "$AUFS_TOOLS_COMMIT" + +#BUILD +# patch kernel with aufs +RUN cd /linux && \ + cp -r /aufs/Documentation /linux && \ + cp -r /aufs/fs /linux && \ + cp -r /aufs/include/uapi/linux/aufs_type.h /linux/include/uapi/linux/ && \ + set -e && for patch in \ + /aufs/aufs*-kbuild.patch \ + /aufs/aufs*-base.patch \ + /aufs/aufs*-mmap.patch \ + /aufs/aufs*-standalone.patch \ + /aufs/aufs*-loopback.patch \ + /aufs/lockdep-debug.patch \ + ; do \ + patch -p1 < "$patch"; \ + done + +COPY kernel_config /linux/arch/x86/configs/x86_64_defconfig +COPY kernel_config.debug /linux/debug_config +COPY kernel_config.aufs /linux/aufs_config +RUN cat /linux/aufs_config >> /linux/arch/x86/configs/x86_64_defconfig + +ARG DEBUG=0 + +RUN if [ $DEBUG -ne "0" ]; then \ + sed -i 's/CONFIG_PANIC_ON_OOPS=y/# CONFIG_PANIC_ON_OOPS is not set/' /linux/arch/x86/configs/x86_64_defconfig; \ + cat /linux/debug_config >> /linux/arch/x86/configs/x86_64_defconfig; \ + fi + +# Apply local patches +COPY patches-aufs /patches +RUN cd /linux && \ + set -e && for patch in /patches/*.patch; do \ + echo "Applying $patch"; \ + patch -p1 < "$patch"; \ + done + +WORKDIR /linux +RUN make defconfig && \ + make oldconfig && \ + make -j "$(getconf _NPROCESSORS_ONLN)" KCFLAGS="-fno-pie" +RUN make INSTALL_MOD_PATH=/tmp/kernel-modules modules_install && \ + make INSTALL_HDR_PATH=/tmp headers_install && \ + ( cd /tmp && tar cf /kernel-headers.tar include ) && \ + ( cd /tmp/kernel-modules && tar cf /kernel-modules.tar . ) + +# Build aufs tools, do this here as they need kernel headers and to match aufs +# Fortunately they are built statically linked +RUN cd /aufs-util && \ + CPPFLAGS="-I/tmp/include" CFLAGS=$CPPFLAGS LDFLAGS=$CPPFLAGS make && \ + DESTDIR=/tmp/aufs-utils make install && \ + rm -rf /tmp/aufs-utils/usr/lib /tmp/aufs-utils/usr/share && \ + cd /tmp/aufs-utils && rm libau* && tar cf /aufs-utils.tar . + +RUN printf "KERNEL_SOURCE=${KERNEL_SOURCE}\nAUFS_REPO=${AUFS_REPO}\nAUFS_BRANCH=${AUFS_BRANCH}\nAUFS_COMMIT=${AUFS_COMMIT}\nAUFS_TOOLS_REPO=${AUFS_TOOLS_REPO}\nAUFS_TOOLS_COMMIT=${AUFS_TOOLS_COMMIT}\n" > /kernel-source-info diff --git a/alpine/kernel/Makefile b/alpine/kernel/Makefile index 82aaa7201..403ad165b 100644 --- a/alpine/kernel/Makefile +++ b/alpine/kernel/Makefile @@ -2,16 +2,28 @@ DEBUG ?= 0 all: x86_64/vmlinuz64 -x86_64/vmlinuz64: Dockerfile kernel_config +ifdef AUFS +x86_64/vmlinuz64: Dockerfile.aufs kernel_config kernel_config.debug kernel_config.aufs mkdir -p x86_64 etc - BUILD=$$( docker build --build-arg DEBUG=$(DEBUG) -q . ) && [ -n "$$BUILD" ] && echo "Built $$BUILD" && \ + BUILD=$$( docker build -f Dockerfile.aufs --build-arg DEBUG=$(DEBUG) -q . ) && [ -n "$$BUILD" ] && echo "Built $$BUILD" && \ docker run --rm --net=none --log-driver=none $$BUILD cat /kernel-modules.tar | tar xf - && \ docker run --rm --net=none --log-driver=none $$BUILD cat /aufs-utils.tar | tar xf - && \ docker run --rm --net=none --log-driver=none $$BUILD cat /kernel-source-info > etc/kernel-source-info && \ docker run --rm --net=none --log-driver=none $$BUILD cat /linux/vmlinux > x86_64/vmlinux && \ docker run --rm --net=none --log-driver=none $$BUILD cat /linux/arch/x86_64/boot/bzImage > $@ && \ docker run --rm --net=none --log-driver=none $$BUILD cat /kernel-headers.tar > x86_64/kernel-headers.tar && \ + cp -a patches-aufs etc/kernel-patches +else +x86_64/vmlinuz64: Dockerfile kernel_config kernel_config.debug + mkdir -p x86_64 etc + BUILD=$$( docker build --build-arg DEBUG=$(DEBUG) -q . ) && [ -n "$$BUILD" ] && echo "Built $$BUILD" && \ + docker run --rm --net=none --log-driver=none $$BUILD cat /kernel-modules.tar | tar xf - && \ + docker run --rm --net=none --log-driver=none $$BUILD cat /kernel-source-info > etc/kernel-source-info && \ + docker run --rm --net=none --log-driver=none $$BUILD cat /linux/vmlinux > x86_64/vmlinux && \ + docker run --rm --net=none --log-driver=none $$BUILD cat /linux/arch/x86_64/boot/bzImage > $@ && \ + docker run --rm --net=none --log-driver=none $$BUILD cat /kernel-headers.tar > x86_64/kernel-headers.tar && \ cp -a patches etc/kernel-patches +endif clean: rm -rf x86_64 lib etc usr sbin diff --git a/alpine/kernel/kernel_config b/alpine/kernel/kernel_config index 9ccbb57e7..3d03892ae 100644 --- a/alpine/kernel/kernel_config +++ b/alpine/kernel/kernel_config @@ -2935,22 +2935,6 @@ CONFIG_PSTORE_ZLIB_COMPRESS=y # CONFIG_PSTORE_RAM is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set -CONFIG_AUFS_FS=y -# CONFIG_AUFS_BRANCH_MAX_127 is not set -# CONFIG_AUFS_BRANCH_MAX_511 is not set -# CONFIG_AUFS_BRANCH_MAX_1023 is not set -CONFIG_AUFS_BRANCH_MAX_32767=y -CONFIG_AUFS_SBILIST=y -# CONFIG_AUFS_HNOTIFY is not set -# CONFIG_AUFS_EXPORT is not set -CONFIG_AUFS_XATTR=y -# CONFIG_AUFS_FHSM is not set -# CONFIG_AUFS_RDU is not set -# CONFIG_AUFS_SHWH is not set -# CONFIG_AUFS_BR_RAMFS is not set -# CONFIG_AUFS_BR_FUSE is not set -CONFIG_AUFS_BDEV_LOOP=y -# CONFIG_AUFS_DEBUG is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y # CONFIG_NFS_V2 is not set diff --git a/alpine/kernel/kernel_config.aufs b/alpine/kernel/kernel_config.aufs new file mode 100644 index 000000000..3dbf17a99 --- /dev/null +++ b/alpine/kernel/kernel_config.aufs @@ -0,0 +1,16 @@ +CONFIG_AUFS_FS=y +# CONFIG_AUFS_BRANCH_MAX_127 is not set +# CONFIG_AUFS_BRANCH_MAX_511 is not set +# CONFIG_AUFS_BRANCH_MAX_1023 is not set +CONFIG_AUFS_BRANCH_MAX_32767=y +CONFIG_AUFS_SBILIST=y +# CONFIG_AUFS_HNOTIFY is not set +# CONFIG_AUFS_EXPORT is not set +CONFIG_AUFS_XATTR=y +# CONFIG_AUFS_FHSM is not set +# CONFIG_AUFS_RDU is not set +# CONFIG_AUFS_SHWH is not set +# CONFIG_AUFS_BR_RAMFS is not set +# CONFIG_AUFS_BR_FUSE is not set +CONFIG_AUFS_BDEV_LOOP=y +# CONFIG_AUFS_DEBUG is not set diff --git a/alpine/kernel/patches-aufs/0001-VSOCK-Only-allow-host-network-namespace-to-use-AF_VS.patch b/alpine/kernel/patches-aufs/0001-VSOCK-Only-allow-host-network-namespace-to-use-AF_VS.patch new file mode 100644 index 000000000..4855bf28f --- /dev/null +++ b/alpine/kernel/patches-aufs/0001-VSOCK-Only-allow-host-network-namespace-to-use-AF_VS.patch @@ -0,0 +1,30 @@ +From afc48615e62910f37b6076f9118c80d2f9613064 Mon Sep 17 00:00:00 2001 +From: Ian Campbell +Date: Mon, 4 Apr 2016 14:50:10 +0100 +Subject: [PATCH 1/5] VSOCK: Only allow host network namespace to use AF_VSOCK. + +The VSOCK addressing schema does not really lend itself to simply creating an +alternative end point address within a namespace. + +Signed-off-by: Ian Campbell +--- + net/vmw_vsock/af_vsock.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c +index 8a398b3..0edc54c 100644 +--- a/net/vmw_vsock/af_vsock.c ++++ b/net/vmw_vsock/af_vsock.c +@@ -1852,6 +1852,9 @@ static const struct proto_ops vsock_stream_ops = { + static int vsock_create(struct net *net, struct socket *sock, + int protocol, int kern) + { ++ if (!net_eq(net, &init_net)) ++ return -EAFNOSUPPORT; ++ + if (!sock) + return -EINVAL; + +-- +2.10.2 + diff --git a/alpine/kernel/patches/0002-Drivers-hv-vmbus-fix-the-race-when-querying-updating.patch b/alpine/kernel/patches-aufs/0002-Drivers-hv-vmbus-fix-the-race-when-querying-updating.patch similarity index 100% rename from alpine/kernel/patches/0002-Drivers-hv-vmbus-fix-the-race-when-querying-updating.patch rename to alpine/kernel/patches-aufs/0002-Drivers-hv-vmbus-fix-the-race-when-querying-updating.patch diff --git a/alpine/kernel/patches-aufs/0003-hv_sock-introduce-Hyper-V-Sockets.patch b/alpine/kernel/patches-aufs/0003-hv_sock-introduce-Hyper-V-Sockets.patch new file mode 100644 index 000000000..f40e48135 --- /dev/null +++ b/alpine/kernel/patches-aufs/0003-hv_sock-introduce-Hyper-V-Sockets.patch @@ -0,0 +1,1791 @@ +From e7db86231b7078971c613aa81c9090079571cf24 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Sat, 23 Jul 2016 01:35:51 +0000 +Subject: [PATCH 3/5] hv_sock: introduce Hyper-V Sockets + +Hyper-V Sockets (hv_sock) supplies a byte-stream based communication +mechanism between the host and the guest. It's somewhat like TCP over +VMBus, but the transportation layer (VMBus) is much simpler than IP. + +With Hyper-V Sockets, applications between the host and the guest can talk +to each other directly by the traditional BSD-style socket APIs. + +Hyper-V Sockets is only available on new Windows hosts, like Windows Server +2016. More info is in this article "Make your own integration services": +https://msdn.microsoft.com/en-us/virtualization/hyperv_on_windows/develop/make_mgmt_service + +The patch implements the necessary support in the guest side by introducing +a new socket address family AF_HYPERV. + +Signed-off-by: Dexuan Cui +Cc: "K. Y. Srinivasan" +Cc: Haiyang Zhang +Cc: Vitaly Kuznetsov +Cc: Cathy Avery +Cc: Olaf Hering +Origin: https://patchwork.kernel.org/patch/9244467/ +--- + MAINTAINERS | 2 + + include/linux/hyperv.h | 13 + + include/linux/socket.h | 4 +- + include/net/af_hvsock.h | 78 +++ + include/uapi/linux/hyperv.h | 23 + + net/Kconfig | 1 + + net/Makefile | 1 + + net/hv_sock/Kconfig | 10 + + net/hv_sock/Makefile | 3 + + net/hv_sock/af_hvsock.c | 1507 +++++++++++++++++++++++++++++++++++++++++++ + 10 files changed, 1641 insertions(+), 1 deletion(-) + create mode 100644 include/net/af_hvsock.h + create mode 100644 net/hv_sock/Kconfig + create mode 100644 net/hv_sock/Makefile + create mode 100644 net/hv_sock/af_hvsock.c + +diff --git a/MAINTAINERS b/MAINTAINERS +index babaf82..6126545 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -5667,7 +5667,9 @@ F: drivers/pci/host/pci-hyperv.c + F: drivers/net/hyperv/ + F: drivers/scsi/storvsc_drv.c + F: drivers/video/fbdev/hyperv_fb.c ++F: net/hv_sock/ + F: include/linux/hyperv.h ++F: include/net/af_hvsock.h + F: tools/hv/ + F: Documentation/ABI/stable/sysfs-bus-vmbus + +diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h +index b10954a..50f8976 100644 +--- a/include/linux/hyperv.h ++++ b/include/linux/hyperv.h +@@ -1505,5 +1505,18 @@ static inline void commit_rd_index(struct vmbus_channel *channel) + vmbus_set_event(channel); + } + ++struct vmpipe_proto_header { ++ u32 pkt_type; ++ u32 data_size; ++}; ++ ++#define HVSOCK_HEADER_LEN (sizeof(struct vmpacket_descriptor) + \ ++ sizeof(struct vmpipe_proto_header)) ++ ++/* See 'prev_indices' in hv_ringbuffer_read(), hv_ringbuffer_write() */ ++#define PREV_INDICES_LEN (sizeof(u64)) + ++#define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ ++ ALIGN((payload_len), 8) + \ ++ PREV_INDICES_LEN) + #endif /* _HYPERV_H */ +diff --git a/include/linux/socket.h b/include/linux/socket.h +index b5cc5a6..0b68b58 100644 +--- a/include/linux/socket.h ++++ b/include/linux/socket.h +@@ -202,8 +202,9 @@ struct ucred { + #define AF_VSOCK 40 /* vSockets */ + #define AF_KCM 41 /* Kernel Connection Multiplexor*/ + #define AF_QIPCRTR 42 /* Qualcomm IPC Router */ ++#define AF_HYPERV 43 /* Hyper-V Sockets */ + +-#define AF_MAX 43 /* For now.. */ ++#define AF_MAX 44 /* For now.. */ + + /* Protocol families, same as address families. */ + #define PF_UNSPEC AF_UNSPEC +@@ -251,6 +252,7 @@ struct ucred { + #define PF_VSOCK AF_VSOCK + #define PF_KCM AF_KCM + #define PF_QIPCRTR AF_QIPCRTR ++#define PF_HYPERV AF_HYPERV + #define PF_MAX AF_MAX + + /* Maximum queue length specifiable by listen. */ +diff --git a/include/net/af_hvsock.h b/include/net/af_hvsock.h +new file mode 100644 +index 0000000..e7a8a3a +--- /dev/null ++++ b/include/net/af_hvsock.h +@@ -0,0 +1,78 @@ ++#ifndef __AF_HVSOCK_H__ ++#define __AF_HVSOCK_H__ ++ ++#include ++#include ++#include ++ ++/* The host side's design of the feature requires 5 exact 4KB pages for ++ * recv/send rings respectively -- this is suboptimal considering memory ++ * consumption, however unluckily we have to live with it, before the ++ * host comes up with a better design in the future. ++ */ ++#define PAGE_SIZE_4K 4096 ++#define RINGBUFFER_HVSOCK_RCV_SIZE (PAGE_SIZE_4K * 5) ++#define RINGBUFFER_HVSOCK_SND_SIZE (PAGE_SIZE_4K * 5) ++ ++/* The MTU is 16KB per the host side's design. ++ * In future, the buffer can be elimiated when we switch to use the coming ++ * new VMBus ringbuffer "in-place consumption" APIs, by which we can ++ * directly copy data from VMBus ringbuffer into the userspace buffer. ++ */ ++#define HVSOCK_MTU_SIZE (1024 * 16) ++struct hvsock_recv_buf { ++ unsigned int data_len; ++ unsigned int data_offset; ++ ++ struct vmpipe_proto_header hdr; ++ u8 buf[HVSOCK_MTU_SIZE]; ++}; ++ ++/* In the VM, actually we can send up to HVSOCK_MTU_SIZE bytes of payload, ++ * but for now let's use a smaller size to minimize the dynamically-allocated ++ * buffer. Note: the buffer can be elimiated in future when we add new VMBus ++ * ringbuffer APIs that allow us to directly copy data from userspace buf to ++ * VMBus ringbuffer. ++ */ ++#define HVSOCK_MAX_SND_SIZE_BY_VM (1024 * 4) ++struct hvsock_send_buf { ++ struct vmpipe_proto_header hdr; ++ u8 buf[HVSOCK_MAX_SND_SIZE_BY_VM]; ++}; ++ ++struct hvsock_sock { ++ /* sk must be the first member. */ ++ struct sock sk; ++ ++ struct sockaddr_hv local_addr; ++ struct sockaddr_hv remote_addr; ++ ++ /* protected by the global hvsock_mutex */ ++ struct list_head bound_list; ++ struct list_head connected_list; ++ ++ struct list_head accept_queue; ++ /* used by enqueue and dequeue */ ++ struct mutex accept_queue_mutex; ++ ++ struct delayed_work dwork; ++ ++ u32 peer_shutdown; ++ ++ struct vmbus_channel *channel; ++ ++ struct hvsock_send_buf *send; ++ struct hvsock_recv_buf *recv; ++}; ++ ++static inline struct hvsock_sock *sk_to_hvsock(struct sock *sk) ++{ ++ return (struct hvsock_sock *)sk; ++} ++ ++static inline struct sock *hvsock_to_sk(struct hvsock_sock *hvsk) ++{ ++ return (struct sock *)hvsk; ++} ++ ++#endif /* __AF_HVSOCK_H__ */ +diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h +index e347b24..eb3e44b 100644 +--- a/include/uapi/linux/hyperv.h ++++ b/include/uapi/linux/hyperv.h +@@ -26,6 +26,7 @@ + #define _UAPI_HYPERV_H + + #include ++#include + + /* + * Framework version for util services. +@@ -396,4 +397,26 @@ struct hv_kvp_ip_msg { + struct hv_kvp_ipaddr_value kvp_ip_val; + } __attribute__((packed)); + ++/* This is the address format of Hyper-V Sockets. ++ * Note: here we just borrow the kernel's built-in type uuid_le. When ++ * an application calls bind() or connect(), the 2 members of struct ++ * sockaddr_hv must be of GUID. ++ * The GUID format differs from the UUID format only in the byte order of ++ * the first 3 fields. Refer to: ++ * https://en.wikipedia.org/wiki/Globally_unique_identifier ++ */ ++struct sockaddr_hv { ++ __kernel_sa_family_t shv_family; /* Address family */ ++ u16 reserved; /* Must be Zero */ ++ uuid_le shv_vm_guid; /* VM ID */ ++ uuid_le shv_service_guid; /* Service ID */ ++}; ++ ++#define SHV_VMID_GUEST NULL_UUID_LE ++#define SHV_VMID_HOST NULL_UUID_LE ++ ++#define SHV_SERVICE_ID_ANY NULL_UUID_LE ++ ++#define SHV_PROTO_RAW 1 ++ + #endif /* _UAPI_HYPERV_H */ +diff --git a/net/Kconfig b/net/Kconfig +index c2cdbce..921e86f 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -231,6 +231,7 @@ source "net/dns_resolver/Kconfig" + source "net/batman-adv/Kconfig" + source "net/openvswitch/Kconfig" + source "net/vmw_vsock/Kconfig" ++source "net/hv_sock/Kconfig" + source "net/netlink/Kconfig" + source "net/mpls/Kconfig" + source "net/hsr/Kconfig" +diff --git a/net/Makefile b/net/Makefile +index 9bd20bb..b4d4e9a 100644 +--- a/net/Makefile ++++ b/net/Makefile +@@ -70,6 +70,7 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ + obj-$(CONFIG_NFC) += nfc/ + obj-$(CONFIG_OPENVSWITCH) += openvswitch/ + obj-$(CONFIG_VSOCKETS) += vmw_vsock/ ++obj-$(CONFIG_HYPERV_SOCK) += hv_sock/ + obj-$(CONFIG_MPLS) += mpls/ + obj-$(CONFIG_HSR) += hsr/ + ifneq ($(CONFIG_NET_SWITCHDEV),) +diff --git a/net/hv_sock/Kconfig b/net/hv_sock/Kconfig +new file mode 100644 +index 0000000..ff84875 +--- /dev/null ++++ b/net/hv_sock/Kconfig +@@ -0,0 +1,10 @@ ++config HYPERV_SOCK ++ tristate "Hyper-V Sockets" ++ depends on HYPERV ++ default m if HYPERV ++ help ++ Hyper-V Sockets is a socket interface for high speed ++ communication between Linux guest and Hyper-V host over VMBus. ++ ++ To compile this driver as a module, choose M here: the module ++ will be called hv_sock. +diff --git a/net/hv_sock/Makefile b/net/hv_sock/Makefile +new file mode 100644 +index 0000000..716c012 +--- /dev/null ++++ b/net/hv_sock/Makefile +@@ -0,0 +1,3 @@ ++obj-$(CONFIG_HYPERV_SOCK) += hv_sock.o ++ ++hv_sock-y += af_hvsock.o +diff --git a/net/hv_sock/af_hvsock.c b/net/hv_sock/af_hvsock.c +new file mode 100644 +index 0000000..331d375 +--- /dev/null ++++ b/net/hv_sock/af_hvsock.c +@@ -0,0 +1,1507 @@ ++/* ++ * Hyper-V Sockets -- a socket-based communication channel between the ++ * Hyper-V host and the virtual machines running on it. ++ * ++ * Copyright (c) 2016 Microsoft Corporation. ++ * ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * Alternatively, this software may be distributed under the terms of the ++ * GNU General Public License ("GPL") version 2 as published by the Free ++ * Software Foundation. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, ++ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, ++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING ++ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++ * POSSIBILITY OF SUCH DAMAGE. ++ */ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++ ++static struct proto hvsock_proto = { ++ .name = "HV_SOCK", ++ .owner = THIS_MODULE, ++ .obj_size = sizeof(struct hvsock_sock), ++}; ++ ++#define SS_LISTEN 255 ++ ++#define HVSOCK_CONNECT_TIMEOUT (30 * HZ) ++ ++/* This is an artificial limit */ ++#define HVSOCK_MAX_BACKLOG 128 ++ ++static LIST_HEAD(hvsock_bound_list); ++static LIST_HEAD(hvsock_connected_list); ++static DEFINE_MUTEX(hvsock_mutex); ++ ++static struct sock *hvsock_find_bound_socket(const struct sockaddr_hv *addr) ++{ ++ struct hvsock_sock *hvsk; ++ ++ list_for_each_entry(hvsk, &hvsock_bound_list, bound_list) { ++ if (!uuid_le_cmp(addr->shv_service_guid, ++ hvsk->local_addr.shv_service_guid)) ++ return hvsock_to_sk(hvsk); ++ } ++ return NULL; ++} ++ ++static struct sock *hvsock_find_connected_socket_by_channel( ++ const struct vmbus_channel *channel) ++{ ++ struct hvsock_sock *hvsk; ++ ++ list_for_each_entry(hvsk, &hvsock_connected_list, connected_list) { ++ if (hvsk->channel == channel) ++ return hvsock_to_sk(hvsk); ++ } ++ return NULL; ++} ++ ++static void hvsock_enqueue_accept(struct sock *listener, ++ struct sock *connected) ++{ ++ struct hvsock_sock *hvconnected; ++ struct hvsock_sock *hvlistener; ++ ++ hvlistener = sk_to_hvsock(listener); ++ hvconnected = sk_to_hvsock(connected); ++ ++ sock_hold(connected); ++ sock_hold(listener); ++ ++ mutex_lock(&hvlistener->accept_queue_mutex); ++ list_add_tail(&hvconnected->accept_queue, &hvlistener->accept_queue); ++ listener->sk_ack_backlog++; ++ mutex_unlock(&hvlistener->accept_queue_mutex); ++} ++ ++static struct sock *hvsock_dequeue_accept(struct sock *listener) ++{ ++ struct hvsock_sock *hvconnected; ++ struct hvsock_sock *hvlistener; ++ ++ hvlistener = sk_to_hvsock(listener); ++ ++ mutex_lock(&hvlistener->accept_queue_mutex); ++ ++ if (list_empty(&hvlistener->accept_queue)) { ++ mutex_unlock(&hvlistener->accept_queue_mutex); ++ return NULL; ++ } ++ ++ hvconnected = list_entry(hvlistener->accept_queue.next, ++ struct hvsock_sock, accept_queue); ++ ++ list_del_init(&hvconnected->accept_queue); ++ listener->sk_ack_backlog--; ++ ++ mutex_unlock(&hvlistener->accept_queue_mutex); ++ ++ sock_put(listener); ++ /* The caller will need a reference on the connected socket so we let ++ * it call sock_put(). ++ */ ++ ++ return hvsock_to_sk(hvconnected); ++} ++ ++static bool hvsock_is_accept_queue_empty(struct sock *sk) ++{ ++ struct hvsock_sock *hvsk = sk_to_hvsock(sk); ++ int ret; ++ ++ mutex_lock(&hvsk->accept_queue_mutex); ++ ret = list_empty(&hvsk->accept_queue); ++ mutex_unlock(&hvsk->accept_queue_mutex); ++ ++ return ret; ++} ++ ++static void hvsock_addr_init(struct sockaddr_hv *addr, uuid_le service_id) ++{ ++ memset(addr, 0, sizeof(*addr)); ++ addr->shv_family = AF_HYPERV; ++ addr->shv_service_guid = service_id; ++} ++ ++static int hvsock_addr_validate(const struct sockaddr_hv *addr) ++{ ++ if (!addr) ++ return -EFAULT; ++ ++ if (addr->shv_family != AF_HYPERV) ++ return -EAFNOSUPPORT; ++ ++ if (addr->reserved != 0) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static bool hvsock_addr_bound(const struct sockaddr_hv *addr) ++{ ++ return !!uuid_le_cmp(addr->shv_service_guid, SHV_SERVICE_ID_ANY); ++} ++ ++static int hvsock_addr_cast(const struct sockaddr *addr, size_t len, ++ struct sockaddr_hv **out_addr) ++{ ++ if (len < sizeof(**out_addr)) ++ return -EFAULT; ++ ++ *out_addr = (struct sockaddr_hv *)addr; ++ return hvsock_addr_validate(*out_addr); ++} ++ ++static int __hvsock_do_bind(struct hvsock_sock *hvsk, ++ struct sockaddr_hv *addr) ++{ ++ struct sockaddr_hv hv_addr; ++ int ret = 0; ++ ++ hvsock_addr_init(&hv_addr, addr->shv_service_guid); ++ ++ mutex_lock(&hvsock_mutex); ++ ++ if (!uuid_le_cmp(addr->shv_service_guid, SHV_SERVICE_ID_ANY)) { ++ do { ++ uuid_le_gen(&hv_addr.shv_service_guid); ++ } while (hvsock_find_bound_socket(&hv_addr)); ++ } else { ++ if (hvsock_find_bound_socket(&hv_addr)) { ++ ret = -EADDRINUSE; ++ goto out; ++ } ++ } ++ ++ hvsock_addr_init(&hvsk->local_addr, hv_addr.shv_service_guid); ++ ++ sock_hold(&hvsk->sk); ++ list_add(&hvsk->bound_list, &hvsock_bound_list); ++out: ++ mutex_unlock(&hvsock_mutex); ++ ++ return ret; ++} ++ ++static int __hvsock_bind(struct sock *sk, struct sockaddr_hv *addr) ++{ ++ struct hvsock_sock *hvsk = sk_to_hvsock(sk); ++ int ret; ++ ++ if (hvsock_addr_bound(&hvsk->local_addr)) ++ return -EINVAL; ++ ++ switch (sk->sk_socket->type) { ++ case SOCK_STREAM: ++ ret = __hvsock_do_bind(hvsk, addr); ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return ret; ++} ++ ++/* Autobind this socket to the local address if necessary. */ ++static int hvsock_auto_bind(struct hvsock_sock *hvsk) ++{ ++ struct sock *sk = hvsock_to_sk(hvsk); ++ struct sockaddr_hv local_addr; ++ ++ if (hvsock_addr_bound(&hvsk->local_addr)) ++ return 0; ++ hvsock_addr_init(&local_addr, SHV_SERVICE_ID_ANY); ++ return __hvsock_bind(sk, &local_addr); ++} ++ ++static void hvsock_sk_destruct(struct sock *sk) ++{ ++ struct vmbus_channel *channel; ++ struct hvsock_sock *hvsk; ++ ++ hvsk = sk_to_hvsock(sk); ++ vfree(hvsk->send); ++ vfree(hvsk->recv); ++ ++ channel = hvsk->channel; ++ if (!channel) ++ return; ++ ++ vmbus_hvsock_device_unregister(channel); ++} ++ ++static void __hvsock_release(struct sock *sk) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *pending; ++ ++ hvsk = sk_to_hvsock(sk); ++ ++ mutex_lock(&hvsock_mutex); ++ ++ if (!list_empty(&hvsk->bound_list)) { ++ list_del_init(&hvsk->bound_list); ++ sock_put(&hvsk->sk); ++ } ++ ++ if (!list_empty(&hvsk->connected_list)) { ++ list_del_init(&hvsk->connected_list); ++ sock_put(&hvsk->sk); ++ } ++ ++ mutex_unlock(&hvsock_mutex); ++ ++ lock_sock(sk); ++ sock_orphan(sk); ++ sk->sk_shutdown = SHUTDOWN_MASK; ++ ++ /* Clean up any sockets that never were accepted. */ ++ while ((pending = hvsock_dequeue_accept(sk)) != NULL) { ++ __hvsock_release(pending); ++ sock_put(pending); ++ } ++ ++ release_sock(sk); ++ sock_put(sk); ++} ++ ++static int hvsock_release(struct socket *sock) ++{ ++ /* If accept() is interrupted by a signal, the temporary socket ++ * struct's sock->sk is NULL. ++ */ ++ if (sock->sk) { ++ __hvsock_release(sock->sk); ++ sock->sk = NULL; ++ } ++ ++ sock->state = SS_FREE; ++ return 0; ++} ++ ++static struct sock *hvsock_create(struct net *net, struct socket *sock, ++ gfp_t priority, unsigned short type) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ ++ sk = sk_alloc(net, AF_HYPERV, priority, &hvsock_proto, 0); ++ if (!sk) ++ return NULL; ++ ++ sock_init_data(sock, sk); ++ ++ /* sk->sk_type is normally set in sock_init_data, but only if sock ++ * is non-NULL. We make sure that our sockets always have a type by ++ * setting it here if needed. ++ */ ++ if (!sock) ++ sk->sk_type = type; ++ ++ sk->sk_destruct = hvsock_sk_destruct; ++ ++ /* Looks stream-based socket doesn't need this. */ ++ sk->sk_backlog_rcv = NULL; ++ ++ sk->sk_state = 0; ++ sock_reset_flag(sk, SOCK_DONE); ++ ++ hvsk = sk_to_hvsock(sk); ++ ++ hvsk->send = NULL; ++ hvsk->recv = NULL; ++ ++ hvsock_addr_init(&hvsk->local_addr, SHV_SERVICE_ID_ANY); ++ hvsock_addr_init(&hvsk->remote_addr, SHV_SERVICE_ID_ANY); ++ ++ INIT_LIST_HEAD(&hvsk->bound_list); ++ INIT_LIST_HEAD(&hvsk->connected_list); ++ ++ INIT_LIST_HEAD(&hvsk->accept_queue); ++ mutex_init(&hvsk->accept_queue_mutex); ++ ++ hvsk->peer_shutdown = 0; ++ ++ return sk; ++} ++ ++static int hvsock_bind(struct socket *sock, struct sockaddr *addr, ++ int addr_len) ++{ ++ struct sockaddr_hv *hv_addr; ++ struct sock *sk; ++ int ret; ++ ++ sk = sock->sk; ++ ++ if (hvsock_addr_cast(addr, addr_len, &hv_addr) != 0) ++ return -EINVAL; ++ ++ if (uuid_le_cmp(hv_addr->shv_vm_guid, NULL_UUID_LE)) ++ return -EINVAL; ++ ++ lock_sock(sk); ++ ret = __hvsock_bind(sk, hv_addr); ++ release_sock(sk); ++ ++ return ret; ++} ++ ++static int hvsock_getname(struct socket *sock, ++ struct sockaddr *addr, int *addr_len, int peer) ++{ ++ struct sockaddr_hv *hv_addr; ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ int ret; ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ ret = 0; ++ ++ lock_sock(sk); ++ ++ if (peer) { ++ if (sock->state != SS_CONNECTED) { ++ ret = -ENOTCONN; ++ goto out; ++ } ++ hv_addr = &hvsk->remote_addr; ++ } else { ++ hv_addr = &hvsk->local_addr; ++ } ++ ++ __sockaddr_check_size(sizeof(*hv_addr)); ++ ++ memcpy(addr, hv_addr, sizeof(*hv_addr)); ++ *addr_len = sizeof(*hv_addr); ++ ++out: ++ release_sock(sk); ++ return ret; ++} ++ ++static void get_ringbuffer_rw_status(struct vmbus_channel *channel, ++ bool *can_read, bool *can_write) ++{ ++ u32 avl_read_bytes, avl_write_bytes, dummy; ++ ++ if (can_read) { ++ hv_get_ringbuffer_availbytes(&channel->inbound, ++ &avl_read_bytes, ++ &dummy); ++ /* 0-size payload means FIN */ ++ *can_read = avl_read_bytes >= HVSOCK_PKT_LEN(0); ++ } ++ ++ if (can_write) { ++ hv_get_ringbuffer_availbytes(&channel->outbound, ++ &dummy, ++ &avl_write_bytes); ++ ++ /* We only write if there is enough space */ ++ *can_write = avl_write_bytes > HVSOCK_PKT_LEN(PAGE_SIZE_4K); ++ } ++} ++ ++static size_t get_ringbuffer_writable_bytes(struct vmbus_channel *channel) ++{ ++ u32 avl_write_bytes, dummy; ++ size_t ret; ++ ++ hv_get_ringbuffer_availbytes(&channel->outbound, ++ &dummy, ++ &avl_write_bytes); ++ ++ /* The ringbuffer mustn't be 100% full, and we should reserve a ++ * zero-length-payload packet for the FIN: see hv_ringbuffer_write() ++ * and hvsock_shutdown(). ++ */ ++ if (avl_write_bytes < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) ++ return 0; ++ ret = avl_write_bytes - HVSOCK_PKT_LEN(1) - HVSOCK_PKT_LEN(0); ++ ++ return round_down(ret, 8); ++} ++ ++static int hvsock_get_send_buf(struct hvsock_sock *hvsk) ++{ ++ hvsk->send = vmalloc(sizeof(*hvsk->send)); ++ return hvsk->send ? 0 : -ENOMEM; ++} ++ ++static void hvsock_put_send_buf(struct hvsock_sock *hvsk) ++{ ++ vfree(hvsk->send); ++ hvsk->send = NULL; ++} ++ ++static int hvsock_send_data(struct vmbus_channel *channel, ++ struct hvsock_sock *hvsk, ++ size_t to_write) ++{ ++ hvsk->send->hdr.pkt_type = 1; ++ hvsk->send->hdr.data_size = to_write; ++ return vmbus_sendpacket(channel, &hvsk->send->hdr, ++ sizeof(hvsk->send->hdr) + to_write, ++ 0, VM_PKT_DATA_INBAND, 0); ++} ++ ++static int hvsock_get_recv_buf(struct hvsock_sock *hvsk) ++{ ++ hvsk->recv = vmalloc(sizeof(*hvsk->recv)); ++ return hvsk->recv ? 0 : -ENOMEM; ++} ++ ++static void hvsock_put_recv_buf(struct hvsock_sock *hvsk) ++{ ++ vfree(hvsk->recv); ++ hvsk->recv = NULL; ++} ++ ++static int hvsock_recv_data(struct vmbus_channel *channel, ++ struct hvsock_sock *hvsk, ++ size_t *payload_len) ++{ ++ u32 buffer_actual_len; ++ u64 dummy_req_id; ++ int ret; ++ ++ ret = vmbus_recvpacket(channel, &hvsk->recv->hdr, ++ sizeof(hvsk->recv->hdr) + ++ sizeof(hvsk->recv->buf), ++ &buffer_actual_len, &dummy_req_id); ++ if (ret != 0 || buffer_actual_len <= sizeof(hvsk->recv->hdr)) ++ *payload_len = 0; ++ else ++ *payload_len = hvsk->recv->hdr.data_size; ++ ++ return ret; ++} ++ ++static int hvsock_shutdown(struct socket *sock, int mode) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ int ret = 0; ++ ++ if (mode < SHUT_RD || mode > SHUT_RDWR) ++ return -EINVAL; ++ /* This maps: ++ * SHUT_RD (0) -> RCV_SHUTDOWN (1) ++ * SHUT_WR (1) -> SEND_SHUTDOWN (2) ++ * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) ++ */ ++ ++mode; ++ ++ if (sock->state != SS_CONNECTED) ++ return -ENOTCONN; ++ ++ sock->state = SS_DISCONNECTING; ++ ++ sk = sock->sk; ++ ++ lock_sock(sk); ++ ++ sk->sk_shutdown |= mode; ++ sk->sk_state_change(sk); ++ ++ if (mode & SEND_SHUTDOWN) { ++ hvsk = sk_to_hvsock(sk); ++ ++ ret = hvsock_get_send_buf(hvsk); ++ if (ret < 0) ++ goto out; ++ ++ /* It can't fail: see get_ringbuffer_writable_bytes(). */ ++ (void)hvsock_send_data(hvsk->channel, hvsk, 0); ++ ++ hvsock_put_send_buf(hvsk); ++ } ++ ++out: ++ release_sock(sk); ++ ++ return ret; ++} ++ ++static unsigned int hvsock_poll(struct file *file, struct socket *sock, ++ poll_table *wait) ++{ ++ struct vmbus_channel *channel; ++ bool can_read, can_write; ++ struct hvsock_sock *hvsk; ++ unsigned int mask; ++ struct sock *sk; ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ ++ poll_wait(file, sk_sleep(sk), wait); ++ mask = 0; ++ ++ if (sk->sk_err) ++ /* Signify that there has been an error on this socket. */ ++ mask |= POLLERR; ++ ++ /* INET sockets treat local write shutdown and peer write shutdown as a ++ * case of POLLHUP set. ++ */ ++ if ((sk->sk_shutdown == SHUTDOWN_MASK) || ++ ((sk->sk_shutdown & SEND_SHUTDOWN) && ++ (hvsk->peer_shutdown & SEND_SHUTDOWN))) { ++ mask |= POLLHUP; ++ } ++ ++ if (sk->sk_shutdown & RCV_SHUTDOWN || ++ hvsk->peer_shutdown & SEND_SHUTDOWN) { ++ mask |= POLLRDHUP; ++ } ++ ++ lock_sock(sk); ++ ++ /* Listening sockets that have connections in their accept ++ * queue can be read. ++ */ ++ if (sk->sk_state == SS_LISTEN && !hvsock_is_accept_queue_empty(sk)) ++ mask |= POLLIN | POLLRDNORM; ++ ++ /* The mutex is to against hvsock_open_connection() */ ++ mutex_lock(&hvsock_mutex); ++ ++ channel = hvsk->channel; ++ if (channel) { ++ /* If there is something in the queue then we can read */ ++ get_ringbuffer_rw_status(channel, &can_read, &can_write); ++ ++ if (!can_read && hvsk->recv) ++ can_read = true; ++ ++ if (!(sk->sk_shutdown & RCV_SHUTDOWN) && can_read) ++ mask |= POLLIN | POLLRDNORM; ++ } else { ++ can_write = false; ++ } ++ ++ mutex_unlock(&hvsock_mutex); ++ ++ /* Sockets whose connections have been closed terminated should ++ * also be considered read, and we check the shutdown flag for that. ++ */ ++ if (sk->sk_shutdown & RCV_SHUTDOWN || ++ hvsk->peer_shutdown & SEND_SHUTDOWN) { ++ mask |= POLLIN | POLLRDNORM; ++ } ++ ++ /* Connected sockets that can produce data can be written. */ ++ if (sk->sk_state == SS_CONNECTED && can_write && ++ !(sk->sk_shutdown & SEND_SHUTDOWN)) { ++ /* Remove POLLWRBAND since INET sockets are not setting it. ++ */ ++ mask |= POLLOUT | POLLWRNORM; ++ } ++ ++ /* Simulate INET socket poll behaviors, which sets ++ * POLLOUT|POLLWRNORM when peer is closed and nothing to read, ++ * but local send is not shutdown. ++ */ ++ if (sk->sk_state == SS_UNCONNECTED && ++ !(sk->sk_shutdown & SEND_SHUTDOWN)) ++ mask |= POLLOUT | POLLWRNORM; ++ ++ release_sock(sk); ++ ++ return mask; ++} ++ ++/* This function runs in the tasklet context of process_chn_event() */ ++static void hvsock_on_channel_cb(void *ctx) ++{ ++ struct sock *sk = (struct sock *)ctx; ++ struct vmbus_channel *channel; ++ struct hvsock_sock *hvsk; ++ bool can_read, can_write; ++ ++ hvsk = sk_to_hvsock(sk); ++ channel = hvsk->channel; ++ BUG_ON(!channel); ++ ++ get_ringbuffer_rw_status(channel, &can_read, &can_write); ++ ++ if (can_read) ++ sk->sk_data_ready(sk); ++ ++ if (can_write) ++ sk->sk_write_space(sk); ++} ++ ++static void hvsock_close_connection(struct vmbus_channel *channel) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ ++ mutex_lock(&hvsock_mutex); ++ ++ sk = hvsock_find_connected_socket_by_channel(channel); ++ ++ /* The guest has already closed the connection? */ ++ if (!sk) ++ goto out; ++ ++ sk->sk_state = SS_UNCONNECTED; ++ sock_set_flag(sk, SOCK_DONE); ++ ++ hvsk = sk_to_hvsock(sk); ++ hvsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN; ++ ++ sk->sk_state_change(sk); ++out: ++ mutex_unlock(&hvsock_mutex); ++} ++ ++static int hvsock_open_connection(struct vmbus_channel *channel) ++{ ++ struct hvsock_sock *hvsk = NULL, *new_hvsk = NULL; ++ uuid_le *instance, *service_id; ++ unsigned char conn_from_host; ++ struct sockaddr_hv hv_addr; ++ struct sock *sk, *new_sk = NULL; ++ int ret; ++ ++ instance = &channel->offermsg.offer.if_instance; ++ service_id = &channel->offermsg.offer.if_type; ++ ++ /* The first byte != 0 means the host initiated the connection. */ ++ conn_from_host = channel->offermsg.offer.u.pipe.user_def[0]; ++ ++ mutex_lock(&hvsock_mutex); ++ ++ hvsock_addr_init(&hv_addr, conn_from_host ? *service_id : *instance); ++ sk = hvsock_find_bound_socket(&hv_addr); ++ ++ if (!sk || (conn_from_host && sk->sk_state != SS_LISTEN) || ++ (!conn_from_host && sk->sk_state != SS_CONNECTING)) { ++ ret = -ENXIO; ++ goto out; ++ } ++ ++ if (conn_from_host) { ++ if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog) { ++ ret = -ECONNREFUSED; ++ goto out; ++ } ++ ++ new_sk = hvsock_create(sock_net(sk), NULL, GFP_KERNEL, ++ sk->sk_type); ++ if (!new_sk) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ new_sk->sk_state = SS_CONNECTING; ++ new_hvsk = sk_to_hvsock(new_sk); ++ new_hvsk->channel = channel; ++ hvsock_addr_init(&new_hvsk->local_addr, *service_id); ++ hvsock_addr_init(&new_hvsk->remote_addr, *instance); ++ } else { ++ hvsk = sk_to_hvsock(sk); ++ hvsk->channel = channel; ++ } ++ ++ set_channel_read_state(channel, false); ++ ret = vmbus_open(channel, RINGBUFFER_HVSOCK_SND_SIZE, ++ RINGBUFFER_HVSOCK_RCV_SIZE, NULL, 0, ++ hvsock_on_channel_cb, conn_from_host ? new_sk : sk); ++ if (ret != 0) { ++ if (conn_from_host) { ++ new_hvsk->channel = NULL; ++ sock_put(new_sk); ++ } else { ++ hvsk->channel = NULL; ++ } ++ goto out; ++ } ++ ++ vmbus_set_chn_rescind_callback(channel, hvsock_close_connection); ++ ++ /* see get_ringbuffer_rw_status() */ ++ set_channel_pending_send_size(channel, ++ HVSOCK_PKT_LEN(PAGE_SIZE_4K) + 1); ++ ++ if (conn_from_host) { ++ new_sk->sk_state = SS_CONNECTED; ++ ++ sock_hold(&new_hvsk->sk); ++ list_add(&new_hvsk->connected_list, &hvsock_connected_list); ++ ++ hvsock_enqueue_accept(sk, new_sk); ++ } else { ++ sk->sk_state = SS_CONNECTED; ++ sk->sk_socket->state = SS_CONNECTED; ++ ++ sock_hold(&hvsk->sk); ++ list_add(&hvsk->connected_list, &hvsock_connected_list); ++ } ++ ++ sk->sk_state_change(sk); ++out: ++ mutex_unlock(&hvsock_mutex); ++ return ret; ++} ++ ++static void hvsock_connect_timeout(struct work_struct *work) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ ++ hvsk = container_of(work, struct hvsock_sock, dwork.work); ++ sk = hvsock_to_sk(hvsk); ++ ++ lock_sock(sk); ++ if ((sk->sk_state == SS_CONNECTING) && ++ (sk->sk_shutdown != SHUTDOWN_MASK)) { ++ sk->sk_state = SS_UNCONNECTED; ++ sk->sk_err = ETIMEDOUT; ++ sk->sk_error_report(sk); ++ } ++ release_sock(sk); ++ ++ sock_put(sk); ++} ++ ++static int hvsock_connect_wait(struct socket *sock, ++ int flags, int current_ret) ++{ ++ struct sock *sk = sock->sk; ++ struct hvsock_sock *hvsk; ++ int ret = current_ret; ++ DEFINE_WAIT(wait); ++ long timeout; ++ ++ hvsk = sk_to_hvsock(sk); ++ timeout = HVSOCK_CONNECT_TIMEOUT; ++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); ++ ++ while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) { ++ if (flags & O_NONBLOCK) { ++ /* If we're not going to block, we schedule a timeout ++ * function to generate a timeout on the connection ++ * attempt, in case the peer doesn't respond in a ++ * timely manner. We hold on to the socket until the ++ * timeout fires. ++ */ ++ sock_hold(sk); ++ INIT_DELAYED_WORK(&hvsk->dwork, ++ hvsock_connect_timeout); ++ schedule_delayed_work(&hvsk->dwork, timeout); ++ ++ /* Skip ahead to preserve error code set above. */ ++ goto out_wait; ++ } ++ ++ release_sock(sk); ++ timeout = schedule_timeout(timeout); ++ lock_sock(sk); ++ ++ if (signal_pending(current)) { ++ ret = sock_intr_errno(timeout); ++ goto out_wait_error; ++ } else if (timeout == 0) { ++ ret = -ETIMEDOUT; ++ goto out_wait_error; ++ } ++ ++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); ++ } ++ ++ ret = sk->sk_err ? -sk->sk_err : 0; ++ ++out_wait_error: ++ if (ret < 0) { ++ sk->sk_state = SS_UNCONNECTED; ++ sock->state = SS_UNCONNECTED; ++ } ++out_wait: ++ finish_wait(sk_sleep(sk), &wait); ++ return ret; ++} ++ ++static int hvsock_connect(struct socket *sock, struct sockaddr *addr, ++ int addr_len, int flags) ++{ ++ struct sockaddr_hv *remote_addr; ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ int ret = 0; ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ ++ lock_sock(sk); ++ ++ switch (sock->state) { ++ case SS_CONNECTED: ++ ret = -EISCONN; ++ goto out; ++ case SS_DISCONNECTING: ++ ret = -EINVAL; ++ goto out; ++ case SS_CONNECTING: ++ /* This continues on so we can move sock into the SS_CONNECTED ++ * state once the connection has completed (at which point err ++ * will be set to zero also). Otherwise, we will either wait ++ * for the connection or return -EALREADY should this be a ++ * non-blocking call. ++ */ ++ ret = -EALREADY; ++ break; ++ default: ++ if ((sk->sk_state == SS_LISTEN) || ++ hvsock_addr_cast(addr, addr_len, &remote_addr) != 0) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ /* Set the remote address that we are connecting to. */ ++ memcpy(&hvsk->remote_addr, remote_addr, ++ sizeof(hvsk->remote_addr)); ++ ++ ret = hvsock_auto_bind(hvsk); ++ if (ret) ++ goto out; ++ ++ sk->sk_state = SS_CONNECTING; ++ ++ ret = vmbus_send_tl_connect_request( ++ &hvsk->local_addr.shv_service_guid, ++ &hvsk->remote_addr.shv_service_guid); ++ if (ret < 0) ++ goto out; ++ ++ /* Mark sock as connecting and set the error code to in ++ * progress in case this is a non-blocking connect. ++ */ ++ sock->state = SS_CONNECTING; ++ ret = -EINPROGRESS; ++ } ++ ++ ret = hvsock_connect_wait(sock, flags, ret); ++out: ++ release_sock(sk); ++ return ret; ++} ++ ++static int hvsock_accept_wait(struct sock *listener, ++ struct socket *newsock, int flags) ++{ ++ struct hvsock_sock *hvconnected; ++ struct sock *connected; ++ ++ DEFINE_WAIT(wait); ++ long timeout; ++ ++ int ret = 0; ++ ++ /* Wait for children sockets to appear; these are the new sockets ++ * created upon connection establishment. ++ */ ++ timeout = sock_sndtimeo(listener, flags & O_NONBLOCK); ++ prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); ++ ++ while ((connected = hvsock_dequeue_accept(listener)) == NULL && ++ listener->sk_err == 0) { ++ release_sock(listener); ++ timeout = schedule_timeout(timeout); ++ lock_sock(listener); ++ ++ if (signal_pending(current)) { ++ ret = sock_intr_errno(timeout); ++ goto out_wait; ++ } else if (timeout == 0) { ++ ret = -EAGAIN; ++ goto out_wait; ++ } ++ ++ prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); ++ } ++ ++ if (listener->sk_err) ++ ret = -listener->sk_err; ++ ++ if (connected) { ++ lock_sock(connected); ++ hvconnected = sk_to_hvsock(connected); ++ ++ if (!ret) { ++ newsock->state = SS_CONNECTED; ++ sock_graft(connected, newsock); ++ } ++ release_sock(connected); ++ sock_put(connected); ++ } ++ ++out_wait: ++ finish_wait(sk_sleep(listener), &wait); ++ return ret; ++} ++ ++static int hvsock_accept(struct socket *sock, struct socket *newsock, ++ int flags) ++{ ++ struct sock *listener; ++ int ret; ++ ++ listener = sock->sk; ++ ++ lock_sock(listener); ++ ++ if (sock->type != SOCK_STREAM) { ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ if (listener->sk_state != SS_LISTEN) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ ret = hvsock_accept_wait(listener, newsock, flags); ++out: ++ release_sock(listener); ++ return ret; ++} ++ ++static int hvsock_listen(struct socket *sock, int backlog) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ int ret = 0; ++ ++ sk = sock->sk; ++ lock_sock(sk); ++ ++ if (sock->type != SOCK_STREAM) { ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ if (sock->state != SS_UNCONNECTED) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (backlog <= 0) { ++ ret = -EINVAL; ++ goto out; ++ } ++ if (backlog > HVSOCK_MAX_BACKLOG) ++ backlog = HVSOCK_MAX_BACKLOG; ++ ++ hvsk = sk_to_hvsock(sk); ++ if (!hvsock_addr_bound(&hvsk->local_addr)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ sk->sk_ack_backlog = 0; ++ sk->sk_max_ack_backlog = backlog; ++ sk->sk_state = SS_LISTEN; ++out: ++ release_sock(sk); ++ return ret; ++} ++ ++static int hvsock_sendmsg_wait(struct sock *sk, struct msghdr *msg, ++ size_t len) ++{ ++ struct hvsock_sock *hvsk = sk_to_hvsock(sk); ++ struct vmbus_channel *channel; ++ size_t total_to_write = len; ++ size_t total_written = 0; ++ DEFINE_WAIT(wait); ++ bool can_write; ++ long timeout; ++ int ret = -EIO; ++ ++ timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); ++ channel = hvsk->channel; ++ ++ while (total_to_write > 0) { ++ size_t to_write, max_writable; ++ ++ while (1) { ++ get_ringbuffer_rw_status(channel, NULL, &can_write); ++ ++ if (can_write || sk->sk_err != 0 || ++ (sk->sk_shutdown & SEND_SHUTDOWN) || ++ (hvsk->peer_shutdown & RCV_SHUTDOWN)) ++ break; ++ ++ /* Don't wait for non-blocking sockets. */ ++ if (timeout == 0) { ++ ret = -EAGAIN; ++ goto out_wait; ++ } ++ ++ release_sock(sk); ++ ++ timeout = schedule_timeout(timeout); ++ ++ lock_sock(sk); ++ if (signal_pending(current)) { ++ ret = sock_intr_errno(timeout); ++ goto out_wait; ++ } else if (timeout == 0) { ++ ret = -EAGAIN; ++ goto out_wait; ++ } ++ ++ prepare_to_wait(sk_sleep(sk), &wait, ++ TASK_INTERRUPTIBLE); ++ } ++ ++ /* These checks occur both as part of and after the loop ++ * conditional since we need to check before and after ++ * sleeping. ++ */ ++ if (sk->sk_err) { ++ ret = -sk->sk_err; ++ goto out_wait; ++ } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || ++ (hvsk->peer_shutdown & RCV_SHUTDOWN)) { ++ ret = -EPIPE; ++ goto out_wait; ++ } ++ ++ /* Note: that write will only write as many bytes as possible ++ * in the ringbuffer. It is the caller's responsibility to ++ * check how many bytes we actually wrote. ++ */ ++ do { ++ max_writable = get_ringbuffer_writable_bytes(channel); ++ if (max_writable == 0) ++ goto out_wait; ++ ++ to_write = min_t(size_t, sizeof(hvsk->send->buf), ++ total_to_write); ++ if (to_write > max_writable) ++ to_write = max_writable; ++ ++ ret = hvsock_get_send_buf(hvsk); ++ if (ret < 0) ++ goto out_wait; ++ ++ ret = memcpy_from_msg(hvsk->send->buf, msg, to_write); ++ if (ret != 0) { ++ hvsock_put_send_buf(hvsk); ++ goto out_wait; ++ } ++ ++ ret = hvsock_send_data(channel, hvsk, to_write); ++ hvsock_put_send_buf(hvsk); ++ if (ret != 0) ++ goto out_wait; ++ ++ total_written += to_write; ++ total_to_write -= to_write; ++ } while (total_to_write > 0); ++ } ++ ++out_wait: ++ if (total_written > 0) ++ ret = total_written; ++ ++ finish_wait(sk_sleep(sk), &wait); ++ return ret; ++} ++ ++static int hvsock_sendmsg(struct socket *sock, struct msghdr *msg, ++ size_t len) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ int ret; ++ ++ if (len == 0) ++ return -EINVAL; ++ ++ if (msg->msg_flags & ~MSG_DONTWAIT) ++ return -EOPNOTSUPP; ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ ++ lock_sock(sk); ++ ++ /* Callers should not provide a destination with stream sockets. */ ++ if (msg->msg_namelen) { ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ /* Send data only if both sides are not shutdown in the direction. */ ++ if (sk->sk_shutdown & SEND_SHUTDOWN || ++ hvsk->peer_shutdown & RCV_SHUTDOWN) { ++ ret = -EPIPE; ++ goto out; ++ } ++ ++ if (sk->sk_state != SS_CONNECTED || ++ !hvsock_addr_bound(&hvsk->local_addr)) { ++ ret = -ENOTCONN; ++ goto out; ++ } ++ ++ if (!hvsock_addr_bound(&hvsk->remote_addr)) { ++ ret = -EDESTADDRREQ; ++ goto out; ++ } ++ ++ ret = hvsock_sendmsg_wait(sk, msg, len); ++out: ++ release_sock(sk); ++ ++ /* ret should be a bigger-than-0 total_written or a negative err ++ * code. ++ */ ++ BUG_ON(ret == 0); ++ ++ return ret; ++} ++ ++static int hvsock_recvmsg_wait(struct sock *sk, struct msghdr *msg, ++ size_t len, int flags) ++{ ++ struct hvsock_sock *hvsk = sk_to_hvsock(sk); ++ size_t to_read, total_to_read = len; ++ struct vmbus_channel *channel; ++ DEFINE_WAIT(wait); ++ size_t copied = 0; ++ bool can_read; ++ long timeout; ++ int ret = 0; ++ ++ timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); ++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); ++ channel = hvsk->channel; ++ ++ while (1) { ++ bool need_refill = !hvsk->recv; ++ ++ if (need_refill) { ++ if (hvsk->peer_shutdown & SEND_SHUTDOWN) ++ can_read = false; ++ else ++ get_ringbuffer_rw_status(channel, &can_read, ++ NULL); ++ } else { ++ can_read = true; ++ } ++ ++ if (can_read) { ++ size_t payload_len; ++ ++ if (need_refill) { ++ ret = hvsock_get_recv_buf(hvsk); ++ if (ret < 0) { ++ if (copied > 0) ++ ret = copied; ++ goto out_wait; ++ } ++ ++ ret = hvsock_recv_data(channel, hvsk, ++ &payload_len); ++ if (ret != 0 || ++ payload_len > sizeof(hvsk->recv->buf)) { ++ ret = -EIO; ++ hvsock_put_recv_buf(hvsk); ++ goto out_wait; ++ } ++ ++ if (payload_len == 0) { ++ ret = copied; ++ hvsock_put_recv_buf(hvsk); ++ hvsk->peer_shutdown |= SEND_SHUTDOWN; ++ break; ++ } ++ ++ hvsk->recv->data_len = payload_len; ++ hvsk->recv->data_offset = 0; ++ } ++ ++ to_read = min_t(size_t, total_to_read, ++ hvsk->recv->data_len); ++ ++ ret = memcpy_to_msg(msg, hvsk->recv->buf + ++ hvsk->recv->data_offset, ++ to_read); ++ if (ret != 0) ++ break; ++ ++ copied += to_read; ++ total_to_read -= to_read; ++ ++ hvsk->recv->data_len -= to_read; ++ ++ if (hvsk->recv->data_len == 0) ++ hvsock_put_recv_buf(hvsk); ++ else ++ hvsk->recv->data_offset += to_read; ++ ++ if (total_to_read == 0) ++ break; ++ } else { ++ if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || ++ (hvsk->peer_shutdown & SEND_SHUTDOWN)) ++ break; ++ ++ /* Don't wait for non-blocking sockets. */ ++ if (timeout == 0) { ++ ret = -EAGAIN; ++ break; ++ } ++ ++ if (copied > 0) ++ break; ++ ++ release_sock(sk); ++ timeout = schedule_timeout(timeout); ++ lock_sock(sk); ++ ++ if (signal_pending(current)) { ++ ret = sock_intr_errno(timeout); ++ break; ++ } else if (timeout == 0) { ++ ret = -EAGAIN; ++ break; ++ } ++ ++ prepare_to_wait(sk_sleep(sk), &wait, ++ TASK_INTERRUPTIBLE); ++ } ++ } ++ ++ if (sk->sk_err) ++ ret = -sk->sk_err; ++ else if (sk->sk_shutdown & RCV_SHUTDOWN) ++ ret = 0; ++ ++ if (copied > 0) ++ ret = copied; ++out_wait: ++ finish_wait(sk_sleep(sk), &wait); ++ return ret; ++} ++ ++static int hvsock_recvmsg(struct socket *sock, struct msghdr *msg, ++ size_t len, int flags) ++{ ++ struct sock *sk = sock->sk; ++ int ret; ++ ++ lock_sock(sk); ++ ++ if (sk->sk_state != SS_CONNECTED) { ++ /* Recvmsg is supposed to return 0 if a peer performs an ++ * orderly shutdown. Differentiate between that case and when a ++ * peer has not connected or a local shutdown occurred with the ++ * SOCK_DONE flag. ++ */ ++ if (sock_flag(sk, SOCK_DONE)) ++ ret = 0; ++ else ++ ret = -ENOTCONN; ++ ++ goto out; ++ } ++ ++ /* We ignore msg->addr_name/len. */ ++ if (flags & ~MSG_DONTWAIT) { ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ /* We don't check peer_shutdown flag here since peer may actually shut ++ * down, but there can be data in the queue that a local socket can ++ * receive. ++ */ ++ if (sk->sk_shutdown & RCV_SHUTDOWN) { ++ ret = 0; ++ goto out; ++ } ++ ++ /* It is valid on Linux to pass in a zero-length receive buffer. This ++ * is not an error. We may as well bail out now. ++ */ ++ if (!len) { ++ ret = 0; ++ goto out; ++ } ++ ++ ret = hvsock_recvmsg_wait(sk, msg, len, flags); ++out: ++ release_sock(sk); ++ return ret; ++} ++ ++static const struct proto_ops hvsock_ops = { ++ .family = PF_HYPERV, ++ .owner = THIS_MODULE, ++ .release = hvsock_release, ++ .bind = hvsock_bind, ++ .connect = hvsock_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = hvsock_accept, ++ .getname = hvsock_getname, ++ .poll = hvsock_poll, ++ .ioctl = sock_no_ioctl, ++ .listen = hvsock_listen, ++ .shutdown = hvsock_shutdown, ++ .setsockopt = sock_no_setsockopt, ++ .getsockopt = sock_no_getsockopt, ++ .sendmsg = hvsock_sendmsg, ++ .recvmsg = hvsock_recvmsg, ++ .mmap = sock_no_mmap, ++ .sendpage = sock_no_sendpage, ++}; ++ ++static int hvsock_create_sock(struct net *net, struct socket *sock, ++ int protocol, int kern) ++{ ++ struct sock *sk; ++ ++ if (protocol != 0 && protocol != SHV_PROTO_RAW) ++ return -EPROTONOSUPPORT; ++ ++ switch (sock->type) { ++ case SOCK_STREAM: ++ sock->ops = &hvsock_ops; ++ break; ++ default: ++ return -ESOCKTNOSUPPORT; ++ } ++ ++ sock->state = SS_UNCONNECTED; ++ ++ sk = hvsock_create(net, sock, GFP_KERNEL, 0); ++ return sk ? 0 : -ENOMEM; ++} ++ ++static const struct net_proto_family hvsock_family_ops = { ++ .family = AF_HYPERV, ++ .create = hvsock_create_sock, ++ .owner = THIS_MODULE, ++}; ++ ++static int hvsock_probe(struct hv_device *hdev, ++ const struct hv_vmbus_device_id *dev_id) ++{ ++ struct vmbus_channel *channel = hdev->channel; ++ ++ /* We ignore the error return code to suppress the unnecessary ++ * error message in vmbus_probe(): on error the host will rescind ++ * the offer in 30 seconds and we can do cleanup at that time. ++ */ ++ (void)hvsock_open_connection(channel); ++ ++ return 0; ++} ++ ++static int hvsock_remove(struct hv_device *hdev) ++{ ++ struct vmbus_channel *channel = hdev->channel; ++ ++ vmbus_close(channel); ++ ++ return 0; ++} ++ ++/* It's not really used. See vmbus_match() and vmbus_probe(). */ ++static const struct hv_vmbus_device_id id_table[] = { ++ {}, ++}; ++ ++static struct hv_driver hvsock_drv = { ++ .name = "hv_sock", ++ .hvsock = true, ++ .id_table = id_table, ++ .probe = hvsock_probe, ++ .remove = hvsock_remove, ++}; ++ ++static int __init hvsock_init(void) ++{ ++ int ret; ++ ++ if (vmbus_proto_version < VERSION_WIN10) ++ return -ENODEV; ++ ++ ret = vmbus_driver_register(&hvsock_drv); ++ if (ret) { ++ pr_err("failed to register hv_sock driver\n"); ++ return ret; ++ } ++ ++ ret = proto_register(&hvsock_proto, 0); ++ if (ret) { ++ pr_err("failed to register protocol\n"); ++ goto unreg_hvsock_drv; ++ } ++ ++ ret = sock_register(&hvsock_family_ops); ++ if (ret) { ++ pr_err("failed to register address family\n"); ++ goto unreg_proto; ++ } ++ ++ return 0; ++ ++unreg_proto: ++ proto_unregister(&hvsock_proto); ++unreg_hvsock_drv: ++ vmbus_driver_unregister(&hvsock_drv); ++ return ret; ++} ++ ++static void __exit hvsock_exit(void) ++{ ++ sock_unregister(AF_HYPERV); ++ proto_unregister(&hvsock_proto); ++ vmbus_driver_unregister(&hvsock_drv); ++} ++ ++module_init(hvsock_init); ++module_exit(hvsock_exit); ++ ++MODULE_DESCRIPTION("Hyper-V Sockets"); ++MODULE_LICENSE("Dual BSD/GPL"); +-- +2.10.2 + diff --git a/alpine/kernel/patches-aufs/0004-vmbus-Don-t-spam-the-logs-with-unknown-GUIDs.patch b/alpine/kernel/patches-aufs/0004-vmbus-Don-t-spam-the-logs-with-unknown-GUIDs.patch new file mode 100644 index 000000000..914a2576a --- /dev/null +++ b/alpine/kernel/patches-aufs/0004-vmbus-Don-t-spam-the-logs-with-unknown-GUIDs.patch @@ -0,0 +1,30 @@ +From e8c7a6dee61819c36b77108bc2cddafde26b9876 Mon Sep 17 00:00:00 2001 +From: Rolf Neugebauer +Date: Mon, 23 May 2016 18:55:45 +0100 +Subject: [PATCH 4/5] vmbus: Don't spam the logs with unknown GUIDs + +With Hyper-V sockets device types are introduced on the fly. The pr_info() +then prints a message on every connection, which is way too verbose. Since +there doesn't seem to be an easy way to check for registered services, +disable the pr_info() completely. + +Signed-off-by: Rolf Neugebauer +--- + drivers/hv/channel_mgmt.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index 8f4e6070..ef4a512 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -147,7 +147,6 @@ static u16 hv_get_dev_type(const uuid_le *guid) + if (!uuid_le_cmp(*guid, vmbus_devs[i].guid)) + return i; + } +- pr_info("Unknown GUID: %pUl\n", guid); + return i; + } + +-- +2.10.2 +