diff --git a/Makefile b/Makefile index 2b3cdd2a9..c02b9ef0b 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,9 @@ all: $(MAKE) -C alpine +aufs: + $(MAKE) AUFS=true all + alpine/initrd.img: $(MAKE) -C alpine initrd.img @@ -46,15 +49,20 @@ test: Dockerfile.test alpine/initrd-test.img alpine/kernel/x86_64/vmlinuz64 TAG=$(shell git rev-parse HEAD) STATUS=$(shell git status -s) MOBYLINUX_TAG=alpine/mobylinux.tag +ifdef AUFS +AUFS_PREFIX=aufs- +endif +MEDIA_IMAGE=mobylinux/media:$(MEDIA_PREFIX)$(AUFS_PREFIX)$(TAG) +KERNEL_IMAGE=mobylinux/kernel:$(MEDIA_PREFIX)$(AUFS_PREFIX)$(TAG) media: Dockerfile.media alpine/initrd.img alpine/kernel/x86_64/vmlinuz64 alpine/mobylinux-efi.iso ifeq ($(STATUS),) - tar cf - $^ alpine/mobylinux.efi alpine/kernel/x86_64/vmlinux alpine/kernel/x86_64/kernel-headers.tar | docker build -f Dockerfile.media -t mobylinux/media:$(MEDIA_PREFIX)$(TAG) - - docker push mobylinux/media:$(MEDIA_PREFIX)$(TAG) + tar cf - $^ alpine/mobylinux.efi alpine/kernel/x86_64/vmlinux alpine/kernel/x86_64/kernel-headers.tar | docker build -f Dockerfile.media -t $(MEDIA_IMAGE) - + docker push $(MEDIA_IMAGE) [ -f $(MOBYLINUX_TAG) ] - docker tag $(shell cat $(MOBYLINUX_TAG)) mobylinux/mobylinux:$(MEDIA_PREFIX)$(TAG) - docker push mobylinux/mobylinux:$(MEDIA_PREFIX)$(TAG) - tar cf - Dockerfile.kernel alpine/kernel/x86_64/vmlinuz64 | docker build -f Dockerfile.kernel -t mobylinux/kernel:$(MEDIA_PREFIX)$(TAG) - - docker push mobylinux/kernel:$(MEDIA_PREFIX)$(TAG) + docker tag $(shell cat $(MOBYLINUX_TAG)) $(MEDIA_IMAGE) + docker push $(MEDIA_IMAGE) + tar cf - Dockerfile.kernel alpine/kernel/x86_64/vmlinuz64 | docker build -f Dockerfile.kernel -t $(KERNEL_IMAGE) - + docker push $(KERNEL_IMAGE) else $(error "git not clean") endif diff --git a/alpine/kernel/Dockerfile b/alpine/kernel/Dockerfile index 155b1f50b..6ad788b41 100644 --- a/alpine/kernel/Dockerfile +++ b/alpine/kernel/Dockerfile @@ -1,7 +1,7 @@ # Tag: 36aecb5cf4738737634140eec9abebe1f6559a39 FROM mobylinux/alpine-build-c@sha256:d66b9625abc831f28f8c584991a9cb6975e85d3bb3d3768474b592f1cf32a3a6 -ARG KERNEL_VERSION=4.8.14 +ARG KERNEL_VERSION=4.9 ENV KERNEL_SOURCE=https://www.kernel.org/pub/linux/kernel/v4.x/linux-${KERNEL_VERSION}.tar.xz @@ -9,42 +9,6 @@ RUN curl -fsSL -o linux-${KERNEL_VERSION}.tar.xz ${KERNEL_SOURCE} RUN cat linux-${KERNEL_VERSION}.tar.xz | tar --absolute-names -xJ && mv /linux-${KERNEL_VERSION} /linux -# this is aufs4.8 20161010 -ENV AUFS_REPO https://github.com/sfjro/aufs4-standalone -ENV AUFS_BRANCH aufs4.8 -ENV AUFS_COMMIT e9fd128dcb16167417683e199a5feb14f3c9eca8 - -# Download AUFS -RUN git clone -b "$AUFS_BRANCH" "$AUFS_REPO" /aufs && \ - cd /aufs && \ - git checkout -q "$AUFS_COMMIT" - -# aufs-util 20151116 -ENV AUFS_TOOLS_REPO https://github.com/ncopa/aufs-util.git -ENV AUFS_TOOLS_COMMIT 3b7c5e262b53598a8204a915e485489c46d4e7a4 - -# Download aufs tools -RUN git clone ${AUFS_TOOLS_REPO} && \ - cd /aufs-util && \ - git checkout "$AUFS_TOOLS_COMMIT" - -#BUILD -# patch kernel with aufs -RUN cd /linux && \ - cp -r /aufs/Documentation /linux && \ - cp -r /aufs/fs /linux && \ - cp -r /aufs/include/uapi/linux/aufs_type.h /linux/include/uapi/linux/ && \ - set -e && for patch in \ - /aufs/aufs*-kbuild.patch \ - /aufs/aufs*-base.patch \ - /aufs/aufs*-mmap.patch \ - /aufs/aufs*-standalone.patch \ - /aufs/aufs*-loopback.patch \ - /aufs/lockdep-debug.patch \ - ; do \ - patch -p1 < "$patch"; \ - done - COPY kernel_config /linux/arch/x86/configs/x86_64_defconfig COPY kernel_config.debug /linux/debug_config @@ -72,12 +36,4 @@ RUN make INSTALL_MOD_PATH=/tmp/kernel-modules modules_install && \ ( cd /tmp && tar cf /kernel-headers.tar include ) && \ ( cd /tmp/kernel-modules && tar cf /kernel-modules.tar . ) -# Build aufs tools, do this here as they need kernel headers and to match aufs -# Fortunately they are built statically linked -RUN cd /aufs-util && \ - CPPFLAGS="-I/tmp/include" CFLAGS=$CPPFLAGS LDFLAGS=$CPPFLAGS make && \ - DESTDIR=/tmp/aufs-utils make install && \ - rm -rf /tmp/aufs-utils/usr/lib /tmp/aufs-utils/usr/share && \ - cd /tmp/aufs-utils && rm libau* && tar cf /aufs-utils.tar . - -RUN printf "KERNEL_SOURCE=${KERNEL_SOURCE}\nAUFS_REPO=${AUFS_REPO}\nAUFS_BRANCH=${AUFS_BRANCH}\nAUFS_COMMIT=${AUFS_COMMIT}\nAUFS_TOOLS_REPO=${AUFS_TOOLS_REPO}\nAUFS_TOOLS_COMMIT=${AUFS_TOOLS_COMMIT}\n" > /kernel-source-info +RUN printf "KERNEL_SOURCE=${KERNEL_SOURCE}\n" > /kernel-source-info diff --git a/alpine/kernel/Dockerfile.aufs b/alpine/kernel/Dockerfile.aufs new file mode 100644 index 000000000..c8adfa6ef --- /dev/null +++ b/alpine/kernel/Dockerfile.aufs @@ -0,0 +1,85 @@ +# Tag: 36aecb5cf4738737634140eec9abebe1f6559a39 +FROM mobylinux/alpine-build-c@sha256:d66b9625abc831f28f8c584991a9cb6975e85d3bb3d3768474b592f1cf32a3a6 + +ARG KERNEL_VERSION=4.8.14 + +ENV KERNEL_SOURCE=https://www.kernel.org/pub/linux/kernel/v4.x/linux-${KERNEL_VERSION}.tar.xz + +RUN curl -fsSL -o linux-${KERNEL_VERSION}.tar.xz ${KERNEL_SOURCE} + +RUN cat linux-${KERNEL_VERSION}.tar.xz | tar --absolute-names -xJ && mv /linux-${KERNEL_VERSION} /linux + +# this is aufs4.8 20161010 +ENV AUFS_REPO https://github.com/sfjro/aufs4-standalone +ENV AUFS_BRANCH aufs4.8 +ENV AUFS_COMMIT e9fd128dcb16167417683e199a5feb14f3c9eca8 + +# Download AUFS +RUN git clone -b "$AUFS_BRANCH" "$AUFS_REPO" /aufs && \ + cd /aufs && \ + git checkout -q "$AUFS_COMMIT" + +# aufs-util 20151116 +ENV AUFS_TOOLS_REPO https://github.com/ncopa/aufs-util.git +ENV AUFS_TOOLS_COMMIT 3b7c5e262b53598a8204a915e485489c46d4e7a4 + +# Download aufs tools +RUN git clone ${AUFS_TOOLS_REPO} && \ + cd /aufs-util && \ + git checkout "$AUFS_TOOLS_COMMIT" + +#BUILD +# patch kernel with aufs +RUN cd /linux && \ + cp -r /aufs/Documentation /linux && \ + cp -r /aufs/fs /linux && \ + cp -r /aufs/include/uapi/linux/aufs_type.h /linux/include/uapi/linux/ && \ + set -e && for patch in \ + /aufs/aufs*-kbuild.patch \ + /aufs/aufs*-base.patch \ + /aufs/aufs*-mmap.patch \ + /aufs/aufs*-standalone.patch \ + /aufs/aufs*-loopback.patch \ + /aufs/lockdep-debug.patch \ + ; do \ + patch -p1 < "$patch"; \ + done + +COPY kernel_config /linux/arch/x86/configs/x86_64_defconfig +COPY kernel_config.debug /linux/debug_config +COPY kernel_config.aufs /linux/aufs_config +RUN cat /linux/aufs_config >> /linux/arch/x86/configs/x86_64_defconfig + +ARG DEBUG=0 + +RUN if [ $DEBUG -ne "0" ]; then \ + sed -i 's/CONFIG_PANIC_ON_OOPS=y/# CONFIG_PANIC_ON_OOPS is not set/' /linux/arch/x86/configs/x86_64_defconfig; \ + cat /linux/debug_config >> /linux/arch/x86/configs/x86_64_defconfig; \ + fi + +# Apply local patches +COPY patches-aufs /patches +RUN cd /linux && \ + set -e && for patch in /patches/*.patch; do \ + echo "Applying $patch"; \ + patch -p1 < "$patch"; \ + done + +WORKDIR /linux +RUN make defconfig && \ + make oldconfig && \ + make -j "$(getconf _NPROCESSORS_ONLN)" KCFLAGS="-fno-pie" +RUN make INSTALL_MOD_PATH=/tmp/kernel-modules modules_install && \ + make INSTALL_HDR_PATH=/tmp headers_install && \ + ( cd /tmp && tar cf /kernel-headers.tar include ) && \ + ( cd /tmp/kernel-modules && tar cf /kernel-modules.tar . ) + +# Build aufs tools, do this here as they need kernel headers and to match aufs +# Fortunately they are built statically linked +RUN cd /aufs-util && \ + CPPFLAGS="-I/tmp/include" CFLAGS=$CPPFLAGS LDFLAGS=$CPPFLAGS make && \ + DESTDIR=/tmp/aufs-utils make install && \ + rm -rf /tmp/aufs-utils/usr/lib /tmp/aufs-utils/usr/share && \ + cd /tmp/aufs-utils && rm libau* && tar cf /aufs-utils.tar . + +RUN printf "KERNEL_SOURCE=${KERNEL_SOURCE}\nAUFS_REPO=${AUFS_REPO}\nAUFS_BRANCH=${AUFS_BRANCH}\nAUFS_COMMIT=${AUFS_COMMIT}\nAUFS_TOOLS_REPO=${AUFS_TOOLS_REPO}\nAUFS_TOOLS_COMMIT=${AUFS_TOOLS_COMMIT}\n" > /kernel-source-info diff --git a/alpine/kernel/Makefile b/alpine/kernel/Makefile index 82aaa7201..403ad165b 100644 --- a/alpine/kernel/Makefile +++ b/alpine/kernel/Makefile @@ -2,16 +2,28 @@ DEBUG ?= 0 all: x86_64/vmlinuz64 -x86_64/vmlinuz64: Dockerfile kernel_config +ifdef AUFS +x86_64/vmlinuz64: Dockerfile.aufs kernel_config kernel_config.debug kernel_config.aufs mkdir -p x86_64 etc - BUILD=$$( docker build --build-arg DEBUG=$(DEBUG) -q . ) && [ -n "$$BUILD" ] && echo "Built $$BUILD" && \ + BUILD=$$( docker build -f Dockerfile.aufs --build-arg DEBUG=$(DEBUG) -q . ) && [ -n "$$BUILD" ] && echo "Built $$BUILD" && \ docker run --rm --net=none --log-driver=none $$BUILD cat /kernel-modules.tar | tar xf - && \ docker run --rm --net=none --log-driver=none $$BUILD cat /aufs-utils.tar | tar xf - && \ docker run --rm --net=none --log-driver=none $$BUILD cat /kernel-source-info > etc/kernel-source-info && \ docker run --rm --net=none --log-driver=none $$BUILD cat /linux/vmlinux > x86_64/vmlinux && \ docker run --rm --net=none --log-driver=none $$BUILD cat /linux/arch/x86_64/boot/bzImage > $@ && \ docker run --rm --net=none --log-driver=none $$BUILD cat /kernel-headers.tar > x86_64/kernel-headers.tar && \ + cp -a patches-aufs etc/kernel-patches +else +x86_64/vmlinuz64: Dockerfile kernel_config kernel_config.debug + mkdir -p x86_64 etc + BUILD=$$( docker build --build-arg DEBUG=$(DEBUG) -q . ) && [ -n "$$BUILD" ] && echo "Built $$BUILD" && \ + docker run --rm --net=none --log-driver=none $$BUILD cat /kernel-modules.tar | tar xf - && \ + docker run --rm --net=none --log-driver=none $$BUILD cat /kernel-source-info > etc/kernel-source-info && \ + docker run --rm --net=none --log-driver=none $$BUILD cat /linux/vmlinux > x86_64/vmlinux && \ + docker run --rm --net=none --log-driver=none $$BUILD cat /linux/arch/x86_64/boot/bzImage > $@ && \ + docker run --rm --net=none --log-driver=none $$BUILD cat /kernel-headers.tar > x86_64/kernel-headers.tar && \ cp -a patches etc/kernel-patches +endif clean: rm -rf x86_64 lib etc usr sbin diff --git a/alpine/kernel/kernel_config b/alpine/kernel/kernel_config index 9ccbb57e7..3d03892ae 100644 --- a/alpine/kernel/kernel_config +++ b/alpine/kernel/kernel_config @@ -2935,22 +2935,6 @@ CONFIG_PSTORE_ZLIB_COMPRESS=y # CONFIG_PSTORE_RAM is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set -CONFIG_AUFS_FS=y -# CONFIG_AUFS_BRANCH_MAX_127 is not set -# CONFIG_AUFS_BRANCH_MAX_511 is not set -# CONFIG_AUFS_BRANCH_MAX_1023 is not set -CONFIG_AUFS_BRANCH_MAX_32767=y -CONFIG_AUFS_SBILIST=y -# CONFIG_AUFS_HNOTIFY is not set -# CONFIG_AUFS_EXPORT is not set -CONFIG_AUFS_XATTR=y -# CONFIG_AUFS_FHSM is not set -# CONFIG_AUFS_RDU is not set -# CONFIG_AUFS_SHWH is not set -# CONFIG_AUFS_BR_RAMFS is not set -# CONFIG_AUFS_BR_FUSE is not set -CONFIG_AUFS_BDEV_LOOP=y -# CONFIG_AUFS_DEBUG is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y # CONFIG_NFS_V2 is not set diff --git a/alpine/kernel/kernel_config.aufs b/alpine/kernel/kernel_config.aufs new file mode 100644 index 000000000..3dbf17a99 --- /dev/null +++ b/alpine/kernel/kernel_config.aufs @@ -0,0 +1,16 @@ +CONFIG_AUFS_FS=y +# CONFIG_AUFS_BRANCH_MAX_127 is not set +# CONFIG_AUFS_BRANCH_MAX_511 is not set +# CONFIG_AUFS_BRANCH_MAX_1023 is not set +CONFIG_AUFS_BRANCH_MAX_32767=y +CONFIG_AUFS_SBILIST=y +# CONFIG_AUFS_HNOTIFY is not set +# CONFIG_AUFS_EXPORT is not set +CONFIG_AUFS_XATTR=y +# CONFIG_AUFS_FHSM is not set +# CONFIG_AUFS_RDU is not set +# CONFIG_AUFS_SHWH is not set +# CONFIG_AUFS_BR_RAMFS is not set +# CONFIG_AUFS_BR_FUSE is not set +CONFIG_AUFS_BDEV_LOOP=y +# CONFIG_AUFS_DEBUG is not set diff --git a/alpine/kernel/patches-aufs/0001-VSOCK-Only-allow-host-network-namespace-to-use-AF_VS.patch b/alpine/kernel/patches-aufs/0001-VSOCK-Only-allow-host-network-namespace-to-use-AF_VS.patch new file mode 100644 index 000000000..4855bf28f --- /dev/null +++ b/alpine/kernel/patches-aufs/0001-VSOCK-Only-allow-host-network-namespace-to-use-AF_VS.patch @@ -0,0 +1,30 @@ +From afc48615e62910f37b6076f9118c80d2f9613064 Mon Sep 17 00:00:00 2001 +From: Ian Campbell +Date: Mon, 4 Apr 2016 14:50:10 +0100 +Subject: [PATCH 1/5] VSOCK: Only allow host network namespace to use AF_VSOCK. + +The VSOCK addressing schema does not really lend itself to simply creating an +alternative end point address within a namespace. + +Signed-off-by: Ian Campbell +--- + net/vmw_vsock/af_vsock.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c +index 8a398b3..0edc54c 100644 +--- a/net/vmw_vsock/af_vsock.c ++++ b/net/vmw_vsock/af_vsock.c +@@ -1852,6 +1852,9 @@ static const struct proto_ops vsock_stream_ops = { + static int vsock_create(struct net *net, struct socket *sock, + int protocol, int kern) + { ++ if (!net_eq(net, &init_net)) ++ return -EAFNOSUPPORT; ++ + if (!sock) + return -EINVAL; + +-- +2.10.2 + diff --git a/alpine/kernel/patches/0002-Drivers-hv-vmbus-fix-the-race-when-querying-updating.patch b/alpine/kernel/patches-aufs/0002-Drivers-hv-vmbus-fix-the-race-when-querying-updating.patch similarity index 100% rename from alpine/kernel/patches/0002-Drivers-hv-vmbus-fix-the-race-when-querying-updating.patch rename to alpine/kernel/patches-aufs/0002-Drivers-hv-vmbus-fix-the-race-when-querying-updating.patch diff --git a/alpine/kernel/patches-aufs/0003-hv_sock-introduce-Hyper-V-Sockets.patch b/alpine/kernel/patches-aufs/0003-hv_sock-introduce-Hyper-V-Sockets.patch new file mode 100644 index 000000000..f40e48135 --- /dev/null +++ b/alpine/kernel/patches-aufs/0003-hv_sock-introduce-Hyper-V-Sockets.patch @@ -0,0 +1,1791 @@ +From e7db86231b7078971c613aa81c9090079571cf24 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Sat, 23 Jul 2016 01:35:51 +0000 +Subject: [PATCH 3/5] hv_sock: introduce Hyper-V Sockets + +Hyper-V Sockets (hv_sock) supplies a byte-stream based communication +mechanism between the host and the guest. It's somewhat like TCP over +VMBus, but the transportation layer (VMBus) is much simpler than IP. + +With Hyper-V Sockets, applications between the host and the guest can talk +to each other directly by the traditional BSD-style socket APIs. + +Hyper-V Sockets is only available on new Windows hosts, like Windows Server +2016. More info is in this article "Make your own integration services": +https://msdn.microsoft.com/en-us/virtualization/hyperv_on_windows/develop/make_mgmt_service + +The patch implements the necessary support in the guest side by introducing +a new socket address family AF_HYPERV. + +Signed-off-by: Dexuan Cui +Cc: "K. Y. Srinivasan" +Cc: Haiyang Zhang +Cc: Vitaly Kuznetsov +Cc: Cathy Avery +Cc: Olaf Hering +Origin: https://patchwork.kernel.org/patch/9244467/ +--- + MAINTAINERS | 2 + + include/linux/hyperv.h | 13 + + include/linux/socket.h | 4 +- + include/net/af_hvsock.h | 78 +++ + include/uapi/linux/hyperv.h | 23 + + net/Kconfig | 1 + + net/Makefile | 1 + + net/hv_sock/Kconfig | 10 + + net/hv_sock/Makefile | 3 + + net/hv_sock/af_hvsock.c | 1507 +++++++++++++++++++++++++++++++++++++++++++ + 10 files changed, 1641 insertions(+), 1 deletion(-) + create mode 100644 include/net/af_hvsock.h + create mode 100644 net/hv_sock/Kconfig + create mode 100644 net/hv_sock/Makefile + create mode 100644 net/hv_sock/af_hvsock.c + +diff --git a/MAINTAINERS b/MAINTAINERS +index babaf82..6126545 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -5667,7 +5667,9 @@ F: drivers/pci/host/pci-hyperv.c + F: drivers/net/hyperv/ + F: drivers/scsi/storvsc_drv.c + F: drivers/video/fbdev/hyperv_fb.c ++F: net/hv_sock/ + F: include/linux/hyperv.h ++F: include/net/af_hvsock.h + F: tools/hv/ + F: Documentation/ABI/stable/sysfs-bus-vmbus + +diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h +index b10954a..50f8976 100644 +--- a/include/linux/hyperv.h ++++ b/include/linux/hyperv.h +@@ -1505,5 +1505,18 @@ static inline void commit_rd_index(struct vmbus_channel *channel) + vmbus_set_event(channel); + } + ++struct vmpipe_proto_header { ++ u32 pkt_type; ++ u32 data_size; ++}; ++ ++#define HVSOCK_HEADER_LEN (sizeof(struct vmpacket_descriptor) + \ ++ sizeof(struct vmpipe_proto_header)) ++ ++/* See 'prev_indices' in hv_ringbuffer_read(), hv_ringbuffer_write() */ ++#define PREV_INDICES_LEN (sizeof(u64)) + ++#define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ ++ ALIGN((payload_len), 8) + \ ++ PREV_INDICES_LEN) + #endif /* _HYPERV_H */ +diff --git a/include/linux/socket.h b/include/linux/socket.h +index b5cc5a6..0b68b58 100644 +--- a/include/linux/socket.h ++++ b/include/linux/socket.h +@@ -202,8 +202,9 @@ struct ucred { + #define AF_VSOCK 40 /* vSockets */ + #define AF_KCM 41 /* Kernel Connection Multiplexor*/ + #define AF_QIPCRTR 42 /* Qualcomm IPC Router */ ++#define AF_HYPERV 43 /* Hyper-V Sockets */ + +-#define AF_MAX 43 /* For now.. */ ++#define AF_MAX 44 /* For now.. */ + + /* Protocol families, same as address families. */ + #define PF_UNSPEC AF_UNSPEC +@@ -251,6 +252,7 @@ struct ucred { + #define PF_VSOCK AF_VSOCK + #define PF_KCM AF_KCM + #define PF_QIPCRTR AF_QIPCRTR ++#define PF_HYPERV AF_HYPERV + #define PF_MAX AF_MAX + + /* Maximum queue length specifiable by listen. */ +diff --git a/include/net/af_hvsock.h b/include/net/af_hvsock.h +new file mode 100644 +index 0000000..e7a8a3a +--- /dev/null ++++ b/include/net/af_hvsock.h +@@ -0,0 +1,78 @@ ++#ifndef __AF_HVSOCK_H__ ++#define __AF_HVSOCK_H__ ++ ++#include ++#include ++#include ++ ++/* The host side's design of the feature requires 5 exact 4KB pages for ++ * recv/send rings respectively -- this is suboptimal considering memory ++ * consumption, however unluckily we have to live with it, before the ++ * host comes up with a better design in the future. ++ */ ++#define PAGE_SIZE_4K 4096 ++#define RINGBUFFER_HVSOCK_RCV_SIZE (PAGE_SIZE_4K * 5) ++#define RINGBUFFER_HVSOCK_SND_SIZE (PAGE_SIZE_4K * 5) ++ ++/* The MTU is 16KB per the host side's design. ++ * In future, the buffer can be elimiated when we switch to use the coming ++ * new VMBus ringbuffer "in-place consumption" APIs, by which we can ++ * directly copy data from VMBus ringbuffer into the userspace buffer. ++ */ ++#define HVSOCK_MTU_SIZE (1024 * 16) ++struct hvsock_recv_buf { ++ unsigned int data_len; ++ unsigned int data_offset; ++ ++ struct vmpipe_proto_header hdr; ++ u8 buf[HVSOCK_MTU_SIZE]; ++}; ++ ++/* In the VM, actually we can send up to HVSOCK_MTU_SIZE bytes of payload, ++ * but for now let's use a smaller size to minimize the dynamically-allocated ++ * buffer. Note: the buffer can be elimiated in future when we add new VMBus ++ * ringbuffer APIs that allow us to directly copy data from userspace buf to ++ * VMBus ringbuffer. ++ */ ++#define HVSOCK_MAX_SND_SIZE_BY_VM (1024 * 4) ++struct hvsock_send_buf { ++ struct vmpipe_proto_header hdr; ++ u8 buf[HVSOCK_MAX_SND_SIZE_BY_VM]; ++}; ++ ++struct hvsock_sock { ++ /* sk must be the first member. */ ++ struct sock sk; ++ ++ struct sockaddr_hv local_addr; ++ struct sockaddr_hv remote_addr; ++ ++ /* protected by the global hvsock_mutex */ ++ struct list_head bound_list; ++ struct list_head connected_list; ++ ++ struct list_head accept_queue; ++ /* used by enqueue and dequeue */ ++ struct mutex accept_queue_mutex; ++ ++ struct delayed_work dwork; ++ ++ u32 peer_shutdown; ++ ++ struct vmbus_channel *channel; ++ ++ struct hvsock_send_buf *send; ++ struct hvsock_recv_buf *recv; ++}; ++ ++static inline struct hvsock_sock *sk_to_hvsock(struct sock *sk) ++{ ++ return (struct hvsock_sock *)sk; ++} ++ ++static inline struct sock *hvsock_to_sk(struct hvsock_sock *hvsk) ++{ ++ return (struct sock *)hvsk; ++} ++ ++#endif /* __AF_HVSOCK_H__ */ +diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h +index e347b24..eb3e44b 100644 +--- a/include/uapi/linux/hyperv.h ++++ b/include/uapi/linux/hyperv.h +@@ -26,6 +26,7 @@ + #define _UAPI_HYPERV_H + + #include ++#include + + /* + * Framework version for util services. +@@ -396,4 +397,26 @@ struct hv_kvp_ip_msg { + struct hv_kvp_ipaddr_value kvp_ip_val; + } __attribute__((packed)); + ++/* This is the address format of Hyper-V Sockets. ++ * Note: here we just borrow the kernel's built-in type uuid_le. When ++ * an application calls bind() or connect(), the 2 members of struct ++ * sockaddr_hv must be of GUID. ++ * The GUID format differs from the UUID format only in the byte order of ++ * the first 3 fields. Refer to: ++ * https://en.wikipedia.org/wiki/Globally_unique_identifier ++ */ ++struct sockaddr_hv { ++ __kernel_sa_family_t shv_family; /* Address family */ ++ u16 reserved; /* Must be Zero */ ++ uuid_le shv_vm_guid; /* VM ID */ ++ uuid_le shv_service_guid; /* Service ID */ ++}; ++ ++#define SHV_VMID_GUEST NULL_UUID_LE ++#define SHV_VMID_HOST NULL_UUID_LE ++ ++#define SHV_SERVICE_ID_ANY NULL_UUID_LE ++ ++#define SHV_PROTO_RAW 1 ++ + #endif /* _UAPI_HYPERV_H */ +diff --git a/net/Kconfig b/net/Kconfig +index c2cdbce..921e86f 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -231,6 +231,7 @@ source "net/dns_resolver/Kconfig" + source "net/batman-adv/Kconfig" + source "net/openvswitch/Kconfig" + source "net/vmw_vsock/Kconfig" ++source "net/hv_sock/Kconfig" + source "net/netlink/Kconfig" + source "net/mpls/Kconfig" + source "net/hsr/Kconfig" +diff --git a/net/Makefile b/net/Makefile +index 9bd20bb..b4d4e9a 100644 +--- a/net/Makefile ++++ b/net/Makefile +@@ -70,6 +70,7 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ + obj-$(CONFIG_NFC) += nfc/ + obj-$(CONFIG_OPENVSWITCH) += openvswitch/ + obj-$(CONFIG_VSOCKETS) += vmw_vsock/ ++obj-$(CONFIG_HYPERV_SOCK) += hv_sock/ + obj-$(CONFIG_MPLS) += mpls/ + obj-$(CONFIG_HSR) += hsr/ + ifneq ($(CONFIG_NET_SWITCHDEV),) +diff --git a/net/hv_sock/Kconfig b/net/hv_sock/Kconfig +new file mode 100644 +index 0000000..ff84875 +--- /dev/null ++++ b/net/hv_sock/Kconfig +@@ -0,0 +1,10 @@ ++config HYPERV_SOCK ++ tristate "Hyper-V Sockets" ++ depends on HYPERV ++ default m if HYPERV ++ help ++ Hyper-V Sockets is a socket interface for high speed ++ communication between Linux guest and Hyper-V host over VMBus. ++ ++ To compile this driver as a module, choose M here: the module ++ will be called hv_sock. +diff --git a/net/hv_sock/Makefile b/net/hv_sock/Makefile +new file mode 100644 +index 0000000..716c012 +--- /dev/null ++++ b/net/hv_sock/Makefile +@@ -0,0 +1,3 @@ ++obj-$(CONFIG_HYPERV_SOCK) += hv_sock.o ++ ++hv_sock-y += af_hvsock.o +diff --git a/net/hv_sock/af_hvsock.c b/net/hv_sock/af_hvsock.c +new file mode 100644 +index 0000000..331d375 +--- /dev/null ++++ b/net/hv_sock/af_hvsock.c +@@ -0,0 +1,1507 @@ ++/* ++ * Hyper-V Sockets -- a socket-based communication channel between the ++ * Hyper-V host and the virtual machines running on it. ++ * ++ * Copyright (c) 2016 Microsoft Corporation. ++ * ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * Alternatively, this software may be distributed under the terms of the ++ * GNU General Public License ("GPL") version 2 as published by the Free ++ * Software Foundation. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, ++ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, ++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING ++ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++ * POSSIBILITY OF SUCH DAMAGE. ++ */ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++ ++static struct proto hvsock_proto = { ++ .name = "HV_SOCK", ++ .owner = THIS_MODULE, ++ .obj_size = sizeof(struct hvsock_sock), ++}; ++ ++#define SS_LISTEN 255 ++ ++#define HVSOCK_CONNECT_TIMEOUT (30 * HZ) ++ ++/* This is an artificial limit */ ++#define HVSOCK_MAX_BACKLOG 128 ++ ++static LIST_HEAD(hvsock_bound_list); ++static LIST_HEAD(hvsock_connected_list); ++static DEFINE_MUTEX(hvsock_mutex); ++ ++static struct sock *hvsock_find_bound_socket(const struct sockaddr_hv *addr) ++{ ++ struct hvsock_sock *hvsk; ++ ++ list_for_each_entry(hvsk, &hvsock_bound_list, bound_list) { ++ if (!uuid_le_cmp(addr->shv_service_guid, ++ hvsk->local_addr.shv_service_guid)) ++ return hvsock_to_sk(hvsk); ++ } ++ return NULL; ++} ++ ++static struct sock *hvsock_find_connected_socket_by_channel( ++ const struct vmbus_channel *channel) ++{ ++ struct hvsock_sock *hvsk; ++ ++ list_for_each_entry(hvsk, &hvsock_connected_list, connected_list) { ++ if (hvsk->channel == channel) ++ return hvsock_to_sk(hvsk); ++ } ++ return NULL; ++} ++ ++static void hvsock_enqueue_accept(struct sock *listener, ++ struct sock *connected) ++{ ++ struct hvsock_sock *hvconnected; ++ struct hvsock_sock *hvlistener; ++ ++ hvlistener = sk_to_hvsock(listener); ++ hvconnected = sk_to_hvsock(connected); ++ ++ sock_hold(connected); ++ sock_hold(listener); ++ ++ mutex_lock(&hvlistener->accept_queue_mutex); ++ list_add_tail(&hvconnected->accept_queue, &hvlistener->accept_queue); ++ listener->sk_ack_backlog++; ++ mutex_unlock(&hvlistener->accept_queue_mutex); ++} ++ ++static struct sock *hvsock_dequeue_accept(struct sock *listener) ++{ ++ struct hvsock_sock *hvconnected; ++ struct hvsock_sock *hvlistener; ++ ++ hvlistener = sk_to_hvsock(listener); ++ ++ mutex_lock(&hvlistener->accept_queue_mutex); ++ ++ if (list_empty(&hvlistener->accept_queue)) { ++ mutex_unlock(&hvlistener->accept_queue_mutex); ++ return NULL; ++ } ++ ++ hvconnected = list_entry(hvlistener->accept_queue.next, ++ struct hvsock_sock, accept_queue); ++ ++ list_del_init(&hvconnected->accept_queue); ++ listener->sk_ack_backlog--; ++ ++ mutex_unlock(&hvlistener->accept_queue_mutex); ++ ++ sock_put(listener); ++ /* The caller will need a reference on the connected socket so we let ++ * it call sock_put(). ++ */ ++ ++ return hvsock_to_sk(hvconnected); ++} ++ ++static bool hvsock_is_accept_queue_empty(struct sock *sk) ++{ ++ struct hvsock_sock *hvsk = sk_to_hvsock(sk); ++ int ret; ++ ++ mutex_lock(&hvsk->accept_queue_mutex); ++ ret = list_empty(&hvsk->accept_queue); ++ mutex_unlock(&hvsk->accept_queue_mutex); ++ ++ return ret; ++} ++ ++static void hvsock_addr_init(struct sockaddr_hv *addr, uuid_le service_id) ++{ ++ memset(addr, 0, sizeof(*addr)); ++ addr->shv_family = AF_HYPERV; ++ addr->shv_service_guid = service_id; ++} ++ ++static int hvsock_addr_validate(const struct sockaddr_hv *addr) ++{ ++ if (!addr) ++ return -EFAULT; ++ ++ if (addr->shv_family != AF_HYPERV) ++ return -EAFNOSUPPORT; ++ ++ if (addr->reserved != 0) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static bool hvsock_addr_bound(const struct sockaddr_hv *addr) ++{ ++ return !!uuid_le_cmp(addr->shv_service_guid, SHV_SERVICE_ID_ANY); ++} ++ ++static int hvsock_addr_cast(const struct sockaddr *addr, size_t len, ++ struct sockaddr_hv **out_addr) ++{ ++ if (len < sizeof(**out_addr)) ++ return -EFAULT; ++ ++ *out_addr = (struct sockaddr_hv *)addr; ++ return hvsock_addr_validate(*out_addr); ++} ++ ++static int __hvsock_do_bind(struct hvsock_sock *hvsk, ++ struct sockaddr_hv *addr) ++{ ++ struct sockaddr_hv hv_addr; ++ int ret = 0; ++ ++ hvsock_addr_init(&hv_addr, addr->shv_service_guid); ++ ++ mutex_lock(&hvsock_mutex); ++ ++ if (!uuid_le_cmp(addr->shv_service_guid, SHV_SERVICE_ID_ANY)) { ++ do { ++ uuid_le_gen(&hv_addr.shv_service_guid); ++ } while (hvsock_find_bound_socket(&hv_addr)); ++ } else { ++ if (hvsock_find_bound_socket(&hv_addr)) { ++ ret = -EADDRINUSE; ++ goto out; ++ } ++ } ++ ++ hvsock_addr_init(&hvsk->local_addr, hv_addr.shv_service_guid); ++ ++ sock_hold(&hvsk->sk); ++ list_add(&hvsk->bound_list, &hvsock_bound_list); ++out: ++ mutex_unlock(&hvsock_mutex); ++ ++ return ret; ++} ++ ++static int __hvsock_bind(struct sock *sk, struct sockaddr_hv *addr) ++{ ++ struct hvsock_sock *hvsk = sk_to_hvsock(sk); ++ int ret; ++ ++ if (hvsock_addr_bound(&hvsk->local_addr)) ++ return -EINVAL; ++ ++ switch (sk->sk_socket->type) { ++ case SOCK_STREAM: ++ ret = __hvsock_do_bind(hvsk, addr); ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return ret; ++} ++ ++/* Autobind this socket to the local address if necessary. */ ++static int hvsock_auto_bind(struct hvsock_sock *hvsk) ++{ ++ struct sock *sk = hvsock_to_sk(hvsk); ++ struct sockaddr_hv local_addr; ++ ++ if (hvsock_addr_bound(&hvsk->local_addr)) ++ return 0; ++ hvsock_addr_init(&local_addr, SHV_SERVICE_ID_ANY); ++ return __hvsock_bind(sk, &local_addr); ++} ++ ++static void hvsock_sk_destruct(struct sock *sk) ++{ ++ struct vmbus_channel *channel; ++ struct hvsock_sock *hvsk; ++ ++ hvsk = sk_to_hvsock(sk); ++ vfree(hvsk->send); ++ vfree(hvsk->recv); ++ ++ channel = hvsk->channel; ++ if (!channel) ++ return; ++ ++ vmbus_hvsock_device_unregister(channel); ++} ++ ++static void __hvsock_release(struct sock *sk) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *pending; ++ ++ hvsk = sk_to_hvsock(sk); ++ ++ mutex_lock(&hvsock_mutex); ++ ++ if (!list_empty(&hvsk->bound_list)) { ++ list_del_init(&hvsk->bound_list); ++ sock_put(&hvsk->sk); ++ } ++ ++ if (!list_empty(&hvsk->connected_list)) { ++ list_del_init(&hvsk->connected_list); ++ sock_put(&hvsk->sk); ++ } ++ ++ mutex_unlock(&hvsock_mutex); ++ ++ lock_sock(sk); ++ sock_orphan(sk); ++ sk->sk_shutdown = SHUTDOWN_MASK; ++ ++ /* Clean up any sockets that never were accepted. */ ++ while ((pending = hvsock_dequeue_accept(sk)) != NULL) { ++ __hvsock_release(pending); ++ sock_put(pending); ++ } ++ ++ release_sock(sk); ++ sock_put(sk); ++} ++ ++static int hvsock_release(struct socket *sock) ++{ ++ /* If accept() is interrupted by a signal, the temporary socket ++ * struct's sock->sk is NULL. ++ */ ++ if (sock->sk) { ++ __hvsock_release(sock->sk); ++ sock->sk = NULL; ++ } ++ ++ sock->state = SS_FREE; ++ return 0; ++} ++ ++static struct sock *hvsock_create(struct net *net, struct socket *sock, ++ gfp_t priority, unsigned short type) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ ++ sk = sk_alloc(net, AF_HYPERV, priority, &hvsock_proto, 0); ++ if (!sk) ++ return NULL; ++ ++ sock_init_data(sock, sk); ++ ++ /* sk->sk_type is normally set in sock_init_data, but only if sock ++ * is non-NULL. We make sure that our sockets always have a type by ++ * setting it here if needed. ++ */ ++ if (!sock) ++ sk->sk_type = type; ++ ++ sk->sk_destruct = hvsock_sk_destruct; ++ ++ /* Looks stream-based socket doesn't need this. */ ++ sk->sk_backlog_rcv = NULL; ++ ++ sk->sk_state = 0; ++ sock_reset_flag(sk, SOCK_DONE); ++ ++ hvsk = sk_to_hvsock(sk); ++ ++ hvsk->send = NULL; ++ hvsk->recv = NULL; ++ ++ hvsock_addr_init(&hvsk->local_addr, SHV_SERVICE_ID_ANY); ++ hvsock_addr_init(&hvsk->remote_addr, SHV_SERVICE_ID_ANY); ++ ++ INIT_LIST_HEAD(&hvsk->bound_list); ++ INIT_LIST_HEAD(&hvsk->connected_list); ++ ++ INIT_LIST_HEAD(&hvsk->accept_queue); ++ mutex_init(&hvsk->accept_queue_mutex); ++ ++ hvsk->peer_shutdown = 0; ++ ++ return sk; ++} ++ ++static int hvsock_bind(struct socket *sock, struct sockaddr *addr, ++ int addr_len) ++{ ++ struct sockaddr_hv *hv_addr; ++ struct sock *sk; ++ int ret; ++ ++ sk = sock->sk; ++ ++ if (hvsock_addr_cast(addr, addr_len, &hv_addr) != 0) ++ return -EINVAL; ++ ++ if (uuid_le_cmp(hv_addr->shv_vm_guid, NULL_UUID_LE)) ++ return -EINVAL; ++ ++ lock_sock(sk); ++ ret = __hvsock_bind(sk, hv_addr); ++ release_sock(sk); ++ ++ return ret; ++} ++ ++static int hvsock_getname(struct socket *sock, ++ struct sockaddr *addr, int *addr_len, int peer) ++{ ++ struct sockaddr_hv *hv_addr; ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ int ret; ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ ret = 0; ++ ++ lock_sock(sk); ++ ++ if (peer) { ++ if (sock->state != SS_CONNECTED) { ++ ret = -ENOTCONN; ++ goto out; ++ } ++ hv_addr = &hvsk->remote_addr; ++ } else { ++ hv_addr = &hvsk->local_addr; ++ } ++ ++ __sockaddr_check_size(sizeof(*hv_addr)); ++ ++ memcpy(addr, hv_addr, sizeof(*hv_addr)); ++ *addr_len = sizeof(*hv_addr); ++ ++out: ++ release_sock(sk); ++ return ret; ++} ++ ++static void get_ringbuffer_rw_status(struct vmbus_channel *channel, ++ bool *can_read, bool *can_write) ++{ ++ u32 avl_read_bytes, avl_write_bytes, dummy; ++ ++ if (can_read) { ++ hv_get_ringbuffer_availbytes(&channel->inbound, ++ &avl_read_bytes, ++ &dummy); ++ /* 0-size payload means FIN */ ++ *can_read = avl_read_bytes >= HVSOCK_PKT_LEN(0); ++ } ++ ++ if (can_write) { ++ hv_get_ringbuffer_availbytes(&channel->outbound, ++ &dummy, ++ &avl_write_bytes); ++ ++ /* We only write if there is enough space */ ++ *can_write = avl_write_bytes > HVSOCK_PKT_LEN(PAGE_SIZE_4K); ++ } ++} ++ ++static size_t get_ringbuffer_writable_bytes(struct vmbus_channel *channel) ++{ ++ u32 avl_write_bytes, dummy; ++ size_t ret; ++ ++ hv_get_ringbuffer_availbytes(&channel->outbound, ++ &dummy, ++ &avl_write_bytes); ++ ++ /* The ringbuffer mustn't be 100% full, and we should reserve a ++ * zero-length-payload packet for the FIN: see hv_ringbuffer_write() ++ * and hvsock_shutdown(). ++ */ ++ if (avl_write_bytes < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) ++ return 0; ++ ret = avl_write_bytes - HVSOCK_PKT_LEN(1) - HVSOCK_PKT_LEN(0); ++ ++ return round_down(ret, 8); ++} ++ ++static int hvsock_get_send_buf(struct hvsock_sock *hvsk) ++{ ++ hvsk->send = vmalloc(sizeof(*hvsk->send)); ++ return hvsk->send ? 0 : -ENOMEM; ++} ++ ++static void hvsock_put_send_buf(struct hvsock_sock *hvsk) ++{ ++ vfree(hvsk->send); ++ hvsk->send = NULL; ++} ++ ++static int hvsock_send_data(struct vmbus_channel *channel, ++ struct hvsock_sock *hvsk, ++ size_t to_write) ++{ ++ hvsk->send->hdr.pkt_type = 1; ++ hvsk->send->hdr.data_size = to_write; ++ return vmbus_sendpacket(channel, &hvsk->send->hdr, ++ sizeof(hvsk->send->hdr) + to_write, ++ 0, VM_PKT_DATA_INBAND, 0); ++} ++ ++static int hvsock_get_recv_buf(struct hvsock_sock *hvsk) ++{ ++ hvsk->recv = vmalloc(sizeof(*hvsk->recv)); ++ return hvsk->recv ? 0 : -ENOMEM; ++} ++ ++static void hvsock_put_recv_buf(struct hvsock_sock *hvsk) ++{ ++ vfree(hvsk->recv); ++ hvsk->recv = NULL; ++} ++ ++static int hvsock_recv_data(struct vmbus_channel *channel, ++ struct hvsock_sock *hvsk, ++ size_t *payload_len) ++{ ++ u32 buffer_actual_len; ++ u64 dummy_req_id; ++ int ret; ++ ++ ret = vmbus_recvpacket(channel, &hvsk->recv->hdr, ++ sizeof(hvsk->recv->hdr) + ++ sizeof(hvsk->recv->buf), ++ &buffer_actual_len, &dummy_req_id); ++ if (ret != 0 || buffer_actual_len <= sizeof(hvsk->recv->hdr)) ++ *payload_len = 0; ++ else ++ *payload_len = hvsk->recv->hdr.data_size; ++ ++ return ret; ++} ++ ++static int hvsock_shutdown(struct socket *sock, int mode) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ int ret = 0; ++ ++ if (mode < SHUT_RD || mode > SHUT_RDWR) ++ return -EINVAL; ++ /* This maps: ++ * SHUT_RD (0) -> RCV_SHUTDOWN (1) ++ * SHUT_WR (1) -> SEND_SHUTDOWN (2) ++ * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) ++ */ ++ ++mode; ++ ++ if (sock->state != SS_CONNECTED) ++ return -ENOTCONN; ++ ++ sock->state = SS_DISCONNECTING; ++ ++ sk = sock->sk; ++ ++ lock_sock(sk); ++ ++ sk->sk_shutdown |= mode; ++ sk->sk_state_change(sk); ++ ++ if (mode & SEND_SHUTDOWN) { ++ hvsk = sk_to_hvsock(sk); ++ ++ ret = hvsock_get_send_buf(hvsk); ++ if (ret < 0) ++ goto out; ++ ++ /* It can't fail: see get_ringbuffer_writable_bytes(). */ ++ (void)hvsock_send_data(hvsk->channel, hvsk, 0); ++ ++ hvsock_put_send_buf(hvsk); ++ } ++ ++out: ++ release_sock(sk); ++ ++ return ret; ++} ++ ++static unsigned int hvsock_poll(struct file *file, struct socket *sock, ++ poll_table *wait) ++{ ++ struct vmbus_channel *channel; ++ bool can_read, can_write; ++ struct hvsock_sock *hvsk; ++ unsigned int mask; ++ struct sock *sk; ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ ++ poll_wait(file, sk_sleep(sk), wait); ++ mask = 0; ++ ++ if (sk->sk_err) ++ /* Signify that there has been an error on this socket. */ ++ mask |= POLLERR; ++ ++ /* INET sockets treat local write shutdown and peer write shutdown as a ++ * case of POLLHUP set. ++ */ ++ if ((sk->sk_shutdown == SHUTDOWN_MASK) || ++ ((sk->sk_shutdown & SEND_SHUTDOWN) && ++ (hvsk->peer_shutdown & SEND_SHUTDOWN))) { ++ mask |= POLLHUP; ++ } ++ ++ if (sk->sk_shutdown & RCV_SHUTDOWN || ++ hvsk->peer_shutdown & SEND_SHUTDOWN) { ++ mask |= POLLRDHUP; ++ } ++ ++ lock_sock(sk); ++ ++ /* Listening sockets that have connections in their accept ++ * queue can be read. ++ */ ++ if (sk->sk_state == SS_LISTEN && !hvsock_is_accept_queue_empty(sk)) ++ mask |= POLLIN | POLLRDNORM; ++ ++ /* The mutex is to against hvsock_open_connection() */ ++ mutex_lock(&hvsock_mutex); ++ ++ channel = hvsk->channel; ++ if (channel) { ++ /* If there is something in the queue then we can read */ ++ get_ringbuffer_rw_status(channel, &can_read, &can_write); ++ ++ if (!can_read && hvsk->recv) ++ can_read = true; ++ ++ if (!(sk->sk_shutdown & RCV_SHUTDOWN) && can_read) ++ mask |= POLLIN | POLLRDNORM; ++ } else { ++ can_write = false; ++ } ++ ++ mutex_unlock(&hvsock_mutex); ++ ++ /* Sockets whose connections have been closed terminated should ++ * also be considered read, and we check the shutdown flag for that. ++ */ ++ if (sk->sk_shutdown & RCV_SHUTDOWN || ++ hvsk->peer_shutdown & SEND_SHUTDOWN) { ++ mask |= POLLIN | POLLRDNORM; ++ } ++ ++ /* Connected sockets that can produce data can be written. */ ++ if (sk->sk_state == SS_CONNECTED && can_write && ++ !(sk->sk_shutdown & SEND_SHUTDOWN)) { ++ /* Remove POLLWRBAND since INET sockets are not setting it. ++ */ ++ mask |= POLLOUT | POLLWRNORM; ++ } ++ ++ /* Simulate INET socket poll behaviors, which sets ++ * POLLOUT|POLLWRNORM when peer is closed and nothing to read, ++ * but local send is not shutdown. ++ */ ++ if (sk->sk_state == SS_UNCONNECTED && ++ !(sk->sk_shutdown & SEND_SHUTDOWN)) ++ mask |= POLLOUT | POLLWRNORM; ++ ++ release_sock(sk); ++ ++ return mask; ++} ++ ++/* This function runs in the tasklet context of process_chn_event() */ ++static void hvsock_on_channel_cb(void *ctx) ++{ ++ struct sock *sk = (struct sock *)ctx; ++ struct vmbus_channel *channel; ++ struct hvsock_sock *hvsk; ++ bool can_read, can_write; ++ ++ hvsk = sk_to_hvsock(sk); ++ channel = hvsk->channel; ++ BUG_ON(!channel); ++ ++ get_ringbuffer_rw_status(channel, &can_read, &can_write); ++ ++ if (can_read) ++ sk->sk_data_ready(sk); ++ ++ if (can_write) ++ sk->sk_write_space(sk); ++} ++ ++static void hvsock_close_connection(struct vmbus_channel *channel) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ ++ mutex_lock(&hvsock_mutex); ++ ++ sk = hvsock_find_connected_socket_by_channel(channel); ++ ++ /* The guest has already closed the connection? */ ++ if (!sk) ++ goto out; ++ ++ sk->sk_state = SS_UNCONNECTED; ++ sock_set_flag(sk, SOCK_DONE); ++ ++ hvsk = sk_to_hvsock(sk); ++ hvsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN; ++ ++ sk->sk_state_change(sk); ++out: ++ mutex_unlock(&hvsock_mutex); ++} ++ ++static int hvsock_open_connection(struct vmbus_channel *channel) ++{ ++ struct hvsock_sock *hvsk = NULL, *new_hvsk = NULL; ++ uuid_le *instance, *service_id; ++ unsigned char conn_from_host; ++ struct sockaddr_hv hv_addr; ++ struct sock *sk, *new_sk = NULL; ++ int ret; ++ ++ instance = &channel->offermsg.offer.if_instance; ++ service_id = &channel->offermsg.offer.if_type; ++ ++ /* The first byte != 0 means the host initiated the connection. */ ++ conn_from_host = channel->offermsg.offer.u.pipe.user_def[0]; ++ ++ mutex_lock(&hvsock_mutex); ++ ++ hvsock_addr_init(&hv_addr, conn_from_host ? *service_id : *instance); ++ sk = hvsock_find_bound_socket(&hv_addr); ++ ++ if (!sk || (conn_from_host && sk->sk_state != SS_LISTEN) || ++ (!conn_from_host && sk->sk_state != SS_CONNECTING)) { ++ ret = -ENXIO; ++ goto out; ++ } ++ ++ if (conn_from_host) { ++ if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog) { ++ ret = -ECONNREFUSED; ++ goto out; ++ } ++ ++ new_sk = hvsock_create(sock_net(sk), NULL, GFP_KERNEL, ++ sk->sk_type); ++ if (!new_sk) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ new_sk->sk_state = SS_CONNECTING; ++ new_hvsk = sk_to_hvsock(new_sk); ++ new_hvsk->channel = channel; ++ hvsock_addr_init(&new_hvsk->local_addr, *service_id); ++ hvsock_addr_init(&new_hvsk->remote_addr, *instance); ++ } else { ++ hvsk = sk_to_hvsock(sk); ++ hvsk->channel = channel; ++ } ++ ++ set_channel_read_state(channel, false); ++ ret = vmbus_open(channel, RINGBUFFER_HVSOCK_SND_SIZE, ++ RINGBUFFER_HVSOCK_RCV_SIZE, NULL, 0, ++ hvsock_on_channel_cb, conn_from_host ? new_sk : sk); ++ if (ret != 0) { ++ if (conn_from_host) { ++ new_hvsk->channel = NULL; ++ sock_put(new_sk); ++ } else { ++ hvsk->channel = NULL; ++ } ++ goto out; ++ } ++ ++ vmbus_set_chn_rescind_callback(channel, hvsock_close_connection); ++ ++ /* see get_ringbuffer_rw_status() */ ++ set_channel_pending_send_size(channel, ++ HVSOCK_PKT_LEN(PAGE_SIZE_4K) + 1); ++ ++ if (conn_from_host) { ++ new_sk->sk_state = SS_CONNECTED; ++ ++ sock_hold(&new_hvsk->sk); ++ list_add(&new_hvsk->connected_list, &hvsock_connected_list); ++ ++ hvsock_enqueue_accept(sk, new_sk); ++ } else { ++ sk->sk_state = SS_CONNECTED; ++ sk->sk_socket->state = SS_CONNECTED; ++ ++ sock_hold(&hvsk->sk); ++ list_add(&hvsk->connected_list, &hvsock_connected_list); ++ } ++ ++ sk->sk_state_change(sk); ++out: ++ mutex_unlock(&hvsock_mutex); ++ return ret; ++} ++ ++static void hvsock_connect_timeout(struct work_struct *work) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ ++ hvsk = container_of(work, struct hvsock_sock, dwork.work); ++ sk = hvsock_to_sk(hvsk); ++ ++ lock_sock(sk); ++ if ((sk->sk_state == SS_CONNECTING) && ++ (sk->sk_shutdown != SHUTDOWN_MASK)) { ++ sk->sk_state = SS_UNCONNECTED; ++ sk->sk_err = ETIMEDOUT; ++ sk->sk_error_report(sk); ++ } ++ release_sock(sk); ++ ++ sock_put(sk); ++} ++ ++static int hvsock_connect_wait(struct socket *sock, ++ int flags, int current_ret) ++{ ++ struct sock *sk = sock->sk; ++ struct hvsock_sock *hvsk; ++ int ret = current_ret; ++ DEFINE_WAIT(wait); ++ long timeout; ++ ++ hvsk = sk_to_hvsock(sk); ++ timeout = HVSOCK_CONNECT_TIMEOUT; ++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); ++ ++ while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) { ++ if (flags & O_NONBLOCK) { ++ /* If we're not going to block, we schedule a timeout ++ * function to generate a timeout on the connection ++ * attempt, in case the peer doesn't respond in a ++ * timely manner. We hold on to the socket until the ++ * timeout fires. ++ */ ++ sock_hold(sk); ++ INIT_DELAYED_WORK(&hvsk->dwork, ++ hvsock_connect_timeout); ++ schedule_delayed_work(&hvsk->dwork, timeout); ++ ++ /* Skip ahead to preserve error code set above. */ ++ goto out_wait; ++ } ++ ++ release_sock(sk); ++ timeout = schedule_timeout(timeout); ++ lock_sock(sk); ++ ++ if (signal_pending(current)) { ++ ret = sock_intr_errno(timeout); ++ goto out_wait_error; ++ } else if (timeout == 0) { ++ ret = -ETIMEDOUT; ++ goto out_wait_error; ++ } ++ ++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); ++ } ++ ++ ret = sk->sk_err ? -sk->sk_err : 0; ++ ++out_wait_error: ++ if (ret < 0) { ++ sk->sk_state = SS_UNCONNECTED; ++ sock->state = SS_UNCONNECTED; ++ } ++out_wait: ++ finish_wait(sk_sleep(sk), &wait); ++ return ret; ++} ++ ++static int hvsock_connect(struct socket *sock, struct sockaddr *addr, ++ int addr_len, int flags) ++{ ++ struct sockaddr_hv *remote_addr; ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ int ret = 0; ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ ++ lock_sock(sk); ++ ++ switch (sock->state) { ++ case SS_CONNECTED: ++ ret = -EISCONN; ++ goto out; ++ case SS_DISCONNECTING: ++ ret = -EINVAL; ++ goto out; ++ case SS_CONNECTING: ++ /* This continues on so we can move sock into the SS_CONNECTED ++ * state once the connection has completed (at which point err ++ * will be set to zero also). Otherwise, we will either wait ++ * for the connection or return -EALREADY should this be a ++ * non-blocking call. ++ */ ++ ret = -EALREADY; ++ break; ++ default: ++ if ((sk->sk_state == SS_LISTEN) || ++ hvsock_addr_cast(addr, addr_len, &remote_addr) != 0) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ /* Set the remote address that we are connecting to. */ ++ memcpy(&hvsk->remote_addr, remote_addr, ++ sizeof(hvsk->remote_addr)); ++ ++ ret = hvsock_auto_bind(hvsk); ++ if (ret) ++ goto out; ++ ++ sk->sk_state = SS_CONNECTING; ++ ++ ret = vmbus_send_tl_connect_request( ++ &hvsk->local_addr.shv_service_guid, ++ &hvsk->remote_addr.shv_service_guid); ++ if (ret < 0) ++ goto out; ++ ++ /* Mark sock as connecting and set the error code to in ++ * progress in case this is a non-blocking connect. ++ */ ++ sock->state = SS_CONNECTING; ++ ret = -EINPROGRESS; ++ } ++ ++ ret = hvsock_connect_wait(sock, flags, ret); ++out: ++ release_sock(sk); ++ return ret; ++} ++ ++static int hvsock_accept_wait(struct sock *listener, ++ struct socket *newsock, int flags) ++{ ++ struct hvsock_sock *hvconnected; ++ struct sock *connected; ++ ++ DEFINE_WAIT(wait); ++ long timeout; ++ ++ int ret = 0; ++ ++ /* Wait for children sockets to appear; these are the new sockets ++ * created upon connection establishment. ++ */ ++ timeout = sock_sndtimeo(listener, flags & O_NONBLOCK); ++ prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); ++ ++ while ((connected = hvsock_dequeue_accept(listener)) == NULL && ++ listener->sk_err == 0) { ++ release_sock(listener); ++ timeout = schedule_timeout(timeout); ++ lock_sock(listener); ++ ++ if (signal_pending(current)) { ++ ret = sock_intr_errno(timeout); ++ goto out_wait; ++ } else if (timeout == 0) { ++ ret = -EAGAIN; ++ goto out_wait; ++ } ++ ++ prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); ++ } ++ ++ if (listener->sk_err) ++ ret = -listener->sk_err; ++ ++ if (connected) { ++ lock_sock(connected); ++ hvconnected = sk_to_hvsock(connected); ++ ++ if (!ret) { ++ newsock->state = SS_CONNECTED; ++ sock_graft(connected, newsock); ++ } ++ release_sock(connected); ++ sock_put(connected); ++ } ++ ++out_wait: ++ finish_wait(sk_sleep(listener), &wait); ++ return ret; ++} ++ ++static int hvsock_accept(struct socket *sock, struct socket *newsock, ++ int flags) ++{ ++ struct sock *listener; ++ int ret; ++ ++ listener = sock->sk; ++ ++ lock_sock(listener); ++ ++ if (sock->type != SOCK_STREAM) { ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ if (listener->sk_state != SS_LISTEN) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ ret = hvsock_accept_wait(listener, newsock, flags); ++out: ++ release_sock(listener); ++ return ret; ++} ++ ++static int hvsock_listen(struct socket *sock, int backlog) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ int ret = 0; ++ ++ sk = sock->sk; ++ lock_sock(sk); ++ ++ if (sock->type != SOCK_STREAM) { ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ if (sock->state != SS_UNCONNECTED) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (backlog <= 0) { ++ ret = -EINVAL; ++ goto out; ++ } ++ if (backlog > HVSOCK_MAX_BACKLOG) ++ backlog = HVSOCK_MAX_BACKLOG; ++ ++ hvsk = sk_to_hvsock(sk); ++ if (!hvsock_addr_bound(&hvsk->local_addr)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ sk->sk_ack_backlog = 0; ++ sk->sk_max_ack_backlog = backlog; ++ sk->sk_state = SS_LISTEN; ++out: ++ release_sock(sk); ++ return ret; ++} ++ ++static int hvsock_sendmsg_wait(struct sock *sk, struct msghdr *msg, ++ size_t len) ++{ ++ struct hvsock_sock *hvsk = sk_to_hvsock(sk); ++ struct vmbus_channel *channel; ++ size_t total_to_write = len; ++ size_t total_written = 0; ++ DEFINE_WAIT(wait); ++ bool can_write; ++ long timeout; ++ int ret = -EIO; ++ ++ timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); ++ channel = hvsk->channel; ++ ++ while (total_to_write > 0) { ++ size_t to_write, max_writable; ++ ++ while (1) { ++ get_ringbuffer_rw_status(channel, NULL, &can_write); ++ ++ if (can_write || sk->sk_err != 0 || ++ (sk->sk_shutdown & SEND_SHUTDOWN) || ++ (hvsk->peer_shutdown & RCV_SHUTDOWN)) ++ break; ++ ++ /* Don't wait for non-blocking sockets. */ ++ if (timeout == 0) { ++ ret = -EAGAIN; ++ goto out_wait; ++ } ++ ++ release_sock(sk); ++ ++ timeout = schedule_timeout(timeout); ++ ++ lock_sock(sk); ++ if (signal_pending(current)) { ++ ret = sock_intr_errno(timeout); ++ goto out_wait; ++ } else if (timeout == 0) { ++ ret = -EAGAIN; ++ goto out_wait; ++ } ++ ++ prepare_to_wait(sk_sleep(sk), &wait, ++ TASK_INTERRUPTIBLE); ++ } ++ ++ /* These checks occur both as part of and after the loop ++ * conditional since we need to check before and after ++ * sleeping. ++ */ ++ if (sk->sk_err) { ++ ret = -sk->sk_err; ++ goto out_wait; ++ } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || ++ (hvsk->peer_shutdown & RCV_SHUTDOWN)) { ++ ret = -EPIPE; ++ goto out_wait; ++ } ++ ++ /* Note: that write will only write as many bytes as possible ++ * in the ringbuffer. It is the caller's responsibility to ++ * check how many bytes we actually wrote. ++ */ ++ do { ++ max_writable = get_ringbuffer_writable_bytes(channel); ++ if (max_writable == 0) ++ goto out_wait; ++ ++ to_write = min_t(size_t, sizeof(hvsk->send->buf), ++ total_to_write); ++ if (to_write > max_writable) ++ to_write = max_writable; ++ ++ ret = hvsock_get_send_buf(hvsk); ++ if (ret < 0) ++ goto out_wait; ++ ++ ret = memcpy_from_msg(hvsk->send->buf, msg, to_write); ++ if (ret != 0) { ++ hvsock_put_send_buf(hvsk); ++ goto out_wait; ++ } ++ ++ ret = hvsock_send_data(channel, hvsk, to_write); ++ hvsock_put_send_buf(hvsk); ++ if (ret != 0) ++ goto out_wait; ++ ++ total_written += to_write; ++ total_to_write -= to_write; ++ } while (total_to_write > 0); ++ } ++ ++out_wait: ++ if (total_written > 0) ++ ret = total_written; ++ ++ finish_wait(sk_sleep(sk), &wait); ++ return ret; ++} ++ ++static int hvsock_sendmsg(struct socket *sock, struct msghdr *msg, ++ size_t len) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ int ret; ++ ++ if (len == 0) ++ return -EINVAL; ++ ++ if (msg->msg_flags & ~MSG_DONTWAIT) ++ return -EOPNOTSUPP; ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ ++ lock_sock(sk); ++ ++ /* Callers should not provide a destination with stream sockets. */ ++ if (msg->msg_namelen) { ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ /* Send data only if both sides are not shutdown in the direction. */ ++ if (sk->sk_shutdown & SEND_SHUTDOWN || ++ hvsk->peer_shutdown & RCV_SHUTDOWN) { ++ ret = -EPIPE; ++ goto out; ++ } ++ ++ if (sk->sk_state != SS_CONNECTED || ++ !hvsock_addr_bound(&hvsk->local_addr)) { ++ ret = -ENOTCONN; ++ goto out; ++ } ++ ++ if (!hvsock_addr_bound(&hvsk->remote_addr)) { ++ ret = -EDESTADDRREQ; ++ goto out; ++ } ++ ++ ret = hvsock_sendmsg_wait(sk, msg, len); ++out: ++ release_sock(sk); ++ ++ /* ret should be a bigger-than-0 total_written or a negative err ++ * code. ++ */ ++ BUG_ON(ret == 0); ++ ++ return ret; ++} ++ ++static int hvsock_recvmsg_wait(struct sock *sk, struct msghdr *msg, ++ size_t len, int flags) ++{ ++ struct hvsock_sock *hvsk = sk_to_hvsock(sk); ++ size_t to_read, total_to_read = len; ++ struct vmbus_channel *channel; ++ DEFINE_WAIT(wait); ++ size_t copied = 0; ++ bool can_read; ++ long timeout; ++ int ret = 0; ++ ++ timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); ++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); ++ channel = hvsk->channel; ++ ++ while (1) { ++ bool need_refill = !hvsk->recv; ++ ++ if (need_refill) { ++ if (hvsk->peer_shutdown & SEND_SHUTDOWN) ++ can_read = false; ++ else ++ get_ringbuffer_rw_status(channel, &can_read, ++ NULL); ++ } else { ++ can_read = true; ++ } ++ ++ if (can_read) { ++ size_t payload_len; ++ ++ if (need_refill) { ++ ret = hvsock_get_recv_buf(hvsk); ++ if (ret < 0) { ++ if (copied > 0) ++ ret = copied; ++ goto out_wait; ++ } ++ ++ ret = hvsock_recv_data(channel, hvsk, ++ &payload_len); ++ if (ret != 0 || ++ payload_len > sizeof(hvsk->recv->buf)) { ++ ret = -EIO; ++ hvsock_put_recv_buf(hvsk); ++ goto out_wait; ++ } ++ ++ if (payload_len == 0) { ++ ret = copied; ++ hvsock_put_recv_buf(hvsk); ++ hvsk->peer_shutdown |= SEND_SHUTDOWN; ++ break; ++ } ++ ++ hvsk->recv->data_len = payload_len; ++ hvsk->recv->data_offset = 0; ++ } ++ ++ to_read = min_t(size_t, total_to_read, ++ hvsk->recv->data_len); ++ ++ ret = memcpy_to_msg(msg, hvsk->recv->buf + ++ hvsk->recv->data_offset, ++ to_read); ++ if (ret != 0) ++ break; ++ ++ copied += to_read; ++ total_to_read -= to_read; ++ ++ hvsk->recv->data_len -= to_read; ++ ++ if (hvsk->recv->data_len == 0) ++ hvsock_put_recv_buf(hvsk); ++ else ++ hvsk->recv->data_offset += to_read; ++ ++ if (total_to_read == 0) ++ break; ++ } else { ++ if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || ++ (hvsk->peer_shutdown & SEND_SHUTDOWN)) ++ break; ++ ++ /* Don't wait for non-blocking sockets. */ ++ if (timeout == 0) { ++ ret = -EAGAIN; ++ break; ++ } ++ ++ if (copied > 0) ++ break; ++ ++ release_sock(sk); ++ timeout = schedule_timeout(timeout); ++ lock_sock(sk); ++ ++ if (signal_pending(current)) { ++ ret = sock_intr_errno(timeout); ++ break; ++ } else if (timeout == 0) { ++ ret = -EAGAIN; ++ break; ++ } ++ ++ prepare_to_wait(sk_sleep(sk), &wait, ++ TASK_INTERRUPTIBLE); ++ } ++ } ++ ++ if (sk->sk_err) ++ ret = -sk->sk_err; ++ else if (sk->sk_shutdown & RCV_SHUTDOWN) ++ ret = 0; ++ ++ if (copied > 0) ++ ret = copied; ++out_wait: ++ finish_wait(sk_sleep(sk), &wait); ++ return ret; ++} ++ ++static int hvsock_recvmsg(struct socket *sock, struct msghdr *msg, ++ size_t len, int flags) ++{ ++ struct sock *sk = sock->sk; ++ int ret; ++ ++ lock_sock(sk); ++ ++ if (sk->sk_state != SS_CONNECTED) { ++ /* Recvmsg is supposed to return 0 if a peer performs an ++ * orderly shutdown. Differentiate between that case and when a ++ * peer has not connected or a local shutdown occurred with the ++ * SOCK_DONE flag. ++ */ ++ if (sock_flag(sk, SOCK_DONE)) ++ ret = 0; ++ else ++ ret = -ENOTCONN; ++ ++ goto out; ++ } ++ ++ /* We ignore msg->addr_name/len. */ ++ if (flags & ~MSG_DONTWAIT) { ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ /* We don't check peer_shutdown flag here since peer may actually shut ++ * down, but there can be data in the queue that a local socket can ++ * receive. ++ */ ++ if (sk->sk_shutdown & RCV_SHUTDOWN) { ++ ret = 0; ++ goto out; ++ } ++ ++ /* It is valid on Linux to pass in a zero-length receive buffer. This ++ * is not an error. We may as well bail out now. ++ */ ++ if (!len) { ++ ret = 0; ++ goto out; ++ } ++ ++ ret = hvsock_recvmsg_wait(sk, msg, len, flags); ++out: ++ release_sock(sk); ++ return ret; ++} ++ ++static const struct proto_ops hvsock_ops = { ++ .family = PF_HYPERV, ++ .owner = THIS_MODULE, ++ .release = hvsock_release, ++ .bind = hvsock_bind, ++ .connect = hvsock_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = hvsock_accept, ++ .getname = hvsock_getname, ++ .poll = hvsock_poll, ++ .ioctl = sock_no_ioctl, ++ .listen = hvsock_listen, ++ .shutdown = hvsock_shutdown, ++ .setsockopt = sock_no_setsockopt, ++ .getsockopt = sock_no_getsockopt, ++ .sendmsg = hvsock_sendmsg, ++ .recvmsg = hvsock_recvmsg, ++ .mmap = sock_no_mmap, ++ .sendpage = sock_no_sendpage, ++}; ++ ++static int hvsock_create_sock(struct net *net, struct socket *sock, ++ int protocol, int kern) ++{ ++ struct sock *sk; ++ ++ if (protocol != 0 && protocol != SHV_PROTO_RAW) ++ return -EPROTONOSUPPORT; ++ ++ switch (sock->type) { ++ case SOCK_STREAM: ++ sock->ops = &hvsock_ops; ++ break; ++ default: ++ return -ESOCKTNOSUPPORT; ++ } ++ ++ sock->state = SS_UNCONNECTED; ++ ++ sk = hvsock_create(net, sock, GFP_KERNEL, 0); ++ return sk ? 0 : -ENOMEM; ++} ++ ++static const struct net_proto_family hvsock_family_ops = { ++ .family = AF_HYPERV, ++ .create = hvsock_create_sock, ++ .owner = THIS_MODULE, ++}; ++ ++static int hvsock_probe(struct hv_device *hdev, ++ const struct hv_vmbus_device_id *dev_id) ++{ ++ struct vmbus_channel *channel = hdev->channel; ++ ++ /* We ignore the error return code to suppress the unnecessary ++ * error message in vmbus_probe(): on error the host will rescind ++ * the offer in 30 seconds and we can do cleanup at that time. ++ */ ++ (void)hvsock_open_connection(channel); ++ ++ return 0; ++} ++ ++static int hvsock_remove(struct hv_device *hdev) ++{ ++ struct vmbus_channel *channel = hdev->channel; ++ ++ vmbus_close(channel); ++ ++ return 0; ++} ++ ++/* It's not really used. See vmbus_match() and vmbus_probe(). */ ++static const struct hv_vmbus_device_id id_table[] = { ++ {}, ++}; ++ ++static struct hv_driver hvsock_drv = { ++ .name = "hv_sock", ++ .hvsock = true, ++ .id_table = id_table, ++ .probe = hvsock_probe, ++ .remove = hvsock_remove, ++}; ++ ++static int __init hvsock_init(void) ++{ ++ int ret; ++ ++ if (vmbus_proto_version < VERSION_WIN10) ++ return -ENODEV; ++ ++ ret = vmbus_driver_register(&hvsock_drv); ++ if (ret) { ++ pr_err("failed to register hv_sock driver\n"); ++ return ret; ++ } ++ ++ ret = proto_register(&hvsock_proto, 0); ++ if (ret) { ++ pr_err("failed to register protocol\n"); ++ goto unreg_hvsock_drv; ++ } ++ ++ ret = sock_register(&hvsock_family_ops); ++ if (ret) { ++ pr_err("failed to register address family\n"); ++ goto unreg_proto; ++ } ++ ++ return 0; ++ ++unreg_proto: ++ proto_unregister(&hvsock_proto); ++unreg_hvsock_drv: ++ vmbus_driver_unregister(&hvsock_drv); ++ return ret; ++} ++ ++static void __exit hvsock_exit(void) ++{ ++ sock_unregister(AF_HYPERV); ++ proto_unregister(&hvsock_proto); ++ vmbus_driver_unregister(&hvsock_drv); ++} ++ ++module_init(hvsock_init); ++module_exit(hvsock_exit); ++ ++MODULE_DESCRIPTION("Hyper-V Sockets"); ++MODULE_LICENSE("Dual BSD/GPL"); +-- +2.10.2 + diff --git a/alpine/kernel/patches-aufs/0004-vmbus-Don-t-spam-the-logs-with-unknown-GUIDs.patch b/alpine/kernel/patches-aufs/0004-vmbus-Don-t-spam-the-logs-with-unknown-GUIDs.patch new file mode 100644 index 000000000..914a2576a --- /dev/null +++ b/alpine/kernel/patches-aufs/0004-vmbus-Don-t-spam-the-logs-with-unknown-GUIDs.patch @@ -0,0 +1,30 @@ +From e8c7a6dee61819c36b77108bc2cddafde26b9876 Mon Sep 17 00:00:00 2001 +From: Rolf Neugebauer +Date: Mon, 23 May 2016 18:55:45 +0100 +Subject: [PATCH 4/5] vmbus: Don't spam the logs with unknown GUIDs + +With Hyper-V sockets device types are introduced on the fly. The pr_info() +then prints a message on every connection, which is way too verbose. Since +there doesn't seem to be an easy way to check for registered services, +disable the pr_info() completely. + +Signed-off-by: Rolf Neugebauer +--- + drivers/hv/channel_mgmt.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index 8f4e6070..ef4a512 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -147,7 +147,6 @@ static u16 hv_get_dev_type(const uuid_le *guid) + if (!uuid_le_cmp(*guid, vmbus_devs[i].guid)) + return i; + } +- pr_info("Unknown GUID: %pUl\n", guid); + return i; + } + +-- +2.10.2 +