diff --git a/kernel/patches-4.12.x/0001-vmbus-vmbus_open-reset-onchannel_callback-on-error.patch b/kernel/patches-4.12.x/0001-vmbus-vmbus_open-reset-onchannel_callback-on-error.patch new file mode 100644 index 000000000..a5cd07e2a --- /dev/null +++ b/kernel/patches-4.12.x/0001-vmbus-vmbus_open-reset-onchannel_callback-on-error.patch @@ -0,0 +1,34 @@ +From 516c2116aa11f4ea6cf09aa3a195951509d429ff Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 26 Jul 2017 12:31:50 -0600 +Subject: [PATCH 01/15] vmbus: vmbus_open(): reset onchannel_callback on error + +No real issue is observed without the patch, but let's add this +just in case. + +Signed-off-by: Dexuan Cui +Cc: K. Y. Srinivasan +Cc: Haiyang Zhang +Cc: Stephen Hemminger +Origin: git@github.com:dcui/linux.git +(cherry picked from commit b20ffd850ad9763a10b563680ab4aa0c1c8894ac) +--- + drivers/hv/channel.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c +index 3cea1216754e..6ef18e9e3eca 100644 +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -220,6 +220,8 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, + get_order(send_ringbuffer_size + recv_ringbuffer_size)); + error_set_chnstate: + newchannel->state = CHANNEL_OPEN_STATE; ++ newchannel->onchannel_callback = NULL; ++ newchannel->channel_callback_context = NULL; + return err; + } + EXPORT_SYMBOL_GPL(vmbus_open); +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0002-vmbus-remove-goto-error_clean_msglist-in-vmbus_open.patch b/kernel/patches-4.12.x/0002-vmbus-remove-goto-error_clean_msglist-in-vmbus_open.patch new file mode 100644 index 000000000..826fd992d --- /dev/null +++ b/kernel/patches-4.12.x/0002-vmbus-remove-goto-error_clean_msglist-in-vmbus_open.patch @@ -0,0 +1,63 @@ +From 7dfdecc1c016087532580bb9a51e19fd2af4136d Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 26 Jul 2017 12:31:53 -0600 +Subject: [PATCH 02/15] vmbus: remove "goto error_clean_msglist" in + vmbus_open() + +This is just a cleanup patch to simplify the code a little. +No semantic change. + +Signed-off-by: Dexuan Cui +Cc: K. Y. Srinivasan +Cc: Haiyang Zhang +Cc: Stephen Hemminger +Origin: git@github.com:dcui/linux.git +(cherry picked from commit 981ae0c39f4a01eaa8e9db4b0e431028005ea1e0) +--- + drivers/hv/channel.c | 18 +++++++----------- + 1 file changed, 7 insertions(+), 11 deletions(-) + +diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c +index 6ef18e9e3eca..ee1e87eec16f 100644 +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -180,17 +180,18 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, + ret = vmbus_post_msg(open_msg, + sizeof(struct vmbus_channel_open_channel), true); + +- if (ret != 0) { +- err = ret; +- goto error_clean_msglist; +- } +- +- wait_for_completion(&open_info->waitevent); ++ if (ret == 0) ++ wait_for_completion(&open_info->waitevent); + + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + list_del(&open_info->msglistentry); + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + ++ if (ret != 0) { ++ err = ret; ++ goto error_free_gpadl; ++ } ++ + if (newchannel->rescind) { + err = -ENODEV; + goto error_free_gpadl; +@@ -205,11 +206,6 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, + kfree(open_info); + return 0; + +-error_clean_msglist: +- spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); +- list_del(&open_info->msglistentry); +- spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); +- + error_free_gpadl: + vmbus_teardown_gpadl(newchannel, newchannel->ringbuffer_gpadlhandle); + kfree(open_info); +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0003-vmbus-dynamically-enqueue-dequeue-a-channel-on-vmbus.patch b/kernel/patches-4.12.x/0003-vmbus-dynamically-enqueue-dequeue-a-channel-on-vmbus.patch new file mode 100644 index 000000000..7b80e4334 --- /dev/null +++ b/kernel/patches-4.12.x/0003-vmbus-dynamically-enqueue-dequeue-a-channel-on-vmbus.patch @@ -0,0 +1,189 @@ +From 1854541966d93dc3e73deb8b262943781949e52d Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Fri, 5 May 2017 16:57:23 -0600 +Subject: [PATCH 03/15] vmbus: dynamically enqueue/dequeue a channel on + vmbus_open/close +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +A just-closed channel may have a pending interrupt, and later when a new +channel with the same channel ID is not being fully initialized, the +pending interrupt of the previous channel with the same channel ID can run +the channel callback on the new channel data structure, causing a crash +of NULL pointer dereferencing. + +Normally it’s pretty hard to reproduce the race condition, but it can +indeed happen with specially-designed hv_sock stress test cases. + +Signed-off-by: Dexuan Cui +Reported-by: Rolf Neugebauer +Tested-by: Rolf Neugebauer +Cc: K. Y. Srinivasan +Cc: Haiyang Zhang +Cc: Stephen Hemminger +Origin: git@github.com:dcui/linux.git +(cherry picked from commit 1df677b35ff010d0def33f5420773015815cf843) +--- + drivers/hv/channel.c | 12 +++++++++--- + drivers/hv/channel_mgmt.c | 50 +++++++++++++++++++++-------------------------- + include/linux/hyperv.h | 3 +++ + 3 files changed, 34 insertions(+), 31 deletions(-) + +diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c +index ee1e87eec16f..663e3d78247a 100644 +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -177,6 +177,8 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, + &vmbus_connection.chn_msg_list); + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + ++ hv_percpu_channel_enq(newchannel); ++ + ret = vmbus_post_msg(open_msg, + sizeof(struct vmbus_channel_open_channel), true); + +@@ -189,23 +191,25 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, + + if (ret != 0) { + err = ret; +- goto error_free_gpadl; ++ goto error_deq_channel; + } + + if (newchannel->rescind) { + err = -ENODEV; +- goto error_free_gpadl; ++ goto error_deq_channel; + } + + if (open_info->response.open_result.status) { + err = -EAGAIN; +- goto error_free_gpadl; ++ goto error_deq_channel; + } + + newchannel->state = CHANNEL_OPENED_STATE; + kfree(open_info); + return 0; + ++error_deq_channel: ++ hv_percpu_channel_deq(newchannel); + error_free_gpadl: + vmbus_teardown_gpadl(newchannel, newchannel->ringbuffer_gpadlhandle); + kfree(open_info); +@@ -551,6 +555,8 @@ static int vmbus_close_internal(struct vmbus_channel *channel) + goto out; + } + ++ hv_percpu_channel_deq(channel); ++ + channel->state = CHANNEL_OPEN_STATE; + channel->sc_creation_callback = NULL; + /* Stop callback and cancel the timer asap */ +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index 735f9363f2e4..f4dda5ecfa0b 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -363,6 +363,17 @@ static void percpu_channel_enq(void *arg) + list_add_tail_rcu(&channel->percpu_list, &hv_cpu->chan_list); + } + ++void hv_percpu_channel_enq(struct vmbus_channel *channel) ++{ ++ if (channel->target_cpu != get_cpu()) ++ smp_call_function_single(channel->target_cpu, ++ percpu_channel_enq, channel, true); ++ else ++ percpu_channel_enq(channel); ++ ++ put_cpu(); ++} ++ + static void percpu_channel_deq(void *arg) + { + struct vmbus_channel *channel = arg; +@@ -370,6 +381,17 @@ static void percpu_channel_deq(void *arg) + list_del_rcu(&channel->percpu_list); + } + ++void hv_percpu_channel_deq(struct vmbus_channel *channel) ++{ ++ if (channel->target_cpu != get_cpu()) ++ smp_call_function_single(channel->target_cpu, ++ percpu_channel_deq, channel, true); ++ else ++ percpu_channel_deq(channel); ++ ++ put_cpu(); ++} ++ + + static void vmbus_release_relid(u32 relid) + { +@@ -390,15 +412,6 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) + BUG_ON(!channel->rescind); + BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); + +- if (channel->target_cpu != get_cpu()) { +- put_cpu(); +- smp_call_function_single(channel->target_cpu, +- percpu_channel_deq, channel, true); +- } else { +- percpu_channel_deq(channel); +- put_cpu(); +- } +- + if (channel->primary_channel == NULL) { + list_del(&channel->listentry); + +@@ -491,16 +504,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + + init_vp_index(newchannel, dev_type); + +- if (newchannel->target_cpu != get_cpu()) { +- put_cpu(); +- smp_call_function_single(newchannel->target_cpu, +- percpu_channel_enq, +- newchannel, true); +- } else { +- percpu_channel_enq(newchannel); +- put_cpu(); +- } +- + /* + * This state is used to indicate a successful open + * so that when we do close the channel normally, we +@@ -549,15 +552,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + list_del(&newchannel->listentry); + mutex_unlock(&vmbus_connection.channel_mutex); + +- if (newchannel->target_cpu != get_cpu()) { +- put_cpu(); +- smp_call_function_single(newchannel->target_cpu, +- percpu_channel_deq, newchannel, true); +- } else { +- percpu_channel_deq(newchannel); +- put_cpu(); +- } +- + vmbus_release_relid(newchannel->offermsg.child_relid); + + err_free_chan: +diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h +index e09fc8290c2f..b6975010e798 100644 +--- a/include/linux/hyperv.h ++++ b/include/linux/hyperv.h +@@ -1454,6 +1454,9 @@ extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, + const int *srv_version, int srv_vercnt, + int *nego_fw_version, int *nego_srv_version); + ++void hv_percpu_channel_enq(struct vmbus_channel *channel); ++void hv_percpu_channel_deq(struct vmbus_channel *channel); ++ + void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid); + + void vmbus_setevent(struct vmbus_channel *channel); +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0004-hv_sock-implements-Hyper-V-transport-for-Virtual-Soc.patch b/kernel/patches-4.12.x/0004-hv_sock-implements-Hyper-V-transport-for-Virtual-Soc.patch new file mode 100644 index 000000000..64fb39eba --- /dev/null +++ b/kernel/patches-4.12.x/0004-hv_sock-implements-Hyper-V-transport-for-Virtual-Soc.patch @@ -0,0 +1,935 @@ +From e317beafc5bf9165f1f604d8c201bc0d09a884b5 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 26 Jul 2017 12:31:56 -0600 +Subject: [PATCH 04/15] hv_sock: implements Hyper-V transport for Virtual + Sockets (AF_VSOCK) + +Hyper-V Sockets (hv_sock) supplies a byte-stream based communication +mechanism between the host and the guest. It uses VMBus ringbuffer as the +transportation layer. + +With hv_sock, applications between the host (Windows 10, Windows Server +2016 or newer) and the guest can talk with each other using the traditional +socket APIs. + +More info about Hyper-V Sockets is available here: + +"Make your own integration services": +https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-guide/make-integration-service + +The patch implements the necessary support in Linux guest by introducing a new +vsock transport for AF_VSOCK. + +Signed-off-by: Dexuan Cui +Cc: K. Y. Srinivasan +Cc: Haiyang Zhang +Cc: Stephen Hemminger +Cc: Andy King +Cc: Dmitry Torokhov +Cc: George Zhang +Cc: Jorgen Hansen +Cc: Reilly Grant +Cc: Asias He +Cc: Stefan Hajnoczi +Cc: Vitaly Kuznetsov +Cc: Cathy Avery +Cc: Rolf Neugebauer +Cc: Marcelo Cerri +Origin: git@github.com:dcui/linux.git +(cherry picked from commit ca99361977429c3128fce64dcfe1093e4ab65247) +--- + MAINTAINERS | 1 + + net/vmw_vsock/Kconfig | 12 + + net/vmw_vsock/Makefile | 3 + + net/vmw_vsock/hyperv_transport.c | 829 +++++++++++++++++++++++++++++++++++++++ + 4 files changed, 845 insertions(+) + create mode 100644 net/vmw_vsock/hyperv_transport.c + +diff --git a/MAINTAINERS b/MAINTAINERS +index 767e9d202adf..db1209547aa4 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -6178,6 +6178,7 @@ F: drivers/net/hyperv/ + F: drivers/scsi/storvsc_drv.c + F: drivers/uio/uio_hv_generic.c + F: drivers/video/fbdev/hyperv_fb.c ++F: net/vmw_vsock/hyperv_transport.c + F: include/linux/hyperv.h + F: tools/hv/ + F: Documentation/ABI/stable/sysfs-bus-vmbus +diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig +index 8831e7c42167..a24369d175fd 100644 +--- a/net/vmw_vsock/Kconfig ++++ b/net/vmw_vsock/Kconfig +@@ -46,3 +46,15 @@ config VIRTIO_VSOCKETS_COMMON + This option is selected by any driver which needs to access + the virtio_vsock. The module will be called + vmw_vsock_virtio_transport_common. ++ ++config HYPERV_VSOCKETS ++ tristate "Hyper-V transport for Virtual Sockets" ++ depends on VSOCKETS && HYPERV ++ help ++ This module implements a Hyper-V transport for Virtual Sockets. ++ ++ Enable this transport if your Virtual Machine host supports Virtual ++ Sockets over Hyper-V VMBus. ++ ++ To compile this driver as a module, choose M here: the module will be ++ called hv_sock. If unsure, say N. +diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile +index 09fc2eb29dc8..e63d574234a9 100644 +--- a/net/vmw_vsock/Makefile ++++ b/net/vmw_vsock/Makefile +@@ -2,6 +2,7 @@ obj-$(CONFIG_VSOCKETS) += vsock.o + obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o + obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o + obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o ++obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o + + vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o + +@@ -11,3 +12,5 @@ vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \ + vmw_vsock_virtio_transport-y += virtio_transport.o + + vmw_vsock_virtio_transport_common-y += virtio_transport_common.o ++ ++hv_sock-y += hyperv_transport.o +diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c +new file mode 100644 +index 000000000000..fd89bf357617 +--- /dev/null ++++ b/net/vmw_vsock/hyperv_transport.c +@@ -0,0 +1,829 @@ ++/* ++ * Hyper-V transport for vsock ++ * ++ * Hyper-V Sockets supplies a byte-stream based communication mechanism ++ * between the host and the VM. This driver implements the necessary ++ * support in the VM by introducing the new vsock transport. ++ * ++ * Copyright (c) 2017, Microsoft Corporation. ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms and conditions of the GNU General Public License, ++ * version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ * more details. ++ * ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++/* The host side's design of the feature requires 6 exact 4KB pages for ++ * recv/send rings respectively -- this is suboptimal considering memory ++ * consumption, however unluckily we have to live with it, before the ++ * host comes up with a better design in the future. ++ */ ++#define PAGE_SIZE_4K 4096 ++#define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6) ++#define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6) ++ ++/* The MTU is 16KB per the host side's design */ ++#define HVS_MTU_SIZE (1024 * 16) ++ ++struct vmpipe_proto_header { ++ u32 pkt_type; ++ u32 data_size; ++}; ++ ++/* For recv, we use the VMBus in-place packet iterator APIs to directly copy ++ * data from the ringbuffer into the userspace buffer. ++ */ ++struct hvs_recv_buf { ++ /* The header before the payload data */ ++ struct vmpipe_proto_header hdr; ++ ++ /* The payload */ ++ u8 data[HVS_MTU_SIZE]; ++}; ++ ++/* We can send up to HVS_MTU_SIZE bytes of payload to the host, but let's use ++ * a small size, i.e. HVS_SEND_BUF_SIZE, to minimize the dynamically-allocated ++ * buffer, because tests show there is no significant performance difference. ++ * ++ * Note: the buffer can be eliminated in the future when we add new VMBus ++ * ringbuffer APIs that allow us to directly copy data from userspace buffer ++ * to VMBus ringbuffer. ++ */ ++#define HVS_SEND_BUF_SIZE (PAGE_SIZE_4K - sizeof(struct vmpipe_proto_header)) ++ ++struct hvs_send_buf { ++ /* The header before the payload data */ ++ struct vmpipe_proto_header hdr; ++ ++ /* The payload */ ++ u8 data[HVS_SEND_BUF_SIZE]; ++}; ++ ++#define HVS_HEADER_LEN (sizeof(struct vmpacket_descriptor) + \ ++ sizeof(struct vmpipe_proto_header)) ++ ++/* See 'prev_indices' in hv_ringbuffer_read(), hv_ringbuffer_write(), and ++ * __hv_pkt_iter_next(). ++ */ ++#define VMBUS_PKT_TRAILER (sizeof(u64)) ++ ++#define HVS_PKT_LEN(payload_len) (HVS_HEADER_LEN + \ ++ ALIGN((payload_len), 8) + \ ++ VMBUS_PKT_TRAILER) ++ ++/* Per-socket state (accessed via vsk->trans) */ ++struct hvsock { ++ struct vsock_sock *vsk; ++ ++ uuid_le vm_srv_id; ++ uuid_le host_srv_id; ++ ++ struct vmbus_channel *chan; ++ struct vmpacket_descriptor *recv_desc; ++ ++ /* The length of the payload not delivered to userland yet */ ++ u32 recv_data_len; ++ /* The offset of the payload */ ++ u32 recv_data_off; ++}; ++ ++/* In the VM, we support Hyper-V Sockets with AF_VSOCK, and the endpoint is ++ * (see struct sockaddr_vm). Note: cid is not really used here: ++ * when we write apps to connect to the host, we can only use VMADDR_CID_ANY ++ * or VMADDR_CID_HOST (both are equivalent) as the remote cid, and when we ++ * write apps to bind() & listen() in the VM, we can only use VMADDR_CID_ANY ++ * as the local cid. ++ * ++ * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV: ++ * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user- ++ * guide/make-integration-service, and the endpoint is with ++ * the below sockaddr: ++ * ++ * struct SOCKADDR_HV ++ * { ++ * ADDRESS_FAMILY Family; ++ * USHORT Reserved; ++ * GUID VmId; ++ * GUID ServiceId; ++ * }; ++ * Note: VmID is not used by Linux VM and actually it isn't transmitted via ++ * VMBus, because here it's obvious the host and the VM can easily identify ++ * each other. Though the VmID is useful on the host, especially in the case ++ * of Windows container, Linux VM doesn't need it at all. ++ * ++ * To make use of the AF_VSOCK infrastructure in Linux VM, we have to limit ++ * the available GUID space of SOCKADDR_HV so that we can create a mapping ++ * between AF_VSOCK port and SOCKADDR_HV Service GUID. The rule of writing ++ * Hyper-V Sockets apps on the host and in Linux VM is: ++ * ++ **************************************************************************** ++ * the only valid Service GUIDs, from the perspectives of both the host and * ++ * Linux VM, that can be connected by the other end, must conform to this * ++ * format: -facb-11e6-bd58-64006a7986d3, and the "port" must be in * ++ * this range [0, 0x7FFFFFFF]. * ++ **************************************************************************** ++ * ++ * When we write apps on the host to connect(), the GUID ServiceID is used. ++ * When we write apps in Linux VM to connect(), we only need to specify the ++ * port and the driver will form the GUID and use that to request the host. ++ * ++ * From the perspective of Linux VM: ++ * 1. the local ephemeral port (i.e. the local auto-bound port when we call ++ * connect() without explicit bind()) is generated by __vsock_bind_stream(), ++ * and the range is [1024, 0xFFFFFFFF). ++ * 2. the remote ephemeral port (i.e. the auto-generated remote port for ++ * a connect request initiated by the host's connect()) is generated by ++ * hvs_remote_addr_init() and the range is [0x80000000, 0xFFFFFFFF). ++ */ ++ ++#define MAX_LISTEN_PORT ((u32)0x7FFFFFFF) ++#define MAX_VM_LISTEN_PORT MAX_LISTEN_PORT ++#define MAX_HOST_LISTEN_PORT MAX_LISTEN_PORT ++#define MIN_HOST_EPHEMERAL_PORT (MAX_HOST_LISTEN_PORT + 1) ++ ++/* 00000000-facb-11e6-bd58-64006a7986d3 */ ++static const uuid_le srv_id_template = ++ UUID_LE(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58, ++ 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3); ++ ++static inline bool is_valid_srv_id(const uuid_le *id) ++{ ++ return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(uuid_le) - 4); ++} ++ ++static inline unsigned int get_port_by_srv_id(const uuid_le *svr_id) ++{ ++ return *((unsigned int *)svr_id); ++} ++ ++static inline void hvs_addr_init(struct sockaddr_vm *addr, ++ const uuid_le *svr_id) ++{ ++ unsigned int port = get_port_by_srv_id(svr_id); ++ ++ vsock_addr_init(addr, VMADDR_CID_ANY, port); ++} ++ ++static inline void hvs_remote_addr_init(struct sockaddr_vm *remote, ++ struct sockaddr_vm *local) ++{ ++ static u32 host_ephemeral_port = MIN_HOST_EPHEMERAL_PORT; ++ struct sock *sk; ++ ++ vsock_addr_init(remote, VMADDR_CID_ANY, VMADDR_PORT_ANY); ++ ++ while (1) { ++ /* Wrap around ? */ ++ if (host_ephemeral_port < MIN_HOST_EPHEMERAL_PORT || ++ host_ephemeral_port == VMADDR_PORT_ANY) ++ host_ephemeral_port = MIN_HOST_EPHEMERAL_PORT; ++ ++ remote->svm_port = host_ephemeral_port++; ++ ++ sk = vsock_find_connected_socket(remote, local); ++ if (!sk) { ++ /* Found an available ephemeral port */ ++ return; ++ } ++ ++ /* Release refcnt got in vsock_find_connected_socket */ ++ sock_put(sk); ++ } ++} ++ ++static bool hvs_channel_readable(struct vmbus_channel *chan) ++{ ++ u32 readable = hv_get_bytes_to_read(&chan->inbound); ++ ++ /* 0-size payload means FIN */ ++ return readable >= HVS_PKT_LEN(0); ++} ++ ++static int hvs_channel_readable_payload(struct vmbus_channel *chan) ++{ ++ u32 readable = hv_get_bytes_to_read(&chan->inbound); ++ ++ if (readable > HVS_PKT_LEN(0)) { ++ /* At least we have 1 byte to read. We don't need to return ++ * the exact readable bytes: see vsock_stream_recvmsg() -> ++ * vsock_stream_has_data(). ++ */ ++ return 1; ++ } ++ ++ if (readable == HVS_PKT_LEN(0)) { ++ /* 0-size payload means FIN */ ++ return 0; ++ } ++ ++ /* No payload or FIN */ ++ return -1; ++} ++ ++static inline size_t hvs_channel_writable_bytes(struct vmbus_channel *chan) ++{ ++ u32 writeable = hv_get_bytes_to_write(&chan->outbound); ++ size_t ret; ++ ++ /* The ringbuffer mustn't be 100% full, and we should reserve a ++ * zero-length-payload packet for the FIN: see hv_ringbuffer_write() ++ * and hvs_shutdown(). ++ */ ++ if (writeable <= HVS_PKT_LEN(1) + HVS_PKT_LEN(0)) ++ return 0; ++ ++ ret = writeable - HVS_PKT_LEN(1) - HVS_PKT_LEN(0); ++ ++ return round_down(ret, 8); ++} ++ ++static int hvs_send_data(struct vmbus_channel *chan, ++ struct hvs_send_buf *send_buf, size_t to_write) ++{ ++ send_buf->hdr.pkt_type = 1; ++ send_buf->hdr.data_size = to_write; ++ return vmbus_sendpacket(chan, &send_buf->hdr, ++ sizeof(send_buf->hdr) + to_write, ++ 0, VM_PKT_DATA_INBAND, 0); ++} ++ ++static void hvs_channel_cb(void *ctx) ++{ ++ struct sock *sk = (struct sock *)ctx; ++ struct vsock_sock *vsk = vsock_sk(sk); ++ struct hvsock *hvs = vsk->trans; ++ struct vmbus_channel *chan = hvs->chan; ++ ++ if (hvs_channel_readable(chan)) ++ sk->sk_data_ready(sk); ++ ++ /* Mark it writable only if there is enough space */ ++ if (hvs_channel_writable_bytes(chan) >= HVS_SEND_BUF_SIZE) ++ sk->sk_write_space(sk); ++} ++ ++static void hvs_close_connection(struct vmbus_channel *chan) ++{ ++ struct sock *sk = get_per_channel_state(chan); ++ struct vsock_sock *vsk = vsock_sk(sk); ++ ++ sk->sk_state = SS_UNCONNECTED; ++ sock_set_flag(sk, SOCK_DONE); ++ vsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN; ++ ++ sk->sk_state_change(sk); ++} ++ ++static void hvs_open_connection(struct vmbus_channel *chan) ++{ ++ uuid_le *if_instance, *if_type; ++ unsigned char conn_from_host; ++ ++ struct sockaddr_vm addr; ++ struct sock *sk, *new = NULL; ++ struct vsock_sock *vnew; ++ struct hvsock *hvs, *hvs_new; ++ int ret; ++ ++ if_type = &chan->offermsg.offer.if_type; ++ if_instance = &chan->offermsg.offer.if_instance; ++ conn_from_host = chan->offermsg.offer.u.pipe.user_def[0]; ++ ++ /* The host or the VM should only listen on a port in ++ * [0, MAX_LISTEN_PORT] ++ */ ++ if (!is_valid_srv_id(if_type) || ++ get_port_by_srv_id(if_type) > MAX_LISTEN_PORT) ++ return; ++ ++ hvs_addr_init(&addr, conn_from_host ? if_type : if_instance); ++ sk = vsock_find_bound_socket(&addr); ++ if (!sk) ++ return; ++ ++ if ((conn_from_host && sk->sk_state != VSOCK_SS_LISTEN) || ++ (!conn_from_host && sk->sk_state != SS_CONNECTING)) ++ goto out; ++ ++ if (conn_from_host) { ++ if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog) ++ goto out; ++ ++ new = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, ++ sk->sk_type, 0); ++ if (!new) ++ goto out; ++ ++ new->sk_state = SS_CONNECTING; ++ vnew = vsock_sk(new); ++ hvs_new = vnew->trans; ++ hvs_new->chan = chan; ++ } else { ++ hvs = vsock_sk(sk)->trans; ++ hvs->chan = chan; ++ } ++ ++ set_channel_read_mode(chan, HV_CALL_DIRECT); ++ ret = vmbus_open(chan, RINGBUFFER_HVS_SND_SIZE, ++ RINGBUFFER_HVS_RCV_SIZE, NULL, 0, ++ hvs_channel_cb, conn_from_host ? new : sk); ++ if (ret != 0) { ++ if (conn_from_host) { ++ hvs_new->chan = NULL; ++ sock_put(new); ++ } else { ++ hvs->chan = NULL; ++ } ++ goto out; ++ } ++ ++ set_per_channel_state(chan, conn_from_host ? new : sk); ++ vmbus_set_chn_rescind_callback(chan, hvs_close_connection); ++ ++ /* See hvs_channel_cb() and hvs_notify_poll_out() */ ++ set_channel_pending_send_size(chan, ++ HVS_PKT_LEN(HVS_SEND_BUF_SIZE) + 1); ++ ++ if (conn_from_host) { ++ new->sk_state = SS_CONNECTED; ++ sk->sk_ack_backlog++; ++ ++ hvs_addr_init(&vnew->local_addr, if_type); ++ hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr); ++ ++ hvs_new->vm_srv_id = *if_type; ++ hvs_new->host_srv_id = *if_instance; ++ ++ vsock_insert_connected(vnew); ++ vsock_enqueue_accept(sk, new); ++ } else { ++ sk->sk_state = SS_CONNECTED; ++ sk->sk_socket->state = SS_CONNECTED; ++ ++ vsock_insert_connected(vsock_sk(sk)); ++ } ++ ++ sk->sk_state_change(sk); ++ ++out: ++ /* Release refcnt obtained when we called vsock_find_bound_socket() */ ++ sock_put(sk); ++} ++ ++static u32 hvs_get_local_cid(void) ++{ ++ return VMADDR_CID_ANY; ++} ++ ++static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk) ++{ ++ struct hvsock *hvs; ++ ++ hvs = kzalloc(sizeof(*hvs), GFP_KERNEL); ++ if (!hvs) ++ return -ENOMEM; ++ ++ vsk->trans = hvs; ++ hvs->vsk = vsk; ++ ++ return 0; ++} ++ ++static int hvs_connect(struct vsock_sock *vsk) ++{ ++ struct hvsock *h = vsk->trans; ++ ++ h->vm_srv_id = srv_id_template; ++ h->host_srv_id = srv_id_template; ++ ++ *((u32 *)&h->vm_srv_id) = vsk->local_addr.svm_port; ++ *((u32 *)&h->host_srv_id) = vsk->remote_addr.svm_port; ++ ++ return vmbus_send_tl_connect_request(&h->vm_srv_id, &h->host_srv_id); ++} ++ ++static int hvs_shutdown(struct vsock_sock *vsk, int mode) ++{ ++ struct vmpipe_proto_header hdr; ++ struct hvs_send_buf *send_buf; ++ struct hvsock *hvs; ++ ++ if (!(mode & SEND_SHUTDOWN)) ++ return 0; ++ ++ hvs = vsk->trans; ++ ++ send_buf = (struct hvs_send_buf *)&hdr; ++ ++ /* It can't fail: see hvs_channel_writable_bytes(). */ ++ (void)hvs_send_data(hvs->chan, send_buf, 0); ++ ++ return 0; ++} ++ ++static void hvs_release(struct vsock_sock *vsk) ++{ ++ struct hvsock *hvs = vsk->trans; ++ struct vmbus_channel *chan = hvs->chan; ++ ++ if (chan) ++ hvs_shutdown(vsk, RCV_SHUTDOWN | SEND_SHUTDOWN); ++ ++ vsock_remove_sock(vsk); ++} ++ ++static void hvs_destruct(struct vsock_sock *vsk) ++{ ++ struct hvsock *hvs = vsk->trans; ++ struct vmbus_channel *chan = hvs->chan; ++ ++ if (chan) ++ vmbus_hvsock_device_unregister(chan); ++ ++ kfree(hvs); ++} ++ ++static int hvs_dgram_bind(struct vsock_sock *vsk, struct sockaddr_vm *addr) ++{ ++ return -EOPNOTSUPP; ++} ++ ++static int hvs_dgram_dequeue(struct vsock_sock *vsk, struct msghdr *msg, ++ size_t len, int flags) ++{ ++ return -EOPNOTSUPP; ++} ++ ++static int hvs_dgram_enqueue(struct vsock_sock *vsk, ++ struct sockaddr_vm *remote, struct msghdr *msg, ++ size_t dgram_len) ++{ ++ return -EOPNOTSUPP; ++} ++ ++static bool hvs_dgram_allow(u32 cid, u32 port) ++{ ++ return false; ++} ++ ++static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, ++ size_t len, int flags) ++{ ++ struct hvsock *hvs = vsk->trans; ++ bool need_refill = !hvs->recv_desc; ++ struct hvs_recv_buf *recv_buf; ++ u32 payload_len, to_read; ++ int ret; ++ ++ if (flags & MSG_PEEK) ++ return -EOPNOTSUPP; ++ ++ if (need_refill) { ++ hvs->recv_desc = hv_pkt_iter_first(hvs->chan); ++ recv_buf = (struct hvs_recv_buf *)(hvs->recv_desc + 1); ++ ++ payload_len = recv_buf->hdr.data_size; ++ if (payload_len == 0 || payload_len > HVS_MTU_SIZE) ++ return -EIO; ++ ++ hvs->recv_data_len = payload_len; ++ hvs->recv_data_off = 0; ++ } else { ++ recv_buf = (struct hvs_recv_buf *)(hvs->recv_desc + 1); ++ } ++ ++ to_read = min_t(u32, len, hvs->recv_data_len); ++ ret = memcpy_to_msg(msg, recv_buf->data + hvs->recv_data_off, to_read); ++ if (ret != 0) ++ return ret; ++ ++ hvs->recv_data_len -= to_read; ++ ++ if (hvs->recv_data_len == 0) ++ hvs->recv_desc = hv_pkt_iter_next(hvs->chan, hvs->recv_desc); ++ else ++ hvs->recv_data_off += to_read; ++ ++ return to_read; ++} ++ ++static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, ++ size_t len) ++{ ++ struct hvsock *hvs = vsk->trans; ++ struct vmbus_channel *chan = hvs->chan; ++ struct hvs_send_buf *send_buf; ++ size_t to_write, max_writable, ret; ++ ++ BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K); ++ ++ send_buf = kmalloc(sizeof(*send_buf), GFP_KERNEL); ++ if (!send_buf) ++ return -ENOMEM; ++ ++ max_writable = hvs_channel_writable_bytes(chan); ++ to_write = min_t(size_t, len, max_writable); ++ to_write = min_t(size_t, to_write, HVS_SEND_BUF_SIZE); ++ ++ ret = memcpy_from_msg(send_buf->data, msg, to_write); ++ if (ret < 0) ++ goto out; ++ ++ ret = hvs_send_data(hvs->chan, send_buf, to_write); ++ if (ret < 0) ++ goto out; ++ ++ ret = to_write; ++out: ++ kfree(send_buf); ++ return ret; ++} ++ ++static s64 hvs_stream_has_data(struct vsock_sock *vsk) ++{ ++ struct hvsock *hvs = vsk->trans; ++ s64 ret; ++ ++ switch (hvs_channel_readable_payload(hvs->chan)) { ++ case 1: ++ ret = 1; ++ break; ++ case 0: ++ vsk->peer_shutdown |= SEND_SHUTDOWN; ++ ret = 0; ++ break; ++ default: /* -1 */ ++ ret = 0; ++ break; ++ } ++ ++ return ret; ++} ++ ++static s64 hvs_stream_has_space(struct vsock_sock *vsk) ++{ ++ struct hvsock *hvs = vsk->trans; ++ ++ return hvs_channel_writable_bytes(hvs->chan); ++} ++ ++static u64 hvs_stream_rcvhiwat(struct vsock_sock *vsk) ++{ ++ return HVS_MTU_SIZE + 1; ++} ++ ++static bool hvs_stream_is_active(struct vsock_sock *vsk) ++{ ++ struct hvsock *hvs = vsk->trans; ++ ++ return hvs->chan != NULL; ++} ++ ++static bool hvs_stream_allow(u32 cid, u32 port) ++{ ++ static const u32 valid_cids[] = { ++ VMADDR_CID_ANY, ++ VMADDR_CID_HOST, ++ }; ++ int i; ++ ++ /* The host's port range [MIN_HOST_EPHEMERAL_PORT, 0xFFFFFFFF) is ++ * reserved as ephemeral ports, which are used as the host's ports ++ * when the host initiates connections. ++ */ ++ if (port > MAX_HOST_LISTEN_PORT) ++ return false; ++ ++ for (i = 0; i < ARRAY_SIZE(valid_cids); i++) { ++ if (cid == valid_cids[i]) ++ return true; ++ } ++ ++ return false; ++} ++ ++static ++int hvs_notify_poll_in(struct vsock_sock *vsk, size_t target, bool *readable) ++{ ++ struct hvsock *hvs = vsk->trans; ++ ++ *readable = hvs_channel_readable(hvs->chan); ++ return 0; ++} ++ ++static ++int hvs_notify_poll_out(struct vsock_sock *vsk, size_t target, bool *writable) ++{ ++ /* Report writable only if there is enough space */ ++ *writable = hvs_stream_has_space(vsk) >= HVS_SEND_BUF_SIZE; ++ ++ return 0; ++} ++ ++static ++int hvs_notify_recv_init(struct vsock_sock *vsk, size_t target, ++ struct vsock_transport_recv_notify_data *d) ++{ ++ return 0; ++} ++ ++static ++int hvs_notify_recv_pre_block(struct vsock_sock *vsk, size_t target, ++ struct vsock_transport_recv_notify_data *d) ++{ ++ return 0; ++} ++ ++static ++int hvs_notify_recv_pre_dequeue(struct vsock_sock *vsk, size_t target, ++ struct vsock_transport_recv_notify_data *d) ++{ ++ return 0; ++} ++ ++static ++int hvs_notify_recv_post_dequeue(struct vsock_sock *vsk, size_t target, ++ ssize_t copied, bool data_read, ++ struct vsock_transport_recv_notify_data *d) ++{ ++ return 0; ++} ++ ++static ++int hvs_notify_send_init(struct vsock_sock *vsk, ++ struct vsock_transport_send_notify_data *d) ++{ ++ return 0; ++} ++ ++static ++int hvs_notify_send_pre_block(struct vsock_sock *vsk, ++ struct vsock_transport_send_notify_data *d) ++{ ++ return 0; ++} ++ ++static ++int hvs_notify_send_pre_enqueue(struct vsock_sock *vsk, ++ struct vsock_transport_send_notify_data *d) ++{ ++ return 0; ++} ++ ++static ++int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written, ++ struct vsock_transport_send_notify_data *d) ++{ ++ return 0; ++} ++ ++static void hvs_set_buffer_size(struct vsock_sock *vsk, u64 val) ++{ ++ /* Ignored. */ ++} ++ ++static void hvs_set_min_buffer_size(struct vsock_sock *vsk, u64 val) ++{ ++ /* Ignored. */ ++} ++ ++static void hvs_set_max_buffer_size(struct vsock_sock *vsk, u64 val) ++{ ++ /* Ignored. */ ++} ++ ++static u64 hvs_get_buffer_size(struct vsock_sock *vsk) ++{ ++ return -ENOPROTOOPT; ++} ++ ++static u64 hvs_get_min_buffer_size(struct vsock_sock *vsk) ++{ ++ return -ENOPROTOOPT; ++} ++ ++static u64 hvs_get_max_buffer_size(struct vsock_sock *vsk) ++{ ++ return -ENOPROTOOPT; ++} ++ ++static struct vsock_transport hvs_transport = { ++ .get_local_cid = hvs_get_local_cid, ++ ++ .init = hvs_sock_init, ++ .destruct = hvs_destruct, ++ .release = hvs_release, ++ .connect = hvs_connect, ++ .shutdown = hvs_shutdown, ++ ++ .dgram_bind = hvs_dgram_bind, ++ .dgram_dequeue = hvs_dgram_dequeue, ++ .dgram_enqueue = hvs_dgram_enqueue, ++ .dgram_allow = hvs_dgram_allow, ++ ++ .stream_dequeue = hvs_stream_dequeue, ++ .stream_enqueue = hvs_stream_enqueue, ++ .stream_has_data = hvs_stream_has_data, ++ .stream_has_space = hvs_stream_has_space, ++ .stream_rcvhiwat = hvs_stream_rcvhiwat, ++ .stream_is_active = hvs_stream_is_active, ++ .stream_allow = hvs_stream_allow, ++ ++ .notify_poll_in = hvs_notify_poll_in, ++ .notify_poll_out = hvs_notify_poll_out, ++ .notify_recv_init = hvs_notify_recv_init, ++ .notify_recv_pre_block = hvs_notify_recv_pre_block, ++ .notify_recv_pre_dequeue = hvs_notify_recv_pre_dequeue, ++ .notify_recv_post_dequeue = hvs_notify_recv_post_dequeue, ++ .notify_send_init = hvs_notify_send_init, ++ .notify_send_pre_block = hvs_notify_send_pre_block, ++ .notify_send_pre_enqueue = hvs_notify_send_pre_enqueue, ++ .notify_send_post_enqueue = hvs_notify_send_post_enqueue, ++ ++ .set_buffer_size = hvs_set_buffer_size, ++ .set_min_buffer_size = hvs_set_min_buffer_size, ++ .set_max_buffer_size = hvs_set_max_buffer_size, ++ .get_buffer_size = hvs_get_buffer_size, ++ .get_min_buffer_size = hvs_get_min_buffer_size, ++ .get_max_buffer_size = hvs_get_max_buffer_size, ++}; ++ ++static int hvs_probe(struct hv_device *hdev, ++ const struct hv_vmbus_device_id *dev_id) ++{ ++ struct vmbus_channel *chan = hdev->channel; ++ ++ hvs_open_connection(chan); ++ ++ /* Always return success to suppress the unnecessary error message ++ * in vmbus_probe(): on error the host will rescind the device in ++ * 30 seconds and we can do cleanup at that time in ++ * vmbus_onoffer_rescind(). ++ */ ++ return 0; ++} ++ ++static int hvs_remove(struct hv_device *hdev) ++{ ++ struct vmbus_channel *chan = hdev->channel; ++ ++ vmbus_close(chan); ++ ++ return 0; ++} ++ ++/* This isn't really used. See vmbus_match() and vmbus_probe() */ ++static const struct hv_vmbus_device_id id_table[] = { ++ {}, ++}; ++ ++static struct hv_driver hvs_drv = { ++ .name = "hv_sock", ++ .hvsock = true, ++ .id_table = id_table, ++ .probe = hvs_probe, ++ .remove = hvs_remove, ++}; ++ ++static int __init hvs_init(void) ++{ ++ int ret; ++ ++ if (vmbus_proto_version < VERSION_WIN10) ++ return -ENODEV; ++ ++ ret = vmbus_driver_register(&hvs_drv); ++ if (ret != 0) ++ return ret; ++ ++ ret = vsock_core_init(&hvs_transport); ++ if (ret) { ++ vmbus_driver_unregister(&hvs_drv); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void __exit hvs_exit(void) ++{ ++ vsock_core_exit(); ++ vmbus_driver_unregister(&hvs_drv); ++} ++ ++module_init(hvs_init); ++module_exit(hvs_exit); ++ ++MODULE_DESCRIPTION("Hyper-V sockets"); ++MODULE_VERSION("1.0.0"); ++MODULE_LICENSE("GPL"); +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0005-VMCI-only-try-to-load-on-VMware-hypervisor.patch b/kernel/patches-4.12.x/0005-VMCI-only-try-to-load-on-VMware-hypervisor.patch new file mode 100644 index 000000000..a2b22f6af --- /dev/null +++ b/kernel/patches-4.12.x/0005-VMCI-only-try-to-load-on-VMware-hypervisor.patch @@ -0,0 +1,64 @@ +From 4250125671ac2be252badfff1783ab421ff9a360 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 26 Jul 2017 12:31:58 -0600 +Subject: [PATCH 05/15] VMCI: only try to load on VMware hypervisor + +Without the patch, vmw_vsock_vmci_transport.ko and vmw_vmci.ko can +automatically load when an application creates an AF_VSOCK socket. + +This is the expected good behavior on VMware hypervisor, but as we +are adding hv_sock.ko (i.e. Hyper-V transport for AF_VSOCK), we should +make sure vmw_vsock_vmci_transport.ko can't load on Hyper-V, otherwise +there is a -EBUSY conflict when both vmw_vsock_vmci_transport.ko and +hv_sock.ko try to call vsock_core_init(). + +On the other hand, hv_sock.ko can only load on Hyper-V, because it +depends on hv_vmbus.ko, which detects Hyper-V in hv_acpi_init(). + +KVM's vsock_virtio_transport doesn't have the issue because it doesn't +define MODULE_ALIAS_NETPROTO(PF_VSOCK). + +Signed-off-by: Dexuan Cui +Cc: Alok Kataria +Cc: Andy King +Cc: Adit Ranadive +Cc: George Zhang +Cc: Jorgen Hansen +Cc: K. Y. Srinivasan +Cc: Haiyang Zhang +Cc: Stephen Hemminger +Origin: git@github.com:dcui/linux.git +(cherry picked from commit 52d06e92fca7e91c5a774a5491239597f43261a3) +--- + drivers/misc/vmw_vmci/vmci_driver.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/drivers/misc/vmw_vmci/vmci_driver.c b/drivers/misc/vmw_vmci/vmci_driver.c +index d7eaf1eb11e7..1789ea71ff5d 100644 +--- a/drivers/misc/vmw_vmci/vmci_driver.c ++++ b/drivers/misc/vmw_vmci/vmci_driver.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include "vmci_driver.h" + #include "vmci_event.h" +@@ -58,6 +59,13 @@ static int __init vmci_drv_init(void) + int vmci_err; + int error; + ++ /* ++ * Check if we are running on VMware's hypervisor and bail out ++ * if we are not. ++ */ ++ if (x86_hyper != &x86_hyper_vmware) ++ return -ENODEV; ++ + vmci_err = vmci_event_init(); + if (vmci_err < VMCI_SUCCESS) { + pr_err("Failed to initialize VMCIEvent (result=%d)\n", +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0006-hv_sock-add-the-support-of-auto-loading.patch b/kernel/patches-4.12.x/0006-hv_sock-add-the-support-of-auto-loading.patch new file mode 100644 index 000000000..6b1cc28bc --- /dev/null +++ b/kernel/patches-4.12.x/0006-hv_sock-add-the-support-of-auto-loading.patch @@ -0,0 +1,30 @@ +From 3a056b90bb7e257e9edde271720a0676af9ae621 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 26 Jul 2017 12:32:00 -0600 +Subject: [PATCH 06/15] hv_sock: add the support of auto-loading + +After we disable VMWare virtual sockets driver's auto-loading on Hyper-V, +we can enable hv_sock's auto-loading now. + +Signed-off-by: Dexuan Cui +Cc: K. Y. Srinivasan +Cc: Haiyang Zhang +Cc: Stephen Hemminger +Origin: git@github.com:dcui/linux.git +(cherry picked from commit 6c9b2ebd7fc79eb00bc524b8eae11a218faa1769) +--- + net/vmw_vsock/hyperv_transport.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c +index fd89bf357617..f465b0b662df 100644 +--- a/net/vmw_vsock/hyperv_transport.c ++++ b/net/vmw_vsock/hyperv_transport.c +@@ -827,3 +827,4 @@ module_exit(hvs_exit); + MODULE_DESCRIPTION("Hyper-V sockets"); + MODULE_VERSION("1.0.0"); + MODULE_LICENSE("GPL"); ++MODULE_ALIAS_NETPROTO(PF_VSOCK); +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0007-hv_sock-fix-a-race-in-hvs_stream_dequeue.patch b/kernel/patches-4.12.x/0007-hv_sock-fix-a-race-in-hvs_stream_dequeue.patch new file mode 100644 index 000000000..deb305be3 --- /dev/null +++ b/kernel/patches-4.12.x/0007-hv_sock-fix-a-race-in-hvs_stream_dequeue.patch @@ -0,0 +1,116 @@ +From 5375fe4bf172da12535c273ea0deec4970f3166f Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 26 Jul 2017 12:32:03 -0600 +Subject: [PATCH 07/15] hv_sock: fix a race in hvs_stream_dequeue() + +If hv_pkt_iter_next() returns a non-NULL pointer, we must update +the recv_data_len/data_off info, otherwise the received data will +be silently dropped, and let's fix hvs_stream_has_data() accordingly. + +Thank Rolf for finding this! + +Reported-by: Rolf Neugebauer +Signed-off-by: Dexuan Cui +Cc: K. Y. Srinivasan +Cc: Haiyang Zhang +Cc: Stephen Hemminger +Origin: git@github.com:dcui/linux.git +(cherry picked from commit f06df7f9be11c903fbb03c03adebb67fd222157d) +--- + net/vmw_vsock/hyperv_transport.c | 50 +++++++++++++++++++++++++++++----------- + 1 file changed, 36 insertions(+), 14 deletions(-) + +diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c +index f465b0b662df..30154836acd0 100644 +--- a/net/vmw_vsock/hyperv_transport.c ++++ b/net/vmw_vsock/hyperv_transport.c +@@ -476,13 +476,33 @@ static bool hvs_dgram_allow(u32 cid, u32 port) + return false; + } + ++static int hvs_update_recv_data(struct hvsock *hvs) ++{ ++ struct hvs_recv_buf *recv_buf; ++ u32 payload_len; ++ ++ recv_buf = (struct hvs_recv_buf *)(hvs->recv_desc + 1); ++ payload_len = recv_buf->hdr.data_size; ++ ++ if (payload_len > HVS_MTU_SIZE) ++ return -EIO; ++ ++ if (payload_len == 0) ++ hvs->vsk->peer_shutdown |= SEND_SHUTDOWN; ++ ++ hvs->recv_data_len = payload_len; ++ hvs->recv_data_off = 0; ++ ++ return 0; ++} ++ + static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, + size_t len, int flags) + { + struct hvsock *hvs = vsk->trans; + bool need_refill = !hvs->recv_desc; + struct hvs_recv_buf *recv_buf; +- u32 payload_len, to_read; ++ u32 to_read; + int ret; + + if (flags & MSG_PEEK) +@@ -490,29 +510,28 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, + + if (need_refill) { + hvs->recv_desc = hv_pkt_iter_first(hvs->chan); +- recv_buf = (struct hvs_recv_buf *)(hvs->recv_desc + 1); +- +- payload_len = recv_buf->hdr.data_size; +- if (payload_len == 0 || payload_len > HVS_MTU_SIZE) +- return -EIO; +- +- hvs->recv_data_len = payload_len; +- hvs->recv_data_off = 0; +- } else { +- recv_buf = (struct hvs_recv_buf *)(hvs->recv_desc + 1); ++ ret = hvs_update_recv_data(hvs); ++ if (ret) ++ return ret; + } + ++ recv_buf = (struct hvs_recv_buf *)(hvs->recv_desc + 1); + to_read = min_t(u32, len, hvs->recv_data_len); + ret = memcpy_to_msg(msg, recv_buf->data + hvs->recv_data_off, to_read); + if (ret != 0) + return ret; + + hvs->recv_data_len -= to_read; +- +- if (hvs->recv_data_len == 0) ++ if (hvs->recv_data_len == 0) { + hvs->recv_desc = hv_pkt_iter_next(hvs->chan, hvs->recv_desc); +- else ++ if (hvs->recv_desc) { ++ ret = hvs_update_recv_data(hvs); ++ if (ret) ++ return ret; ++ } ++ } else { + hvs->recv_data_off += to_read; ++ } + + return to_read; + } +@@ -554,6 +573,9 @@ static s64 hvs_stream_has_data(struct vsock_sock *vsk) + struct hvsock *hvs = vsk->trans; + s64 ret; + ++ if (hvs->recv_data_len > 0) ++ return 1; ++ + switch (hvs_channel_readable_payload(hvs->chan)) { + case 1: + ret = 1; +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0008-vsock-fix-vsock_dequeue-enqueue_accept-race.patch b/kernel/patches-4.12.x/0008-vsock-fix-vsock_dequeue-enqueue_accept-race.patch new file mode 100644 index 000000000..cee4bc2a5 --- /dev/null +++ b/kernel/patches-4.12.x/0008-vsock-fix-vsock_dequeue-enqueue_accept-race.patch @@ -0,0 +1,54 @@ +From acb25be7f0f2379a7e1b656ce03543ab41d9cd86 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 26 Jul 2017 12:32:06 -0600 +Subject: [PATCH 08/15] vsock: fix vsock_dequeue/enqueue_accept race + +We should add a lock to protect the concurrent access to the list. + +Signed-off-by: Dexuan Cui +Cc: K. Y. Srinivasan +Cc: Haiyang Zhang +Cc: Stephen Hemminger +Origin: git@github.com:dcui/linux.git +(cherry picked from commit 719999b4908874178002a7a4850396e111780f59) +--- + net/vmw_vsock/af_vsock.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c +index dfc8c51e4d74..b7b2c66d91fd 100644 +--- a/net/vmw_vsock/af_vsock.c ++++ b/net/vmw_vsock/af_vsock.c +@@ -126,6 +126,7 @@ static struct proto vsock_proto = { + + static const struct vsock_transport *transport; + static DEFINE_MUTEX(vsock_register_mutex); ++static DEFINE_SPINLOCK(vsock_accept_queue_lock); + + /**** EXPORTS ****/ + +@@ -406,7 +407,10 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected) + + sock_hold(connected); + sock_hold(listener); ++ ++ spin_lock(&vsock_accept_queue_lock); + list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); ++ spin_unlock(&vsock_accept_queue_lock); + } + EXPORT_SYMBOL_GPL(vsock_enqueue_accept); + +@@ -423,7 +427,10 @@ static struct sock *vsock_dequeue_accept(struct sock *listener) + vconnected = list_entry(vlistener->accept_queue.next, + struct vsock_sock, accept_queue); + ++ spin_lock(&vsock_accept_queue_lock); + list_del_init(&vconnected->accept_queue); ++ spin_unlock(&vsock_accept_queue_lock); ++ + sock_put(listener); + /* The caller will need a reference on the connected socket so we let + * it call sock_put(). +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0009-Drivers-hv-vmbus-Fix-rescind-handling.patch b/kernel/patches-4.12.x/0009-Drivers-hv-vmbus-Fix-rescind-handling.patch new file mode 100644 index 000000000..f8d39e384 --- /dev/null +++ b/kernel/patches-4.12.x/0009-Drivers-hv-vmbus-Fix-rescind-handling.patch @@ -0,0 +1,290 @@ +From 1cd7fb401154c625ae77aef8c6869b3f01b514f8 Mon Sep 17 00:00:00 2001 +From: "K. Y. Srinivasan" +Date: Sun, 30 Apr 2017 16:21:18 -0700 +Subject: [PATCH 09/15] Drivers: hv: vmbus: Fix rescind handling + +Fix the rescind handling. This patch addresses the following rescind +scenario that is currently not handled correctly: + +If a rescind were to be received while the offer is still being +peocessed, we will be blocked indefinitely since the rescind message +is handled on the same work element as the offer message. Fix this +issue. + +I would like to thank Dexuan Cui and +Long Li for working with me on this patch. + +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +Origin: git@github.com:dcui/linux.git +(cherry picked from commit be1ce15dfbdfe3f42c8ed23c5904674d5d90b545) +--- + drivers/hv/channel.c | 8 ++++-- + drivers/hv/channel_mgmt.c | 69 ++++++++++++++++++++++++++++++++++++----------- + drivers/hv/connection.c | 7 +++-- + drivers/hv/hyperv_vmbus.h | 7 +++++ + drivers/hv/vmbus_drv.c | 29 +++++++++++++++++++- + 5 files changed, 99 insertions(+), 21 deletions(-) + +diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c +index 663e3d78247a..d4243b5c39b7 100644 +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -636,9 +636,13 @@ void vmbus_close(struct vmbus_channel *channel) + */ + list_for_each_safe(cur, tmp, &channel->sc_list) { + cur_channel = list_entry(cur, struct vmbus_channel, sc_list); +- if (cur_channel->state != CHANNEL_OPENED_STATE) +- continue; + vmbus_close_internal(cur_channel); ++ if (cur_channel->rescind) { ++ mutex_lock(&vmbus_connection.channel_mutex); ++ hv_process_channel_removal(cur_channel, ++ cur_channel->offermsg.child_relid); ++ mutex_unlock(&vmbus_connection.channel_mutex); ++ } + } + /* + * Now close the primary. +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index f4dda5ecfa0b..f5296548cdc9 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -441,7 +441,6 @@ void vmbus_free_channels(void) + { + struct vmbus_channel *channel, *tmp; + +- mutex_lock(&vmbus_connection.channel_mutex); + list_for_each_entry_safe(channel, tmp, &vmbus_connection.chn_list, + listentry) { + /* hv_process_channel_removal() needs this */ +@@ -449,7 +448,6 @@ void vmbus_free_channels(void) + + vmbus_device_unregister(channel->device_obj); + } +- mutex_unlock(&vmbus_connection.channel_mutex); + } + + /* +@@ -496,8 +494,10 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + list_add_tail(&newchannel->sc_list, &channel->sc_list); + channel->num_sc++; + spin_unlock_irqrestore(&channel->lock, flags); +- } else ++ } else { ++ atomic_dec(&vmbus_connection.offer_in_progress); + goto err_free_chan; ++ } + } + + dev_type = hv_get_dev_type(newchannel); +@@ -514,6 +514,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + if (!fnew) { + if (channel->sc_creation_callback != NULL) + channel->sc_creation_callback(newchannel); ++ atomic_dec(&vmbus_connection.offer_in_progress); + return; + } + +@@ -535,9 +536,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + * binding which eventually invokes the device driver's AddDevice() + * method. + */ +- mutex_lock(&vmbus_connection.channel_mutex); + ret = vmbus_device_register(newchannel->device_obj); +- mutex_unlock(&vmbus_connection.channel_mutex); + + if (ret != 0) { + pr_err("unable to add child device object (relid %d)\n", +@@ -545,6 +544,8 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + kfree(newchannel->device_obj); + goto err_deq_chan; + } ++ ++ atomic_dec(&vmbus_connection.offer_in_progress); + return; + + err_deq_chan: +@@ -791,6 +792,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) + newchannel = alloc_channel(); + if (!newchannel) { + vmbus_release_relid(offer->child_relid); ++ atomic_dec(&vmbus_connection.offer_in_progress); + pr_err("Unable to allocate channel object\n"); + return; + } +@@ -837,16 +839,38 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) + + rescind = (struct vmbus_channel_rescind_offer *)hdr; + ++ /* ++ * The offer msg and the corresponding rescind msg ++ * from the host are guranteed to be ordered - ++ * offer comes in first and then the rescind. ++ * Since we process these events in work elements, ++ * and with preemption, we may end up processing ++ * the events out of order. Given that we handle these ++ * work elements on the same CPU, this is possible only ++ * in the case of preemption. In any case wait here ++ * until the offer processing has moved beyond the ++ * point where the channel is discoverable. ++ */ ++ ++ while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { ++ /* ++ * We wait here until any channel offer is currently ++ * being processed. ++ */ ++ msleep(1); ++ } ++ + mutex_lock(&vmbus_connection.channel_mutex); + channel = relid2channel(rescind->child_relid); ++ mutex_unlock(&vmbus_connection.channel_mutex); + + if (channel == NULL) { + /* +- * This is very impossible, because in +- * vmbus_process_offer(), we have already invoked +- * vmbus_release_relid() on error. ++ * We failed in processing the offer message; ++ * we would have cleaned up the relid in that ++ * failure path. + */ +- goto out; ++ return; + } + + spin_lock_irqsave(&channel->lock, flags); +@@ -858,7 +882,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) + if (channel->device_obj) { + if (channel->chn_rescind_callback) { + channel->chn_rescind_callback(channel); +- goto out; ++ return; + } + /* + * We will have to unregister this device from the +@@ -869,13 +893,26 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) + vmbus_device_unregister(channel->device_obj); + put_device(dev); + } +- } else { +- hv_process_channel_removal(channel, +- channel->offermsg.child_relid); + } +- +-out: +- mutex_unlock(&vmbus_connection.channel_mutex); ++ if (channel->primary_channel != NULL) { ++ /* ++ * Sub-channel is being rescinded. Following is the channel ++ * close sequence when initiated from the driveri (refer to ++ * vmbus_close() for details): ++ * 1. Close all sub-channels first ++ * 2. Then close the primary channel. ++ */ ++ if (channel->state == CHANNEL_OPEN_STATE) { ++ /* ++ * The channel is currently not open; ++ * it is safe for us to cleanup the channel. ++ */ ++ mutex_lock(&vmbus_connection.channel_mutex); ++ hv_process_channel_removal(channel, ++ channel->offermsg.child_relid); ++ mutex_unlock(&vmbus_connection.channel_mutex); ++ } ++ } + } + + void vmbus_hvsock_device_unregister(struct vmbus_channel *channel) +diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c +index fce27fb141cc..b78ee787def0 100644 +--- a/drivers/hv/connection.c ++++ b/drivers/hv/connection.c +@@ -93,10 +93,13 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, + * all the CPUs. This is needed for kexec to work correctly where + * the CPU attempting to connect may not be CPU 0. + */ +- if (version >= VERSION_WIN8_1) ++ if (version >= VERSION_WIN8_1) { + msg->target_vcpu = hv_context.vp_index[smp_processor_id()]; +- else ++ vmbus_connection.connect_cpu = smp_processor_id(); ++ } else { + msg->target_vcpu = 0; ++ vmbus_connection.connect_cpu = 0; ++ } + + /* + * Add to list before we send the request since we may +diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h +index 6113e915c50e..8ce4ae1c78d0 100644 +--- a/drivers/hv/hyperv_vmbus.h ++++ b/drivers/hv/hyperv_vmbus.h +@@ -303,6 +303,13 @@ enum vmbus_connect_state { + #define MAX_SIZE_CHANNEL_MESSAGE HV_MESSAGE_PAYLOAD_BYTE_COUNT + + struct vmbus_connection { ++ /* ++ * CPU on which the initial host contact was made. ++ */ ++ int connect_cpu; ++ ++ atomic_t offer_in_progress; ++ + enum vmbus_connect_state conn_state; + + atomic_t next_gpadl_handle; +diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c +index 0087b49095eb..59bb3efa6e10 100644 +--- a/drivers/hv/vmbus_drv.c ++++ b/drivers/hv/vmbus_drv.c +@@ -798,8 +798,10 @@ static void vmbus_device_release(struct device *device) + struct hv_device *hv_dev = device_to_hv_device(device); + struct vmbus_channel *channel = hv_dev->channel; + ++ mutex_lock(&vmbus_connection.channel_mutex); + hv_process_channel_removal(channel, + channel->offermsg.child_relid); ++ mutex_unlock(&vmbus_connection.channel_mutex); + kfree(hv_dev); + + } +@@ -877,7 +879,32 @@ void vmbus_on_msg_dpc(unsigned long data) + INIT_WORK(&ctx->work, vmbus_onmessage_work); + memcpy(&ctx->msg, msg, sizeof(*msg)); + +- queue_work(vmbus_connection.work_queue, &ctx->work); ++ /* ++ * The host can generate a rescind message while we ++ * may still be handling the original offer. We deal with ++ * this condition by ensuring the processing is done on the ++ * same CPU. ++ */ ++ switch (hdr->msgtype) { ++ case CHANNELMSG_RESCIND_CHANNELOFFER: ++ /* ++ * If we are handling the rescind message; ++ * schedule the work on the global work queue. ++ */ ++ schedule_work_on(vmbus_connection.connect_cpu, ++ &ctx->work); ++ break; ++ ++ case CHANNELMSG_OFFERCHANNEL: ++ atomic_inc(&vmbus_connection.offer_in_progress); ++ queue_work_on(vmbus_connection.connect_cpu, ++ vmbus_connection.work_queue, ++ &ctx->work); ++ break; ++ ++ default: ++ queue_work(vmbus_connection.work_queue, &ctx->work); ++ } + } else + entry->message_handler(hdr); + +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0010-vmbus-fix-hv_percpu_channel_deq-enq-race.patch b/kernel/patches-4.12.x/0010-vmbus-fix-hv_percpu_channel_deq-enq-race.patch new file mode 100644 index 000000000..4c533043b --- /dev/null +++ b/kernel/patches-4.12.x/0010-vmbus-fix-hv_percpu_channel_deq-enq-race.patch @@ -0,0 +1,246 @@ +From 12ded1f47ea422964fb8762f1fe7443bf97e067e Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Mon, 5 Jun 2017 16:13:18 +0800 +Subject: [PATCH 10/15] vmbus: fix hv_percpu_channel_deq/enq race + +Signed-off-by: Dexuan Cui +Origin: git@github.com:dcui/linux.git +(cherry picked from commit 8457502df9dd379ddbdfa42a8c9a6421bb3482f1) +--- + drivers/hv/channel_mgmt.c | 32 +++++++++++++++++++++---- + drivers/hv/connection.c | 11 +++++++++ + drivers/hv/hyperv_vmbus.h | 1 + + drivers/hv/vmbus_drv.c | 59 ++++++++++++++++++++++++++++++++++++++++++++--- + 4 files changed, 95 insertions(+), 8 deletions(-) + +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index f5296548cdc9..19ad79099492 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -365,11 +365,16 @@ static void percpu_channel_enq(void *arg) + + void hv_percpu_channel_enq(struct vmbus_channel *channel) + { ++ unsigned long flags; ++ + if (channel->target_cpu != get_cpu()) + smp_call_function_single(channel->target_cpu, + percpu_channel_enq, channel, true); +- else ++ else { ++ local_irq_save(flags); + percpu_channel_enq(channel); ++ local_irq_restore(flags); ++ } + + put_cpu(); + } +@@ -383,11 +388,16 @@ static void percpu_channel_deq(void *arg) + + void hv_percpu_channel_deq(struct vmbus_channel *channel) + { ++ unsigned long flags; ++ + if (channel->target_cpu != get_cpu()) + smp_call_function_single(channel->target_cpu, + percpu_channel_deq, channel, true); +- else ++ else { ++ local_irq_save(flags); + percpu_channel_deq(channel); ++ local_irq_restore(flags); ++ } + + put_cpu(); + } +@@ -495,7 +505,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + channel->num_sc++; + spin_unlock_irqrestore(&channel->lock, flags); + } else { +- atomic_dec(&vmbus_connection.offer_in_progress); + goto err_free_chan; + } + } +@@ -549,6 +558,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + return; + + err_deq_chan: ++ atomic_dec(&vmbus_connection.offer_in_progress); + mutex_lock(&vmbus_connection.channel_mutex); + list_del(&newchannel->listentry); + mutex_unlock(&vmbus_connection.channel_mutex); +@@ -915,16 +925,28 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) + } + } + ++static void vmbus_stop_rescind_handling_work(struct work_struct *work) ++{ ++ atomic_inc(&vmbus_connection.offer_in_progress); ++} ++ + void vmbus_hvsock_device_unregister(struct vmbus_channel *channel) + { +- mutex_lock(&vmbus_connection.channel_mutex); ++ struct work_struct work; + + BUG_ON(!is_hvsock_channel(channel)); + ++ /* Prevent chn_rescind_callback from running in the rescind path */ ++ INIT_WORK(&work, vmbus_stop_rescind_handling_work); ++ queue_work_on(vmbus_connection.connect_cpu, ++ vmbus_connection.work_queue_rescind, &work); ++ flush_work(&work); ++ + channel->rescind = true; + vmbus_device_unregister(channel->device_obj); + +- mutex_unlock(&vmbus_connection.channel_mutex); ++ /* Unblock the rescind handling */ ++ atomic_dec(&vmbus_connection.offer_in_progress); + } + EXPORT_SYMBOL_GPL(vmbus_hvsock_device_unregister); + +diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c +index b78ee787def0..d2c4d61ff7d3 100644 +--- a/drivers/hv/connection.c ++++ b/drivers/hv/connection.c +@@ -156,6 +156,12 @@ int vmbus_connect(void) + goto cleanup; + } + ++ vmbus_connection.work_queue_rescind = create_workqueue("hv_vmbus_rsd"); ++ if (!vmbus_connection.work_queue_rescind) { ++ ret = -ENOMEM; ++ goto cleanup; ++ } ++ + INIT_LIST_HEAD(&vmbus_connection.chn_msg_list); + spin_lock_init(&vmbus_connection.channelmsg_lock); + +@@ -246,6 +252,11 @@ void vmbus_disconnect(void) + */ + vmbus_initiate_unload(false); + ++ if (vmbus_connection.work_queue_rescind) { ++ drain_workqueue(vmbus_connection.work_queue_rescind); ++ destroy_workqueue(vmbus_connection.work_queue_rescind); ++ } ++ + if (vmbus_connection.work_queue) { + drain_workqueue(vmbus_connection.work_queue); + destroy_workqueue(vmbus_connection.work_queue); +diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h +index 8ce4ae1c78d0..7b8603a00555 100644 +--- a/drivers/hv/hyperv_vmbus.h ++++ b/drivers/hv/hyperv_vmbus.h +@@ -339,6 +339,7 @@ struct vmbus_connection { + struct mutex channel_mutex; + + struct workqueue_struct *work_queue; ++ struct workqueue_struct *work_queue_rescind; + }; + + +diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c +index 59bb3efa6e10..fd221cffeb4d 100644 +--- a/drivers/hv/vmbus_drv.c ++++ b/drivers/hv/vmbus_drv.c +@@ -837,6 +837,52 @@ static void vmbus_onmessage_work(struct work_struct *work) + kfree(ctx); + } + ++static void vmbus_dispatch_msg_work(struct work_struct *work) ++{ ++ struct vmbus_channel_message_header *hdr; ++ struct onmessage_work_context *ctx, *context; ++ ++ ctx = container_of(work, struct onmessage_work_context, work); ++ hdr = (struct vmbus_channel_message_header *)ctx->msg.u.payload; ++ ++ context = kmalloc(sizeof(*context), GFP_KERNEL | __GFP_NOFAIL); ++ INIT_WORK(&context->work, vmbus_onmessage_work); ++ memcpy(&context->msg, &ctx->msg, sizeof(struct hv_message)); ++ ++ /* ++ * The host can generate a rescind message while we ++ * may still be handling the original offer. We deal with ++ * this condition by ensuring the processing is done on the ++ * same CPU. ++ */ ++ switch (hdr->msgtype) { ++ case CHANNELMSG_RESCIND_CHANNELOFFER: ++ /* ++ * If we are handling the rescind message; ++ * schedule the work on the global work queue. ++ */ ++ queue_work_on(vmbus_connection.connect_cpu, ++ vmbus_connection.work_queue_rescind, ++ &context->work); ++ break; ++ ++ case CHANNELMSG_OFFERCHANNEL: ++ /* XXX */ ++ flush_workqueue(vmbus_connection.work_queue_rescind); ++ ++ atomic_inc(&vmbus_connection.offer_in_progress); ++ queue_work_on(vmbus_connection.connect_cpu, ++ vmbus_connection.work_queue, ++ &context->work); ++ break; ++ ++ default: ++ queue_work(vmbus_connection.work_queue, &context->work); ++ } ++ ++ kfree(ctx); ++} ++ + static void hv_process_timer_expiration(struct hv_message *msg, + struct hv_per_cpu_context *hv_cpu) + { +@@ -876,9 +922,10 @@ void vmbus_on_msg_dpc(unsigned long data) + if (ctx == NULL) + return; + +- INIT_WORK(&ctx->work, vmbus_onmessage_work); ++ INIT_WORK(&ctx->work, vmbus_dispatch_msg_work); + memcpy(&ctx->msg, msg, sizeof(*msg)); + ++#if 0 + /* + * The host can generate a rescind message while we + * may still be handling the original offer. We deal with +@@ -891,8 +938,9 @@ void vmbus_on_msg_dpc(unsigned long data) + * If we are handling the rescind message; + * schedule the work on the global work queue. + */ +- schedule_work_on(vmbus_connection.connect_cpu, +- &ctx->work); ++ queue_work_on(vmbus_connection.connect_cpu, ++ vmbus_connection.work_queue_rescind, ++ &ctx->work); + break; + + case CHANNELMSG_OFFERCHANNEL: +@@ -905,6 +953,9 @@ void vmbus_on_msg_dpc(unsigned long data) + default: + queue_work(vmbus_connection.work_queue, &ctx->work); + } ++#else ++ schedule_work(&ctx->work); ++#endif + } else + entry->message_handler(hdr); + +@@ -1202,6 +1253,8 @@ int vmbus_device_register(struct hv_device *child_device_obj) + child_device_obj->device.parent = &hv_acpi_dev->dev; + child_device_obj->device.release = vmbus_device_release; + ++ if (is_hvsock_channel(child_device_obj->channel)) ++ dev_set_uevent_suppress(&child_device_obj->device, 1); + /* + * Register with the LDM. This will kick off the driver/device + * binding...which will eventually call vmbus_match() and vmbus_probe() +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0011-vmbus-add-vmbus-onoffer-onoffer_rescind-sync.patch b/kernel/patches-4.12.x/0011-vmbus-add-vmbus-onoffer-onoffer_rescind-sync.patch new file mode 100644 index 000000000..d6ff72858 --- /dev/null +++ b/kernel/patches-4.12.x/0011-vmbus-add-vmbus-onoffer-onoffer_rescind-sync.patch @@ -0,0 +1,119 @@ +From 98f32826ce5f2aad27b940e32a4c12199a934bab Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Mon, 5 Jun 2017 21:32:00 +0800 +Subject: [PATCH 11/15] vmbus: add vmbus onoffer/onoffer_rescind sync. + +Signed-off-by: Dexuan Cui +Origin: git@github.com:dcui/linux.git +(cherry picked from commit 1b91aa6d0e745d9765e3d90058928829f0b0bd40) +--- + drivers/hv/channel_mgmt.c | 25 +++++++++++++++++++------ + drivers/hv/hyperv_vmbus.h | 1 + + drivers/hv/vmbus_drv.c | 1 + + 3 files changed, 21 insertions(+), 6 deletions(-) + +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index 19ad79099492..a12b1eabc15e 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -467,6 +467,7 @@ void vmbus_free_channels(void) + static void vmbus_process_offer(struct vmbus_channel *newchannel) + { + struct vmbus_channel *channel; ++ struct hv_device *device_obj; + bool fnew = true; + unsigned long flags; + u16 dev_type; +@@ -524,6 +525,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + if (channel->sc_creation_callback != NULL) + channel->sc_creation_callback(newchannel); + atomic_dec(&vmbus_connection.offer_in_progress); ++ atomic_dec(&vmbus_connection.register_in_progress); + return; + } + +@@ -532,33 +534,36 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + * We need to set the DeviceObject field before calling + * vmbus_child_dev_add() + */ +- newchannel->device_obj = vmbus_device_create( ++ device_obj = vmbus_device_create( + &newchannel->offermsg.offer.if_type, + &newchannel->offermsg.offer.if_instance, + newchannel); +- if (!newchannel->device_obj) ++ if (!device_obj) + goto err_deq_chan; + +- newchannel->device_obj->device_id = dev_type; ++ device_obj->device_id = dev_type; + /* + * Add the new device to the bus. This will kick off device-driver + * binding which eventually invokes the device driver's AddDevice() + * method. + */ +- ret = vmbus_device_register(newchannel->device_obj); ++ atomic_dec(&vmbus_connection.offer_in_progress); ++ ret = vmbus_device_register(device_obj); + + if (ret != 0) { + pr_err("unable to add child device object (relid %d)\n", + newchannel->offermsg.child_relid); +- kfree(newchannel->device_obj); ++ kfree(device_obj); + goto err_deq_chan; + } ++ newchannel->device_obj = device_obj; ++ atomic_dec(&vmbus_connection.register_in_progress); + +- atomic_dec(&vmbus_connection.offer_in_progress); + return; + + err_deq_chan: + atomic_dec(&vmbus_connection.offer_in_progress); ++ atomic_dec(&vmbus_connection.register_in_progress); + mutex_lock(&vmbus_connection.channel_mutex); + list_del(&newchannel->listentry); + mutex_unlock(&vmbus_connection.channel_mutex); +@@ -889,6 +894,14 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) + + vmbus_rescind_cleanup(channel); + ++ while (atomic_read(&vmbus_connection.register_in_progress) != 0) { ++ /* ++ * We wait here until any channel offer is currently ++ * being processed. ++ */ ++ msleep(1); ++ } ++ + if (channel->device_obj) { + if (channel->chn_rescind_callback) { + channel->chn_rescind_callback(channel); +diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h +index 7b8603a00555..30ae1d291527 100644 +--- a/drivers/hv/hyperv_vmbus.h ++++ b/drivers/hv/hyperv_vmbus.h +@@ -309,6 +309,7 @@ struct vmbus_connection { + int connect_cpu; + + atomic_t offer_in_progress; ++ atomic_t register_in_progress; + + enum vmbus_connect_state conn_state; + +diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c +index fd221cffeb4d..fc6a5893d54c 100644 +--- a/drivers/hv/vmbus_drv.c ++++ b/drivers/hv/vmbus_drv.c +@@ -871,6 +871,7 @@ static void vmbus_dispatch_msg_work(struct work_struct *work) + flush_workqueue(vmbus_connection.work_queue_rescind); + + atomic_inc(&vmbus_connection.offer_in_progress); ++ atomic_inc(&vmbus_connection.register_in_progress); + queue_work_on(vmbus_connection.connect_cpu, + vmbus_connection.work_queue, + &context->work); +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0012-vmbus-fix-the-missed-signaling-in-hv_signal_on_read.patch b/kernel/patches-4.12.x/0012-vmbus-fix-the-missed-signaling-in-hv_signal_on_read.patch new file mode 100644 index 000000000..64a424573 --- /dev/null +++ b/kernel/patches-4.12.x/0012-vmbus-fix-the-missed-signaling-in-hv_signal_on_read.patch @@ -0,0 +1,41 @@ +From 19fdaff8faea41153ced91768194c58804c232cf Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 28 Jun 2017 23:50:38 +0800 +Subject: [PATCH 12/15] vmbus: fix the missed signaling in hv_signal_on_read() + +There is an off-by-one bug here, which can cause host-to-guest write to stall. + +When cur_write_sz == pending_sz, we shouldn't signal the host because it's +meaningless: the ring mustn't be 100% full. + +But when cached_write_sz == pending_sz, we must signal the host. + +Signed-off-by: John Starks +Signed-off-by: Dexuan Cui +Origin: git@github.com:dcui/linux.git +(cherry picked from commit c49aced6328557e6c1f5cf6f58e1fae96fb58fa0) +--- + include/linux/hyperv.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h +index b6975010e798..511d47e3ee9d 100644 +--- a/include/linux/hyperv.h ++++ b/include/linux/hyperv.h +@@ -1518,11 +1518,11 @@ static inline void hv_signal_on_read(struct vmbus_channel *channel) + + cur_write_sz = hv_get_bytes_to_write(rbi); + +- if (cur_write_sz < pending_sz) ++ if (cur_write_sz <= pending_sz) + return; + + cached_write_sz = hv_get_cached_bytes_to_write(rbi); +- if (cached_write_sz < pending_sz) ++ if (cached_write_sz <= pending_sz) + vmbus_setevent(channel); + } + +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0013-hv_sock-avoid-double-FINs-if-shutdown-is-called.patch b/kernel/patches-4.12.x/0013-hv_sock-avoid-double-FINs-if-shutdown-is-called.patch new file mode 100644 index 000000000..dc5e78a7d --- /dev/null +++ b/kernel/patches-4.12.x/0013-hv_sock-avoid-double-FINs-if-shutdown-is-called.patch @@ -0,0 +1,45 @@ +From 506b9c1e78a698217b918446ece7dbee08ae160f Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 26 Jul 2017 12:32:08 -0600 +Subject: [PATCH 13/15] hv_sock: avoid double FINs if shutdown() is called + +The host expects a single FIN according to Hyperv-V team. +With the patch, the connection may not be cleanly closed. + +Signed-off-by: Dexuan Cui +Cc: K. Y. Srinivasan +Cc: Haiyang Zhang +Cc: Stephen Hemminger +Origin: git@github.com:dcui/linux.git +(cherry picked from commit 82235a6f52a493a18f61c4f7f7deffc8558d5c5e) +--- + net/vmw_vsock/hyperv_transport.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c +index 30154836acd0..693dfb7944a6 100644 +--- a/net/vmw_vsock/hyperv_transport.c ++++ b/net/vmw_vsock/hyperv_transport.c +@@ -95,6 +95,9 @@ struct hvsock { + u32 recv_data_len; + /* The offset of the payload */ + u32 recv_data_off; ++ ++ /* Have we sent the zero-length packet (FIN)? */ ++ unsigned long fin_sent; + }; + + /* In the VM, we support Hyper-V Sockets with AF_VSOCK, and the endpoint is +@@ -423,6 +426,9 @@ static int hvs_shutdown(struct vsock_sock *vsk, int mode) + + hvs = vsk->trans; + ++ if (test_and_set_bit(0, &hvs->fin_sent)) ++ return 0; ++ + send_buf = (struct hvs_send_buf *)&hdr; + + /* It can't fail: see hvs_channel_writable_bytes(). */ +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0014-Added-vsock-transport-support-to-9pfs.patch b/kernel/patches-4.12.x/0014-Added-vsock-transport-support-to-9pfs.patch new file mode 100644 index 000000000..a3f85ef61 --- /dev/null +++ b/kernel/patches-4.12.x/0014-Added-vsock-transport-support-to-9pfs.patch @@ -0,0 +1,151 @@ +From 76ef98c66146a53c5367c60f2291761f40e33355 Mon Sep 17 00:00:00 2001 +From: Cheng-mean Liu +Date: Tue, 11 Jul 2017 16:50:36 -0700 +Subject: [PATCH 14/15] Added vsock transport support to 9pfs + +Signed-off-by: Cheng-mean Liu +Origin: https://github.com/Microsoft/opengcs/blob/master/kernelconfig/4.11/patch_9pfs_vsock-transport.patch +--- + net/9p/trans_fd.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 84 insertions(+), 1 deletion(-) + +diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c +index 7bc2208b6cc4..dc34352145df 100644 +--- a/net/9p/trans_fd.c ++++ b/net/9p/trans_fd.c +@@ -44,8 +44,9 @@ + #include + #include + #include +- + #include /* killme */ ++//#include ++#include + + #define P9_PORT 564 + #define MAX_SOCK_BUF (64*1024) +@@ -155,6 +156,7 @@ struct p9_trans_fd { + struct p9_conn conn; + }; + ++ + static void p9_poll_workfn(struct work_struct *work); + + static DEFINE_SPINLOCK(p9_poll_lock); +@@ -740,6 +742,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts) + opts->wfd = ~0; + opts->privport = 0; + ++ + if (!params) + return 0; + +@@ -1035,6 +1038,72 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) + return 0; + } + ++static int ++p9_fd_create_vsock(struct p9_client *client, const char *addr, char *args) ++{ ++ int err; ++ struct socket *csocket; ++ struct sockaddr_vm server_socket_addr; ++ struct p9_fd_opts opts; ++ ++ err = parse_opts(args, &opts); ++ if (err < 0) ++ return err; ++ ++ csocket = NULL; ++ ++ // for debugging purpose only ++ pr_err("%s:%s\n", __func__, addr); ++ // kgdb_breakpoint(); ++ ++ // create socket ++ err = __sock_create(current->nsproxy->net_ns, ++ AF_VSOCK, ++ SOCK_STREAM, ++ 0, ++ &csocket, 1); ++ if (err) { ++ pr_err("%s:__sock_create (%d): problem creating socket (err=%d)\n", ++ __func__, task_pid_nr(current), err); ++ return err; ++ } ++ ++ // server socket address information ++ memset((char *)&server_socket_addr, 0, sizeof(struct sockaddr_vm)); ++ server_socket_addr.svm_family = AF_VSOCK; ++ server_socket_addr.svm_reserved1 = 0; ++ server_socket_addr.svm_cid = VMADDR_CID_HOST; ++ ++ /* Connecting to the host's 0000pppp-facb-11e6-bd58-64006a7986d3 */ ++ server_socket_addr.svm_port = opts.port; ++ ++ pr_err("%s:opts.port=(%d)(0x%x)\n", __func__, opts.port, opts.port); ++ pr_err("%s: service_id:(hex) 0000%x%x-facb-11e6-bd58-64006a7986d3\n", ++ __func__, ++ (__u8)((opts.port & 0xff00) >> 8), ++ (__u8)(opts.port & 0x00ff)); ++ ++ pr_err("%s: connecting", __func__); ++ err = csocket->ops->connect(csocket, ++ (struct sockaddr *)&server_socket_addr, ++ sizeof(struct sockaddr_vm), 0); ++ if (err < 0) { ++ pr_err("%s:connect (%d): problem connecting socket to %s (err = %d)\n", ++ __func__, task_pid_nr(current), addr, err); ++ sock_release(csocket); ++ return err; ++ } ++ ++ pr_err("%s: open socket", __func__); ++ err = p9_socket_open(client, csocket); ++ if (err < 0) { ++ pr_err("%s: p9_socket_open failed\n", __func__); ++ } ++ ++ pr_err("Leaving %s\n", __func__); ++ return err; ++} ++ + static struct p9_trans_module p9_tcp_trans = { + .name = "tcp", + .maxsize = MAX_SOCK_BUF, +@@ -1071,6 +1140,18 @@ static struct p9_trans_module p9_fd_trans = { + .owner = THIS_MODULE, + }; + ++static struct p9_trans_module p9_vsock_trans = { ++ .name = "vsock", ++ .maxsize = MAX_SOCK_BUF, ++ .def = 0, ++ .create = p9_fd_create_vsock, ++ .close = p9_fd_close, ++ .request = p9_fd_request, ++ .cancel = p9_fd_cancel, ++ .cancelled = p9_fd_cancelled, ++ .owner = THIS_MODULE, ++}; ++ + /** + * p9_poll_proc - poll worker thread + * @a: thread state and arguments +@@ -1108,6 +1189,7 @@ int p9_trans_fd_init(void) + v9fs_register_trans(&p9_tcp_trans); + v9fs_register_trans(&p9_unix_trans); + v9fs_register_trans(&p9_fd_trans); ++ v9fs_register_trans(&p9_vsock_trans); + + return 0; + } +@@ -1118,4 +1200,5 @@ void p9_trans_fd_exit(void) + v9fs_unregister_trans(&p9_tcp_trans); + v9fs_unregister_trans(&p9_unix_trans); + v9fs_unregister_trans(&p9_fd_trans); ++ v9fs_unregister_trans(&p9_vsock_trans); + } +-- +2.14.1 + diff --git a/kernel/patches-4.12.x/0015-NVDIMM-reducded-ND_MIN_NAMESPACE_SIZE-from-4MB-to-4K.patch b/kernel/patches-4.12.x/0015-NVDIMM-reducded-ND_MIN_NAMESPACE_SIZE-from-4MB-to-4K.patch new file mode 100644 index 000000000..44b3bf028 --- /dev/null +++ b/kernel/patches-4.12.x/0015-NVDIMM-reducded-ND_MIN_NAMESPACE_SIZE-from-4MB-to-4K.patch @@ -0,0 +1,28 @@ +From e1580d5ac40822f7f44ab4a95973b48cba8c9f63 Mon Sep 17 00:00:00 2001 +From: Cheng-mean Liu +Date: Tue, 11 Jul 2017 16:58:26 -0700 +Subject: [PATCH 15/15] NVDIMM: reducded ND_MIN_NAMESPACE_SIZE from 4MB to 4KB + (page size) + +Signed-off-by: Cheng-mean Liu +Origin: https://github.com/Microsoft/opengcs/blob/master/kernelconfig/4.11/patch_lower-the-minimum-PMEM-size.patch +--- + include/uapi/linux/ndctl.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h +index 7ad3863cb88b..670ffdec0924 100644 +--- a/include/uapi/linux/ndctl.h ++++ b/include/uapi/linux/ndctl.h +@@ -260,7 +260,7 @@ enum nd_driver_flags { + }; + + enum { +- ND_MIN_NAMESPACE_SIZE = 0x00400000, ++ ND_MIN_NAMESPACE_SIZE = 0x00001000, + }; + + enum ars_masks { +-- +2.14.1 +