diff --git a/alpine/kernel/kernel_config b/alpine/kernel/kernel_config index 30a0a9d23..fbb585b42 100644 --- a/alpine/kernel/kernel_config +++ b/alpine/kernel/kernel_config @@ -1197,6 +1197,7 @@ CONFIG_OPENVSWITCH_VXLAN=y CONFIG_VSOCKETS=y CONFIG_VIRTIO_VSOCKETS=y CONFIG_VIRTIO_VSOCKETS_COMMON=y +CONFIG_HYPERV_SOCK=y CONFIG_NETLINK_MMAP=y CONFIG_NETLINK_DIAG=y CONFIG_MPLS=y @@ -1225,6 +1226,7 @@ CONFIG_NET_FLOW_LIMIT=y # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set +# CONFIG_AF_KCM is not set CONFIG_FIB_RULES=y # CONFIG_WIRELESS is not set # CONFIG_WIMAX is not set diff --git a/alpine/kernel/patches/0001-virtio-make-find_vqs-checkpatch.pl-friendly.patch b/alpine/kernel/patches/0001-virtio-make-find_vqs-checkpatch.pl-friendly.patch index 99216ebc9..b28621584 100644 --- a/alpine/kernel/patches/0001-virtio-make-find_vqs-checkpatch.pl-friendly.patch +++ b/alpine/kernel/patches/0001-virtio-make-find_vqs-checkpatch.pl-friendly.patch @@ -1,7 +1,7 @@ From d8f7730e3211cdb16cd9d26143121aeb05f22509 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 17 Dec 2015 16:53:43 +0800 -Subject: [PATCH 1/9] virtio: make find_vqs() checkpatch.pl-friendly +Subject: [PATCH 01/25] virtio: make find_vqs() checkpatch.pl-friendly checkpatch.pl wants arrays of strings declared as follows: diff --git a/alpine/kernel/patches/0002-VSOCK-constify-vmci_transport_notify_ops-structures.patch b/alpine/kernel/patches/0002-VSOCK-constify-vmci_transport_notify_ops-structures.patch index ba54a3e74..3e379264c 100644 --- a/alpine/kernel/patches/0002-VSOCK-constify-vmci_transport_notify_ops-structures.patch +++ b/alpine/kernel/patches/0002-VSOCK-constify-vmci_transport_notify_ops-structures.patch @@ -1,7 +1,7 @@ From 0260029492a1503e871236767ed86e2fc3862cc2 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 Nov 2015 18:39:17 +0100 -Subject: [PATCH 2/9] VSOCK: constify vmci_transport_notify_ops structures +Subject: [PATCH 02/25] VSOCK: constify vmci_transport_notify_ops structures The vmci_transport_notify_ops structures are never modified, so declare them as const. diff --git a/alpine/kernel/patches/0003-AF_VSOCK-Shrink-the-area-influenced-by-prepare_to_wa.patch b/alpine/kernel/patches/0003-AF_VSOCK-Shrink-the-area-influenced-by-prepare_to_wa.patch index ea277f413..330abb911 100644 --- a/alpine/kernel/patches/0003-AF_VSOCK-Shrink-the-area-influenced-by-prepare_to_wa.patch +++ b/alpine/kernel/patches/0003-AF_VSOCK-Shrink-the-area-influenced-by-prepare_to_wa.patch @@ -1,7 +1,7 @@ From 6a585c01a353551a69af45bf31606f13115480d1 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 22 Mar 2016 17:05:52 +0100 -Subject: [PATCH 3/9] AF_VSOCK: Shrink the area influenced by prepare_to_wait +Subject: [PATCH 03/25] AF_VSOCK: Shrink the area influenced by prepare_to_wait When a thread is prepared for waiting by calling prepare_to_wait, sleeping is not allowed until either the wait has taken place or finish_wait has diff --git a/alpine/kernel/patches/0004-VSOCK-transport-specific-vsock_transport-functions.patch b/alpine/kernel/patches/0004-VSOCK-transport-specific-vsock_transport-functions.patch index 2d822bbba..f990d3abd 100644 --- a/alpine/kernel/patches/0004-VSOCK-transport-specific-vsock_transport-functions.patch +++ b/alpine/kernel/patches/0004-VSOCK-transport-specific-vsock_transport-functions.patch @@ -1,7 +1,7 @@ From a3f136168f164f66de1de277a08b76f54b289d5a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 17 Dec 2015 11:10:21 +0800 -Subject: [PATCH 4/9] VSOCK: transport-specific vsock_transport functions +Subject: [PATCH 04/25] VSOCK: transport-specific vsock_transport functions struct vsock_transport contains function pointers called by AF_VSOCK core code. The transport may want its own transport-specific function diff --git a/alpine/kernel/patches/0005-VSOCK-Introduce-virtio_vsock_common.ko.patch b/alpine/kernel/patches/0005-VSOCK-Introduce-virtio_vsock_common.ko.patch index 3048e9acc..beffe7e53 100644 --- a/alpine/kernel/patches/0005-VSOCK-Introduce-virtio_vsock_common.ko.patch +++ b/alpine/kernel/patches/0005-VSOCK-Introduce-virtio_vsock_common.ko.patch @@ -1,7 +1,7 @@ From 4018aa8a812fd6f1a64e3d227550bf5752127314 Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 13 Jun 2013 18:27:00 +0800 -Subject: [PATCH 5/9] VSOCK: Introduce virtio_vsock_common.ko +Subject: [PATCH 05/25] VSOCK: Introduce virtio_vsock_common.ko This module contains the common code and header files for the following virtio_transporto and vhost_vsock kernel modules. diff --git a/alpine/kernel/patches/0006-VSOCK-Introduce-virtio_transport.ko.patch b/alpine/kernel/patches/0006-VSOCK-Introduce-virtio_transport.ko.patch index 2132384e1..d8d5deba1 100644 --- a/alpine/kernel/patches/0006-VSOCK-Introduce-virtio_transport.ko.patch +++ b/alpine/kernel/patches/0006-VSOCK-Introduce-virtio_transport.ko.patch @@ -1,7 +1,7 @@ From ccaac837ceb4a9582bb57f71e0cac791f7336b19 Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 13 Jun 2013 18:28:48 +0800 -Subject: [PATCH 6/9] VSOCK: Introduce virtio_transport.ko +Subject: [PATCH 06/25] VSOCK: Introduce virtio_transport.ko VM sockets virtio transport implementation. This driver runs in the guest. diff --git a/alpine/kernel/patches/0007-VSOCK-Introduce-vhost_vsock.ko.patch b/alpine/kernel/patches/0007-VSOCK-Introduce-vhost_vsock.ko.patch index fc39b0ff8..7070782a7 100644 --- a/alpine/kernel/patches/0007-VSOCK-Introduce-vhost_vsock.ko.patch +++ b/alpine/kernel/patches/0007-VSOCK-Introduce-vhost_vsock.ko.patch @@ -1,7 +1,7 @@ From f52efbc874c742a671939ea6408c59545025007d Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 13 Jun 2013 18:29:21 +0800 -Subject: [PATCH 7/9] VSOCK: Introduce vhost_vsock.ko +Subject: [PATCH 07/25] VSOCK: Introduce vhost_vsock.ko VM sockets vhost transport implementation. This driver runs on the host. diff --git a/alpine/kernel/patches/0008-VSOCK-Add-Makefile-and-Kconfig.patch b/alpine/kernel/patches/0008-VSOCK-Add-Makefile-and-Kconfig.patch index 79d729b4f..42ddbf249 100644 --- a/alpine/kernel/patches/0008-VSOCK-Add-Makefile-and-Kconfig.patch +++ b/alpine/kernel/patches/0008-VSOCK-Add-Makefile-and-Kconfig.patch @@ -1,7 +1,7 @@ From e8c8f5299fd202db5d56a10f1dc0a4e464e9a211 Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 13 Jun 2013 18:30:19 +0800 -Subject: [PATCH 8/9] VSOCK: Add Makefile and Kconfig +Subject: [PATCH 08/25] VSOCK: Add Makefile and Kconfig Enable virtio-vsock and vhost-vsock. diff --git a/alpine/kernel/patches/0009-VSOCK-Only-allow-host-network-namespace-to-use-AF_VS.patch b/alpine/kernel/patches/0009-VSOCK-Only-allow-host-network-namespace-to-use-AF_VS.patch index 25e65e28d..3c0bdd5c3 100644 --- a/alpine/kernel/patches/0009-VSOCK-Only-allow-host-network-namespace-to-use-AF_VS.patch +++ b/alpine/kernel/patches/0009-VSOCK-Only-allow-host-network-namespace-to-use-AF_VS.patch @@ -1,7 +1,8 @@ From 550ec4c8f90f2bf99c1bcb13b2f8476780f42418 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 4 Apr 2016 14:50:10 +0100 -Subject: [PATCH 9/9] VSOCK: Only allow host network namespace to use AF_VSOCK. +Subject: [PATCH 09/25] VSOCK: Only allow host network namespace to use + AF_VSOCK. The VSOCK addressing schema does not really lend itself to simply creating an alternative end point address within a namespace. diff --git a/alpine/kernel/patches/0010-Drivers-hv-vmbus-serialize-process_chn_event-and-vmb.patch b/alpine/kernel/patches/0010-Drivers-hv-vmbus-serialize-process_chn_event-and-vmb.patch new file mode 100644 index 000000000..35003286a --- /dev/null +++ b/alpine/kernel/patches/0010-Drivers-hv-vmbus-serialize-process_chn_event-and-vmb.patch @@ -0,0 +1,87 @@ +From 1f7906c43fe139e15c19f35a4493a7ca61a6463f Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Mon, 14 Dec 2015 16:01:47 -0800 +Subject: [PATCH 10/25] Drivers: hv: vmbus: serialize process_chn_event() and + vmbus_close_internal() + +process_chn_event(), running in the tasklet, can race with +vmbus_close_internal() in the case of SMP guest, e.g., when the former is +accessing channel->inbound.ring_buffer, the latter could be freeing the +ring_buffer pages. + +To resolve the race, we can serialize them by disabling the tasklet when +the latter is running here. + +Signed-off-by: Dexuan Cui +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +(cherry picked from commit 63d55b2aeb5e4faa170316fee73c3c47ea9268c7) +--- + drivers/hv/channel.c | 21 +++++++++++++++++++-- + 1 file changed, 19 insertions(+), 2 deletions(-) + +diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c +index 9098f13..6a90c69 100644 +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + #include "hyperv_vmbus.h" + +@@ -496,8 +497,21 @@ static void reset_channel_cb(void *arg) + static int vmbus_close_internal(struct vmbus_channel *channel) + { + struct vmbus_channel_close_channel *msg; ++ struct tasklet_struct *tasklet; + int ret; + ++ /* ++ * process_chn_event(), running in the tasklet, can race ++ * with vmbus_close_internal() in the case of SMP guest, e.g., when ++ * the former is accessing channel->inbound.ring_buffer, the latter ++ * could be freeing the ring_buffer pages. ++ * ++ * To resolve the race, we can serialize them by disabling the ++ * tasklet when the latter is running here. ++ */ ++ tasklet = hv_context.event_dpc[channel->target_cpu]; ++ tasklet_disable(tasklet); ++ + channel->state = CHANNEL_OPEN_STATE; + channel->sc_creation_callback = NULL; + /* Stop callback and cancel the timer asap */ +@@ -525,7 +539,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel) + * If we failed to post the close msg, + * it is perhaps better to leak memory. + */ +- return ret; ++ goto out; + } + + /* Tear down the gpadl for the channel's ring buffer */ +@@ -538,7 +552,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel) + * If we failed to teardown gpadl, + * it is perhaps better to leak memory. + */ +- return ret; ++ goto out; + } + } + +@@ -555,6 +569,9 @@ static int vmbus_close_internal(struct vmbus_channel *channel) + if (channel->rescind) + hv_process_channel_removal(channel, + channel->offermsg.child_relid); ++out: ++ tasklet_enable(tasklet); ++ + return ret; + } + +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0011-Drivers-hv-vmbus-do-sanity-check-of-channel-state-in.patch b/alpine/kernel/patches/0011-Drivers-hv-vmbus-do-sanity-check-of-channel-state-in.patch new file mode 100644 index 000000000..10c5342aa --- /dev/null +++ b/alpine/kernel/patches/0011-Drivers-hv-vmbus-do-sanity-check-of-channel-state-in.patch @@ -0,0 +1,42 @@ +From 00375a20748490730b2f004bfe44e83abecec5f1 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Mon, 14 Dec 2015 16:01:48 -0800 +Subject: [PATCH 11/25] Drivers: hv: vmbus: do sanity check of channel state in + vmbus_close_internal() + +This fixes an incorrect assumption of channel state in the function. + +Signed-off-by: Dexuan Cui +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +(cherry picked from commit 64b7faf903dae2df94d89edf2c688b16751800e4) +--- + drivers/hv/channel.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c +index 6a90c69..b3c14ca 100644 +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -512,6 +512,18 @@ static int vmbus_close_internal(struct vmbus_channel *channel) + tasklet = hv_context.event_dpc[channel->target_cpu]; + tasklet_disable(tasklet); + ++ /* ++ * In case a device driver's probe() fails (e.g., ++ * util_probe() -> vmbus_open() returns -ENOMEM) and the device is ++ * rescinded later (e.g., we dynamically disble an Integrated Service ++ * in Hyper-V Manager), the driver's remove() invokes vmbus_close(): ++ * here we should skip most of the below cleanup work. ++ */ ++ if (channel->state != CHANNEL_OPENED_STATE) { ++ ret = -EINVAL; ++ goto out; ++ } ++ + channel->state = CHANNEL_OPEN_STATE; + channel->sc_creation_callback = NULL; + /* Stop callback and cancel the timer asap */ +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0012-Drivers-hv-vmbus-fix-rescind-offer-handling-for-devi.patch b/alpine/kernel/patches/0012-Drivers-hv-vmbus-fix-rescind-offer-handling-for-devi.patch new file mode 100644 index 000000000..e78aa0276 --- /dev/null +++ b/alpine/kernel/patches/0012-Drivers-hv-vmbus-fix-rescind-offer-handling-for-devi.patch @@ -0,0 +1,122 @@ +From 53cd041cabf572ec98d5b911abfff1a3baf1ccaa Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Mon, 14 Dec 2015 16:01:49 -0800 +Subject: [PATCH 12/25] Drivers: hv: vmbus: fix rescind-offer handling for + device without a driver + +In the path vmbus_onoffer_rescind() -> vmbus_device_unregister() -> +device_unregister() -> ... -> __device_release_driver(), we can see for a +device without a driver loaded: dev->driver is NULL, so +dev->bus->remove(dev), namely vmbus_remove(), isn't invoked. + +As a result, vmbus_remove() -> hv_process_channel_removal() isn't invoked +and some cleanups(like sending a CHANNELMSG_RELID_RELEASED message to the +host) aren't done. + +We can demo the issue this way: +1. rmmod hv_utils; +2. disable the Heartbeat Integration Service in Hyper-V Manager and lsvmbus +shows the device disappears. +3. re-enable the Heartbeat in Hyper-V Manager and modprobe hv_utils, but +lsvmbus shows the device can't appear again. +This is because, the host thinks the VM hasn't released the relid, so can't +re-offer the device to the VM. + +We can fix the issue by moving hv_process_channel_removal() +from vmbus_close_internal() to vmbus_device_release(), since the latter is +always invoked on device_unregister(), whether or not the dev has a driver +loaded. + +Signed-off-by: Dexuan Cui +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +(cherry picked from commit 34c6801e3310ad286c7bb42bc88d42926b8f99bf) +--- + drivers/hv/channel.c | 6 ------ + drivers/hv/channel_mgmt.c | 6 +++--- + drivers/hv/vmbus_drv.c | 15 +++------------ + 3 files changed, 6 insertions(+), 21 deletions(-) + +diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c +index b3c14ca..2889d97 100644 +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -575,12 +575,6 @@ static int vmbus_close_internal(struct vmbus_channel *channel) + free_pages((unsigned long)channel->ringbuffer_pages, + get_order(channel->ringbuffer_pagecount * PAGE_SIZE)); + +- /* +- * If the channel has been rescinded; process device removal. +- */ +- if (channel->rescind) +- hv_process_channel_removal(channel, +- channel->offermsg.child_relid); + out: + tasklet_enable(tasklet); + +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index 652afd1..bd2e9f6 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -191,6 +191,8 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) + if (channel == NULL) + return; + ++ BUG_ON(!channel->rescind); ++ + if (channel->target_cpu != get_cpu()) { + put_cpu(); + smp_call_function_single(channel->target_cpu, +@@ -230,9 +232,7 @@ void vmbus_free_channels(void) + + list_for_each_entry_safe(channel, tmp, &vmbus_connection.chn_list, + listentry) { +- /* if we don't set rescind to true, vmbus_close_internal() +- * won't invoke hv_process_channel_removal(). +- */ ++ /* hv_process_channel_removal() needs this */ + channel->rescind = true; + + vmbus_device_unregister(channel->device_obj); +diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c +index f19b6f7..7e46a48 100644 +--- a/drivers/hv/vmbus_drv.c ++++ b/drivers/hv/vmbus_drv.c +@@ -602,23 +602,11 @@ static int vmbus_remove(struct device *child_device) + { + struct hv_driver *drv; + struct hv_device *dev = device_to_hv_device(child_device); +- u32 relid = dev->channel->offermsg.child_relid; + + if (child_device->driver) { + drv = drv_to_hv_drv(child_device->driver); + if (drv->remove) + drv->remove(dev); +- else { +- hv_process_channel_removal(dev->channel, relid); +- pr_err("remove not set for driver %s\n", +- dev_name(child_device)); +- } +- } else { +- /* +- * We don't have a driver for this device; deal with the +- * rescind message by removing the channel. +- */ +- hv_process_channel_removal(dev->channel, relid); + } + + return 0; +@@ -653,7 +641,10 @@ static void vmbus_shutdown(struct device *child_device) + static void vmbus_device_release(struct device *device) + { + struct hv_device *hv_dev = device_to_hv_device(device); ++ struct vmbus_channel *channel = hv_dev->channel; + ++ hv_process_channel_removal(channel, ++ channel->offermsg.child_relid); + kfree(hv_dev); + + } +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0013-Drivers-hv-vmbus-release-relid-on-error-in-vmbus_pro.patch b/alpine/kernel/patches/0013-Drivers-hv-vmbus-release-relid-on-error-in-vmbus_pro.patch new file mode 100644 index 000000000..f080bde66 --- /dev/null +++ b/alpine/kernel/patches/0013-Drivers-hv-vmbus-release-relid-on-error-in-vmbus_pro.patch @@ -0,0 +1,74 @@ +From 613d19efd48c06602018c0e7c6b4bf8d191105cd Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Mon, 14 Dec 2015 16:01:50 -0800 +Subject: [PATCH 13/25] Drivers: hv: vmbus: release relid on error in + vmbus_process_offer() + +We want to simplify vmbus_onoffer_rescind() by not invoking +hv_process_channel_removal(NULL, ...). + +Signed-off-by: Dexuan Cui +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +(cherry picked from commit f52078cf5711ce47c113a58702b35c8ff5f212f5) +--- + drivers/hv/channel_mgmt.c | 21 +++++++++++++++------ + 1 file changed, 15 insertions(+), 6 deletions(-) + +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index bd2e9f6..df76a71 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -177,19 +177,22 @@ static void percpu_channel_deq(void *arg) + } + + +-void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) ++static void vmbus_release_relid(u32 relid) + { + struct vmbus_channel_relid_released msg; +- unsigned long flags; +- struct vmbus_channel *primary_channel; + + memset(&msg, 0, sizeof(struct vmbus_channel_relid_released)); + msg.child_relid = relid; + msg.header.msgtype = CHANNELMSG_RELID_RELEASED; + vmbus_post_msg(&msg, sizeof(struct vmbus_channel_relid_released)); ++} + +- if (channel == NULL) +- return; ++void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) ++{ ++ unsigned long flags; ++ struct vmbus_channel *primary_channel; ++ ++ vmbus_release_relid(relid); + + BUG_ON(!channel->rescind); + +@@ -336,6 +339,8 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + return; + + err_deq_chan: ++ vmbus_release_relid(newchannel->offermsg.child_relid); ++ + spin_lock_irqsave(&vmbus_connection.channel_lock, flags); + list_del(&newchannel->listentry); + spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags); +@@ -585,7 +590,11 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) + channel = relid2channel(rescind->child_relid); + + if (channel == NULL) { +- hv_process_channel_removal(NULL, rescind->child_relid); ++ /* ++ * This is very impossible, because in ++ * vmbus_process_offer(), we have already invoked ++ * vmbus_release_relid() on error. ++ */ + return; + } + +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0014-Drivers-hv-vmbus-channge-vmbus_connection.channel_lo.patch b/alpine/kernel/patches/0014-Drivers-hv-vmbus-channge-vmbus_connection.channel_lo.patch new file mode 100644 index 000000000..c13702f70 --- /dev/null +++ b/alpine/kernel/patches/0014-Drivers-hv-vmbus-channge-vmbus_connection.channel_lo.patch @@ -0,0 +1,116 @@ +From b9a136e91171bea99a140195ccb4bbea2a65551d Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Mon, 14 Dec 2015 16:01:51 -0800 +Subject: [PATCH 14/25] Drivers: hv: vmbus: channge + vmbus_connection.channel_lock to mutex + +spinlock is unnecessary here. +mutex is enough. + +Signed-off-by: Dexuan Cui +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +(cherry picked from commit d6f591e339d23f434efda11917da511870891472) +--- + drivers/hv/channel_mgmt.c | 12 ++++++------ + drivers/hv/connection.c | 7 +++---- + drivers/hv/hyperv_vmbus.h | 2 +- + 3 files changed, 10 insertions(+), 11 deletions(-) + +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index df76a71..bd4f084 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -206,9 +206,9 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) + } + + if (channel->primary_channel == NULL) { +- spin_lock_irqsave(&vmbus_connection.channel_lock, flags); ++ mutex_lock(&vmbus_connection.channel_mutex); + list_del(&channel->listentry); +- spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags); ++ mutex_unlock(&vmbus_connection.channel_mutex); + + primary_channel = channel; + } else { +@@ -253,7 +253,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + unsigned long flags; + + /* Make sure this is a new offer */ +- spin_lock_irqsave(&vmbus_connection.channel_lock, flags); ++ mutex_lock(&vmbus_connection.channel_mutex); + + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (!uuid_le_cmp(channel->offermsg.offer.if_type, +@@ -269,7 +269,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + list_add_tail(&newchannel->listentry, + &vmbus_connection.chn_list); + +- spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags); ++ mutex_unlock(&vmbus_connection.channel_mutex); + + if (!fnew) { + /* +@@ -341,9 +341,9 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + err_deq_chan: + vmbus_release_relid(newchannel->offermsg.child_relid); + +- spin_lock_irqsave(&vmbus_connection.channel_lock, flags); ++ mutex_lock(&vmbus_connection.channel_mutex); + list_del(&newchannel->listentry); +- spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags); ++ mutex_unlock(&vmbus_connection.channel_mutex); + + if (newchannel->target_cpu != get_cpu()) { + put_cpu(); +diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c +index 4fc2e88..521f48e 100644 +--- a/drivers/hv/connection.c ++++ b/drivers/hv/connection.c +@@ -146,7 +146,7 @@ int vmbus_connect(void) + spin_lock_init(&vmbus_connection.channelmsg_lock); + + INIT_LIST_HEAD(&vmbus_connection.chn_list); +- spin_lock_init(&vmbus_connection.channel_lock); ++ mutex_init(&vmbus_connection.channel_mutex); + + /* + * Setup the vmbus event connection for channel interrupt +@@ -282,11 +282,10 @@ struct vmbus_channel *relid2channel(u32 relid) + { + struct vmbus_channel *channel; + struct vmbus_channel *found_channel = NULL; +- unsigned long flags; + struct list_head *cur, *tmp; + struct vmbus_channel *cur_sc; + +- spin_lock_irqsave(&vmbus_connection.channel_lock, flags); ++ mutex_lock(&vmbus_connection.channel_mutex); + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (channel->offermsg.child_relid == relid) { + found_channel = channel; +@@ -305,7 +304,7 @@ struct vmbus_channel *relid2channel(u32 relid) + } + } + } +- spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags); ++ mutex_unlock(&vmbus_connection.channel_mutex); + + return found_channel; + } +diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h +index 3782636..d9937be 100644 +--- a/drivers/hv/hyperv_vmbus.h ++++ b/drivers/hv/hyperv_vmbus.h +@@ -678,7 +678,7 @@ struct vmbus_connection { + + /* List of channels */ + struct list_head chn_list; +- spinlock_t channel_lock; ++ struct mutex channel_mutex; + + struct workqueue_struct *work_queue; + }; +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0015-Drivers-hv-vmbus-add-a-helper-function-to-set-a-chan.patch b/alpine/kernel/patches/0015-Drivers-hv-vmbus-add-a-helper-function-to-set-a-chan.patch new file mode 100644 index 000000000..874326ac8 --- /dev/null +++ b/alpine/kernel/patches/0015-Drivers-hv-vmbus-add-a-helper-function-to-set-a-chan.patch @@ -0,0 +1,36 @@ +From 37e0f1616a680b0b209c3555812c0691dacd74e0 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 27 Jan 2016 22:29:37 -0800 +Subject: [PATCH 15/25] Drivers: hv: vmbus: add a helper function to set a + channel's pending send size + +This will be used by the coming net/hvsock driver. + +Signed-off-by: Dexuan Cui +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +(cherry picked from commit 3c75354d043ad546148d6992e40033ecaefc5ea5) +--- + include/linux/hyperv.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h +index ae6a711..fda6310 100644 +--- a/include/linux/hyperv.h ++++ b/include/linux/hyperv.h +@@ -792,6 +792,12 @@ static inline void *get_per_channel_state(struct vmbus_channel *c) + return c->per_channel_state; + } + ++static inline void set_channel_pending_send_size(struct vmbus_channel *c, ++ u32 size) ++{ ++ c->outbound.ring_buffer->pending_send_sz = size; ++} ++ + void vmbus_onmessage(void *context); + + int vmbus_request_offers(void); +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0016-Drivers-hv-vmbus-define-the-new-offer-type-for-Hyper.patch b/alpine/kernel/patches/0016-Drivers-hv-vmbus-define-the-new-offer-type-for-Hyper.patch new file mode 100644 index 000000000..210f8bdc0 --- /dev/null +++ b/alpine/kernel/patches/0016-Drivers-hv-vmbus-define-the-new-offer-type-for-Hyper.patch @@ -0,0 +1,44 @@ +From 7d695c9e75755b005a7f45f99dfd7d3bb641e3a8 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 27 Jan 2016 22:29:38 -0800 +Subject: [PATCH 16/25] Drivers: hv: vmbus: define the new offer type for + Hyper-V socket (hvsock) + +A helper function is also added. + +Signed-off-by: Dexuan Cui +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +(cherry picked from commit e8d6ca023efce3bd80050dcd9e708ee3cf8babd4) +--- + include/linux/hyperv.h | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h +index fda6310..9fb2130 100644 +--- a/include/linux/hyperv.h ++++ b/include/linux/hyperv.h +@@ -237,6 +237,7 @@ struct vmbus_channel_offer { + #define VMBUS_CHANNEL_LOOPBACK_OFFER 0x100 + #define VMBUS_CHANNEL_PARENT_OFFER 0x200 + #define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x400 ++#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000 + + struct vmpacket_descriptor { + u16 type; +@@ -771,6 +772,12 @@ struct vmbus_channel { + enum hv_signal_policy signal_policy; + }; + ++static inline bool is_hvsock_channel(const struct vmbus_channel *c) ++{ ++ return !!(c->offermsg.offer.chn_flags & ++ VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); ++} ++ + static inline void set_channel_signal_state(struct vmbus_channel *c, + enum hv_signal_policy policy) + { +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0017-Drivers-hv-vmbus-vmbus_sendpacket_ctl-hvsock-avoid-u.patch b/alpine/kernel/patches/0017-Drivers-hv-vmbus-vmbus_sendpacket_ctl-hvsock-avoid-u.patch new file mode 100644 index 000000000..2d916c9e3 --- /dev/null +++ b/alpine/kernel/patches/0017-Drivers-hv-vmbus-vmbus_sendpacket_ctl-hvsock-avoid-u.patch @@ -0,0 +1,45 @@ +From 8507cfd5b7af092d5cc5e99ff9852b3bc46c48c0 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 27 Jan 2016 22:29:39 -0800 +Subject: [PATCH 17/25] Drivers: hv: vmbus: vmbus_sendpacket_ctl: hvsock: avoid + unnecessary signaling + +When the hvsock channel's outbound ringbuffer is full (i.e., +hv_ringbuffer_write() returns -EAGAIN), we should avoid the unnecessary +signaling the host. + +Signed-off-by: Dexuan Cui +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +(cherry picked from commit 5f363bc38f810d238d1e8b19998625ddec3b8138) +--- + drivers/hv/channel.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c +index 2889d97..a7f9e3e 100644 +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -659,6 +659,9 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer, + * If we cannot write to the ring-buffer; signal the host + * even if we may not have written anything. This is a rare + * enough condition that it should not matter. ++ * NOTE: in this case, the hvsock channel is an exception, because ++ * it looks the host side's hvsock implementation has a throttling ++ * mechanism which can hurt the performance otherwise. + */ + + if (channel->signal_policy) +@@ -666,7 +669,8 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer, + else + kick_q = true; + +- if (((ret == 0) && kick_q && signal) || (ret)) ++ if (((ret == 0) && kick_q && signal) || ++ (ret && !is_hvsock_channel(channel))) + vmbus_setevent(channel); + + return ret; +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0018-Drivers-hv-vmbus-define-a-new-VMBus-message-type-for.patch b/alpine/kernel/patches/0018-Drivers-hv-vmbus-define-a-new-VMBus-message-type-for.patch new file mode 100644 index 000000000..b3a1fa2fb --- /dev/null +++ b/alpine/kernel/patches/0018-Drivers-hv-vmbus-define-a-new-VMBus-message-type-for.patch @@ -0,0 +1,101 @@ +From 746cdb5f4c824ef3af9d12909818a077a0cf303c Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 27 Jan 2016 22:29:40 -0800 +Subject: [PATCH 18/25] Drivers: hv: vmbus: define a new VMBus message type for + hvsock + +A function to send the type of message is also added. + +The coming net/hvsock driver will use this function to proactively request +the host to offer a VMBus channel for a new hvsock connection. + +Signed-off-by: Dexuan Cui +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +(cherry picked from commit 5c23a1a5c60b0f472cfa61cd7d8279f8aaeb5b64) +--- + drivers/hv/channel.c | 15 +++++++++++++++ + drivers/hv/channel_mgmt.c | 4 ++++ + include/linux/hyperv.h | 13 +++++++++++++ + 3 files changed, 32 insertions(+) + +diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c +index a7f9e3e..239b072 100644 +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -219,6 +219,21 @@ error0: + } + EXPORT_SYMBOL_GPL(vmbus_open); + ++/* Used for Hyper-V Socket: a guest client's connect() to the host */ ++int vmbus_send_tl_connect_request(const uuid_le *shv_guest_servie_id, ++ const uuid_le *shv_host_servie_id) ++{ ++ struct vmbus_channel_tl_connect_request conn_msg; ++ ++ memset(&conn_msg, 0, sizeof(conn_msg)); ++ conn_msg.header.msgtype = CHANNELMSG_TL_CONNECT_REQUEST; ++ conn_msg.guest_endpoint_id = *shv_guest_servie_id; ++ conn_msg.host_service_id = *shv_host_servie_id; ++ ++ return vmbus_post_msg(&conn_msg, sizeof(conn_msg)); ++} ++EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request); ++ + /* + * create_gpadl_header - Creates a gpadl for the specified buffer + */ +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index bd4f084..4d61f41 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -820,6 +820,10 @@ struct vmbus_channel_message_table_entry + {CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response}, + {CHANNELMSG_UNLOAD, 0, NULL}, + {CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response}, ++ {CHANNELMSG_18, 0, NULL}, ++ {CHANNELMSG_19, 0, NULL}, ++ {CHANNELMSG_20, 0, NULL}, ++ {CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL}, + }; + + /* +diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h +index 9fb2130..3f485a4 100644 +--- a/include/linux/hyperv.h ++++ b/include/linux/hyperv.h +@@ -394,6 +394,10 @@ enum vmbus_channel_message_type { + CHANNELMSG_VERSION_RESPONSE = 15, + CHANNELMSG_UNLOAD = 16, + CHANNELMSG_UNLOAD_RESPONSE = 17, ++ CHANNELMSG_18 = 18, ++ CHANNELMSG_19 = 19, ++ CHANNELMSG_20 = 20, ++ CHANNELMSG_TL_CONNECT_REQUEST = 21, + CHANNELMSG_COUNT + }; + +@@ -564,6 +568,13 @@ struct vmbus_channel_initiate_contact { + u64 monitor_page2; + } __packed; + ++/* Hyper-V socket: guest's connect()-ing to host */ ++struct vmbus_channel_tl_connect_request { ++ struct vmbus_channel_message_header header; ++ uuid_le guest_endpoint_id; ++ uuid_le host_service_id; ++} __packed; ++ + struct vmbus_channel_version_response { + struct vmbus_channel_message_header header; + u8 version_supported; +@@ -1276,4 +1287,6 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid); + + extern __u32 vmbus_proto_version; + ++int vmbus_send_tl_connect_request(const uuid_le *shv_guest_servie_id, ++ const uuid_le *shv_host_servie_id); + #endif /* _HYPERV_H */ +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0019-Drivers-hv-vmbus-add-a-hvsock-flag-in-struct-hv_driv.patch b/alpine/kernel/patches/0019-Drivers-hv-vmbus-add-a-hvsock-flag-in-struct-hv_driv.patch new file mode 100644 index 000000000..5546345c1 --- /dev/null +++ b/alpine/kernel/patches/0019-Drivers-hv-vmbus-add-a-hvsock-flag-in-struct-hv_driv.patch @@ -0,0 +1,64 @@ +From fb6b14a429dae008b7fee772a1e27cb09db459b7 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 27 Jan 2016 22:29:41 -0800 +Subject: [PATCH 19/25] Drivers: hv: vmbus: add a hvsock flag in struct + hv_driver + +Only the coming hv_sock driver has a "true" value for this flag. + +We treat the hvsock offers/channels as special VMBus devices. +Since the hv_sock driver handles all the hvsock offers/channels, we need to +tweak vmbus_match() for hv_sock driver, so we introduce this flag. + +Signed-off-by: Dexuan Cui +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +(cherry picked from commit 8981da320a11217589aa3c50f9e891bcdef07ece) +--- + drivers/hv/vmbus_drv.c | 4 ++++ + include/linux/hyperv.h | 14 ++++++++++++++ + 2 files changed, 18 insertions(+) + +diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c +index 7e46a48..7d607ad 100644 +--- a/drivers/hv/vmbus_drv.c ++++ b/drivers/hv/vmbus_drv.c +@@ -563,6 +563,10 @@ static int vmbus_match(struct device *device, struct device_driver *driver) + struct hv_driver *drv = drv_to_hv_drv(driver); + struct hv_device *hv_dev = device_to_hv_device(device); + ++ /* The hv_sock driver handles all hv_sock offers. */ ++ if (is_hvsock_channel(hv_dev->channel)) ++ return drv->hvsock; ++ + if (hv_vmbus_get_id(drv->id_table, hv_dev->dev_type.b)) + return 1; + +diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h +index 3f485a4..9ee79af 100644 +--- a/include/linux/hyperv.h ++++ b/include/linux/hyperv.h +@@ -966,6 +966,20 @@ extern void vmbus_ontimer(unsigned long data); + struct hv_driver { + const char *name; + ++ /* ++ * A hvsock offer, which has a VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER ++ * channel flag, actually doesn't mean a synthetic device because the ++ * offer's if_type/if_instance can change for every new hvsock ++ * connection. ++ * ++ * However, to facilitate the notification of new-offer/rescind-offer ++ * from vmbus driver to hvsock driver, we can handle hvsock offer as ++ * a special vmbus device, and hence we need the below flag to ++ * indicate if the driver is the hvsock driver or not: we need to ++ * specially treat the hvosck offer & driver in vmbus_match(). ++ */ ++ bool hvsock; ++ + /* the device type supported by this driver */ + uuid_le dev_type; + const struct hv_vmbus_device_id *id_table; +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0020-Drivers-hv-vmbus-add-a-per-channel-rescind-callback.patch b/alpine/kernel/patches/0020-Drivers-hv-vmbus-add-a-per-channel-rescind-callback.patch new file mode 100644 index 000000000..cd3eb9128 --- /dev/null +++ b/alpine/kernel/patches/0020-Drivers-hv-vmbus-add-a-per-channel-rescind-callback.patch @@ -0,0 +1,72 @@ +From 4a2d55757c137c2e574500227cb2efe77a26ee3a Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 27 Jan 2016 22:29:42 -0800 +Subject: [PATCH 20/25] Drivers: hv: vmbus: add a per-channel rescind callback + +This will be used by the coming hv_sock driver. + +Signed-off-by: Dexuan Cui +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +(cherry picked from commit 499e8401a515d04daa986b995da710d2b9737764) +--- + drivers/hv/channel_mgmt.c | 11 +++++++++++ + include/linux/hyperv.h | 9 +++++++++ + 2 files changed, 20 insertions(+) + +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index 4d61f41..421e3dd 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -603,6 +603,10 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) + spin_unlock_irqrestore(&channel->lock, flags); + + if (channel->device_obj) { ++ if (channel->chn_rescind_callback) { ++ channel->chn_rescind_callback(channel); ++ return; ++ } + /* + * We will have to unregister this device from the + * driver core. +@@ -972,3 +976,10 @@ bool vmbus_are_subchannels_present(struct vmbus_channel *primary) + return ret; + } + EXPORT_SYMBOL_GPL(vmbus_are_subchannels_present); ++ ++void vmbus_set_chn_rescind_callback(struct vmbus_channel *channel, ++ void (*chn_rescind_cb)(struct vmbus_channel *)) ++{ ++ channel->chn_rescind_callback = chn_rescind_cb; ++} ++EXPORT_SYMBOL_GPL(vmbus_set_chn_rescind_callback); +diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h +index 9ee79af..09e9ec1 100644 +--- a/include/linux/hyperv.h ++++ b/include/linux/hyperv.h +@@ -742,6 +742,12 @@ struct vmbus_channel { + void (*sc_creation_callback)(struct vmbus_channel *new_sc); + + /* ++ * Channel rescind callback. Some channels (the hvsock ones), need to ++ * register a callback which is invoked in vmbus_onoffer_rescind(). ++ */ ++ void (*chn_rescind_callback)(struct vmbus_channel *channel); ++ ++ /* + * The spinlock to protect the structure. It is being used to protect + * test-and-set access to various attributes of the structure as well + * as all sc_list operations. +@@ -827,6 +833,9 @@ int vmbus_request_offers(void); + void vmbus_set_sc_create_callback(struct vmbus_channel *primary_channel, + void (*sc_cr_cb)(struct vmbus_channel *new_sc)); + ++void vmbus_set_chn_rescind_callback(struct vmbus_channel *channel, ++ void (*chn_rescind_cb)(struct vmbus_channel *)); ++ + /* + * Retrieve the (sub) channel on which to send an outgoing request. + * When a primary channel has multiple sub-channels, we choose a +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0021-Drivers-hv-vmbus-add-an-API-vmbus_hvsock_device_unre.patch b/alpine/kernel/patches/0021-Drivers-hv-vmbus-add-an-API-vmbus_hvsock_device_unre.patch new file mode 100644 index 000000000..554a7dd77 --- /dev/null +++ b/alpine/kernel/patches/0021-Drivers-hv-vmbus-add-an-API-vmbus_hvsock_device_unre.patch @@ -0,0 +1,153 @@ +From b6c9f23164d3e7460d8983d27f2df194ab5e9a0b Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Wed, 27 Jan 2016 22:29:43 -0800 +Subject: [PATCH 21/25] Drivers: hv: vmbus: add an API + vmbus_hvsock_device_unregister() + +The hvsock driver needs this API to release all the resources related +to the channel. + +Signed-off-by: Dexuan Cui +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman +(cherry picked from commit 85d9aa705184a4504d0330017e3956fcdae8a9d6) +--- + drivers/hv/channel_mgmt.c | 33 ++++++++++++++++++++++++++++----- + drivers/hv/connection.c | 4 ++-- + include/linux/hyperv.h | 2 ++ + 3 files changed, 32 insertions(+), 7 deletions(-) + +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index 421e3dd..0fe9665 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -195,6 +195,7 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) + vmbus_release_relid(relid); + + BUG_ON(!channel->rescind); ++ BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); + + if (channel->target_cpu != get_cpu()) { + put_cpu(); +@@ -206,9 +207,7 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) + } + + if (channel->primary_channel == NULL) { +- mutex_lock(&vmbus_connection.channel_mutex); + list_del(&channel->listentry); +- mutex_unlock(&vmbus_connection.channel_mutex); + + primary_channel = channel; + } else { +@@ -251,6 +250,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + struct vmbus_channel *channel; + bool fnew = true; + unsigned long flags; ++ int ret; + + /* Make sure this is a new offer */ + mutex_lock(&vmbus_connection.channel_mutex); +@@ -330,7 +330,11 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) + * binding which eventually invokes the device driver's AddDevice() + * method. + */ +- if (vmbus_device_register(newchannel->device_obj) != 0) { ++ mutex_lock(&vmbus_connection.channel_mutex); ++ ret = vmbus_device_register(newchannel->device_obj); ++ mutex_unlock(&vmbus_connection.channel_mutex); ++ ++ if (ret != 0) { + pr_err("unable to add child device object (relid %d)\n", + newchannel->offermsg.child_relid); + kfree(newchannel->device_obj); +@@ -587,6 +591,8 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) + struct device *dev; + + rescind = (struct vmbus_channel_rescind_offer *)hdr; ++ ++ mutex_lock(&vmbus_connection.channel_mutex); + channel = relid2channel(rescind->child_relid); + + if (channel == NULL) { +@@ -595,7 +601,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) + * vmbus_process_offer(), we have already invoked + * vmbus_release_relid() on error. + */ +- return; ++ goto out; + } + + spin_lock_irqsave(&channel->lock, flags); +@@ -605,7 +611,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) + if (channel->device_obj) { + if (channel->chn_rescind_callback) { + channel->chn_rescind_callback(channel); +- return; ++ goto out; + } + /* + * We will have to unregister this device from the +@@ -620,8 +626,25 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) + hv_process_channel_removal(channel, + channel->offermsg.child_relid); + } ++ ++out: ++ mutex_unlock(&vmbus_connection.channel_mutex); + } + ++void vmbus_hvsock_device_unregister(struct vmbus_channel *channel) ++{ ++ mutex_lock(&vmbus_connection.channel_mutex); ++ ++ BUG_ON(!is_hvsock_channel(channel)); ++ ++ channel->rescind = true; ++ vmbus_device_unregister(channel->device_obj); ++ ++ mutex_unlock(&vmbus_connection.channel_mutex); ++} ++EXPORT_SYMBOL_GPL(vmbus_hvsock_device_unregister); ++ ++ + /* + * vmbus_onoffers_delivered - + * This is invoked when all offers have been delivered. +diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c +index 521f48e..09c08b5 100644 +--- a/drivers/hv/connection.c ++++ b/drivers/hv/connection.c +@@ -285,7 +285,8 @@ struct vmbus_channel *relid2channel(u32 relid) + struct list_head *cur, *tmp; + struct vmbus_channel *cur_sc; + +- mutex_lock(&vmbus_connection.channel_mutex); ++ BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); ++ + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (channel->offermsg.child_relid == relid) { + found_channel = channel; +@@ -304,7 +305,6 @@ struct vmbus_channel *relid2channel(u32 relid) + } + } + } +- mutex_unlock(&vmbus_connection.channel_mutex); + + return found_channel; + } +diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h +index 09e9ec1..af7ee0a 100644 +--- a/include/linux/hyperv.h ++++ b/include/linux/hyperv.h +@@ -1043,6 +1043,8 @@ int __must_check __vmbus_driver_register(struct hv_driver *hv_driver, + const char *mod_name); + void vmbus_driver_unregister(struct hv_driver *hv_driver); + ++void vmbus_hvsock_device_unregister(struct vmbus_channel *channel); ++ + int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, + resource_size_t min, resource_size_t max, + resource_size_t size, resource_size_t align, +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0022-kcm-Kernel-Connection-Multiplexor-module.patch b/alpine/kernel/patches/0022-kcm-Kernel-Connection-Multiplexor-module.patch new file mode 100644 index 000000000..48e1c0bd9 --- /dev/null +++ b/alpine/kernel/patches/0022-kcm-Kernel-Connection-Multiplexor-module.patch @@ -0,0 +1,2315 @@ +From f483aa8ef1890f82d6a362d296c21786c5ee9f30 Mon Sep 17 00:00:00 2001 +From: Tom Herbert +Date: Mon, 7 Mar 2016 14:11:06 -0800 +Subject: [PATCH 22/25] kcm: Kernel Connection Multiplexor module + +This module implements the Kernel Connection Multiplexor. + +Kernel Connection Multiplexor (KCM) is a facility that provides a +message based interface over TCP for generic application protocols. +With KCM an application can efficiently send and receive application +protocol messages over TCP using datagram sockets. + +For more information see the included Documentation/networking/kcm.txt + +Signed-off-by: Tom Herbert +Signed-off-by: David S. Miller +(cherry picked from commit ab7ac4eb9832e32a09f4e8042705484d2fb0aad3) +--- + include/linux/socket.h | 6 +- + include/net/kcm.h | 125 +++ + include/uapi/linux/kcm.h | 40 + + net/Kconfig | 1 + + net/Makefile | 1 + + net/kcm/Kconfig | 10 + + net/kcm/Makefile | 3 + + net/kcm/kcmsock.c | 2016 ++++++++++++++++++++++++++++++++++++++++++++++ + 8 files changed, 2201 insertions(+), 1 deletion(-) + create mode 100644 include/net/kcm.h + create mode 100644 include/uapi/linux/kcm.h + create mode 100644 net/kcm/Kconfig + create mode 100644 net/kcm/Makefile + create mode 100644 net/kcm/kcmsock.c + +diff --git a/include/linux/socket.h b/include/linux/socket.h +index 5bf59c8..4e1ea53 100644 +--- a/include/linux/socket.h ++++ b/include/linux/socket.h +@@ -200,7 +200,9 @@ struct ucred { + #define AF_ALG 38 /* Algorithm sockets */ + #define AF_NFC 39 /* NFC sockets */ + #define AF_VSOCK 40 /* vSockets */ +-#define AF_MAX 41 /* For now.. */ ++#define AF_KCM 41 /* Kernel Connection Multiplexor*/ ++ ++#define AF_MAX 42 /* For now.. */ + + /* Protocol families, same as address families. */ + #define PF_UNSPEC AF_UNSPEC +@@ -246,6 +248,7 @@ struct ucred { + #define PF_ALG AF_ALG + #define PF_NFC AF_NFC + #define PF_VSOCK AF_VSOCK ++#define PF_KCM AF_KCM + #define PF_MAX AF_MAX + + /* Maximum queue length specifiable by listen. */ +@@ -322,6 +325,7 @@ struct ucred { + #define SOL_CAIF 278 + #define SOL_ALG 279 + #define SOL_NFC 280 ++#define SOL_KCM 281 + + /* IPX options */ + #define IPX_TYPE 1 +diff --git a/include/net/kcm.h b/include/net/kcm.h +new file mode 100644 +index 0000000..1bcae39 +--- /dev/null ++++ b/include/net/kcm.h +@@ -0,0 +1,125 @@ ++/* ++ * Kernel Connection Multiplexor ++ * ++ * Copyright (c) 2016 Tom Herbert ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation. ++ */ ++ ++#ifndef __NET_KCM_H_ ++#define __NET_KCM_H_ ++ ++#include ++#include ++#include ++ ++extern unsigned int kcm_net_id; ++ ++struct kcm_tx_msg { ++ unsigned int sent; ++ unsigned int fragidx; ++ unsigned int frag_offset; ++ unsigned int msg_flags; ++ struct sk_buff *frag_skb; ++ struct sk_buff *last_skb; ++}; ++ ++struct kcm_rx_msg { ++ int full_len; ++ int accum_len; ++ int offset; ++}; ++ ++/* Socket structure for KCM client sockets */ ++struct kcm_sock { ++ struct sock sk; ++ struct kcm_mux *mux; ++ struct list_head kcm_sock_list; ++ int index; ++ u32 done : 1; ++ struct work_struct done_work; ++ ++ /* Transmit */ ++ struct kcm_psock *tx_psock; ++ struct work_struct tx_work; ++ struct list_head wait_psock_list; ++ struct sk_buff *seq_skb; ++ ++ /* Don't use bit fields here, these are set under different locks */ ++ bool tx_wait; ++ bool tx_wait_more; ++ ++ /* Receive */ ++ struct kcm_psock *rx_psock; ++ struct list_head wait_rx_list; /* KCMs waiting for receiving */ ++ bool rx_wait; ++ u32 rx_disabled : 1; ++}; ++ ++struct bpf_prog; ++ ++/* Structure for an attached lower socket */ ++struct kcm_psock { ++ struct sock *sk; ++ struct kcm_mux *mux; ++ int index; ++ ++ u32 tx_stopped : 1; ++ u32 rx_stopped : 1; ++ u32 done : 1; ++ u32 unattaching : 1; ++ ++ void (*save_state_change)(struct sock *sk); ++ void (*save_data_ready)(struct sock *sk); ++ void (*save_write_space)(struct sock *sk); ++ ++ struct list_head psock_list; ++ ++ /* Receive */ ++ struct sk_buff *rx_skb_head; ++ struct sk_buff **rx_skb_nextp; ++ struct sk_buff *ready_rx_msg; ++ struct list_head psock_ready_list; ++ struct work_struct rx_work; ++ struct delayed_work rx_delayed_work; ++ struct bpf_prog *bpf_prog; ++ struct kcm_sock *rx_kcm; ++ ++ /* Transmit */ ++ struct kcm_sock *tx_kcm; ++ struct list_head psock_avail_list; ++}; ++ ++/* Per net MUX list */ ++struct kcm_net { ++ struct mutex mutex; ++ struct list_head mux_list; ++ int count; ++}; ++ ++/* Structure for a MUX */ ++struct kcm_mux { ++ struct list_head kcm_mux_list; ++ struct rcu_head rcu; ++ struct kcm_net *knet; ++ ++ struct list_head kcm_socks; /* All KCM sockets on MUX */ ++ int kcm_socks_cnt; /* Total KCM socket count for MUX */ ++ struct list_head psocks; /* List of all psocks on MUX */ ++ int psocks_cnt; /* Total attached sockets */ ++ ++ /* Receive */ ++ spinlock_t rx_lock ____cacheline_aligned_in_smp; ++ struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */ ++ struct list_head psocks_ready; /* List of psocks with a msg ready */ ++ struct sk_buff_head rx_hold_queue; ++ ++ /* Transmit */ ++ spinlock_t lock ____cacheline_aligned_in_smp; /* TX and mux locking */ ++ struct list_head psocks_avail; /* List of available psocks */ ++ struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */ ++}; ++ ++#endif /* __NET_KCM_H_ */ +diff --git a/include/uapi/linux/kcm.h b/include/uapi/linux/kcm.h +new file mode 100644 +index 0000000..a5a53094 +--- /dev/null ++++ b/include/uapi/linux/kcm.h +@@ -0,0 +1,40 @@ ++/* ++ * Kernel Connection Multiplexor ++ * ++ * Copyright (c) 2016 Tom Herbert ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation. ++ * ++ * User API to clone KCM sockets and attach transport socket to a KCM ++ * multiplexor. ++ */ ++ ++#ifndef KCM_KERNEL_H ++#define KCM_KERNEL_H ++ ++struct kcm_attach { ++ int fd; ++ int bpf_fd; ++}; ++ ++struct kcm_unattach { ++ int fd; ++}; ++ ++struct kcm_clone { ++ int fd; ++}; ++ ++#define SIOCKCMATTACH (SIOCPROTOPRIVATE + 0) ++#define SIOCKCMUNATTACH (SIOCPROTOPRIVATE + 1) ++#define SIOCKCMCLONE (SIOCPROTOPRIVATE + 2) ++ ++#define KCMPROTO_CONNECTED 0 ++ ++/* Socket options */ ++#define KCM_RECV_DISABLE 1 ++ ++#endif ++ +diff --git a/net/Kconfig b/net/Kconfig +index 127da94..b8439e6 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -351,6 +351,7 @@ source "net/can/Kconfig" + source "net/irda/Kconfig" + source "net/bluetooth/Kconfig" + source "net/rxrpc/Kconfig" ++source "net/kcm/Kconfig" + + config FIB_RULES + bool +diff --git a/net/Makefile b/net/Makefile +index a5d0409..81d1411 100644 +--- a/net/Makefile ++++ b/net/Makefile +@@ -34,6 +34,7 @@ obj-$(CONFIG_IRDA) += irda/ + obj-$(CONFIG_BT) += bluetooth/ + obj-$(CONFIG_SUNRPC) += sunrpc/ + obj-$(CONFIG_AF_RXRPC) += rxrpc/ ++obj-$(CONFIG_AF_KCM) += kcm/ + obj-$(CONFIG_ATM) += atm/ + obj-$(CONFIG_L2TP) += l2tp/ + obj-$(CONFIG_DECNET) += decnet/ +diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig +new file mode 100644 +index 0000000..5db94d9 +--- /dev/null ++++ b/net/kcm/Kconfig +@@ -0,0 +1,10 @@ ++ ++config AF_KCM ++ tristate "KCM sockets" ++ depends on INET ++ select BPF_SYSCALL ++ ---help--- ++ KCM (Kernel Connection Multiplexor) sockets provide a method ++ for multiplexing messages of a message based application ++ protocol over kernel connectons (e.g. TCP connections). ++ +diff --git a/net/kcm/Makefile b/net/kcm/Makefile +new file mode 100644 +index 0000000..cb525f7 +--- /dev/null ++++ b/net/kcm/Makefile +@@ -0,0 +1,3 @@ ++obj-$(CONFIG_AF_KCM) += kcm.o ++ ++kcm-y := kcmsock.o +diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c +new file mode 100644 +index 0000000..30ef69a +--- /dev/null ++++ b/net/kcm/kcmsock.c +@@ -0,0 +1,2016 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++unsigned int kcm_net_id; ++ ++static struct kmem_cache *kcm_psockp __read_mostly; ++static struct kmem_cache *kcm_muxp __read_mostly; ++static struct workqueue_struct *kcm_wq; ++ ++static inline struct kcm_sock *kcm_sk(const struct sock *sk) ++{ ++ return (struct kcm_sock *)sk; ++} ++ ++static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) ++{ ++ return (struct kcm_tx_msg *)skb->cb; ++} ++ ++static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb) ++{ ++ return (struct kcm_rx_msg *)((void *)skb->cb + ++ offsetof(struct qdisc_skb_cb, data)); ++} ++ ++static void report_csk_error(struct sock *csk, int err) ++{ ++ csk->sk_err = EPIPE; ++ csk->sk_error_report(csk); ++} ++ ++/* Callback lock held */ ++static void kcm_abort_rx_psock(struct kcm_psock *psock, int err, ++ struct sk_buff *skb) ++{ ++ struct sock *csk = psock->sk; ++ ++ /* Unrecoverable error in receive */ ++ ++ if (psock->rx_stopped) ++ return; ++ ++ psock->rx_stopped = 1; ++ ++ /* Report an error on the lower socket */ ++ report_csk_error(csk, err); ++} ++ ++static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, ++ bool wakeup_kcm) ++{ ++ struct sock *csk = psock->sk; ++ struct kcm_mux *mux = psock->mux; ++ ++ /* Unrecoverable error in transmit */ ++ ++ spin_lock_bh(&mux->lock); ++ ++ if (psock->tx_stopped) { ++ spin_unlock_bh(&mux->lock); ++ return; ++ } ++ ++ psock->tx_stopped = 1; ++ ++ if (!psock->tx_kcm) { ++ /* Take off psocks_avail list */ ++ list_del(&psock->psock_avail_list); ++ } else if (wakeup_kcm) { ++ /* In this case psock is being aborted while outside of ++ * write_msgs and psock is reserved. Schedule tx_work ++ * to handle the failure there. Need to commit tx_stopped ++ * before queuing work. ++ */ ++ smp_mb(); ++ ++ queue_work(kcm_wq, &psock->tx_kcm->tx_work); ++ } ++ ++ spin_unlock_bh(&mux->lock); ++ ++ /* Report error on lower socket */ ++ report_csk_error(csk, err); ++} ++ ++static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); ++ ++/* KCM is ready to receive messages on its queue-- either the KCM is new or ++ * has become unblocked after being blocked on full socket buffer. Queue any ++ * pending ready messages on a psock. RX mux lock held. ++ */ ++static void kcm_rcv_ready(struct kcm_sock *kcm) ++{ ++ struct kcm_mux *mux = kcm->mux; ++ struct kcm_psock *psock; ++ struct sk_buff *skb; ++ ++ if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled)) ++ return; ++ ++ while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) { ++ if (kcm_queue_rcv_skb(&kcm->sk, skb)) { ++ /* Assuming buffer limit has been reached */ ++ skb_queue_head(&mux->rx_hold_queue, skb); ++ WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); ++ return; ++ } ++ } ++ ++ while (!list_empty(&mux->psocks_ready)) { ++ psock = list_first_entry(&mux->psocks_ready, struct kcm_psock, ++ psock_ready_list); ++ ++ if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) { ++ /* Assuming buffer limit has been reached */ ++ WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); ++ return; ++ } ++ ++ /* Consumed the ready message on the psock. Schedule rx_work to ++ * get more messages. ++ */ ++ list_del(&psock->psock_ready_list); ++ psock->ready_rx_msg = NULL; ++ ++ /* Commit clearing of ready_rx_msg for queuing work */ ++ smp_mb(); ++ ++ queue_work(kcm_wq, &psock->rx_work); ++ } ++ ++ /* Buffer limit is okay now, add to ready list */ ++ list_add_tail(&kcm->wait_rx_list, ++ &kcm->mux->kcm_rx_waiters); ++ kcm->rx_wait = true; ++} ++ ++static void kcm_rfree(struct sk_buff *skb) ++{ ++ struct sock *sk = skb->sk; ++ struct kcm_sock *kcm = kcm_sk(sk); ++ struct kcm_mux *mux = kcm->mux; ++ unsigned int len = skb->truesize; ++ ++ sk_mem_uncharge(sk, len); ++ atomic_sub(len, &sk->sk_rmem_alloc); ++ ++ /* For reading rx_wait and rx_psock without holding lock */ ++ smp_mb__after_atomic(); ++ ++ if (!kcm->rx_wait && !kcm->rx_psock && ++ sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { ++ spin_lock_bh(&mux->rx_lock); ++ kcm_rcv_ready(kcm); ++ spin_unlock_bh(&mux->rx_lock); ++ } ++} ++ ++static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ++{ ++ struct sk_buff_head *list = &sk->sk_receive_queue; ++ ++ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) ++ return -ENOMEM; ++ ++ if (!sk_rmem_schedule(sk, skb, skb->truesize)) ++ return -ENOBUFS; ++ ++ skb->dev = NULL; ++ ++ skb_orphan(skb); ++ skb->sk = sk; ++ skb->destructor = kcm_rfree; ++ atomic_add(skb->truesize, &sk->sk_rmem_alloc); ++ sk_mem_charge(sk, skb->truesize); ++ ++ skb_queue_tail(list, skb); ++ ++ if (!sock_flag(sk, SOCK_DEAD)) ++ sk->sk_data_ready(sk); ++ ++ return 0; ++} ++ ++/* Requeue received messages for a kcm socket to other kcm sockets. This is ++ * called with a kcm socket is receive disabled. ++ * RX mux lock held. ++ */ ++static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) ++{ ++ struct sk_buff *skb; ++ struct kcm_sock *kcm; ++ ++ while ((skb = __skb_dequeue(head))) { ++ /* Reset destructor to avoid calling kcm_rcv_ready */ ++ skb->destructor = sock_rfree; ++ skb_orphan(skb); ++try_again: ++ if (list_empty(&mux->kcm_rx_waiters)) { ++ skb_queue_tail(&mux->rx_hold_queue, skb); ++ continue; ++ } ++ ++ kcm = list_first_entry(&mux->kcm_rx_waiters, ++ struct kcm_sock, wait_rx_list); ++ ++ if (kcm_queue_rcv_skb(&kcm->sk, skb)) { ++ /* Should mean socket buffer full */ ++ list_del(&kcm->wait_rx_list); ++ kcm->rx_wait = false; ++ ++ /* Commit rx_wait to read in kcm_free */ ++ smp_wmb(); ++ ++ goto try_again; ++ } ++ } ++} ++ ++/* Lower sock lock held */ ++static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, ++ struct sk_buff *head) ++{ ++ struct kcm_mux *mux = psock->mux; ++ struct kcm_sock *kcm; ++ ++ WARN_ON(psock->ready_rx_msg); ++ ++ if (psock->rx_kcm) ++ return psock->rx_kcm; ++ ++ spin_lock_bh(&mux->rx_lock); ++ ++ if (psock->rx_kcm) { ++ spin_unlock_bh(&mux->rx_lock); ++ return psock->rx_kcm; ++ } ++ ++ if (list_empty(&mux->kcm_rx_waiters)) { ++ psock->ready_rx_msg = head; ++ list_add_tail(&psock->psock_ready_list, ++ &mux->psocks_ready); ++ spin_unlock_bh(&mux->rx_lock); ++ return NULL; ++ } ++ ++ kcm = list_first_entry(&mux->kcm_rx_waiters, ++ struct kcm_sock, wait_rx_list); ++ list_del(&kcm->wait_rx_list); ++ kcm->rx_wait = false; ++ ++ psock->rx_kcm = kcm; ++ kcm->rx_psock = psock; ++ ++ spin_unlock_bh(&mux->rx_lock); ++ ++ return kcm; ++} ++ ++static void kcm_done(struct kcm_sock *kcm); ++ ++static void kcm_done_work(struct work_struct *w) ++{ ++ kcm_done(container_of(w, struct kcm_sock, done_work)); ++} ++ ++/* Lower sock held */ ++static void unreserve_rx_kcm(struct kcm_psock *psock, ++ bool rcv_ready) ++{ ++ struct kcm_sock *kcm = psock->rx_kcm; ++ struct kcm_mux *mux = psock->mux; ++ ++ if (!kcm) ++ return; ++ ++ spin_lock_bh(&mux->rx_lock); ++ ++ psock->rx_kcm = NULL; ++ kcm->rx_psock = NULL; ++ ++ /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with ++ * kcm_rfree ++ */ ++ smp_mb(); ++ ++ if (unlikely(kcm->done)) { ++ spin_unlock_bh(&mux->rx_lock); ++ ++ /* Need to run kcm_done in a task since we need to qcquire ++ * callback locks which may already be held here. ++ */ ++ INIT_WORK(&kcm->done_work, kcm_done_work); ++ schedule_work(&kcm->done_work); ++ return; ++ } ++ ++ if (unlikely(kcm->rx_disabled)) { ++ requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); ++ } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) { ++ /* Check for degenerative race with rx_wait that all ++ * data was dequeued (accounted for in kcm_rfree). ++ */ ++ kcm_rcv_ready(kcm); ++ } ++ spin_unlock_bh(&mux->rx_lock); ++} ++ ++/* Macro to invoke filter function. */ ++#define KCM_RUN_FILTER(prog, ctx) \ ++ (*prog->bpf_func)(ctx, prog->insnsi) ++ ++/* Lower socket lock held */ ++static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, ++ unsigned int orig_offset, size_t orig_len) ++{ ++ struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data; ++ struct kcm_rx_msg *rxm; ++ struct kcm_sock *kcm; ++ struct sk_buff *head, *skb; ++ size_t eaten = 0, cand_len; ++ ssize_t extra; ++ int err; ++ bool cloned_orig = false; ++ ++ if (psock->ready_rx_msg) ++ return 0; ++ ++ head = psock->rx_skb_head; ++ if (head) { ++ /* Message already in progress */ ++ ++ if (unlikely(orig_offset)) { ++ /* Getting data with a non-zero offset when a message is ++ * in progress is not expected. If it does happen, we ++ * need to clone and pull since we can't deal with ++ * offsets in the skbs for a message expect in the head. ++ */ ++ orig_skb = skb_clone(orig_skb, GFP_ATOMIC); ++ if (!orig_skb) { ++ desc->error = -ENOMEM; ++ return 0; ++ } ++ if (!pskb_pull(orig_skb, orig_offset)) { ++ kfree_skb(orig_skb); ++ desc->error = -ENOMEM; ++ return 0; ++ } ++ cloned_orig = true; ++ orig_offset = 0; ++ } ++ ++ if (!psock->rx_skb_nextp) { ++ /* We are going to append to the frags_list of head. ++ * Need to unshare the frag_list. ++ */ ++ err = skb_unclone(head, GFP_ATOMIC); ++ if (err) { ++ desc->error = err; ++ return 0; ++ } ++ ++ if (unlikely(skb_shinfo(head)->frag_list)) { ++ /* We can't append to an sk_buff that already ++ * has a frag_list. We create a new head, point ++ * the frag_list of that to the old head, and ++ * then are able to use the old head->next for ++ * appending to the message. ++ */ ++ if (WARN_ON(head->next)) { ++ desc->error = -EINVAL; ++ return 0; ++ } ++ ++ skb = alloc_skb(0, GFP_ATOMIC); ++ if (!skb) { ++ desc->error = -ENOMEM; ++ return 0; ++ } ++ skb->len = head->len; ++ skb->data_len = head->len; ++ skb->truesize = head->truesize; ++ *kcm_rx_msg(skb) = *kcm_rx_msg(head); ++ psock->rx_skb_nextp = &head->next; ++ skb_shinfo(skb)->frag_list = head; ++ psock->rx_skb_head = skb; ++ head = skb; ++ } else { ++ psock->rx_skb_nextp = ++ &skb_shinfo(head)->frag_list; ++ } ++ } ++ } ++ ++ while (eaten < orig_len) { ++ /* Always clone since we will consume something */ ++ skb = skb_clone(orig_skb, GFP_ATOMIC); ++ if (!skb) { ++ desc->error = -ENOMEM; ++ break; ++ } ++ ++ cand_len = orig_len - eaten; ++ ++ head = psock->rx_skb_head; ++ if (!head) { ++ head = skb; ++ psock->rx_skb_head = head; ++ /* Will set rx_skb_nextp on next packet if needed */ ++ psock->rx_skb_nextp = NULL; ++ rxm = kcm_rx_msg(head); ++ memset(rxm, 0, sizeof(*rxm)); ++ rxm->offset = orig_offset + eaten; ++ } else { ++ /* Unclone since we may be appending to an skb that we ++ * already share a frag_list with. ++ */ ++ err = skb_unclone(skb, GFP_ATOMIC); ++ if (err) { ++ desc->error = err; ++ break; ++ } ++ ++ rxm = kcm_rx_msg(head); ++ *psock->rx_skb_nextp = skb; ++ psock->rx_skb_nextp = &skb->next; ++ head->data_len += skb->len; ++ head->len += skb->len; ++ head->truesize += skb->truesize; ++ } ++ ++ if (!rxm->full_len) { ++ ssize_t len; ++ ++ len = KCM_RUN_FILTER(psock->bpf_prog, head); ++ ++ if (!len) { ++ /* Need more header to determine length */ ++ rxm->accum_len += cand_len; ++ eaten += cand_len; ++ WARN_ON(eaten != orig_len); ++ break; ++ } else if (len <= (ssize_t)head->len - ++ skb->len - rxm->offset) { ++ /* Length must be into new skb (and also ++ * greater than zero) ++ */ ++ desc->error = -EPROTO; ++ psock->rx_skb_head = NULL; ++ kcm_abort_rx_psock(psock, EPROTO, head); ++ break; ++ } ++ ++ rxm->full_len = len; ++ } ++ ++ extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len; ++ ++ if (extra < 0) { ++ /* Message not complete yet. */ ++ rxm->accum_len += cand_len; ++ eaten += cand_len; ++ WARN_ON(eaten != orig_len); ++ break; ++ } ++ ++ /* Positive extra indicates ore bytes than needed for the ++ * message ++ */ ++ ++ WARN_ON(extra > cand_len); ++ ++ eaten += (cand_len - extra); ++ ++ /* Hurray, we have a new message! */ ++ psock->rx_skb_head = NULL; ++ ++try_queue: ++ kcm = reserve_rx_kcm(psock, head); ++ if (!kcm) { ++ /* Unable to reserve a KCM, message is held in psock. */ ++ break; ++ } ++ ++ if (kcm_queue_rcv_skb(&kcm->sk, head)) { ++ /* Should mean socket buffer full */ ++ unreserve_rx_kcm(psock, false); ++ goto try_queue; ++ } ++ } ++ ++ if (cloned_orig) ++ kfree_skb(orig_skb); ++ ++ return eaten; ++} ++ ++/* Called with lock held on lower socket */ ++static int psock_tcp_read_sock(struct kcm_psock *psock) ++{ ++ read_descriptor_t desc; ++ ++ desc.arg.data = psock; ++ desc.error = 0; ++ desc.count = 1; /* give more than one skb per call */ ++ ++ /* sk should be locked here, so okay to do tcp_read_sock */ ++ tcp_read_sock(psock->sk, &desc, kcm_tcp_recv); ++ ++ unreserve_rx_kcm(psock, true); ++ ++ return desc.error; ++} ++ ++/* Lower sock lock held */ ++static void psock_tcp_data_ready(struct sock *sk) ++{ ++ struct kcm_psock *psock; ++ ++ read_lock_bh(&sk->sk_callback_lock); ++ ++ psock = (struct kcm_psock *)sk->sk_user_data; ++ if (unlikely(!psock || psock->rx_stopped)) ++ goto out; ++ ++ if (psock->ready_rx_msg) ++ goto out; ++ ++ if (psock_tcp_read_sock(psock) == -ENOMEM) ++ queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); ++ ++out: ++ read_unlock_bh(&sk->sk_callback_lock); ++} ++ ++static void do_psock_rx_work(struct kcm_psock *psock) ++{ ++ read_descriptor_t rd_desc; ++ struct sock *csk = psock->sk; ++ ++ /* We need the read lock to synchronize with psock_tcp_data_ready. We ++ * need the socket lock for calling tcp_read_sock. ++ */ ++ lock_sock(csk); ++ read_lock_bh(&csk->sk_callback_lock); ++ ++ if (unlikely(csk->sk_user_data != psock)) ++ goto out; ++ ++ if (unlikely(psock->rx_stopped)) ++ goto out; ++ ++ if (psock->ready_rx_msg) ++ goto out; ++ ++ rd_desc.arg.data = psock; ++ ++ if (psock_tcp_read_sock(psock) == -ENOMEM) ++ queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); ++ ++out: ++ read_unlock_bh(&csk->sk_callback_lock); ++ release_sock(csk); ++} ++ ++static void psock_rx_work(struct work_struct *w) ++{ ++ do_psock_rx_work(container_of(w, struct kcm_psock, rx_work)); ++} ++ ++static void psock_rx_delayed_work(struct work_struct *w) ++{ ++ do_psock_rx_work(container_of(w, struct kcm_psock, ++ rx_delayed_work.work)); ++} ++ ++static void psock_tcp_state_change(struct sock *sk) ++{ ++ /* TCP only does a POLLIN for a half close. Do a POLLHUP here ++ * since application will normally not poll with POLLIN ++ * on the TCP sockets. ++ */ ++ ++ report_csk_error(sk, EPIPE); ++} ++ ++static void psock_tcp_write_space(struct sock *sk) ++{ ++ struct kcm_psock *psock; ++ struct kcm_mux *mux; ++ struct kcm_sock *kcm; ++ ++ read_lock_bh(&sk->sk_callback_lock); ++ ++ psock = (struct kcm_psock *)sk->sk_user_data; ++ if (unlikely(!psock)) ++ goto out; ++ ++ mux = psock->mux; ++ ++ spin_lock_bh(&mux->lock); ++ ++ /* Check if the socket is reserved so someone is waiting for sending. */ ++ kcm = psock->tx_kcm; ++ if (kcm) ++ queue_work(kcm_wq, &kcm->tx_work); ++ ++ spin_unlock_bh(&mux->lock); ++out: ++ read_unlock_bh(&sk->sk_callback_lock); ++} ++ ++static void unreserve_psock(struct kcm_sock *kcm); ++ ++/* kcm sock is locked. */ ++static struct kcm_psock *reserve_psock(struct kcm_sock *kcm) ++{ ++ struct kcm_mux *mux = kcm->mux; ++ struct kcm_psock *psock; ++ ++ psock = kcm->tx_psock; ++ ++ smp_rmb(); /* Must read tx_psock before tx_wait */ ++ ++ if (psock) { ++ WARN_ON(kcm->tx_wait); ++ if (unlikely(psock->tx_stopped)) ++ unreserve_psock(kcm); ++ else ++ return kcm->tx_psock; ++ } ++ ++ spin_lock_bh(&mux->lock); ++ ++ /* Check again under lock to see if psock was reserved for this ++ * psock via psock_unreserve. ++ */ ++ psock = kcm->tx_psock; ++ if (unlikely(psock)) { ++ WARN_ON(kcm->tx_wait); ++ spin_unlock_bh(&mux->lock); ++ return kcm->tx_psock; ++ } ++ ++ if (!list_empty(&mux->psocks_avail)) { ++ psock = list_first_entry(&mux->psocks_avail, ++ struct kcm_psock, ++ psock_avail_list); ++ list_del(&psock->psock_avail_list); ++ if (kcm->tx_wait) { ++ list_del(&kcm->wait_psock_list); ++ kcm->tx_wait = false; ++ } ++ kcm->tx_psock = psock; ++ psock->tx_kcm = kcm; ++ } else if (!kcm->tx_wait) { ++ list_add_tail(&kcm->wait_psock_list, ++ &mux->kcm_tx_waiters); ++ kcm->tx_wait = true; ++ } ++ ++ spin_unlock_bh(&mux->lock); ++ ++ return psock; ++} ++ ++/* mux lock held */ ++static void psock_now_avail(struct kcm_psock *psock) ++{ ++ struct kcm_mux *mux = psock->mux; ++ struct kcm_sock *kcm; ++ ++ if (list_empty(&mux->kcm_tx_waiters)) { ++ list_add_tail(&psock->psock_avail_list, ++ &mux->psocks_avail); ++ } else { ++ kcm = list_first_entry(&mux->kcm_tx_waiters, ++ struct kcm_sock, ++ wait_psock_list); ++ list_del(&kcm->wait_psock_list); ++ kcm->tx_wait = false; ++ psock->tx_kcm = kcm; ++ ++ /* Commit before changing tx_psock since that is read in ++ * reserve_psock before queuing work. ++ */ ++ smp_mb(); ++ ++ kcm->tx_psock = psock; ++ queue_work(kcm_wq, &kcm->tx_work); ++ } ++} ++ ++/* kcm sock is locked. */ ++static void unreserve_psock(struct kcm_sock *kcm) ++{ ++ struct kcm_psock *psock; ++ struct kcm_mux *mux = kcm->mux; ++ ++ spin_lock_bh(&mux->lock); ++ ++ psock = kcm->tx_psock; ++ ++ if (WARN_ON(!psock)) { ++ spin_unlock_bh(&mux->lock); ++ return; ++ } ++ ++ smp_rmb(); /* Read tx_psock before tx_wait */ ++ ++ WARN_ON(kcm->tx_wait); ++ ++ kcm->tx_psock = NULL; ++ psock->tx_kcm = NULL; ++ ++ if (unlikely(psock->tx_stopped)) { ++ if (psock->done) { ++ /* Deferred free */ ++ list_del(&psock->psock_list); ++ mux->psocks_cnt--; ++ sock_put(psock->sk); ++ fput(psock->sk->sk_socket->file); ++ kmem_cache_free(kcm_psockp, psock); ++ } ++ ++ /* Don't put back on available list */ ++ ++ spin_unlock_bh(&mux->lock); ++ ++ return; ++ } ++ ++ psock_now_avail(psock); ++ ++ spin_unlock_bh(&mux->lock); ++} ++ ++/* Write any messages ready on the kcm socket. Called with kcm sock lock ++ * held. Return bytes actually sent or error. ++ */ ++static int kcm_write_msgs(struct kcm_sock *kcm) ++{ ++ struct sock *sk = &kcm->sk; ++ struct kcm_psock *psock; ++ struct sk_buff *skb, *head; ++ struct kcm_tx_msg *txm; ++ unsigned short fragidx, frag_offset; ++ unsigned int sent, total_sent = 0; ++ int ret = 0; ++ ++ kcm->tx_wait_more = false; ++ psock = kcm->tx_psock; ++ if (unlikely(psock && psock->tx_stopped)) { ++ /* A reserved psock was aborted asynchronously. Unreserve ++ * it and we'll retry the message. ++ */ ++ unreserve_psock(kcm); ++ if (skb_queue_empty(&sk->sk_write_queue)) ++ return 0; ++ ++ kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0; ++ ++ } else if (skb_queue_empty(&sk->sk_write_queue)) { ++ return 0; ++ } ++ ++ head = skb_peek(&sk->sk_write_queue); ++ txm = kcm_tx_msg(head); ++ ++ if (txm->sent) { ++ /* Send of first skbuff in queue already in progress */ ++ if (WARN_ON(!psock)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ sent = txm->sent; ++ frag_offset = txm->frag_offset; ++ fragidx = txm->fragidx; ++ skb = txm->frag_skb; ++ ++ goto do_frag; ++ } ++ ++try_again: ++ psock = reserve_psock(kcm); ++ if (!psock) ++ goto out; ++ ++ do { ++ skb = head; ++ txm = kcm_tx_msg(head); ++ sent = 0; ++ ++do_frag_list: ++ if (WARN_ON(!skb_shinfo(skb)->nr_frags)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; ++ fragidx++) { ++ skb_frag_t *frag; ++ ++ frag_offset = 0; ++do_frag: ++ frag = &skb_shinfo(skb)->frags[fragidx]; ++ if (WARN_ON(!frag->size)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ ret = kernel_sendpage(psock->sk->sk_socket, ++ frag->page.p, ++ frag->page_offset + frag_offset, ++ frag->size - frag_offset, ++ MSG_DONTWAIT); ++ if (ret <= 0) { ++ if (ret == -EAGAIN) { ++ /* Save state to try again when there's ++ * write space on the socket ++ */ ++ txm->sent = sent; ++ txm->frag_offset = frag_offset; ++ txm->fragidx = fragidx; ++ txm->frag_skb = skb; ++ ++ ret = 0; ++ goto out; ++ } ++ ++ /* Hard failure in sending message, abort this ++ * psock since it has lost framing ++ * synchonization and retry sending the ++ * message from the beginning. ++ */ ++ kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, ++ true); ++ unreserve_psock(kcm); ++ ++ txm->sent = 0; ++ ret = 0; ++ ++ goto try_again; ++ } ++ ++ sent += ret; ++ frag_offset += ret; ++ if (frag_offset < frag->size) { ++ /* Not finished with this frag */ ++ goto do_frag; ++ } ++ } ++ ++ if (skb == head) { ++ if (skb_has_frag_list(skb)) { ++ skb = skb_shinfo(skb)->frag_list; ++ goto do_frag_list; ++ } ++ } else if (skb->next) { ++ skb = skb->next; ++ goto do_frag_list; ++ } ++ ++ /* Successfully sent the whole packet, account for it. */ ++ skb_dequeue(&sk->sk_write_queue); ++ kfree_skb(head); ++ sk->sk_wmem_queued -= sent; ++ total_sent += sent; ++ } while ((head = skb_peek(&sk->sk_write_queue))); ++out: ++ if (!head) { ++ /* Done with all queued messages. */ ++ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); ++ unreserve_psock(kcm); ++ } ++ ++ /* Check if write space is available */ ++ sk->sk_write_space(sk); ++ ++ return total_sent ? : ret; ++} ++ ++static void kcm_tx_work(struct work_struct *w) ++{ ++ struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work); ++ struct sock *sk = &kcm->sk; ++ int err; ++ ++ lock_sock(sk); ++ ++ /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx ++ * aborts ++ */ ++ err = kcm_write_msgs(kcm); ++ if (err < 0) { ++ /* Hard failure in write, report error on KCM socket */ ++ pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err); ++ report_csk_error(&kcm->sk, -err); ++ goto out; ++ } ++ ++ /* Primarily for SOCK_SEQPACKET sockets */ ++ if (likely(sk->sk_socket) && ++ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { ++ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); ++ sk->sk_write_space(sk); ++ } ++ ++out: ++ release_sock(sk); ++} ++ ++static void kcm_push(struct kcm_sock *kcm) ++{ ++ if (kcm->tx_wait_more) ++ kcm_write_msgs(kcm); ++} ++ ++static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) ++{ ++ struct sock *sk = sock->sk; ++ struct kcm_sock *kcm = kcm_sk(sk); ++ struct sk_buff *skb = NULL, *head = NULL; ++ size_t copy, copied = 0; ++ long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ++ int eor = (sock->type == SOCK_DGRAM) ? ++ !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); ++ int err = -EPIPE; ++ ++ lock_sock(sk); ++ ++ /* Per tcp_sendmsg this should be in poll */ ++ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); ++ ++ if (sk->sk_err) ++ goto out_error; ++ ++ if (kcm->seq_skb) { ++ /* Previously opened message */ ++ head = kcm->seq_skb; ++ skb = kcm_tx_msg(head)->last_skb; ++ goto start; ++ } ++ ++ /* Call the sk_stream functions to manage the sndbuf mem. */ ++ if (!sk_stream_memory_free(sk)) { ++ kcm_push(kcm); ++ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); ++ err = sk_stream_wait_memory(sk, &timeo); ++ if (err) ++ goto out_error; ++ } ++ ++ /* New message, alloc head skb */ ++ head = alloc_skb(0, sk->sk_allocation); ++ while (!head) { ++ kcm_push(kcm); ++ err = sk_stream_wait_memory(sk, &timeo); ++ if (err) ++ goto out_error; ++ ++ head = alloc_skb(0, sk->sk_allocation); ++ } ++ ++ skb = head; ++ ++ /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling ++ * csum_and_copy_from_iter from skb_do_copy_data_nocache. ++ */ ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ ++start: ++ while (msg_data_left(msg)) { ++ bool merge = true; ++ int i = skb_shinfo(skb)->nr_frags; ++ struct page_frag *pfrag = sk_page_frag(sk); ++ ++ if (!sk_page_frag_refill(sk, pfrag)) ++ goto wait_for_memory; ++ ++ if (!skb_can_coalesce(skb, i, pfrag->page, ++ pfrag->offset)) { ++ if (i == MAX_SKB_FRAGS) { ++ struct sk_buff *tskb; ++ ++ tskb = alloc_skb(0, sk->sk_allocation); ++ if (!tskb) ++ goto wait_for_memory; ++ ++ if (head == skb) ++ skb_shinfo(head)->frag_list = tskb; ++ else ++ skb->next = tskb; ++ ++ skb = tskb; ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ continue; ++ } ++ merge = false; ++ } ++ ++ copy = min_t(int, msg_data_left(msg), ++ pfrag->size - pfrag->offset); ++ ++ if (!sk_wmem_schedule(sk, copy)) ++ goto wait_for_memory; ++ ++ err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, ++ pfrag->page, ++ pfrag->offset, ++ copy); ++ if (err) ++ goto out_error; ++ ++ /* Update the skb. */ ++ if (merge) { ++ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); ++ } else { ++ skb_fill_page_desc(skb, i, pfrag->page, ++ pfrag->offset, copy); ++ get_page(pfrag->page); ++ } ++ ++ pfrag->offset += copy; ++ copied += copy; ++ if (head != skb) { ++ head->len += copy; ++ head->data_len += copy; ++ } ++ ++ continue; ++ ++wait_for_memory: ++ kcm_push(kcm); ++ err = sk_stream_wait_memory(sk, &timeo); ++ if (err) ++ goto out_error; ++ } ++ ++ if (eor) { ++ bool not_busy = skb_queue_empty(&sk->sk_write_queue); ++ ++ /* Message complete, queue it on send buffer */ ++ __skb_queue_tail(&sk->sk_write_queue, head); ++ kcm->seq_skb = NULL; ++ ++ if (msg->msg_flags & MSG_BATCH) { ++ kcm->tx_wait_more = true; ++ } else if (kcm->tx_wait_more || not_busy) { ++ err = kcm_write_msgs(kcm); ++ if (err < 0) { ++ /* We got a hard error in write_msgs but have ++ * already queued this message. Report an error ++ * in the socket, but don't affect return value ++ * from sendmsg ++ */ ++ pr_warn("KCM: Hard failure on kcm_write_msgs\n"); ++ report_csk_error(&kcm->sk, -err); ++ } ++ } ++ } else { ++ /* Message not complete, save state */ ++partial_message: ++ kcm->seq_skb = head; ++ kcm_tx_msg(head)->last_skb = skb; ++ } ++ ++ release_sock(sk); ++ return copied; ++ ++out_error: ++ kcm_push(kcm); ++ ++ if (copied && sock->type == SOCK_SEQPACKET) { ++ /* Wrote some bytes before encountering an ++ * error, return partial success. ++ */ ++ goto partial_message; ++ } ++ ++ if (head != kcm->seq_skb) ++ kfree_skb(head); ++ ++ err = sk_stream_error(sk, msg->msg_flags, err); ++ ++ /* make sure we wake any epoll edge trigger waiter */ ++ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) ++ sk->sk_write_space(sk); ++ ++ release_sock(sk); ++ return err; ++} ++ ++static struct sk_buff *kcm_wait_data(struct sock *sk, int flags, ++ long timeo, int *err) ++{ ++ struct sk_buff *skb; ++ ++ while (!(skb = skb_peek(&sk->sk_receive_queue))) { ++ if (sk->sk_err) { ++ *err = sock_error(sk); ++ return NULL; ++ } ++ ++ if (sock_flag(sk, SOCK_DONE)) ++ return NULL; ++ ++ if ((flags & MSG_DONTWAIT) || !timeo) { ++ *err = -EAGAIN; ++ return NULL; ++ } ++ ++ sk_wait_data(sk, &timeo, NULL); ++ ++ /* Handle signals */ ++ if (signal_pending(current)) { ++ *err = sock_intr_errno(timeo); ++ return NULL; ++ } ++ } ++ ++ return skb; ++} ++ ++static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, ++ size_t len, int flags) ++{ ++ struct sock *sk = sock->sk; ++ int err = 0; ++ long timeo; ++ struct kcm_rx_msg *rxm; ++ int copied = 0; ++ struct sk_buff *skb; ++ ++ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); ++ ++ lock_sock(sk); ++ ++ skb = kcm_wait_data(sk, flags, timeo, &err); ++ if (!skb) ++ goto out; ++ ++ /* Okay, have a message on the receive queue */ ++ ++ rxm = kcm_rx_msg(skb); ++ ++ if (len > rxm->full_len) ++ len = rxm->full_len; ++ ++ err = skb_copy_datagram_msg(skb, rxm->offset, msg, len); ++ if (err < 0) ++ goto out; ++ ++ copied = len; ++ if (likely(!(flags & MSG_PEEK))) { ++ if (copied < rxm->full_len) { ++ if (sock->type == SOCK_DGRAM) { ++ /* Truncated message */ ++ msg->msg_flags |= MSG_TRUNC; ++ goto msg_finished; ++ } ++ rxm->offset += copied; ++ rxm->full_len -= copied; ++ } else { ++msg_finished: ++ /* Finished with message */ ++ msg->msg_flags |= MSG_EOR; ++ skb_unlink(skb, &sk->sk_receive_queue); ++ kfree_skb(skb); ++ } ++ } ++ ++out: ++ release_sock(sk); ++ ++ return copied ? : err; ++} ++ ++/* kcm sock lock held */ ++static void kcm_recv_disable(struct kcm_sock *kcm) ++{ ++ struct kcm_mux *mux = kcm->mux; ++ ++ if (kcm->rx_disabled) ++ return; ++ ++ spin_lock_bh(&mux->rx_lock); ++ ++ kcm->rx_disabled = 1; ++ ++ /* If a psock is reserved we'll do cleanup in unreserve */ ++ if (!kcm->rx_psock) { ++ if (kcm->rx_wait) { ++ list_del(&kcm->wait_rx_list); ++ kcm->rx_wait = false; ++ } ++ ++ requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); ++ } ++ ++ spin_unlock_bh(&mux->rx_lock); ++} ++ ++/* kcm sock lock held */ ++static void kcm_recv_enable(struct kcm_sock *kcm) ++{ ++ struct kcm_mux *mux = kcm->mux; ++ ++ if (!kcm->rx_disabled) ++ return; ++ ++ spin_lock_bh(&mux->rx_lock); ++ ++ kcm->rx_disabled = 0; ++ kcm_rcv_ready(kcm); ++ ++ spin_unlock_bh(&mux->rx_lock); ++} ++ ++static int kcm_setsockopt(struct socket *sock, int level, int optname, ++ char __user *optval, unsigned int optlen) ++{ ++ struct kcm_sock *kcm = kcm_sk(sock->sk); ++ int val, valbool; ++ int err = 0; ++ ++ if (level != SOL_KCM) ++ return -ENOPROTOOPT; ++ ++ if (optlen < sizeof(int)) ++ return -EINVAL; ++ ++ if (get_user(val, (int __user *)optval)) ++ return -EINVAL; ++ ++ valbool = val ? 1 : 0; ++ ++ switch (optname) { ++ case KCM_RECV_DISABLE: ++ lock_sock(&kcm->sk); ++ if (valbool) ++ kcm_recv_disable(kcm); ++ else ++ kcm_recv_enable(kcm); ++ release_sock(&kcm->sk); ++ break; ++ default: ++ err = -ENOPROTOOPT; ++ } ++ ++ return err; ++} ++ ++static int kcm_getsockopt(struct socket *sock, int level, int optname, ++ char __user *optval, int __user *optlen) ++{ ++ struct kcm_sock *kcm = kcm_sk(sock->sk); ++ int val, len; ++ ++ if (level != SOL_KCM) ++ return -ENOPROTOOPT; ++ ++ if (get_user(len, optlen)) ++ return -EFAULT; ++ ++ len = min_t(unsigned int, len, sizeof(int)); ++ if (len < 0) ++ return -EINVAL; ++ ++ switch (optname) { ++ case KCM_RECV_DISABLE: ++ val = kcm->rx_disabled; ++ break; ++ default: ++ return -ENOPROTOOPT; ++ } ++ ++ if (put_user(len, optlen)) ++ return -EFAULT; ++ if (copy_to_user(optval, &val, len)) ++ return -EFAULT; ++ return 0; ++} ++ ++static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) ++{ ++ struct kcm_sock *tkcm; ++ struct list_head *head; ++ int index = 0; ++ ++ /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so ++ * we set sk_state, otherwise epoll_wait always returns right away with ++ * POLLHUP ++ */ ++ kcm->sk.sk_state = TCP_ESTABLISHED; ++ ++ /* Add to mux's kcm sockets list */ ++ kcm->mux = mux; ++ spin_lock_bh(&mux->lock); ++ ++ head = &mux->kcm_socks; ++ list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) { ++ if (tkcm->index != index) ++ break; ++ head = &tkcm->kcm_sock_list; ++ index++; ++ } ++ ++ list_add(&kcm->kcm_sock_list, head); ++ kcm->index = index; ++ ++ mux->kcm_socks_cnt++; ++ spin_unlock_bh(&mux->lock); ++ ++ INIT_WORK(&kcm->tx_work, kcm_tx_work); ++ ++ spin_lock_bh(&mux->rx_lock); ++ kcm_rcv_ready(kcm); ++ spin_unlock_bh(&mux->rx_lock); ++} ++ ++static int kcm_attach(struct socket *sock, struct socket *csock, ++ struct bpf_prog *prog) ++{ ++ struct kcm_sock *kcm = kcm_sk(sock->sk); ++ struct kcm_mux *mux = kcm->mux; ++ struct sock *csk; ++ struct kcm_psock *psock = NULL, *tpsock; ++ struct list_head *head; ++ int index = 0; ++ ++ if (csock->ops->family != PF_INET && ++ csock->ops->family != PF_INET6) ++ return -EINVAL; ++ ++ csk = csock->sk; ++ if (!csk) ++ return -EINVAL; ++ ++ /* Only support TCP for now */ ++ if (csk->sk_protocol != IPPROTO_TCP) ++ return -EINVAL; ++ ++ psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); ++ if (!psock) ++ return -ENOMEM; ++ ++ psock->mux = mux; ++ psock->sk = csk; ++ psock->bpf_prog = prog; ++ INIT_WORK(&psock->rx_work, psock_rx_work); ++ INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work); ++ ++ sock_hold(csk); ++ ++ write_lock_bh(&csk->sk_callback_lock); ++ psock->save_data_ready = csk->sk_data_ready; ++ psock->save_write_space = csk->sk_write_space; ++ psock->save_state_change = csk->sk_state_change; ++ csk->sk_user_data = psock; ++ csk->sk_data_ready = psock_tcp_data_ready; ++ csk->sk_write_space = psock_tcp_write_space; ++ csk->sk_state_change = psock_tcp_state_change; ++ write_unlock_bh(&csk->sk_callback_lock); ++ ++ /* Finished initialization, now add the psock to the MUX. */ ++ spin_lock_bh(&mux->lock); ++ head = &mux->psocks; ++ list_for_each_entry(tpsock, &mux->psocks, psock_list) { ++ if (tpsock->index != index) ++ break; ++ head = &tpsock->psock_list; ++ index++; ++ } ++ ++ list_add(&psock->psock_list, head); ++ psock->index = index; ++ ++ mux->psocks_cnt++; ++ psock_now_avail(psock); ++ spin_unlock_bh(&mux->lock); ++ ++ /* Schedule RX work in case there are already bytes queued */ ++ queue_work(kcm_wq, &psock->rx_work); ++ ++ return 0; ++} ++ ++static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) ++{ ++ struct socket *csock; ++ struct bpf_prog *prog; ++ int err; ++ ++ csock = sockfd_lookup(info->fd, &err); ++ if (!csock) ++ return -ENOENT; ++ ++ prog = bpf_prog_get(info->bpf_fd); ++ if (IS_ERR(prog)) { ++ err = PTR_ERR(prog); ++ goto out; ++ } ++ ++ if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { ++ bpf_prog_put(prog); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ err = kcm_attach(sock, csock, prog); ++ if (err) { ++ bpf_prog_put(prog); ++ goto out; ++ } ++ ++ /* Keep reference on file also */ ++ ++ return 0; ++out: ++ fput(csock->file); ++ return err; ++} ++ ++static void kcm_unattach(struct kcm_psock *psock) ++{ ++ struct sock *csk = psock->sk; ++ struct kcm_mux *mux = psock->mux; ++ ++ /* Stop getting callbacks from TCP socket. After this there should ++ * be no way to reserve a kcm for this psock. ++ */ ++ write_lock_bh(&csk->sk_callback_lock); ++ csk->sk_user_data = NULL; ++ csk->sk_data_ready = psock->save_data_ready; ++ csk->sk_write_space = psock->save_write_space; ++ csk->sk_state_change = psock->save_state_change; ++ psock->rx_stopped = 1; ++ ++ if (WARN_ON(psock->rx_kcm)) { ++ write_unlock_bh(&csk->sk_callback_lock); ++ return; ++ } ++ ++ spin_lock_bh(&mux->rx_lock); ++ ++ /* Stop receiver activities. After this point psock should not be ++ * able to get onto ready list either through callbacks or work. ++ */ ++ if (psock->ready_rx_msg) { ++ list_del(&psock->psock_ready_list); ++ kfree_skb(psock->ready_rx_msg); ++ psock->ready_rx_msg = NULL; ++ } ++ ++ spin_unlock_bh(&mux->rx_lock); ++ ++ write_unlock_bh(&csk->sk_callback_lock); ++ ++ cancel_work_sync(&psock->rx_work); ++ cancel_delayed_work_sync(&psock->rx_delayed_work); ++ ++ bpf_prog_put(psock->bpf_prog); ++ ++ kfree_skb(psock->rx_skb_head); ++ psock->rx_skb_head = NULL; ++ ++ spin_lock_bh(&mux->lock); ++ ++ if (psock->tx_kcm) { ++ /* psock was reserved. Just mark it finished and we will clean ++ * up in the kcm paths, we need kcm lock which can not be ++ * acquired here. ++ */ ++ spin_unlock_bh(&mux->lock); ++ ++ /* We are unattaching a socket that is reserved. Abort the ++ * socket since we may be out of sync in sending on it. We need ++ * to do this without the mux lock. ++ */ ++ kcm_abort_tx_psock(psock, EPIPE, false); ++ ++ spin_lock_bh(&mux->lock); ++ if (!psock->tx_kcm) { ++ /* psock now unreserved in window mux was unlocked */ ++ goto no_reserved; ++ } ++ psock->done = 1; ++ ++ /* Commit done before queuing work to process it */ ++ smp_mb(); ++ ++ /* Queue tx work to make sure psock->done is handled */ ++ queue_work(kcm_wq, &psock->tx_kcm->tx_work); ++ spin_unlock_bh(&mux->lock); ++ } else { ++no_reserved: ++ if (!psock->tx_stopped) ++ list_del(&psock->psock_avail_list); ++ list_del(&psock->psock_list); ++ mux->psocks_cnt--; ++ spin_unlock_bh(&mux->lock); ++ ++ sock_put(csk); ++ fput(csk->sk_socket->file); ++ kmem_cache_free(kcm_psockp, psock); ++ } ++} ++ ++static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) ++{ ++ struct kcm_sock *kcm = kcm_sk(sock->sk); ++ struct kcm_mux *mux = kcm->mux; ++ struct kcm_psock *psock; ++ struct socket *csock; ++ struct sock *csk; ++ int err; ++ ++ csock = sockfd_lookup(info->fd, &err); ++ if (!csock) ++ return -ENOENT; ++ ++ csk = csock->sk; ++ if (!csk) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ err = -ENOENT; ++ ++ spin_lock_bh(&mux->lock); ++ ++ list_for_each_entry(psock, &mux->psocks, psock_list) { ++ if (psock->sk != csk) ++ continue; ++ ++ /* Found the matching psock */ ++ ++ if (psock->unattaching || WARN_ON(psock->done)) { ++ err = -EALREADY; ++ break; ++ } ++ ++ psock->unattaching = 1; ++ ++ spin_unlock_bh(&mux->lock); ++ ++ kcm_unattach(psock); ++ ++ err = 0; ++ goto out; ++ } ++ ++ spin_unlock_bh(&mux->lock); ++ ++out: ++ fput(csock->file); ++ return err; ++} ++ ++static struct proto kcm_proto = { ++ .name = "KCM", ++ .owner = THIS_MODULE, ++ .obj_size = sizeof(struct kcm_sock), ++}; ++ ++/* Clone a kcm socket. */ ++static int kcm_clone(struct socket *osock, struct kcm_clone *info, ++ struct socket **newsockp) ++{ ++ struct socket *newsock; ++ struct sock *newsk; ++ struct file *newfile; ++ int err, newfd; ++ ++ err = -ENFILE; ++ newsock = sock_alloc(); ++ if (!newsock) ++ goto out; ++ ++ newsock->type = osock->type; ++ newsock->ops = osock->ops; ++ ++ __module_get(newsock->ops->owner); ++ ++ newfd = get_unused_fd_flags(0); ++ if (unlikely(newfd < 0)) { ++ err = newfd; ++ goto out_fd_fail; ++ } ++ ++ newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); ++ if (unlikely(IS_ERR(newfile))) { ++ err = PTR_ERR(newfile); ++ goto out_sock_alloc_fail; ++ } ++ ++ newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, ++ &kcm_proto, true); ++ if (!newsk) { ++ err = -ENOMEM; ++ goto out_sk_alloc_fail; ++ } ++ ++ sock_init_data(newsock, newsk); ++ init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); ++ ++ fd_install(newfd, newfile); ++ *newsockp = newsock; ++ info->fd = newfd; ++ ++ return 0; ++ ++out_sk_alloc_fail: ++ fput(newfile); ++out_sock_alloc_fail: ++ put_unused_fd(newfd); ++out_fd_fail: ++ sock_release(newsock); ++out: ++ return err; ++} ++ ++static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) ++{ ++ int err; ++ ++ switch (cmd) { ++ case SIOCKCMATTACH: { ++ struct kcm_attach info; ++ ++ if (copy_from_user(&info, (void __user *)arg, sizeof(info))) ++ err = -EFAULT; ++ ++ err = kcm_attach_ioctl(sock, &info); ++ ++ break; ++ } ++ case SIOCKCMUNATTACH: { ++ struct kcm_unattach info; ++ ++ if (copy_from_user(&info, (void __user *)arg, sizeof(info))) ++ err = -EFAULT; ++ ++ err = kcm_unattach_ioctl(sock, &info); ++ ++ break; ++ } ++ case SIOCKCMCLONE: { ++ struct kcm_clone info; ++ struct socket *newsock = NULL; ++ ++ if (copy_from_user(&info, (void __user *)arg, sizeof(info))) ++ err = -EFAULT; ++ ++ err = kcm_clone(sock, &info, &newsock); ++ ++ if (!err) { ++ if (copy_to_user((void __user *)arg, &info, ++ sizeof(info))) { ++ err = -EFAULT; ++ sock_release(newsock); ++ } ++ } ++ ++ break; ++ } ++ default: ++ err = -ENOIOCTLCMD; ++ break; ++ } ++ ++ return err; ++} ++ ++static void free_mux(struct rcu_head *rcu) ++{ ++ struct kcm_mux *mux = container_of(rcu, ++ struct kcm_mux, rcu); ++ ++ kmem_cache_free(kcm_muxp, mux); ++} ++ ++static void release_mux(struct kcm_mux *mux) ++{ ++ struct kcm_net *knet = mux->knet; ++ struct kcm_psock *psock, *tmp_psock; ++ ++ /* Release psocks */ ++ list_for_each_entry_safe(psock, tmp_psock, ++ &mux->psocks, psock_list) { ++ if (!WARN_ON(psock->unattaching)) ++ kcm_unattach(psock); ++ } ++ ++ if (WARN_ON(mux->psocks_cnt)) ++ return; ++ ++ __skb_queue_purge(&mux->rx_hold_queue); ++ ++ mutex_lock(&knet->mutex); ++ list_del_rcu(&mux->kcm_mux_list); ++ knet->count--; ++ mutex_unlock(&knet->mutex); ++ ++ call_rcu(&mux->rcu, free_mux); ++} ++ ++static void kcm_done(struct kcm_sock *kcm) ++{ ++ struct kcm_mux *mux = kcm->mux; ++ struct sock *sk = &kcm->sk; ++ int socks_cnt; ++ ++ spin_lock_bh(&mux->rx_lock); ++ if (kcm->rx_psock) { ++ /* Cleanup in unreserve_rx_kcm */ ++ WARN_ON(kcm->done); ++ kcm->rx_disabled = 1; ++ kcm->done = 1; ++ spin_unlock_bh(&mux->rx_lock); ++ return; ++ } ++ ++ if (kcm->rx_wait) { ++ list_del(&kcm->wait_rx_list); ++ kcm->rx_wait = false; ++ } ++ /* Move any pending receive messages to other kcm sockets */ ++ requeue_rx_msgs(mux, &sk->sk_receive_queue); ++ ++ spin_unlock_bh(&mux->rx_lock); ++ ++ if (WARN_ON(sk_rmem_alloc_get(sk))) ++ return; ++ ++ /* Detach from MUX */ ++ spin_lock_bh(&mux->lock); ++ ++ list_del(&kcm->kcm_sock_list); ++ mux->kcm_socks_cnt--; ++ socks_cnt = mux->kcm_socks_cnt; ++ ++ spin_unlock_bh(&mux->lock); ++ ++ if (!socks_cnt) { ++ /* We are done with the mux now. */ ++ release_mux(mux); ++ } ++ ++ WARN_ON(kcm->rx_wait); ++ ++ sock_put(&kcm->sk); ++} ++ ++/* Called by kcm_release to close a KCM socket. ++ * If this is the last KCM socket on the MUX, destroy the MUX. ++ */ ++static int kcm_release(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ struct kcm_sock *kcm; ++ struct kcm_mux *mux; ++ struct kcm_psock *psock; ++ ++ if (!sk) ++ return 0; ++ ++ kcm = kcm_sk(sk); ++ mux = kcm->mux; ++ ++ sock_orphan(sk); ++ kfree_skb(kcm->seq_skb); ++ ++ lock_sock(sk); ++ /* Purge queue under lock to avoid race condition with tx_work trying ++ * to act when queue is nonempty. If tx_work runs after this point ++ * it will just return. ++ */ ++ __skb_queue_purge(&sk->sk_write_queue); ++ release_sock(sk); ++ ++ spin_lock_bh(&mux->lock); ++ if (kcm->tx_wait) { ++ /* Take of tx_wait list, after this point there should be no way ++ * that a psock will be assigned to this kcm. ++ */ ++ list_del(&kcm->wait_psock_list); ++ kcm->tx_wait = false; ++ } ++ spin_unlock_bh(&mux->lock); ++ ++ /* Cancel work. After this point there should be no outside references ++ * to the kcm socket. ++ */ ++ cancel_work_sync(&kcm->tx_work); ++ ++ lock_sock(sk); ++ psock = kcm->tx_psock; ++ if (psock) { ++ /* A psock was reserved, so we need to kill it since it ++ * may already have some bytes queued from a message. We ++ * need to do this after removing kcm from tx_wait list. ++ */ ++ kcm_abort_tx_psock(psock, EPIPE, false); ++ unreserve_psock(kcm); ++ } ++ release_sock(sk); ++ ++ WARN_ON(kcm->tx_wait); ++ WARN_ON(kcm->tx_psock); ++ ++ sock->sk = NULL; ++ ++ kcm_done(kcm); ++ ++ return 0; ++} ++ ++static const struct proto_ops kcm_ops = { ++ .family = PF_KCM, ++ .owner = THIS_MODULE, ++ .release = kcm_release, ++ .bind = sock_no_bind, ++ .connect = sock_no_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = sock_no_accept, ++ .getname = sock_no_getname, ++ .poll = datagram_poll, ++ .ioctl = kcm_ioctl, ++ .listen = sock_no_listen, ++ .shutdown = sock_no_shutdown, ++ .setsockopt = kcm_setsockopt, ++ .getsockopt = kcm_getsockopt, ++ .sendmsg = kcm_sendmsg, ++ .recvmsg = kcm_recvmsg, ++ .mmap = sock_no_mmap, ++ .sendpage = sock_no_sendpage, ++}; ++ ++/* Create proto operation for kcm sockets */ ++static int kcm_create(struct net *net, struct socket *sock, ++ int protocol, int kern) ++{ ++ struct kcm_net *knet = net_generic(net, kcm_net_id); ++ struct sock *sk; ++ struct kcm_mux *mux; ++ ++ switch (sock->type) { ++ case SOCK_DGRAM: ++ case SOCK_SEQPACKET: ++ sock->ops = &kcm_ops; ++ break; ++ default: ++ return -ESOCKTNOSUPPORT; ++ } ++ ++ if (protocol != KCMPROTO_CONNECTED) ++ return -EPROTONOSUPPORT; ++ ++ sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern); ++ if (!sk) ++ return -ENOMEM; ++ ++ /* Allocate a kcm mux, shared between KCM sockets */ ++ mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL); ++ if (!mux) { ++ sk_free(sk); ++ return -ENOMEM; ++ } ++ ++ spin_lock_init(&mux->lock); ++ spin_lock_init(&mux->rx_lock); ++ INIT_LIST_HEAD(&mux->kcm_socks); ++ INIT_LIST_HEAD(&mux->kcm_rx_waiters); ++ INIT_LIST_HEAD(&mux->kcm_tx_waiters); ++ ++ INIT_LIST_HEAD(&mux->psocks); ++ INIT_LIST_HEAD(&mux->psocks_ready); ++ INIT_LIST_HEAD(&mux->psocks_avail); ++ ++ mux->knet = knet; ++ ++ /* Add new MUX to list */ ++ mutex_lock(&knet->mutex); ++ list_add_rcu(&mux->kcm_mux_list, &knet->mux_list); ++ knet->count++; ++ mutex_unlock(&knet->mutex); ++ ++ skb_queue_head_init(&mux->rx_hold_queue); ++ ++ /* Init KCM socket */ ++ sock_init_data(sock, sk); ++ init_kcm_sock(kcm_sk(sk), mux); ++ ++ return 0; ++} ++ ++static struct net_proto_family kcm_family_ops = { ++ .family = PF_KCM, ++ .create = kcm_create, ++ .owner = THIS_MODULE, ++}; ++ ++static __net_init int kcm_init_net(struct net *net) ++{ ++ struct kcm_net *knet = net_generic(net, kcm_net_id); ++ ++ INIT_LIST_HEAD_RCU(&knet->mux_list); ++ mutex_init(&knet->mutex); ++ ++ return 0; ++} ++ ++static __net_exit void kcm_exit_net(struct net *net) ++{ ++ struct kcm_net *knet = net_generic(net, kcm_net_id); ++ ++ /* All KCM sockets should be closed at this point, which should mean ++ * that all multiplexors and psocks have been destroyed. ++ */ ++ WARN_ON(!list_empty(&knet->mux_list)); ++} ++ ++static struct pernet_operations kcm_net_ops = { ++ .init = kcm_init_net, ++ .exit = kcm_exit_net, ++ .id = &kcm_net_id, ++ .size = sizeof(struct kcm_net), ++}; ++ ++static int __init kcm_init(void) ++{ ++ int err = -ENOMEM; ++ ++ kcm_muxp = kmem_cache_create("kcm_mux_cache", ++ sizeof(struct kcm_mux), 0, ++ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); ++ if (!kcm_muxp) ++ goto fail; ++ ++ kcm_psockp = kmem_cache_create("kcm_psock_cache", ++ sizeof(struct kcm_psock), 0, ++ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); ++ if (!kcm_psockp) ++ goto fail; ++ ++ kcm_wq = create_singlethread_workqueue("kkcmd"); ++ if (!kcm_wq) ++ goto fail; ++ ++ err = proto_register(&kcm_proto, 1); ++ if (err) ++ goto fail; ++ ++ err = sock_register(&kcm_family_ops); ++ if (err) ++ goto sock_register_fail; ++ ++ err = register_pernet_device(&kcm_net_ops); ++ if (err) ++ goto net_ops_fail; ++ ++ return 0; ++ ++net_ops_fail: ++ sock_unregister(PF_KCM); ++ ++sock_register_fail: ++ proto_unregister(&kcm_proto); ++ ++fail: ++ kmem_cache_destroy(kcm_muxp); ++ kmem_cache_destroy(kcm_psockp); ++ ++ if (kcm_wq) ++ destroy_workqueue(kcm_wq); ++ ++ return err; ++} ++ ++static void __exit kcm_exit(void) ++{ ++ unregister_pernet_device(&kcm_net_ops); ++ sock_unregister(PF_KCM); ++ proto_unregister(&kcm_proto); ++ destroy_workqueue(kcm_wq); ++ ++ kmem_cache_destroy(kcm_muxp); ++ kmem_cache_destroy(kcm_psockp); ++} ++ ++module_init(kcm_init); ++module_exit(kcm_exit); ++ ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS_NETPROTO(PF_KCM); ++ +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0023-net-add-the-AF_KCM-entries-to-family-name-tables.patch b/alpine/kernel/patches/0023-net-add-the-AF_KCM-entries-to-family-name-tables.patch new file mode 100644 index 000000000..d6596ea66 --- /dev/null +++ b/alpine/kernel/patches/0023-net-add-the-AF_KCM-entries-to-family-name-tables.patch @@ -0,0 +1,52 @@ +From 4e7679280dd0ad8e28f9ebeea70127ed4385222a Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Mon, 21 Mar 2016 02:51:09 -0700 +Subject: [PATCH 23/25] net: add the AF_KCM entries to family name tables + +This is for the recent kcm driver, which introduces AF_KCM(41) in +b7ac4eb(kcm: Kernel Connection Multiplexor module). + +Signed-off-by: Dexuan Cui +Cc: Signed-off-by: Tom Herbert +Origin: https://patchwork.ozlabs.org/patch/600006 +--- + net/core/sock.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/net/core/sock.c b/net/core/sock.c +index 0d91f7d..925def4 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -263,7 +263,8 @@ static const char *const af_family_key_strings[AF_MAX+1] = { + "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , + "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , + "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , +- "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX" ++ "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , ++ "sk_lock-AF_MAX" + }; + static const char *const af_family_slock_key_strings[AF_MAX+1] = { + "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , +@@ -279,7 +280,8 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { + "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , + "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , + "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , +- "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" ++ "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , ++ "slock-AF_MAX" + }; + static const char *const af_family_clock_key_strings[AF_MAX+1] = { + "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , +@@ -295,7 +297,8 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { + "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , + "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , + "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , +- "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX" ++ "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , ++ "clock-AF_MAX" + }; + + /* +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0024-hv_sock-introduce-Hyper-V-Sockets.patch b/alpine/kernel/patches/0024-hv_sock-introduce-Hyper-V-Sockets.patch new file mode 100644 index 000000000..a35e8d4a9 --- /dev/null +++ b/alpine/kernel/patches/0024-hv_sock-introduce-Hyper-V-Sockets.patch @@ -0,0 +1,1730 @@ +From 65ca3b4d64bbe02b726a91e837605c6d082fb9b9 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Mon, 21 Mar 2016 02:52:49 -0700 +Subject: [PATCH 24/25] hv_sock: introduce Hyper-V Sockets + +Hyper-V Sockets (hv_sock) supplies a byte-stream based communication +mechanism between the host and the guest. It's somewhat like TCP over +VMBus, but the transportation layer (VMBus) is much simpler than IP. + +With Hyper-V Sockets, applications between the host and the guest can talk +to each other directly by the traditional BSD-style socket APIs. + +Hyper-V Sockets is only available on new Windows hosts, like Windows Server +2016. More info is in this article "Make your own integration services": +https://msdn.microsoft.com/en-us/virtualization/hyperv_on_windows/develop/make_mgmt_service + +The patch implements the necessary support in the guest side by introducing +a new socket address family AF_HYPERV. + +Signed-off-by: Dexuan Cui +Cc: "K. Y. Srinivasan" +Cc: Haiyang Zhang +Cc: Vitaly Kuznetsov +Origin: https://patchwork.ozlabs.org/patch/600008 +--- + MAINTAINERS | 2 + + include/linux/hyperv.h | 16 + + include/linux/socket.h | 5 +- + include/net/af_hvsock.h | 51 ++ + include/uapi/linux/hyperv.h | 16 + + net/Kconfig | 1 + + net/Makefile | 1 + + net/hv_sock/Kconfig | 10 + + net/hv_sock/Makefile | 3 + + net/hv_sock/af_hvsock.c | 1480 +++++++++++++++++++++++++++++++++++++++++++ + 10 files changed, 1583 insertions(+), 2 deletions(-) + create mode 100644 include/net/af_hvsock.h + create mode 100644 net/hv_sock/Kconfig + create mode 100644 net/hv_sock/Makefile + create mode 100644 net/hv_sock/af_hvsock.c + +diff --git a/MAINTAINERS b/MAINTAINERS +index 530bce8..3af3740 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -5136,7 +5136,9 @@ F: drivers/input/serio/hyperv-keyboard.c + F: drivers/net/hyperv/ + F: drivers/scsi/storvsc_drv.c + F: drivers/video/fbdev/hyperv_fb.c ++F: net/hv_sock/ + F: include/linux/hyperv.h ++F: include/net/af_hvsock.h + F: tools/hv/ + F: Documentation/ABI/stable/sysfs-bus-vmbus + +diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h +index af7ee0a..b78205e 100644 +--- a/include/linux/hyperv.h ++++ b/include/linux/hyperv.h +@@ -1314,4 +1314,20 @@ extern __u32 vmbus_proto_version; + + int vmbus_send_tl_connect_request(const uuid_le *shv_guest_servie_id, + const uuid_le *shv_host_servie_id); ++struct vmpipe_proto_header { ++ u32 pkt_type; ++ u32 data_size; ++} __packed; ++ ++#define HVSOCK_HEADER_LEN (sizeof(struct vmpacket_descriptor) + \ ++ sizeof(struct vmpipe_proto_header)) ++ ++/* See 'prev_indices' in hv_ringbuffer_read(), hv_ringbuffer_write() */ ++#define PREV_INDICES_LEN (sizeof(u64)) ++ ++#define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ ++ ALIGN((payload_len), 8) + \ ++ PREV_INDICES_LEN) ++#define HVSOCK_MIN_PKT_LEN HVSOCK_PKT_LEN(1) ++ + #endif /* _HYPERV_H */ +diff --git a/include/linux/socket.h b/include/linux/socket.h +index 4e1ea53..2a28c5f 100644 +--- a/include/linux/socket.h ++++ b/include/linux/socket.h +@@ -201,8 +201,8 @@ struct ucred { + #define AF_NFC 39 /* NFC sockets */ + #define AF_VSOCK 40 /* vSockets */ + #define AF_KCM 41 /* Kernel Connection Multiplexor*/ +- +-#define AF_MAX 42 /* For now.. */ ++#define AF_HYPERV 42 /* Hyper-V Sockets */ ++#define AF_MAX 43 /* For now.. */ + + /* Protocol families, same as address families. */ + #define PF_UNSPEC AF_UNSPEC +@@ -249,6 +249,7 @@ struct ucred { + #define PF_NFC AF_NFC + #define PF_VSOCK AF_VSOCK + #define PF_KCM AF_KCM ++#define PF_HYPERV AF_HYPERV + #define PF_MAX AF_MAX + + /* Maximum queue length specifiable by listen. */ +diff --git a/include/net/af_hvsock.h b/include/net/af_hvsock.h +new file mode 100644 +index 0000000..a5aa28d +--- /dev/null ++++ b/include/net/af_hvsock.h +@@ -0,0 +1,51 @@ ++#ifndef __AF_HVSOCK_H__ ++#define __AF_HVSOCK_H__ ++ ++#include ++#include ++#include ++ ++#define VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV (5 * PAGE_SIZE) ++#define VMBUS_RINGBUFFER_SIZE_HVSOCK_SEND (5 * PAGE_SIZE) ++ ++#define HVSOCK_RCV_BUF_SZ VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV ++#define HVSOCK_SND_BUF_SZ PAGE_SIZE ++ ++#define sk_to_hvsock(__sk) ((struct hvsock_sock *)(__sk)) ++#define hvsock_to_sk(__hvsk) ((struct sock *)(__hvsk)) ++ ++struct hvsock_sock { ++ /* sk must be the first member. */ ++ struct sock sk; ++ ++ struct sockaddr_hv local_addr; ++ struct sockaddr_hv remote_addr; ++ ++ /* protected by the global hvsock_mutex */ ++ struct list_head bound_list; ++ struct list_head connected_list; ++ ++ struct list_head accept_queue; ++ /* used by enqueue and dequeue */ ++ struct mutex accept_queue_mutex; ++ ++ struct delayed_work dwork; ++ ++ u32 peer_shutdown; ++ ++ struct vmbus_channel *channel; ++ ++ struct { ++ struct vmpipe_proto_header hdr; ++ char buf[HVSOCK_SND_BUF_SZ]; ++ } __packed send; ++ ++ struct { ++ struct vmpipe_proto_header hdr; ++ char buf[HVSOCK_RCV_BUF_SZ]; ++ unsigned int data_len; ++ unsigned int data_offset; ++ } __packed recv; ++}; ++ ++#endif /* __AF_HVSOCK_H__ */ +diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h +index e4c0a35..18ca18a 100644 +--- a/include/uapi/linux/hyperv.h ++++ b/include/uapi/linux/hyperv.h +@@ -26,6 +26,7 @@ + #define _UAPI_HYPERV_H + + #include ++#include + + /* + * Framework version for util services. +@@ -395,4 +396,19 @@ struct hv_kvp_ip_msg { + struct hv_kvp_ipaddr_value kvp_ip_val; + } __attribute__((packed)); + ++/* This is the Hyper-V socket's address format. */ ++struct sockaddr_hv { ++ __kernel_sa_family_t shv_family; /* Address family */ ++ __le16 reserved; /* Must be Zero */ ++ uuid_le shv_vm_id; /* Not used. Must be Zero. */ ++ uuid_le shv_service_id; /* Service ID */ ++}; ++ ++#define SHV_VMID_GUEST NULL_UUID_LE ++#define SHV_VMID_HOST NULL_UUID_LE ++ ++#define SHV_SERVICE_ID_ANY NULL_UUID_LE ++ ++#define SHV_PROTO_RAW 1 ++ + #endif /* _UAPI_HYPERV_H */ +diff --git a/net/Kconfig b/net/Kconfig +index b8439e6..ebc8f20 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -228,6 +228,7 @@ source "net/dns_resolver/Kconfig" + source "net/batman-adv/Kconfig" + source "net/openvswitch/Kconfig" + source "net/vmw_vsock/Kconfig" ++source "net/hv_sock/Kconfig" + source "net/netlink/Kconfig" + source "net/mpls/Kconfig" + source "net/hsr/Kconfig" +diff --git a/net/Makefile b/net/Makefile +index 81d1411..d115c31 100644 +--- a/net/Makefile ++++ b/net/Makefile +@@ -70,6 +70,7 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ + obj-$(CONFIG_NFC) += nfc/ + obj-$(CONFIG_OPENVSWITCH) += openvswitch/ + obj-$(CONFIG_VSOCKETS) += vmw_vsock/ ++obj-$(CONFIG_HYPERV_SOCK) += hv_sock/ + obj-$(CONFIG_MPLS) += mpls/ + obj-$(CONFIG_HSR) += hsr/ + ifneq ($(CONFIG_NET_SWITCHDEV),) +diff --git a/net/hv_sock/Kconfig b/net/hv_sock/Kconfig +new file mode 100644 +index 0000000..1f41848 +--- /dev/null ++++ b/net/hv_sock/Kconfig +@@ -0,0 +1,10 @@ ++config HYPERV_SOCK ++ tristate "Hyper-V Sockets" ++ depends on HYPERV ++ default m if HYPERV ++ help ++ Hyper-V Sockets is somewhat like TCP over VMBus, allowing ++ communication between Linux guest and Hyper-V host without TCP/IP. ++ ++ To compile this driver as a module, choose M here: the module ++ will be called hv_sock. +diff --git a/net/hv_sock/Makefile b/net/hv_sock/Makefile +new file mode 100644 +index 0000000..716c012 +--- /dev/null ++++ b/net/hv_sock/Makefile +@@ -0,0 +1,3 @@ ++obj-$(CONFIG_HYPERV_SOCK) += hv_sock.o ++ ++hv_sock-y += af_hvsock.o +diff --git a/net/hv_sock/af_hvsock.c b/net/hv_sock/af_hvsock.c +new file mode 100644 +index 0000000..e5639eb +--- /dev/null ++++ b/net/hv_sock/af_hvsock.c +@@ -0,0 +1,1480 @@ ++/* ++ * Hyper-V Socket driver ++ * ++ * Copyright(c) 2016, Microsoft Corporation. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, ++ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, ++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING ++ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++ * POSSIBILITY OF SUCH DAMAGE. ++ */ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++ ++static struct proto hvsock_proto = { ++ .name = "HV_SOCK", ++ .owner = THIS_MODULE, ++ .obj_size = sizeof(struct hvsock_sock), ++}; ++ ++#define SS_LISTEN 255 ++ ++static LIST_HEAD(hvsock_bound_list); ++static LIST_HEAD(hvsock_connected_list); ++static DEFINE_MUTEX(hvsock_mutex); ++ ++static bool uuid_equals(uuid_le u1, uuid_le u2) ++{ ++ return !uuid_le_cmp(u1, u2); ++} ++ ++/* NOTE: hvsock_mutex must be held when the below helper functions, whose ++ * names begin with __ hvsock, are invoked. ++ */ ++static void __hvsock_insert_bound(struct list_head *list, ++ struct hvsock_sock *hvsk) ++{ ++ sock_hold(&hvsk->sk); ++ list_add(&hvsk->bound_list, list); ++} ++ ++static void __hvsock_insert_connected(struct list_head *list, ++ struct hvsock_sock *hvsk) ++{ ++ sock_hold(&hvsk->sk); ++ list_add(&hvsk->connected_list, list); ++} ++ ++static void __hvsock_remove_bound(struct hvsock_sock *hvsk) ++{ ++ list_del_init(&hvsk->bound_list); ++ sock_put(&hvsk->sk); ++} ++ ++static void __hvsock_remove_connected(struct hvsock_sock *hvsk) ++{ ++ list_del_init(&hvsk->connected_list); ++ sock_put(&hvsk->sk); ++} ++ ++static struct sock *__hvsock_find_bound_socket(const struct sockaddr_hv *addr) ++{ ++ struct hvsock_sock *hvsk; ++ ++ list_for_each_entry(hvsk, &hvsock_bound_list, bound_list) ++ if (uuid_equals(addr->shv_service_id, ++ hvsk->local_addr.shv_service_id)) ++ return hvsock_to_sk(hvsk); ++ return NULL; ++} ++ ++static struct sock *__hvsock_find_connected_socket_by_channel( ++ const struct vmbus_channel *channel) ++{ ++ struct hvsock_sock *hvsk; ++ ++ list_for_each_entry(hvsk, &hvsock_connected_list, connected_list) ++ if (hvsk->channel == channel) ++ return hvsock_to_sk(hvsk); ++ return NULL; ++} ++ ++static bool __hvsock_in_bound_list(struct hvsock_sock *hvsk) ++{ ++ return !list_empty(&hvsk->bound_list); ++} ++ ++static bool __hvsock_in_connected_list(struct hvsock_sock *hvsk) ++{ ++ return !list_empty(&hvsk->connected_list); ++} ++ ++static void hvsock_insert_connected(struct hvsock_sock *hvsk) ++{ ++ __hvsock_insert_connected(&hvsock_connected_list, hvsk); ++} ++ ++static ++void hvsock_enqueue_accept(struct sock *listener, struct sock *connected) ++{ ++ struct hvsock_sock *hvlistener; ++ struct hvsock_sock *hvconnected; ++ ++ hvlistener = sk_to_hvsock(listener); ++ hvconnected = sk_to_hvsock(connected); ++ ++ sock_hold(connected); ++ sock_hold(listener); ++ ++ mutex_lock(&hvlistener->accept_queue_mutex); ++ list_add_tail(&hvconnected->accept_queue, &hvlistener->accept_queue); ++ listener->sk_ack_backlog++; ++ mutex_unlock(&hvlistener->accept_queue_mutex); ++} ++ ++static struct sock *hvsock_dequeue_accept(struct sock *listener) ++{ ++ struct hvsock_sock *hvlistener; ++ struct hvsock_sock *hvconnected; ++ ++ hvlistener = sk_to_hvsock(listener); ++ ++ mutex_lock(&hvlistener->accept_queue_mutex); ++ ++ if (list_empty(&hvlistener->accept_queue)) { ++ mutex_unlock(&hvlistener->accept_queue_mutex); ++ return NULL; ++ } ++ ++ hvconnected = list_entry(hvlistener->accept_queue.next, ++ struct hvsock_sock, accept_queue); ++ ++ list_del_init(&hvconnected->accept_queue); ++ listener->sk_ack_backlog--; ++ ++ mutex_unlock(&hvlistener->accept_queue_mutex); ++ ++ sock_put(listener); ++ /* The caller will need a reference on the connected socket so we let ++ * it call sock_put(). ++ */ ++ ++ return hvsock_to_sk(hvconnected); ++} ++ ++static bool hvsock_is_accept_queue_empty(struct sock *sk) ++{ ++ struct hvsock_sock *hvsk = sk_to_hvsock(sk); ++ int ret; ++ ++ mutex_lock(&hvsk->accept_queue_mutex); ++ ret = list_empty(&hvsk->accept_queue); ++ mutex_unlock(&hvsk->accept_queue_mutex); ++ ++ return ret; ++} ++ ++static void hvsock_addr_init(struct sockaddr_hv *addr, uuid_le service_id) ++{ ++ memset(addr, 0, sizeof(*addr)); ++ addr->shv_family = AF_HYPERV; ++ addr->shv_service_id = service_id; ++} ++ ++static int hvsock_addr_validate(const struct sockaddr_hv *addr) ++{ ++ if (!addr) ++ return -EFAULT; ++ ++ if (addr->shv_family != AF_HYPERV) ++ return -EAFNOSUPPORT; ++ ++ if (addr->reserved != 0) ++ return -EINVAL; ++ ++ if (!uuid_equals(addr->shv_vm_id, NULL_UUID_LE)) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static bool hvsock_addr_bound(const struct sockaddr_hv *addr) ++{ ++ return !uuid_equals(addr->shv_service_id, SHV_SERVICE_ID_ANY); ++} ++ ++static int hvsock_addr_cast(const struct sockaddr *addr, size_t len, ++ struct sockaddr_hv **out_addr) ++{ ++ if (len < sizeof(**out_addr)) ++ return -EFAULT; ++ ++ *out_addr = (struct sockaddr_hv *)addr; ++ return hvsock_addr_validate(*out_addr); ++} ++ ++static int __hvsock_do_bind(struct hvsock_sock *hvsk, ++ struct sockaddr_hv *addr) ++{ ++ struct sockaddr_hv hv_addr; ++ int ret = 0; ++ ++ hvsock_addr_init(&hv_addr, addr->shv_service_id); ++ ++ mutex_lock(&hvsock_mutex); ++ ++ if (uuid_equals(addr->shv_service_id, SHV_SERVICE_ID_ANY)) { ++ do { ++ uuid_le_gen(&hv_addr.shv_service_id); ++ } while (__hvsock_find_bound_socket(&hv_addr)); ++ } else { ++ if (__hvsock_find_bound_socket(&hv_addr)) { ++ ret = -EADDRINUSE; ++ goto out; ++ } ++ } ++ ++ hvsock_addr_init(&hvsk->local_addr, hv_addr.shv_service_id); ++ __hvsock_insert_bound(&hvsock_bound_list, hvsk); ++ ++out: ++ mutex_unlock(&hvsock_mutex); ++ ++ return ret; ++} ++ ++static int __hvsock_bind(struct sock *sk, struct sockaddr_hv *addr) ++{ ++ struct hvsock_sock *hvsk = sk_to_hvsock(sk); ++ int ret; ++ ++ if (hvsock_addr_bound(&hvsk->local_addr)) ++ return -EINVAL; ++ ++ switch (sk->sk_socket->type) { ++ case SOCK_STREAM: ++ ret = __hvsock_do_bind(hvsk, addr); ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return ret; ++} ++ ++/* Autobind this socket to the local address if necessary. */ ++static int hvsock_auto_bind(struct hvsock_sock *hvsk) ++{ ++ struct sock *sk = hvsock_to_sk(hvsk); ++ struct sockaddr_hv local_addr; ++ ++ if (hvsock_addr_bound(&hvsk->local_addr)) ++ return 0; ++ hvsock_addr_init(&local_addr, SHV_SERVICE_ID_ANY); ++ return __hvsock_bind(sk, &local_addr); ++} ++ ++static void hvsock_sk_destruct(struct sock *sk) ++{ ++ struct hvsock_sock *hvsk = sk_to_hvsock(sk); ++ struct vmbus_channel *channel = hvsk->channel; ++ ++ if (!channel) ++ return; ++ ++ vmbus_hvsock_device_unregister(channel); ++} ++ ++static void __hvsock_release(struct sock *sk) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *pending; ++ ++ hvsk = sk_to_hvsock(sk); ++ ++ mutex_lock(&hvsock_mutex); ++ if (__hvsock_in_bound_list(hvsk)) ++ __hvsock_remove_bound(hvsk); ++ ++ if (__hvsock_in_connected_list(hvsk)) ++ __hvsock_remove_connected(hvsk); ++ mutex_unlock(&hvsock_mutex); ++ ++ lock_sock(sk); ++ sock_orphan(sk); ++ sk->sk_shutdown = SHUTDOWN_MASK; ++ ++ /* Clean up any sockets that never were accepted. */ ++ while ((pending = hvsock_dequeue_accept(sk)) != NULL) { ++ __hvsock_release(pending); ++ sock_put(pending); ++ } ++ ++ release_sock(sk); ++ sock_put(sk); ++} ++ ++static int hvsock_release(struct socket *sock) ++{ ++ /* If accept() is interrupted by a signal, the temporary socket ++ * struct's sock->sk is NULL. ++ */ ++ if (sock->sk) { ++ __hvsock_release(sock->sk); ++ sock->sk = NULL; ++ } ++ ++ sock->state = SS_FREE; ++ return 0; ++} ++ ++static struct sock *__hvsock_create(struct net *net, struct socket *sock, ++ gfp_t priority, unsigned short type) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ ++ sk = sk_alloc(net, AF_HYPERV, priority, &hvsock_proto, 0); ++ if (!sk) ++ return NULL; ++ ++ sock_init_data(sock, sk); ++ ++ /* sk->sk_type is normally set in sock_init_data, but only if sock is ++ * non-NULL. We make sure that our sockets always have a type by ++ * setting it here if needed. ++ */ ++ if (!sock) ++ sk->sk_type = type; ++ ++ hvsk = sk_to_hvsock(sk); ++ hvsock_addr_init(&hvsk->local_addr, SHV_SERVICE_ID_ANY); ++ hvsock_addr_init(&hvsk->remote_addr, SHV_SERVICE_ID_ANY); ++ ++ sk->sk_destruct = hvsock_sk_destruct; ++ ++ /* Looks stream-based socket doesn't need this. */ ++ sk->sk_backlog_rcv = NULL; ++ ++ sk->sk_state = 0; ++ sock_reset_flag(sk, SOCK_DONE); ++ ++ INIT_LIST_HEAD(&hvsk->bound_list); ++ INIT_LIST_HEAD(&hvsk->connected_list); ++ ++ INIT_LIST_HEAD(&hvsk->accept_queue); ++ mutex_init(&hvsk->accept_queue_mutex); ++ ++ hvsk->peer_shutdown = 0; ++ ++ hvsk->recv.data_len = 0; ++ hvsk->recv.data_offset = 0; ++ ++ return sk; ++} ++ ++static int hvsock_bind(struct socket *sock, struct sockaddr *addr, ++ int addr_len) ++{ ++ struct sockaddr_hv *hv_addr; ++ struct sock *sk; ++ int ret; ++ ++ sk = sock->sk; ++ ++ if (hvsock_addr_cast(addr, addr_len, &hv_addr) != 0) ++ return -EINVAL; ++ ++ lock_sock(sk); ++ ret = __hvsock_bind(sk, hv_addr); ++ release_sock(sk); ++ ++ return ret; ++} ++ ++static int hvsock_getname(struct socket *sock, ++ struct sockaddr *addr, int *addr_len, int peer) ++{ ++ struct sockaddr_hv *hv_addr; ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ int ret; ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ ret = 0; ++ ++ lock_sock(sk); ++ ++ if (peer) { ++ if (sock->state != SS_CONNECTED) { ++ ret = -ENOTCONN; ++ goto out; ++ } ++ hv_addr = &hvsk->remote_addr; ++ } else { ++ hv_addr = &hvsk->local_addr; ++ } ++ ++ __sockaddr_check_size(sizeof(*hv_addr)); ++ ++ memcpy(addr, hv_addr, sizeof(*hv_addr)); ++ *addr_len = sizeof(*hv_addr); ++ ++out: ++ release_sock(sk); ++ return ret; ++} ++ ++static int hvsock_shutdown(struct socket *sock, int mode) ++{ ++ struct sock *sk; ++ ++ if (mode < SHUT_RD || mode > SHUT_RDWR) ++ return -EINVAL; ++ /* This maps: ++ * SHUT_RD (0) -> RCV_SHUTDOWN (1) ++ * SHUT_WR (1) -> SEND_SHUTDOWN (2) ++ * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) ++ */ ++ ++mode; ++ ++ if (sock->state == SS_UNCONNECTED) ++ return -ENOTCONN; ++ ++ sock->state = SS_DISCONNECTING; ++ ++ sk = sock->sk; ++ ++ lock_sock(sk); ++ ++ sk->sk_shutdown |= mode; ++ sk->sk_state_change(sk); ++ ++ /* TODO: how to send a FIN if we haven't done that? */ ++ if (mode & SEND_SHUTDOWN) ++ ; ++ ++ release_sock(sk); ++ ++ return 0; ++} ++ ++static void get_ringbuffer_rw_status(struct vmbus_channel *channel, ++ bool *can_read, bool *can_write) ++{ ++ u32 avl_read_bytes, avl_write_bytes, dummy; ++ ++ if (can_read) { ++ hv_get_ringbuffer_availbytes(&channel->inbound, ++ &avl_read_bytes, ++ &dummy); ++ *can_read = avl_read_bytes >= HVSOCK_MIN_PKT_LEN; ++ } ++ ++ /* We write into the ringbuffer only when we're able to write a ++ * a payload of 4096 bytes (the actual written payload's length may be ++ * less than 4096). ++ */ ++ if (can_write) { ++ hv_get_ringbuffer_availbytes(&channel->outbound, ++ &dummy, ++ &avl_write_bytes); ++ *can_write = avl_write_bytes > HVSOCK_PKT_LEN(PAGE_SIZE); ++ } ++} ++ ++static unsigned int hvsock_poll(struct file *file, struct socket *sock, ++ poll_table *wait) ++{ ++ struct vmbus_channel *channel; ++ bool can_read, can_write; ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ unsigned int mask; ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ ++ poll_wait(file, sk_sleep(sk), wait); ++ mask = 0; ++ ++ if (sk->sk_err) ++ /* Signify that there has been an error on this socket. */ ++ mask |= POLLERR; ++ ++ /* INET sockets treat local write shutdown and peer write shutdown as a ++ * case of POLLHUP set. ++ */ ++ if ((sk->sk_shutdown == SHUTDOWN_MASK) || ++ ((sk->sk_shutdown & SEND_SHUTDOWN) && ++ (hvsk->peer_shutdown & SEND_SHUTDOWN))) { ++ mask |= POLLHUP; ++ } ++ ++ if (sk->sk_shutdown & RCV_SHUTDOWN || ++ hvsk->peer_shutdown & SEND_SHUTDOWN) { ++ mask |= POLLRDHUP; ++ } ++ ++ lock_sock(sk); ++ ++ /* Listening sockets that have connections in their accept ++ * queue can be read. ++ */ ++ if (sk->sk_state == SS_LISTEN && !hvsock_is_accept_queue_empty(sk)) ++ mask |= POLLIN | POLLRDNORM; ++ ++ /* The mutex is to against hvsock_open_connection() */ ++ mutex_lock(&hvsock_mutex); ++ ++ channel = hvsk->channel; ++ if (channel) { ++ /* If there is something in the queue then we can read */ ++ get_ringbuffer_rw_status(channel, &can_read, &can_write); ++ ++ if (!can_read && hvsk->recv.data_len > 0) ++ can_read = true; ++ ++ if (!(sk->sk_shutdown & RCV_SHUTDOWN) && can_read) ++ mask |= POLLIN | POLLRDNORM; ++ } else { ++ can_read = false; ++ can_write = false; ++ } ++ ++ mutex_unlock(&hvsock_mutex); ++ ++ /* Sockets whose connections have been closed terminated should ++ * also be considered read, and we check the shutdown flag for that. ++ */ ++ if (sk->sk_shutdown & RCV_SHUTDOWN || ++ hvsk->peer_shutdown & SEND_SHUTDOWN) { ++ mask |= POLLIN | POLLRDNORM; ++ } ++ ++ /* Connected sockets that can produce data can be written. */ ++ if (sk->sk_state == SS_CONNECTED && can_write && ++ !(sk->sk_shutdown & SEND_SHUTDOWN)) { ++ /* Remove POLLWRBAND since INET sockets are not setting it. ++ */ ++ mask |= POLLOUT | POLLWRNORM; ++ } ++ ++ /* Simulate INET socket poll behaviors, which sets ++ * POLLOUT|POLLWRNORM when peer is closed and nothing to read, ++ * but local send is not shutdown. ++ */ ++ if (sk->sk_state == SS_UNCONNECTED && ++ !(sk->sk_shutdown & SEND_SHUTDOWN)) ++ mask |= POLLOUT | POLLWRNORM; ++ ++ release_sock(sk); ++ ++ return mask; ++} ++ ++/* This function runs in the tasklet context of process_chn_event() */ ++static void hvsock_on_channel_cb(void *ctx) ++{ ++ struct sock *sk = (struct sock *)ctx; ++ struct hvsock_sock *hvsk = sk_to_hvsock(sk); ++ struct vmbus_channel *channel = hvsk->channel; ++ bool can_read, can_write; ++ ++ if (!channel) { ++ WARN_ONCE(1, "NULL channel! There is a programming bug.\n"); ++ return; ++ } ++ ++ get_ringbuffer_rw_status(channel, &can_read, &can_write); ++ ++ if (can_read) ++ sk->sk_data_ready(sk); ++ ++ if (can_write) ++ sk->sk_write_space(sk); ++} ++ ++static void hvsock_close_connection(struct vmbus_channel *channel) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ ++ mutex_lock(&hvsock_mutex); ++ ++ sk = __hvsock_find_connected_socket_by_channel(channel); ++ ++ /* The guest has already closed the connection? */ ++ if (!sk) ++ goto out; ++ ++ sk->sk_socket->state = SS_UNCONNECTED; ++ sk->sk_state = SS_UNCONNECTED; ++ sock_set_flag(sk, SOCK_DONE); ++ ++ hvsk = sk_to_hvsock(sk); ++ hvsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN; ++ ++ sk->sk_state_change(sk); ++out: ++ mutex_unlock(&hvsock_mutex); ++} ++ ++static int hvsock_open_connection(struct vmbus_channel *channel) ++{ ++ struct hvsock_sock *hvsk, *new_hvsk; ++ struct sockaddr_hv hv_addr; ++ struct sock *sk, *new_sk; ++ ++ uuid_le *instance, *service_id; ++ int ret; ++ ++ instance = &channel->offermsg.offer.if_instance; ++ service_id = &channel->offermsg.offer.if_type; ++ ++ hvsock_addr_init(&hv_addr, *instance); ++ ++ mutex_lock(&hvsock_mutex); ++ ++ sk = __hvsock_find_bound_socket(&hv_addr); ++ ++ if (sk) { ++ /* It is from the guest client's connect() */ ++ if (sk->sk_state != SS_CONNECTING) { ++ ret = -ENXIO; ++ goto out; ++ } ++ ++ hvsk = sk_to_hvsock(sk); ++ hvsk->channel = channel; ++ set_channel_read_state(channel, false); ++ vmbus_set_chn_rescind_callback(channel, ++ hvsock_close_connection); ++ ret = vmbus_open(channel, VMBUS_RINGBUFFER_SIZE_HVSOCK_SEND, ++ VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV, NULL, 0, ++ hvsock_on_channel_cb, sk); ++ if (ret != 0) { ++ hvsk->channel = NULL; ++ goto out; ++ } ++ ++ set_channel_pending_send_size(channel, ++ HVSOCK_PKT_LEN(PAGE_SIZE)); ++ sk->sk_state = SS_CONNECTED; ++ sk->sk_socket->state = SS_CONNECTED; ++ hvsock_insert_connected(hvsk); ++ sk->sk_state_change(sk); ++ goto out; ++ } ++ ++ /* Now we suppose it is from a host client's connect() */ ++ hvsock_addr_init(&hv_addr, *service_id); ++ sk = __hvsock_find_bound_socket(&hv_addr); ++ ++ /* No guest server listening? Well, let's ignore the offer */ ++ if (!sk || sk->sk_state != SS_LISTEN) { ++ ret = -ENXIO; ++ goto out; ++ } ++ ++ if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog) { ++ ret = -EMFILE; ++ goto out; ++ } ++ ++ new_sk = __hvsock_create(sock_net(sk), NULL, GFP_KERNEL, sk->sk_type); ++ if (!new_sk) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ new_hvsk = sk_to_hvsock(new_sk); ++ new_sk->sk_state = SS_CONNECTING; ++ hvsock_addr_init(&new_hvsk->local_addr, *service_id); ++ hvsock_addr_init(&new_hvsk->remote_addr, *instance); ++ ++ set_channel_read_state(channel, false); ++ new_hvsk->channel = channel; ++ vmbus_set_chn_rescind_callback(channel, hvsock_close_connection); ++ ret = vmbus_open(channel, VMBUS_RINGBUFFER_SIZE_HVSOCK_SEND, ++ VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV, NULL, 0, ++ hvsock_on_channel_cb, new_sk); ++ if (ret != 0) { ++ new_hvsk->channel = NULL; ++ sock_put(new_sk); ++ goto out; ++ } ++ set_channel_pending_send_size(channel, HVSOCK_PKT_LEN(PAGE_SIZE)); ++ ++ new_sk->sk_state = SS_CONNECTED; ++ hvsock_insert_connected(new_hvsk); ++ hvsock_enqueue_accept(sk, new_sk); ++ sk->sk_state_change(sk); ++out: ++ mutex_unlock(&hvsock_mutex); ++ return ret; ++} ++ ++static void hvsock_connect_timeout(struct work_struct *work) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ ++ hvsk = container_of(work, struct hvsock_sock, dwork.work); ++ sk = hvsock_to_sk(hvsk); ++ ++ lock_sock(sk); ++ if ((sk->sk_state == SS_CONNECTING) && ++ (sk->sk_shutdown != SHUTDOWN_MASK)) { ++ sk->sk_state = SS_UNCONNECTED; ++ sk->sk_err = ETIMEDOUT; ++ sk->sk_error_report(sk); ++ } ++ release_sock(sk); ++ ++ sock_put(sk); ++} ++ ++static int hvsock_connect(struct socket *sock, struct sockaddr *addr, ++ int addr_len, int flags) ++{ ++ struct sockaddr_hv *remote_addr; ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ ++ DEFINE_WAIT(wait); ++ long timeout; ++ ++ int ret = 0; ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ ++ lock_sock(sk); ++ ++ switch (sock->state) { ++ case SS_CONNECTED: ++ ret = -EISCONN; ++ goto out; ++ case SS_DISCONNECTING: ++ ret = -EINVAL; ++ goto out; ++ case SS_CONNECTING: ++ /* This continues on so we can move sock into the SS_CONNECTED ++ * state once the connection has completed (at which point err ++ * will be set to zero also). Otherwise, we will either wait ++ * for the connection or return -EALREADY should this be a ++ * non-blocking call. ++ */ ++ ret = -EALREADY; ++ break; ++ default: ++ if ((sk->sk_state == SS_LISTEN) || ++ hvsock_addr_cast(addr, addr_len, &remote_addr) != 0) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ /* Set the remote address that we are connecting to. */ ++ memcpy(&hvsk->remote_addr, remote_addr, ++ sizeof(hvsk->remote_addr)); ++ ++ ret = hvsock_auto_bind(hvsk); ++ if (ret) ++ goto out; ++ ++ sk->sk_state = SS_CONNECTING; ++ ++ ret = vmbus_send_tl_connect_request( ++ &hvsk->local_addr.shv_service_id, ++ &hvsk->remote_addr.shv_service_id); ++ if (ret < 0) ++ goto out; ++ ++ /* Mark sock as connecting and set the error code to in ++ * progress in case this is a non-blocking connect. ++ */ ++ sock->state = SS_CONNECTING; ++ ret = -EINPROGRESS; ++ } ++ ++ /* The receive path will handle all communication until we are able to ++ * enter the connected state. Here we wait for the connection to be ++ * completed or a notification of an error. ++ */ ++ timeout = 30 * HZ; ++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); ++ ++ while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) { ++ if (flags & O_NONBLOCK) { ++ /* If we're not going to block, we schedule a timeout ++ * function to generate a timeout on the connection ++ * attempt, in case the peer doesn't respond in a ++ * timely manner. We hold on to the socket until the ++ * timeout fires. ++ */ ++ sock_hold(sk); ++ INIT_DELAYED_WORK(&hvsk->dwork, ++ hvsock_connect_timeout); ++ schedule_delayed_work(&hvsk->dwork, timeout); ++ ++ /* Skip ahead to preserve error code set above. */ ++ goto out_wait; ++ } ++ ++ release_sock(sk); ++ timeout = schedule_timeout(timeout); ++ lock_sock(sk); ++ ++ if (signal_pending(current)) { ++ ret = sock_intr_errno(timeout); ++ goto out_wait_error; ++ } else if (timeout == 0) { ++ ret = -ETIMEDOUT; ++ goto out_wait_error; ++ } ++ ++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); ++ } ++ ++ ret = sk->sk_err ? -sk->sk_err : 0; ++ ++out_wait_error: ++ if (ret < 0) { ++ sk->sk_state = SS_UNCONNECTED; ++ sock->state = SS_UNCONNECTED; ++ } ++out_wait: ++ finish_wait(sk_sleep(sk), &wait); ++out: ++ release_sock(sk); ++ return ret; ++} ++ ++static ++int hvsock_accept(struct socket *sock, struct socket *newsock, int flags) ++{ ++ struct hvsock_sock *hvconnected; ++ struct sock *connected; ++ struct sock *listener; ++ ++ DEFINE_WAIT(wait); ++ long timeout; ++ ++ int ret = 0; ++ ++ listener = sock->sk; ++ ++ lock_sock(listener); ++ ++ if (sock->type != SOCK_STREAM) { ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ if (listener->sk_state != SS_LISTEN) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ /* Wait for children sockets to appear; these are the new sockets ++ * created upon connection establishment. ++ */ ++ timeout = sock_sndtimeo(listener, flags & O_NONBLOCK); ++ prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); ++ ++ while ((connected = hvsock_dequeue_accept(listener)) == NULL && ++ listener->sk_err == 0) { ++ release_sock(listener); ++ timeout = schedule_timeout(timeout); ++ lock_sock(listener); ++ ++ if (signal_pending(current)) { ++ ret = sock_intr_errno(timeout); ++ goto out_wait; ++ } else if (timeout == 0) { ++ ret = -EAGAIN; ++ goto out_wait; ++ } ++ ++ prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); ++ } ++ ++ if (listener->sk_err) ++ ret = -listener->sk_err; ++ ++ if (connected) { ++ lock_sock(connected); ++ hvconnected = sk_to_hvsock(connected); ++ ++ /* If the listener socket has received an error, then we should ++ * reject this socket and return. Note that we simply mark the ++ * socket rejected, drop our reference, and let the cleanup ++ * function handle the cleanup; the fact that we found it in ++ * the listener's accept queue guarantees that the cleanup ++ * function hasn't run yet. ++ */ ++ if (ret) { ++ release_sock(connected); ++ sock_put(connected); ++ goto out_wait; ++ } ++ ++ newsock->state = SS_CONNECTED; ++ sock_graft(connected, newsock); ++ release_sock(connected); ++ sock_put(connected); ++ } ++ ++out_wait: ++ finish_wait(sk_sleep(listener), &wait); ++out: ++ release_sock(listener); ++ return ret; ++} ++ ++static int hvsock_listen(struct socket *sock, int backlog) ++{ ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ int ret = 0; ++ ++ sk = sock->sk; ++ lock_sock(sk); ++ ++ if (sock->type != SOCK_STREAM) { ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ if (sock->state != SS_UNCONNECTED) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (backlog <= 0) { ++ ret = -EINVAL; ++ goto out; ++ } ++ /* This is an artificial limit */ ++ if (backlog > 128) ++ backlog = 128; ++ ++ hvsk = sk_to_hvsock(sk); ++ if (!hvsock_addr_bound(&hvsk->local_addr)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ sk->sk_ack_backlog = 0; ++ sk->sk_max_ack_backlog = backlog; ++ sk->sk_state = SS_LISTEN; ++out: ++ release_sock(sk); ++ return ret; ++} ++ ++static int hvsock_setsockopt(struct socket *sock, ++ int level, ++ int optname, ++ char __user *optval, unsigned int optlen) ++{ ++ return -ENOPROTOOPT; ++} ++ ++static int hvsock_getsockopt(struct socket *sock, ++ int level, ++ int optname, ++ char __user *optval, int __user *optlen) ++{ ++ return -ENOPROTOOPT; ++} ++ ++static int hvsock_send_data(struct vmbus_channel *channel, ++ struct hvsock_sock *hvsk, ++ size_t to_write) ++{ ++ hvsk->send.hdr.pkt_type = 1; ++ hvsk->send.hdr.data_size = to_write; ++ return vmbus_sendpacket(channel, &hvsk->send.hdr, ++ sizeof(hvsk->send.hdr) + to_write, ++ 0, VM_PKT_DATA_INBAND, 0); ++} ++ ++static int hvsock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) ++{ ++ struct vmbus_channel *channel; ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ ++ size_t total_to_write = len; ++ size_t total_written = 0; ++ ++ bool can_write; ++ long timeout; ++ int ret = 0; ++ ++ DEFINE_WAIT(wait); ++ ++ if (len == 0) ++ return -EINVAL; ++ ++ if (msg->msg_flags & ~MSG_DONTWAIT) { ++ pr_err("hvsock_sendmsg: unsupported flags=0x%x\n", ++ msg->msg_flags); ++ return -EOPNOTSUPP; ++ } ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ channel = hvsk->channel; ++ ++ lock_sock(sk); ++ ++ /* Callers should not provide a destination with stream sockets. */ ++ if (msg->msg_namelen) { ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ /* Send data only if both sides are not shutdown in the direction. */ ++ if (sk->sk_shutdown & SEND_SHUTDOWN || ++ hvsk->peer_shutdown & RCV_SHUTDOWN) { ++ ret = -EPIPE; ++ goto out; ++ } ++ ++ if (sk->sk_state != SS_CONNECTED || ++ !hvsock_addr_bound(&hvsk->local_addr)) { ++ ret = -ENOTCONN; ++ goto out; ++ } ++ ++ if (!hvsock_addr_bound(&hvsk->remote_addr)) { ++ ret = -EDESTADDRREQ; ++ goto out; ++ } ++ ++ timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ++ ++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); ++ ++ while (total_to_write > 0) { ++ size_t to_write; ++ ++ while (1) { ++ get_ringbuffer_rw_status(channel, NULL, &can_write); ++ ++ if (can_write || sk->sk_err != 0 || ++ (sk->sk_shutdown & SEND_SHUTDOWN) || ++ (hvsk->peer_shutdown & RCV_SHUTDOWN)) ++ break; ++ ++ /* Don't wait for non-blocking sockets. */ ++ if (timeout == 0) { ++ ret = -EAGAIN; ++ goto out_wait; ++ } ++ ++ release_sock(sk); ++ ++ timeout = schedule_timeout(timeout); ++ ++ lock_sock(sk); ++ if (signal_pending(current)) { ++ ret = sock_intr_errno(timeout); ++ goto out_wait; ++ } else if (timeout == 0) { ++ ret = -EAGAIN; ++ goto out_wait; ++ } ++ ++ prepare_to_wait(sk_sleep(sk), &wait, ++ TASK_INTERRUPTIBLE); ++ } ++ ++ /* These checks occur both as part of and after the loop ++ * conditional since we need to check before and after ++ * sleeping. ++ */ ++ if (sk->sk_err) { ++ ret = -sk->sk_err; ++ goto out_wait; ++ } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || ++ (hvsk->peer_shutdown & RCV_SHUTDOWN)) { ++ ret = -EPIPE; ++ goto out_wait; ++ } ++ ++ /* Note: that write will only write as many bytes as possible ++ * in the ringbuffer. It is the caller's responsibility to ++ * check how many bytes we actually wrote. ++ */ ++ do { ++ to_write = min_t(size_t, HVSOCK_SND_BUF_SZ, ++ total_to_write); ++ ret = memcpy_from_msg(hvsk->send.buf, msg, to_write); ++ if (ret != 0) ++ goto out_wait; ++ ++ ret = hvsock_send_data(channel, hvsk, to_write); ++ if (ret != 0) ++ goto out_wait; ++ ++ total_written += to_write; ++ total_to_write -= to_write; ++ } while (total_to_write > 0); ++ } ++out_wait: ++ if (total_written > 0) ++ ret = total_written; ++ ++ finish_wait(sk_sleep(sk), &wait); ++out: ++ release_sock(sk); ++ ++ /* ret is a bigger-than-0 total_written or a negative err code. */ ++ if (ret == 0) { ++ WARN(1, "unexpected return value of 0\n"); ++ ret = -EIO; ++ } ++ ++ return ret; ++} ++ ++static int hvsock_recv_data(struct vmbus_channel *channel, ++ struct hvsock_sock *hvsk, ++ size_t *payload_len) ++{ ++ u32 buffer_actual_len; ++ u64 dummy_req_id; ++ int ret; ++ ++ ret = vmbus_recvpacket(channel, &hvsk->recv.hdr, ++ sizeof(hvsk->recv.hdr) + sizeof(hvsk->recv.buf), ++ &buffer_actual_len, &dummy_req_id); ++ if (ret != 0 || buffer_actual_len <= sizeof(hvsk->recv.hdr)) ++ *payload_len = 0; ++ else ++ *payload_len = hvsk->recv.hdr.data_size; ++ ++ return ret; ++} ++ ++static int hvsock_recvmsg(struct socket *sock, struct msghdr *msg, ++ size_t len, int flags) ++{ ++ struct vmbus_channel *channel; ++ struct hvsock_sock *hvsk; ++ struct sock *sk; ++ ++ size_t total_to_read = len; ++ size_t copied; ++ ++ bool can_read; ++ long timeout; ++ ++ int ret = 0; ++ ++ DEFINE_WAIT(wait); ++ ++ sk = sock->sk; ++ hvsk = sk_to_hvsock(sk); ++ channel = hvsk->channel; ++ ++ lock_sock(sk); ++ ++ if (sk->sk_state != SS_CONNECTED) { ++ /* Recvmsg is supposed to return 0 if a peer performs an ++ * orderly shutdown. Differentiate between that case and when a ++ * peer has not connected or a local shutdown occurred with the ++ * SOCK_DONE flag. ++ */ ++ if (sock_flag(sk, SOCK_DONE)) ++ ret = 0; ++ else ++ ret = -ENOTCONN; ++ ++ goto out; ++ } ++ ++ /* We ignore msg->addr_name/len. */ ++ if (flags & ~MSG_DONTWAIT) { ++ pr_err("hvsock_recvmsg: unsupported flags=0x%x\n", flags); ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ /* We don't check peer_shutdown flag here since peer may actually shut ++ * down, but there can be data in the queue that a local socket can ++ * receive. ++ */ ++ if (sk->sk_shutdown & RCV_SHUTDOWN) { ++ ret = 0; ++ goto out; ++ } ++ ++ /* It is valid on Linux to pass in a zero-length receive buffer. This ++ * is not an error. We may as well bail out now. ++ */ ++ if (!len) { ++ ret = 0; ++ goto out; ++ } ++ ++ timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); ++ copied = 0; ++ ++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); ++ ++ while (1) { ++ bool need_refill = hvsk->recv.data_len == 0; ++ ++ if (need_refill) ++ get_ringbuffer_rw_status(channel, &can_read, NULL); ++ else ++ can_read = true; ++ ++ if (can_read) { ++ size_t payload_len; ++ ++ if (need_refill) { ++ ret = hvsock_recv_data(channel, hvsk, ++ &payload_len); ++ if (ret != 0 || payload_len == 0 || ++ payload_len > HVSOCK_RCV_BUF_SZ) { ++ ret = -EIO; ++ goto out_wait; ++ } ++ ++ hvsk->recv.data_len = payload_len; ++ hvsk->recv.data_offset = 0; ++ } ++ ++ if (hvsk->recv.data_len <= total_to_read) { ++ ret = memcpy_to_msg(msg, hvsk->recv.buf + ++ hvsk->recv.data_offset, ++ hvsk->recv.data_len); ++ if (ret != 0) ++ break; ++ ++ copied += hvsk->recv.data_len; ++ total_to_read -= hvsk->recv.data_len; ++ hvsk->recv.data_len = 0; ++ hvsk->recv.data_offset = 0; ++ ++ if (total_to_read == 0) ++ break; ++ } else { ++ ret = memcpy_to_msg(msg, hvsk->recv.buf + ++ hvsk->recv.data_offset, ++ total_to_read); ++ if (ret != 0) ++ break; ++ ++ copied += total_to_read; ++ hvsk->recv.data_len -= total_to_read; ++ hvsk->recv.data_offset += total_to_read; ++ total_to_read = 0; ++ break; ++ } ++ } else { ++ if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || ++ (hvsk->peer_shutdown & SEND_SHUTDOWN)) ++ break; ++ ++ /* Don't wait for non-blocking sockets. */ ++ if (timeout == 0) { ++ ret = -EAGAIN; ++ break; ++ } ++ ++ if (copied > 0) ++ break; ++ ++ release_sock(sk); ++ timeout = schedule_timeout(timeout); ++ lock_sock(sk); ++ ++ if (signal_pending(current)) { ++ ret = sock_intr_errno(timeout); ++ break; ++ } else if (timeout == 0) { ++ ret = -EAGAIN; ++ break; ++ } ++ ++ prepare_to_wait(sk_sleep(sk), &wait, ++ TASK_INTERRUPTIBLE); ++ } ++ } ++ ++ if (sk->sk_err) ++ ret = -sk->sk_err; ++ else if (sk->sk_shutdown & RCV_SHUTDOWN) ++ ret = 0; ++ ++ if (copied > 0) { ++ ret = copied; ++ ++ /* If the other side has shutdown for sending and there ++ * is nothing more to read, then we modify the socket ++ * state. ++ */ ++ if ((hvsk->peer_shutdown & SEND_SHUTDOWN) && ++ hvsk->recv.data_len == 0) { ++ get_ringbuffer_rw_status(channel, &can_read, NULL); ++ if (!can_read) { ++ sk->sk_state = SS_UNCONNECTED; ++ sock_set_flag(sk, SOCK_DONE); ++ sk->sk_state_change(sk); ++ } ++ } ++ } ++out_wait: ++ finish_wait(sk_sleep(sk), &wait); ++out: ++ release_sock(sk); ++ return ret; ++} ++ ++static const struct proto_ops hvsock_ops = { ++ .family = PF_HYPERV, ++ .owner = THIS_MODULE, ++ .release = hvsock_release, ++ .bind = hvsock_bind, ++ .connect = hvsock_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = hvsock_accept, ++ .getname = hvsock_getname, ++ .poll = hvsock_poll, ++ .ioctl = sock_no_ioctl, ++ .listen = hvsock_listen, ++ .shutdown = hvsock_shutdown, ++ .setsockopt = hvsock_setsockopt, ++ .getsockopt = hvsock_getsockopt, ++ .sendmsg = hvsock_sendmsg, ++ .recvmsg = hvsock_recvmsg, ++ .mmap = sock_no_mmap, ++ .sendpage = sock_no_sendpage, ++}; ++ ++static int hvsock_create(struct net *net, struct socket *sock, ++ int protocol, int kern) ++{ ++ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ if (protocol != 0 && protocol != SHV_PROTO_RAW) ++ return -EPROTONOSUPPORT; ++ ++ switch (sock->type) { ++ case SOCK_STREAM: ++ sock->ops = &hvsock_ops; ++ break; ++ default: ++ return -ESOCKTNOSUPPORT; ++ } ++ ++ sock->state = SS_UNCONNECTED; ++ ++ return __hvsock_create(net, sock, GFP_KERNEL, 0) ? 0 : -ENOMEM; ++} ++ ++static const struct net_proto_family hvsock_family_ops = { ++ .family = AF_HYPERV, ++ .create = hvsock_create, ++ .owner = THIS_MODULE, ++}; ++ ++static int hvsock_probe(struct hv_device *hdev, ++ const struct hv_vmbus_device_id *dev_id) ++{ ++ struct vmbus_channel *channel = hdev->channel; ++ ++ /* We ignore the error return code to suppress the unnecessary ++ * error message in vmbus_probe(): on error the host will rescind ++ * the offer in 30 seconds and we can do cleanup at that time. ++ */ ++ (void)hvsock_open_connection(channel); ++ ++ return 0; ++} ++ ++static int hvsock_remove(struct hv_device *hdev) ++{ ++ struct vmbus_channel *channel = hdev->channel; ++ ++ vmbus_close(channel); ++ ++ return 0; ++} ++ ++/* It's not really used. See vmbus_match() and vmbus_probe(). */ ++static const struct hv_vmbus_device_id id_table[] = { ++ {}, ++}; ++ ++static struct hv_driver hvsock_drv = { ++ .name = "hv_sock", ++ .hvsock = true, ++ .id_table = id_table, ++ .probe = hvsock_probe, ++ .remove = hvsock_remove, ++}; ++ ++static int __init hvsock_init(void) ++{ ++ int ret; ++ ++ /* Hyper-V socket requires at least VMBus 4.0 */ ++ if ((vmbus_proto_version >> 16) < 4) { ++ pr_err("failed to load: VMBus 4 or later is required\n"); ++ return -ENODEV; ++ } ++ ++ ret = vmbus_driver_register(&hvsock_drv); ++ if (ret) { ++ pr_err("failed to register hv_sock driver\n"); ++ return ret; ++ } ++ ++ ret = proto_register(&hvsock_proto, 0); ++ if (ret) { ++ pr_err("failed to register protocol\n"); ++ goto unreg_hvsock_drv; ++ } ++ ++ ret = sock_register(&hvsock_family_ops); ++ if (ret) { ++ pr_err("failed to register address family\n"); ++ goto unreg_proto; ++ } ++ ++ return 0; ++ ++unreg_proto: ++ proto_unregister(&hvsock_proto); ++unreg_hvsock_drv: ++ vmbus_driver_unregister(&hvsock_drv); ++ return ret; ++} ++ ++static void __exit hvsock_exit(void) ++{ ++ sock_unregister(AF_HYPERV); ++ proto_unregister(&hvsock_proto); ++ vmbus_driver_unregister(&hvsock_drv); ++} ++ ++module_init(hvsock_init); ++module_exit(hvsock_exit); ++ ++MODULE_DESCRIPTION("Hyper-V Sockets"); ++MODULE_LICENSE("Dual BSD/GPL"); +-- +2.8.0.rc3 + diff --git a/alpine/kernel/patches/0025-net-add-the-AF_HYPERV-entries-to-family-name-tables.patch b/alpine/kernel/patches/0025-net-add-the-AF_HYPERV-entries-to-family-name-tables.patch new file mode 100644 index 000000000..a29ee0cb4 --- /dev/null +++ b/alpine/kernel/patches/0025-net-add-the-AF_HYPERV-entries-to-family-name-tables.patch @@ -0,0 +1,49 @@ +From 0198717a05de80bc7769ed1d2c3a0cdf3c40fd7c Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Mon, 21 Mar 2016 02:53:08 -0700 +Subject: [PATCH 25/25] net: add the AF_HYPERV entries to family name tables + +This is for the hv_sock driver, which introduces AF_HYPERV(42). + +Signed-off-by: Dexuan Cui +Cc: "K. Y. Srinivasan" +Cc: Haiyang Zhang +Origin: https://patchwork.ozlabs.org/patch/600009 +--- + net/core/sock.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/net/core/sock.c b/net/core/sock.c +index 925def4..323f7a3 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -264,7 +264,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = { + "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , + "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , + "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , +- "sk_lock-AF_MAX" ++ "sk_lock-AF_HYPERV", "sk_lock-AF_MAX" + }; + static const char *const af_family_slock_key_strings[AF_MAX+1] = { + "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , +@@ -281,7 +281,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { + "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , + "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , + "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , +- "slock-AF_MAX" ++ "slock-AF_HYPERV", "slock-AF_MAX" + }; + static const char *const af_family_clock_key_strings[AF_MAX+1] = { + "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , +@@ -298,7 +298,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { + "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , + "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , + "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , +- "clock-AF_MAX" ++ "clock-AF_HYPERV", "clock-AF_MAX" + }; + + /* +-- +2.8.0.rc3 +