mirror of
https://github.com/linuxkit/linuxkit.git
synced 2025-07-22 02:21:34 +00:00
kernel: Add VMBus stability improvements
The four new patches prevent swamping VMBus with too many notifications which, eventually, let Hyper-V assume there is a DoS attempt and would thus throttle the VM for up to 5s. The first three new patches were cherry-picked from upstream. The final patch was received by email. Will update once it has been submitted/accepted upstream. Generated from: https://github.com/rneugeba/linux-stable/tree/v4.9.5-moby Signed-off-by: Rolf Neugebauer <rolf.neugebauer@docker.com>
This commit is contained in:
parent
7f4d396ea6
commit
e7575f88da
@ -1,7 +1,8 @@
|
||||
From 1fc22bc5c2e3544786420355405038e4a12ffd72 Mon Sep 17 00:00:00 2001
|
||||
From: Ian Campbell <ian.campbell@docker.com>
|
||||
Date: Mon, 4 Apr 2016 14:50:10 +0100
|
||||
Subject: [PATCH 1/7] VSOCK: Only allow host network namespace to use AF_VSOCK.
|
||||
Subject: [PATCH 01/11] VSOCK: Only allow host network namespace to use
|
||||
AF_VSOCK.
|
||||
|
||||
The VSOCK addressing schema does not really lend itself to simply creating an
|
||||
alternative end point address within a namespace.
|
||||
|
@ -1,7 +1,7 @@
|
||||
From 0d9748d2d26216421225b41643c6167fda91c26f Mon Sep 17 00:00:00 2001
|
||||
From: Dexuan Cui <decui@microsoft.com>
|
||||
Date: Sat, 23 Jul 2016 01:35:51 +0000
|
||||
Subject: [PATCH 2/7] hv_sock: introduce Hyper-V Sockets
|
||||
Subject: [PATCH 02/11] hv_sock: introduce Hyper-V Sockets
|
||||
|
||||
Hyper-V Sockets (hv_sock) supplies a byte-stream based communication
|
||||
mechanism between the host and the guest. It's somewhat like TCP over
|
||||
|
@ -1,7 +1,7 @@
|
||||
From c9168a8ff6c84924df9efec05257f74ed93981b2 Mon Sep 17 00:00:00 2001
|
||||
From: Rolf Neugebauer <rolf.neugebauer@gmail.com>
|
||||
Date: Mon, 23 May 2016 18:55:45 +0100
|
||||
Subject: [PATCH 3/7] vmbus: Don't spam the logs with unknown GUIDs
|
||||
Subject: [PATCH 03/11] vmbus: Don't spam the logs with unknown GUIDs
|
||||
|
||||
With Hyper-V sockets device types are introduced on the fly. The pr_info()
|
||||
then prints a message on every connection, which is way too verbose. Since
|
||||
|
@ -1,8 +1,8 @@
|
||||
From 24a17904f276ced95d717fc427c4f6ff256ac0a8 Mon Sep 17 00:00:00 2001
|
||||
From: Alex Ng <alexng@messages.microsoft.com>
|
||||
Date: Sun, 6 Nov 2016 13:14:07 -0800
|
||||
Subject: [PATCH 4/7] Drivers: hv: utils: Fix the mapping between host version
|
||||
and protocol to use
|
||||
Subject: [PATCH 04/11] Drivers: hv: utils: Fix the mapping between host
|
||||
version and protocol to use
|
||||
|
||||
We should intentionally declare the protocols to use for every known host
|
||||
and default to using the latest protocol if the host is unknown or new.
|
||||
|
@ -1,8 +1,8 @@
|
||||
From 5d339781f71a37083d4c4d304ba45b89aede184f Mon Sep 17 00:00:00 2001
|
||||
From: Rolf Neugebauer <rolf.neugebauer@docker.com>
|
||||
Date: Wed, 11 Jan 2017 22:40:38 +0000
|
||||
Subject: [PATCH 5/7] Drivers: hv: utils: Force TimeSync version 3.0 on Windows
|
||||
10
|
||||
Subject: [PATCH 05/11] Drivers: hv: utils: Force TimeSync version 3.0 on
|
||||
Windows 10
|
||||
|
||||
Some older Windows 10 builds, including 10586 do not seem to
|
||||
support TimeSync protocol 4.0 causing loss of time synchronisation
|
||||
|
@ -1,7 +1,7 @@
|
||||
From 4c63fbce6abc0eb36a21d760959aae954427dcf3 Mon Sep 17 00:00:00 2001
|
||||
From: David Sheets <david.sheets@docker.com>
|
||||
Date: Fri, 13 Jan 2017 15:58:30 +0000
|
||||
Subject: [PATCH 6/7] fuse: fix time_to_jiffies nsec sanity check
|
||||
Subject: [PATCH 06/11] fuse: fix time_to_jiffies nsec sanity check
|
||||
|
||||
Commit bcb6f6d2b9c2 ("fuse: use timespec64") introduced clamped nsec values
|
||||
in time_to_jiffies but used the max of nsec and NSEC_PER_SEC - 1 instead of
|
||||
|
@ -1,7 +1,7 @@
|
||||
From 00ef9fe4743da2cdd5b55d97335e678304329e24 Mon Sep 17 00:00:00 2001
|
||||
From: Rolf Neugebauer <rolf.neugebauer@docker.com>
|
||||
Date: Tue, 17 Jan 2017 18:13:51 +0000
|
||||
Subject: [PATCH 7/7] virtio: don't set VIRTIO_NET_HDR_F_DATA_VALID on xmit
|
||||
Subject: [PATCH 07/11] virtio: don't set VIRTIO_NET_HDR_F_DATA_VALID on xmit
|
||||
|
||||
This patch part reverts fd2a0437dc33 and e858fae2b0b8 which introduced a
|
||||
subtle change in how the virtio_net flags are derived from the SKBs
|
||||
|
@ -0,0 +1,103 @@
|
||||
From 72c60164c3508d229686d64e624d1d2f47d01676 Mon Sep 17 00:00:00 2001
|
||||
From: "K. Y. Srinivasan" <kys@microsoft.com>
|
||||
Date: Sun, 6 Nov 2016 13:14:16 -0800
|
||||
Subject: [PATCH 08/11] Drivers: hv: vmbus: Base host signaling strictly on the
|
||||
ring state
|
||||
|
||||
One of the factors that can result in the host concluding that a given
|
||||
guest in mounting a DOS attack is if the guest generates interrupts
|
||||
to the host when the host is not expecting it. If these "spurious"
|
||||
interrupts reach a certain rate, the host can throttle the guest to
|
||||
minimize the impact. The host computation of the "expected number
|
||||
of interrupts" is strictly based on the ring transitions. Until
|
||||
the host logic is fixed, base the guest logic to interrupt solely
|
||||
on the ring state.
|
||||
|
||||
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
|
||||
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
(cherry picked from commit 74198eb4a42c4a3c4fbef08fa01a291a282f7c2e)
|
||||
---
|
||||
drivers/hv/channel.c | 23 ++++++++++++++++++++---
|
||||
drivers/hv/channel_mgmt.c | 2 --
|
||||
drivers/hv/ring_buffer.c | 7 -------
|
||||
3 files changed, 20 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
|
||||
index 16f91c8490fe..5e482d7f60cb 100644
|
||||
--- a/drivers/hv/channel.c
|
||||
+++ b/drivers/hv/channel.c
|
||||
@@ -676,10 +676,18 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
|
||||
* NOTE: in this case, the hvsock channel is an exception, because
|
||||
* it looks the host side's hvsock implementation has a throttling
|
||||
* mechanism which can hurt the performance otherwise.
|
||||
+ *
|
||||
+ * KYS: Oct. 30, 2016:
|
||||
+ * It looks like Windows hosts have logic to deal with DOS attacks that
|
||||
+ * can be triggered if it receives interrupts when it is not expecting
|
||||
+ * the interrupt. The host expects interrupts only when the ring
|
||||
+ * transitions from empty to non-empty (or full to non full on the guest
|
||||
+ * to host ring).
|
||||
+ * So, base the signaling decision solely on the ring state until the
|
||||
+ * host logic is fixed.
|
||||
*/
|
||||
|
||||
- if (((ret == 0) && kick_q && signal) ||
|
||||
- (ret && !is_hvsock_channel(channel)))
|
||||
+ if (((ret == 0) && signal))
|
||||
vmbus_setevent(channel);
|
||||
|
||||
return ret;
|
||||
@@ -786,9 +794,18 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
|
||||
* If we cannot write to the ring-buffer; signal the host
|
||||
* even if we may not have written anything. This is a rare
|
||||
* enough condition that it should not matter.
|
||||
+ *
|
||||
+ * KYS: Oct. 30, 2016:
|
||||
+ * It looks like Windows hosts have logic to deal with DOS attacks that
|
||||
+ * can be triggered if it receives interrupts when it is not expecting
|
||||
+ * the interrupt. The host expects interrupts only when the ring
|
||||
+ * transitions from empty to non-empty (or full to non full on the guest
|
||||
+ * to host ring).
|
||||
+ * So, base the signaling decision solely on the ring state until the
|
||||
+ * host logic is fixed.
|
||||
*/
|
||||
|
||||
- if (((ret == 0) && kick_q && signal) || (ret))
|
||||
+ if (((ret == 0) && signal))
|
||||
vmbus_setevent(channel);
|
||||
|
||||
return ret;
|
||||
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
|
||||
index db5bccf4fa7e..8f3d9f787288 100644
|
||||
--- a/drivers/hv/channel_mgmt.c
|
||||
+++ b/drivers/hv/channel_mgmt.c
|
||||
@@ -448,8 +448,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
|
||||
}
|
||||
|
||||
dev_type = hv_get_dev_type(newchannel);
|
||||
- if (dev_type == HV_NIC)
|
||||
- set_channel_signal_state(newchannel, HV_SIGNAL_POLICY_EXPLICIT);
|
||||
|
||||
init_vp_index(newchannel, dev_type);
|
||||
|
||||
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
|
||||
index 08043da1a61c..5d11d93eedf4 100644
|
||||
--- a/drivers/hv/ring_buffer.c
|
||||
+++ b/drivers/hv/ring_buffer.c
|
||||
@@ -75,13 +75,6 @@ static bool hv_need_to_signal(u32 old_write, struct hv_ring_buffer_info *rbi,
|
||||
if (READ_ONCE(rbi->ring_buffer->interrupt_mask))
|
||||
return false;
|
||||
|
||||
- /*
|
||||
- * When the client wants to control signaling,
|
||||
- * we only honour the host interrupt mask.
|
||||
- */
|
||||
- if (policy == HV_SIGNAL_POLICY_EXPLICIT)
|
||||
- return true;
|
||||
-
|
||||
/* check interrupt_mask before read_index */
|
||||
virt_rmb();
|
||||
/*
|
||||
--
|
||||
2.11.0
|
||||
|
@ -0,0 +1,321 @@
|
||||
From f0472d013423117cae77e00089485f7d976138f2 Mon Sep 17 00:00:00 2001
|
||||
From: "K. Y. Srinivasan" <kys@microsoft.com>
|
||||
Date: Sun, 6 Nov 2016 13:14:17 -0800
|
||||
Subject: [PATCH 09/11] Drivers: hv: vmbus: On write cleanup the logic to
|
||||
interrupt the host
|
||||
|
||||
Signal the host when we determine the host is to be signaled.
|
||||
The currrent code determines the need to signal in the ringbuffer
|
||||
code and actually issues the signal elsewhere. This can result
|
||||
in the host viewing this interrupt as spurious since the host may also
|
||||
poll the channel. Make the necessary adjustments.
|
||||
|
||||
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
|
||||
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
(cherry picked from commit 1f6ee4e7d83586c8b10bd4f2f4346353d04ce884)
|
||||
---
|
||||
drivers/hv/channel.c | 99 +++++------------------------------------------
|
||||
drivers/hv/hyperv_vmbus.h | 6 +--
|
||||
drivers/hv/ring_buffer.c | 30 +++++++++-----
|
||||
include/linux/hyperv.h | 1 +
|
||||
4 files changed, 35 insertions(+), 101 deletions(-)
|
||||
|
||||
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
|
||||
index 5e482d7f60cb..8a8148f7b842 100644
|
||||
--- a/drivers/hv/channel.c
|
||||
+++ b/drivers/hv/channel.c
|
||||
@@ -39,7 +39,7 @@
|
||||
* vmbus_setevent- Trigger an event notification on the specified
|
||||
* channel.
|
||||
*/
|
||||
-static void vmbus_setevent(struct vmbus_channel *channel)
|
||||
+void vmbus_setevent(struct vmbus_channel *channel)
|
||||
{
|
||||
struct hv_monitor_page *monitorpage;
|
||||
|
||||
@@ -65,6 +65,7 @@ static void vmbus_setevent(struct vmbus_channel *channel)
|
||||
vmbus_set_event(channel);
|
||||
}
|
||||
}
|
||||
+EXPORT_SYMBOL_GPL(vmbus_setevent);
|
||||
|
||||
/*
|
||||
* vmbus_open - Open the specified channel.
|
||||
@@ -635,8 +636,6 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
|
||||
u32 packetlen_aligned = ALIGN(packetlen, sizeof(u64));
|
||||
struct kvec bufferlist[3];
|
||||
u64 aligned_data = 0;
|
||||
- int ret;
|
||||
- bool signal = false;
|
||||
bool lock = channel->acquire_ring_lock;
|
||||
int num_vecs = ((bufferlen != 0) ? 3 : 1);
|
||||
|
||||
@@ -656,41 +655,9 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
|
||||
bufferlist[2].iov_base = &aligned_data;
|
||||
bufferlist[2].iov_len = (packetlen_aligned - packetlen);
|
||||
|
||||
- ret = hv_ringbuffer_write(&channel->outbound, bufferlist, num_vecs,
|
||||
- &signal, lock, channel->signal_policy);
|
||||
-
|
||||
- /*
|
||||
- * Signalling the host is conditional on many factors:
|
||||
- * 1. The ring state changed from being empty to non-empty.
|
||||
- * This is tracked by the variable "signal".
|
||||
- * 2. The variable kick_q tracks if more data will be placed
|
||||
- * on the ring. We will not signal if more data is
|
||||
- * to be placed.
|
||||
- *
|
||||
- * Based on the channel signal state, we will decide
|
||||
- * which signaling policy will be applied.
|
||||
- *
|
||||
- * If we cannot write to the ring-buffer; signal the host
|
||||
- * even if we may not have written anything. This is a rare
|
||||
- * enough condition that it should not matter.
|
||||
- * NOTE: in this case, the hvsock channel is an exception, because
|
||||
- * it looks the host side's hvsock implementation has a throttling
|
||||
- * mechanism which can hurt the performance otherwise.
|
||||
- *
|
||||
- * KYS: Oct. 30, 2016:
|
||||
- * It looks like Windows hosts have logic to deal with DOS attacks that
|
||||
- * can be triggered if it receives interrupts when it is not expecting
|
||||
- * the interrupt. The host expects interrupts only when the ring
|
||||
- * transitions from empty to non-empty (or full to non full on the guest
|
||||
- * to host ring).
|
||||
- * So, base the signaling decision solely on the ring state until the
|
||||
- * host logic is fixed.
|
||||
- */
|
||||
-
|
||||
- if (((ret == 0) && signal))
|
||||
- vmbus_setevent(channel);
|
||||
+ return hv_ringbuffer_write(channel, bufferlist, num_vecs,
|
||||
+ lock, kick_q);
|
||||
|
||||
- return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(vmbus_sendpacket_ctl);
|
||||
|
||||
@@ -731,7 +698,6 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
|
||||
u32 flags,
|
||||
bool kick_q)
|
||||
{
|
||||
- int ret;
|
||||
int i;
|
||||
struct vmbus_channel_packet_page_buffer desc;
|
||||
u32 descsize;
|
||||
@@ -739,7 +705,6 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
|
||||
u32 packetlen_aligned;
|
||||
struct kvec bufferlist[3];
|
||||
u64 aligned_data = 0;
|
||||
- bool signal = false;
|
||||
bool lock = channel->acquire_ring_lock;
|
||||
|
||||
if (pagecount > MAX_PAGE_BUFFER_COUNT)
|
||||
@@ -777,38 +742,8 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
|
||||
bufferlist[2].iov_base = &aligned_data;
|
||||
bufferlist[2].iov_len = (packetlen_aligned - packetlen);
|
||||
|
||||
- ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3,
|
||||
- &signal, lock, channel->signal_policy);
|
||||
-
|
||||
- /*
|
||||
- * Signalling the host is conditional on many factors:
|
||||
- * 1. The ring state changed from being empty to non-empty.
|
||||
- * This is tracked by the variable "signal".
|
||||
- * 2. The variable kick_q tracks if more data will be placed
|
||||
- * on the ring. We will not signal if more data is
|
||||
- * to be placed.
|
||||
- *
|
||||
- * Based on the channel signal state, we will decide
|
||||
- * which signaling policy will be applied.
|
||||
- *
|
||||
- * If we cannot write to the ring-buffer; signal the host
|
||||
- * even if we may not have written anything. This is a rare
|
||||
- * enough condition that it should not matter.
|
||||
- *
|
||||
- * KYS: Oct. 30, 2016:
|
||||
- * It looks like Windows hosts have logic to deal with DOS attacks that
|
||||
- * can be triggered if it receives interrupts when it is not expecting
|
||||
- * the interrupt. The host expects interrupts only when the ring
|
||||
- * transitions from empty to non-empty (or full to non full on the guest
|
||||
- * to host ring).
|
||||
- * So, base the signaling decision solely on the ring state until the
|
||||
- * host logic is fixed.
|
||||
- */
|
||||
-
|
||||
- if (((ret == 0) && signal))
|
||||
- vmbus_setevent(channel);
|
||||
-
|
||||
- return ret;
|
||||
+ return hv_ringbuffer_write(channel, bufferlist, 3,
|
||||
+ lock, kick_q);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer_ctl);
|
||||
|
||||
@@ -839,12 +774,10 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
|
||||
u32 desc_size,
|
||||
void *buffer, u32 bufferlen, u64 requestid)
|
||||
{
|
||||
- int ret;
|
||||
u32 packetlen;
|
||||
u32 packetlen_aligned;
|
||||
struct kvec bufferlist[3];
|
||||
u64 aligned_data = 0;
|
||||
- bool signal = false;
|
||||
bool lock = channel->acquire_ring_lock;
|
||||
|
||||
packetlen = desc_size + bufferlen;
|
||||
@@ -865,13 +798,8 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
|
||||
bufferlist[2].iov_base = &aligned_data;
|
||||
bufferlist[2].iov_len = (packetlen_aligned - packetlen);
|
||||
|
||||
- ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3,
|
||||
- &signal, lock, channel->signal_policy);
|
||||
-
|
||||
- if (ret == 0 && signal)
|
||||
- vmbus_setevent(channel);
|
||||
-
|
||||
- return ret;
|
||||
+ return hv_ringbuffer_write(channel, bufferlist, 3,
|
||||
+ lock, true);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vmbus_sendpacket_mpb_desc);
|
||||
|
||||
@@ -883,14 +811,12 @@ int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel,
|
||||
struct hv_multipage_buffer *multi_pagebuffer,
|
||||
void *buffer, u32 bufferlen, u64 requestid)
|
||||
{
|
||||
- int ret;
|
||||
struct vmbus_channel_packet_multipage_buffer desc;
|
||||
u32 descsize;
|
||||
u32 packetlen;
|
||||
u32 packetlen_aligned;
|
||||
struct kvec bufferlist[3];
|
||||
u64 aligned_data = 0;
|
||||
- bool signal = false;
|
||||
bool lock = channel->acquire_ring_lock;
|
||||
u32 pfncount = NUM_PAGES_SPANNED(multi_pagebuffer->offset,
|
||||
multi_pagebuffer->len);
|
||||
@@ -930,13 +856,8 @@ int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel,
|
||||
bufferlist[2].iov_base = &aligned_data;
|
||||
bufferlist[2].iov_len = (packetlen_aligned - packetlen);
|
||||
|
||||
- ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3,
|
||||
- &signal, lock, channel->signal_policy);
|
||||
-
|
||||
- if (ret == 0 && signal)
|
||||
- vmbus_setevent(channel);
|
||||
-
|
||||
- return ret;
|
||||
+ return hv_ringbuffer_write(channel, bufferlist, 3,
|
||||
+ lock, true);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vmbus_sendpacket_multipagebuffer);
|
||||
|
||||
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
|
||||
index a5b4442433c8..fa782e13c8da 100644
|
||||
--- a/drivers/hv/hyperv_vmbus.h
|
||||
+++ b/drivers/hv/hyperv_vmbus.h
|
||||
@@ -527,10 +527,10 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
|
||||
|
||||
void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info);
|
||||
|
||||
-int hv_ringbuffer_write(struct hv_ring_buffer_info *ring_info,
|
||||
+int hv_ringbuffer_write(struct vmbus_channel *channel,
|
||||
struct kvec *kv_list,
|
||||
- u32 kv_count, bool *signal, bool lock,
|
||||
- enum hv_signal_policy policy);
|
||||
+ u32 kv_count, bool lock,
|
||||
+ bool kick_q);
|
||||
|
||||
int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info,
|
||||
void *buffer, u32 buflen, u32 *buffer_actual_len,
|
||||
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
|
||||
index 5d11d93eedf4..4af71306d0ff 100644
|
||||
--- a/drivers/hv/ring_buffer.c
|
||||
+++ b/drivers/hv/ring_buffer.c
|
||||
@@ -66,14 +66,25 @@ u32 hv_end_read(struct hv_ring_buffer_info *rbi)
|
||||
* once the ring buffer is empty, it will clear the
|
||||
* interrupt_mask and re-check to see if new data has
|
||||
* arrived.
|
||||
+ *
|
||||
+ * KYS: Oct. 30, 2016:
|
||||
+ * It looks like Windows hosts have logic to deal with DOS attacks that
|
||||
+ * can be triggered if it receives interrupts when it is not expecting
|
||||
+ * the interrupt. The host expects interrupts only when the ring
|
||||
+ * transitions from empty to non-empty (or full to non full on the guest
|
||||
+ * to host ring).
|
||||
+ * So, base the signaling decision solely on the ring state until the
|
||||
+ * host logic is fixed.
|
||||
*/
|
||||
|
||||
-static bool hv_need_to_signal(u32 old_write, struct hv_ring_buffer_info *rbi,
|
||||
- enum hv_signal_policy policy)
|
||||
+static void hv_signal_on_write(u32 old_write, struct vmbus_channel *channel,
|
||||
+ bool kick_q)
|
||||
{
|
||||
+ struct hv_ring_buffer_info *rbi = &channel->outbound;
|
||||
+
|
||||
virt_mb();
|
||||
if (READ_ONCE(rbi->ring_buffer->interrupt_mask))
|
||||
- return false;
|
||||
+ return;
|
||||
|
||||
/* check interrupt_mask before read_index */
|
||||
virt_rmb();
|
||||
@@ -82,9 +93,9 @@ static bool hv_need_to_signal(u32 old_write, struct hv_ring_buffer_info *rbi,
|
||||
* ring transitions from being empty to non-empty.
|
||||
*/
|
||||
if (old_write == READ_ONCE(rbi->ring_buffer->read_index))
|
||||
- return true;
|
||||
+ vmbus_setevent(channel);
|
||||
|
||||
- return false;
|
||||
+ return;
|
||||
}
|
||||
|
||||
/* Get the next write location for the specified ring buffer. */
|
||||
@@ -273,9 +284,9 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
|
||||
}
|
||||
|
||||
/* Write to the ring buffer. */
|
||||
-int hv_ringbuffer_write(struct hv_ring_buffer_info *outring_info,
|
||||
- struct kvec *kv_list, u32 kv_count, bool *signal, bool lock,
|
||||
- enum hv_signal_policy policy)
|
||||
+int hv_ringbuffer_write(struct vmbus_channel *channel,
|
||||
+ struct kvec *kv_list, u32 kv_count, bool lock,
|
||||
+ bool kick_q)
|
||||
{
|
||||
int i = 0;
|
||||
u32 bytes_avail_towrite;
|
||||
@@ -285,6 +296,7 @@ int hv_ringbuffer_write(struct hv_ring_buffer_info *outring_info,
|
||||
u32 old_write;
|
||||
u64 prev_indices = 0;
|
||||
unsigned long flags = 0;
|
||||
+ struct hv_ring_buffer_info *outring_info = &channel->outbound;
|
||||
|
||||
for (i = 0; i < kv_count; i++)
|
||||
totalbytes_towrite += kv_list[i].iov_len;
|
||||
@@ -337,7 +349,7 @@ int hv_ringbuffer_write(struct hv_ring_buffer_info *outring_info,
|
||||
if (lock)
|
||||
spin_unlock_irqrestore(&outring_info->ring_lock, flags);
|
||||
|
||||
- *signal = hv_need_to_signal(old_write, outring_info, policy);
|
||||
+ hv_signal_on_write(old_write, channel, kick_q);
|
||||
return 0;
|
||||
}
|
||||
|
||||
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
|
||||
index 468e15e29f5f..9a2d657a7fe7 100644
|
||||
--- a/include/linux/hyperv.h
|
||||
+++ b/include/linux/hyperv.h
|
||||
@@ -1447,6 +1447,7 @@ void hv_event_tasklet_enable(struct vmbus_channel *channel);
|
||||
|
||||
void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid);
|
||||
|
||||
+void vmbus_setevent(struct vmbus_channel *channel);
|
||||
/*
|
||||
* Negotiated version with the Host.
|
||||
*/
|
||||
--
|
||||
2.11.0
|
||||
|
@ -0,0 +1,141 @@
|
||||
From 9e0ab3ccac70147f6e0e04cf0872d6f477ec6b68 Mon Sep 17 00:00:00 2001
|
||||
From: "K. Y. Srinivasan" <kys@microsoft.com>
|
||||
Date: Sun, 6 Nov 2016 13:14:18 -0800
|
||||
Subject: [PATCH 10/11] Drivers: hv: vmbus: On the read path cleanup the logic
|
||||
to interrupt the host
|
||||
|
||||
Signal the host when we determine the host is to be signaled -
|
||||
on th read path. The currrent code determines the need to signal in the
|
||||
ringbuffer code and actually issues the signal elsewhere. This can result
|
||||
in the host viewing this interrupt as spurious since the host may also
|
||||
poll the channel. Make the necessary adjustments.
|
||||
|
||||
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
|
||||
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
---
|
||||
drivers/hv/channel.c | 11 ++---------
|
||||
drivers/hv/hyperv_vmbus.h | 4 ++--
|
||||
drivers/hv/ring_buffer.c | 7 ++++---
|
||||
include/linux/hyperv.h | 12 ++++++------
|
||||
4 files changed, 14 insertions(+), 20 deletions(-)
|
||||
|
||||
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
|
||||
index 8a8148f7b842..5fb4c6d9209b 100644
|
||||
--- a/drivers/hv/channel.c
|
||||
+++ b/drivers/hv/channel.c
|
||||
@@ -879,16 +879,9 @@ __vmbus_recvpacket(struct vmbus_channel *channel, void *buffer,
|
||||
u32 bufferlen, u32 *buffer_actual_len, u64 *requestid,
|
||||
bool raw)
|
||||
{
|
||||
- int ret;
|
||||
- bool signal = false;
|
||||
-
|
||||
- ret = hv_ringbuffer_read(&channel->inbound, buffer, bufferlen,
|
||||
- buffer_actual_len, requestid, &signal, raw);
|
||||
+ return hv_ringbuffer_read(channel, buffer, bufferlen,
|
||||
+ buffer_actual_len, requestid, raw);
|
||||
|
||||
- if (signal)
|
||||
- vmbus_setevent(channel);
|
||||
-
|
||||
- return ret;
|
||||
}
|
||||
|
||||
int vmbus_recvpacket(struct vmbus_channel *channel, void *buffer,
|
||||
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
|
||||
index fa782e13c8da..2b13f2a0a71e 100644
|
||||
--- a/drivers/hv/hyperv_vmbus.h
|
||||
+++ b/drivers/hv/hyperv_vmbus.h
|
||||
@@ -532,9 +532,9 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
|
||||
u32 kv_count, bool lock,
|
||||
bool kick_q);
|
||||
|
||||
-int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info,
|
||||
+int hv_ringbuffer_read(struct vmbus_channel *channel,
|
||||
void *buffer, u32 buflen, u32 *buffer_actual_len,
|
||||
- u64 *requestid, bool *signal, bool raw);
|
||||
+ u64 *requestid, bool raw);
|
||||
|
||||
void hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info,
|
||||
struct hv_ring_buffer_debug_info *debug_info);
|
||||
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
|
||||
index 4af71306d0ff..cd49cb17eb7f 100644
|
||||
--- a/drivers/hv/ring_buffer.c
|
||||
+++ b/drivers/hv/ring_buffer.c
|
||||
@@ -353,9 +353,9 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
|
||||
return 0;
|
||||
}
|
||||
|
||||
-int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info,
|
||||
+int hv_ringbuffer_read(struct vmbus_channel *channel,
|
||||
void *buffer, u32 buflen, u32 *buffer_actual_len,
|
||||
- u64 *requestid, bool *signal, bool raw)
|
||||
+ u64 *requestid, bool raw)
|
||||
{
|
||||
u32 bytes_avail_toread;
|
||||
u32 next_read_location = 0;
|
||||
@@ -364,6 +364,7 @@ int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info,
|
||||
u32 offset;
|
||||
u32 packetlen;
|
||||
int ret = 0;
|
||||
+ struct hv_ring_buffer_info *inring_info = &channel->inbound;
|
||||
|
||||
if (buflen <= 0)
|
||||
return -EINVAL;
|
||||
@@ -421,7 +422,7 @@ int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info,
|
||||
/* Update the read index */
|
||||
hv_set_next_read_location(inring_info, next_read_location);
|
||||
|
||||
- *signal = hv_need_to_signal_on_read(inring_info);
|
||||
+ hv_signal_on_read(channel);
|
||||
|
||||
return ret;
|
||||
}
|
||||
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
|
||||
index 9a2d657a7fe7..5343b7406acc 100644
|
||||
--- a/include/linux/hyperv.h
|
||||
+++ b/include/linux/hyperv.h
|
||||
@@ -1480,10 +1480,11 @@ hv_get_ring_buffer(struct hv_ring_buffer_info *ring_info)
|
||||
* there is room for the producer to send the pending packet.
|
||||
*/
|
||||
|
||||
-static inline bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
|
||||
+static inline void hv_signal_on_read(struct vmbus_channel *channel)
|
||||
{
|
||||
u32 cur_write_sz;
|
||||
u32 pending_sz;
|
||||
+ struct hv_ring_buffer_info *rbi = &channel->inbound;
|
||||
|
||||
/*
|
||||
* Issue a full memory barrier before making the signaling decision.
|
||||
@@ -1501,14 +1502,14 @@ static inline bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
|
||||
pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
|
||||
/* If the other end is not blocked on write don't bother. */
|
||||
if (pending_sz == 0)
|
||||
- return false;
|
||||
+ return;
|
||||
|
||||
cur_write_sz = hv_get_bytes_to_write(rbi);
|
||||
|
||||
if (cur_write_sz >= pending_sz)
|
||||
- return true;
|
||||
+ vmbus_setevent(channel);
|
||||
|
||||
- return false;
|
||||
+ return;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1590,8 +1591,7 @@ static inline void commit_rd_index(struct vmbus_channel *channel)
|
||||
virt_rmb();
|
||||
ring_info->ring_buffer->read_index = ring_info->priv_read_index;
|
||||
|
||||
- if (hv_need_to_signal_on_read(ring_info))
|
||||
- vmbus_set_event(channel);
|
||||
+ hv_signal_on_read(channel);
|
||||
}
|
||||
|
||||
struct vmpipe_proto_header {
|
||||
--
|
||||
2.11.0
|
||||
|
@ -0,0 +1,140 @@
|
||||
From c9dc13e8f77c09369dae49f265176999e3f327c8 Mon Sep 17 00:00:00 2001
|
||||
From: Dexuan Cui <decui@microsoft.com>
|
||||
Date: Mon, 16 Jan 2017 15:46:19 +0800
|
||||
Subject: [PATCH 11/11] Drivers: hv: vmbus: finally fix
|
||||
hv_need_to_signal_on_read()
|
||||
|
||||
Commit a389fcfd2cb5 ("Drivers: hv: vmbus: Fix signaling logic in hv_need_to_signal_on_read()")
|
||||
added the proper mb(), but removed the test "prev_write_sz < pending_sz"
|
||||
when making the signal decision.
|
||||
|
||||
As a result, the guest can signal the host unnecessarily,
|
||||
and then the host can throttle the guest because the host
|
||||
thinks the guest is buggy or malicious; finally the user
|
||||
running stress test can perceive intermittent freeze of
|
||||
the guest.
|
||||
|
||||
This patch brings back the test, and properly handles the
|
||||
in-place consumption APIs used by NetVSC (see get_next_pkt_raw(),
|
||||
put_pkt_raw() and commit_rd_index()).
|
||||
|
||||
Fixes: a389fcfd2cb5 ("Drivers: hv: vmbus: Fix signaling logic in hv_need_to_signal_on_read()")
|
||||
Signed-off-by: Dexuan Cui <decui@microsoft.com>
|
||||
Reported-by: Rolf Neugebauer <rolf.neugebauer@docker.com>
|
||||
Tested-by: Rolf Neugebauer <rolf.neugebauer@docker.com>
|
||||
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
|
||||
Cc: Haiyang Zhang <haiyangz@microsoft.com>
|
||||
Cc: Stephen Hemminger <sthemmin@microsoft.com>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
|
||||
Origin: Email from Dexuan
|
||||
---
|
||||
drivers/hv/ring_buffer.c | 1 +
|
||||
drivers/net/hyperv/netvsc.c | 2 ++
|
||||
include/linux/hyperv.h | 32 ++++++++++++++++++++++++++++++--
|
||||
3 files changed, 33 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
|
||||
index cd49cb17eb7f..308dbda700eb 100644
|
||||
--- a/drivers/hv/ring_buffer.c
|
||||
+++ b/drivers/hv/ring_buffer.c
|
||||
@@ -383,6 +383,7 @@ int hv_ringbuffer_read(struct vmbus_channel *channel,
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ init_cached_read_index(channel);
|
||||
next_read_location = hv_get_next_read_location(inring_info);
|
||||
next_read_location = hv_copyfrom_ringbuffer(inring_info, &desc,
|
||||
sizeof(desc),
|
||||
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
|
||||
index 720b5fa9e625..9cd74817b09c 100644
|
||||
--- a/drivers/net/hyperv/netvsc.c
|
||||
+++ b/drivers/net/hyperv/netvsc.c
|
||||
@@ -1288,6 +1288,8 @@ void netvsc_channel_cb(void *context)
|
||||
ndev = hv_get_drvdata(device);
|
||||
buffer = get_per_channel_state(channel);
|
||||
|
||||
+ init_cached_read_index(channel);
|
||||
+
|
||||
do {
|
||||
desc = get_next_pkt_raw(channel);
|
||||
if (desc != NULL) {
|
||||
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
|
||||
index 5343b7406acc..e34da6846348 100644
|
||||
--- a/include/linux/hyperv.h
|
||||
+++ b/include/linux/hyperv.h
|
||||
@@ -128,6 +128,7 @@ struct hv_ring_buffer_info {
|
||||
u32 ring_data_startoffset;
|
||||
u32 priv_write_index;
|
||||
u32 priv_read_index;
|
||||
+ u32 cached_read_index;
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -180,6 +181,19 @@ static inline u32 hv_get_bytes_to_write(struct hv_ring_buffer_info *rbi)
|
||||
return write;
|
||||
}
|
||||
|
||||
+static inline u32 hv_get_cached_bytes_to_write(
|
||||
+ const struct hv_ring_buffer_info *rbi)
|
||||
+{
|
||||
+ u32 read_loc, write_loc, dsize, write;
|
||||
+
|
||||
+ dsize = rbi->ring_datasize;
|
||||
+ read_loc = rbi->cached_read_index;
|
||||
+ write_loc = rbi->ring_buffer->write_index;
|
||||
+
|
||||
+ write = write_loc >= read_loc ? dsize - (write_loc - read_loc) :
|
||||
+ read_loc - write_loc;
|
||||
+ return write;
|
||||
+}
|
||||
/*
|
||||
* VMBUS version is 32 bit entity broken up into
|
||||
* two 16 bit quantities: major_number. minor_number.
|
||||
@@ -1482,7 +1496,7 @@ hv_get_ring_buffer(struct hv_ring_buffer_info *ring_info)
|
||||
|
||||
static inline void hv_signal_on_read(struct vmbus_channel *channel)
|
||||
{
|
||||
- u32 cur_write_sz;
|
||||
+ u32 cur_write_sz, cached_write_sz;
|
||||
u32 pending_sz;
|
||||
struct hv_ring_buffer_info *rbi = &channel->inbound;
|
||||
|
||||
@@ -1506,12 +1520,24 @@ static inline void hv_signal_on_read(struct vmbus_channel *channel)
|
||||
|
||||
cur_write_sz = hv_get_bytes_to_write(rbi);
|
||||
|
||||
- if (cur_write_sz >= pending_sz)
|
||||
+ if (cur_write_sz < pending_sz)
|
||||
+ return;
|
||||
+
|
||||
+ cached_write_sz = hv_get_cached_bytes_to_write(rbi);
|
||||
+ if (cached_write_sz < pending_sz)
|
||||
vmbus_setevent(channel);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
+static inline void
|
||||
+init_cached_read_index(struct vmbus_channel *channel)
|
||||
+{
|
||||
+ struct hv_ring_buffer_info *rbi = &channel->inbound;
|
||||
+
|
||||
+ rbi->cached_read_index = rbi->ring_buffer->read_index;
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* An API to support in-place processing of incoming VMBUS packets.
|
||||
*/
|
||||
@@ -1573,6 +1599,8 @@ static inline void put_pkt_raw(struct vmbus_channel *channel,
|
||||
* This call commits the read index and potentially signals the host.
|
||||
* Here is the pattern for using the "in-place" consumption APIs:
|
||||
*
|
||||
+ * init_cached_read_index();
|
||||
+ *
|
||||
* while (get_next_pkt_raw() {
|
||||
* process the packet "in-place";
|
||||
* put_pkt_raw();
|
||||
--
|
||||
2.11.0
|
||||
|
Loading…
Reference in New Issue
Block a user