versions: qemu-experimental: 6.0~rc 470dd6

Move to next 6.0 dev tree for qemu experimental,
the qemu version is the same base as:

https://gitlab.com/virtio-fs/qemu/-/commits/virtio-fs-dev/

Using qemu 6.0-rc1 some patches does not apply.

Fixes: #1624

Signed-off-by: Carlos Venegas <jos.c.venegas.munoz@intel.com>
This commit is contained in:
Carlos Venegas 2021-04-05 18:38:30 +00:00
parent 6491b9d7aa
commit f365bdb7cf
32 changed files with 4545 additions and 2 deletions

View File

@ -0,0 +1,270 @@
From d14a6cb000d0a5f9e382e5e5de0021756034d0cb Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Thu, 2 May 2019 18:04:04 +0100
Subject: [PATCH 01/29] DAX: libvhost-user: Allow popping a queue element with
bad pointers
Allow a daemon implemented with libvhost-user to accept an
element with pointers to memory that aren't in the mapping table.
The daemon might have some special way to deal with some special
cases of this.
The default behaviour doesn't change.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
contrib/vhost-user-blk/vhost-user-blk.c | 3 +-
contrib/vhost-user-gpu/vhost-user-gpu.c | 5 ++-
contrib/vhost-user-input/main.c | 4 +-
contrib/vhost-user-scsi/vhost-user-scsi.c | 2 +-
subprojects/libvhost-user/libvhost-user.c | 51 ++++++++++++++++++-----
subprojects/libvhost-user/libvhost-user.h | 8 +++-
tests/vhost-user-bridge.c | 4 +-
tools/virtiofsd/fuse_virtio.c | 3 +-
8 files changed, 59 insertions(+), 21 deletions(-)
diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c
index d14b2896bf..01193552e9 100644
--- a/contrib/vhost-user-blk/vhost-user-blk.c
+++ b/contrib/vhost-user-blk/vhost-user-blk.c
@@ -235,7 +235,8 @@ static int vub_virtio_process_req(VubDev *vdev_blk,
unsigned out_num;
VubReq *req;
- elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + sizeof(VubReq));
+ elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + sizeof(VubReq),
+ NULL, NULL);
if (!elem) {
return -1;
}
diff --git a/contrib/vhost-user-gpu/vhost-user-gpu.c b/contrib/vhost-user-gpu/vhost-user-gpu.c
index f445ef28ec..58161a4378 100644
--- a/contrib/vhost-user-gpu/vhost-user-gpu.c
+++ b/contrib/vhost-user-gpu/vhost-user-gpu.c
@@ -819,7 +819,8 @@ vg_handle_ctrl(VuDev *dev, int qidx)
return;
}
- cmd = vu_queue_pop(dev, vq, sizeof(struct virtio_gpu_ctrl_command));
+ cmd = vu_queue_pop(dev, vq, sizeof(struct virtio_gpu_ctrl_command),
+ NULL, NULL);
if (!cmd) {
break;
}
@@ -922,7 +923,7 @@ vg_handle_cursor(VuDev *dev, int qidx)
struct virtio_gpu_update_cursor cursor;
for (;;) {
- elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement));
+ elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement), NULL, NULL);
if (!elem) {
break;
}
diff --git a/contrib/vhost-user-input/main.c b/contrib/vhost-user-input/main.c
index c15d18c33f..d5c435605c 100644
--- a/contrib/vhost-user-input/main.c
+++ b/contrib/vhost-user-input/main.c
@@ -57,7 +57,7 @@ static void vi_input_send(VuInput *vi, struct virtio_input_event *event)
/* ... then check available space ... */
for (i = 0; i < vi->qindex; i++) {
- elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement));
+ elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement), NULL, NULL);
if (!elem) {
while (--i >= 0) {
vu_queue_unpop(dev, vq, vi->queue[i].elem, 0);
@@ -141,7 +141,7 @@ static void vi_handle_sts(VuDev *dev, int qidx)
g_debug("%s", G_STRFUNC);
for (;;) {
- elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement));
+ elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement), NULL, NULL);
if (!elem) {
break;
}
diff --git a/contrib/vhost-user-scsi/vhost-user-scsi.c b/contrib/vhost-user-scsi/vhost-user-scsi.c
index 4f6e3e2a24..7564d6ab2d 100644
--- a/contrib/vhost-user-scsi/vhost-user-scsi.c
+++ b/contrib/vhost-user-scsi/vhost-user-scsi.c
@@ -252,7 +252,7 @@ static void vus_proc_req(VuDev *vu_dev, int idx)
VirtIOSCSICmdReq *req;
VirtIOSCSICmdResp *rsp;
- elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement));
+ elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement), NULL, NULL);
if (!elem) {
g_debug("No more elements pending on vq[%d]@%p", idx, vq);
break;
diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c
index fab7ca17ee..3b1b5c385f 100644
--- a/subprojects/libvhost-user/libvhost-user.c
+++ b/subprojects/libvhost-user/libvhost-user.c
@@ -2461,7 +2461,8 @@ vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable)
static bool
virtqueue_map_desc(VuDev *dev,
- unsigned int *p_num_sg, struct iovec *iov,
+ unsigned int *p_num_sg, unsigned int *p_bad_sg,
+ struct iovec *iov,
unsigned int max_num_sg, bool is_write,
uint64_t pa, size_t sz)
{
@@ -2482,10 +2483,35 @@ virtqueue_map_desc(VuDev *dev,
return false;
}
- iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa);
- if (iov[num_sg].iov_base == NULL) {
- vu_panic(dev, "virtio: invalid address for buffers");
- return false;
+ if (p_bad_sg && *p_bad_sg) {
+ /* A previous mapping was bad, we won't try and map this either */
+ *p_bad_sg = *p_bad_sg + 1;
+ }
+ if (!p_bad_sg || !*p_bad_sg) {
+ /* No bad mappings so far, lets try mapping this one */
+ iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa);
+ if (iov[num_sg].iov_base == NULL) {
+ /*
+ * OK, it won't map, either panic or if the caller can handle
+ * it, then count it.
+ */
+ if (!p_bad_sg) {
+ vu_panic(dev, "virtio: invalid address for buffers");
+ return false;
+ } else {
+ *p_bad_sg = *p_bad_sg + 1;
+ }
+ }
+ }
+ if (p_bad_sg && *p_bad_sg) {
+ /*
+ * There was a bad mapping, either now or previously, since
+ * the caller set p_bad_sg it means it's prepared to deal with
+ * it, so give it the pa in the iov
+ * Note: In this case len will be the whole sz, so we won't
+ * go around again for this descriptor
+ */
+ iov[num_sg].iov_base = (void *)(uintptr_t)pa;
}
iov[num_sg].iov_len = len;
num_sg++;
@@ -2516,7 +2542,8 @@ virtqueue_alloc_element(size_t sz,
}
static void *
-vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
+vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz,
+ unsigned int *p_bad_in, unsigned int *p_bad_out)
{
struct vring_desc *desc = vq->vring.desc;
uint64_t desc_addr, read_len;
@@ -2560,7 +2587,7 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
/* Collect all the descriptors */
do {
if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
- if (!virtqueue_map_desc(dev, &in_num, iov + out_num,
+ if (!virtqueue_map_desc(dev, &in_num, p_bad_in, iov + out_num,
VIRTQUEUE_MAX_SIZE - out_num, true,
le64toh(desc[i].addr),
le32toh(desc[i].len))) {
@@ -2571,7 +2598,7 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
vu_panic(dev, "Incorrect order for descriptors");
return NULL;
}
- if (!virtqueue_map_desc(dev, &out_num, iov,
+ if (!virtqueue_map_desc(dev, &out_num, p_bad_out, iov,
VIRTQUEUE_MAX_SIZE, false,
le64toh(desc[i].addr),
le32toh(desc[i].len))) {
@@ -2661,7 +2688,8 @@ vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx)
}
void *
-vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
+vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz,
+ unsigned int *p_bad_in, unsigned int *p_bad_out)
{
int i;
unsigned int head;
@@ -2674,7 +2702,8 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
i = (--vq->resubmit_num);
- elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz);
+ elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz,
+ p_bad_in, p_bad_out);
if (!vq->resubmit_num) {
free(vq->resubmit_list);
@@ -2706,7 +2735,7 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
vring_set_avail_event(vq, vq->last_avail_idx);
}
- elem = vu_queue_map_desc(dev, vq, head, sz);
+ elem = vu_queue_map_desc(dev, vq, head, sz, p_bad_in, p_bad_out);
if (!elem) {
return NULL;
diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h
index 7d47f1364a..f0aca2b216 100644
--- a/subprojects/libvhost-user/libvhost-user.h
+++ b/subprojects/libvhost-user/libvhost-user.h
@@ -589,11 +589,17 @@ void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq);
* @dev: a VuDev context
* @vq: a VuVirtq queue
* @sz: the size of struct to return (must be >= VuVirtqElement)
+ * @p_bad_in: If none NULL, a pointer to an integer count of
+ * unmappable regions in input descriptors
+ * @p_bad_out: If none NULL, a pointer to an integer count of
+ * unmappable regions in output descriptors
+ *
*
* Returns: a VuVirtqElement filled from the queue or NULL. The
* returned element must be free()-d by the caller.
*/
-void *vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz);
+void *vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz,
+ unsigned int *p_bad_in, unsigned int *p_bad_out);
/**
diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c
index 24815920b2..4f6829e6c3 100644
--- a/tests/vhost-user-bridge.c
+++ b/tests/vhost-user-bridge.c
@@ -184,7 +184,7 @@ vubr_handle_tx(VuDev *dev, int qidx)
unsigned int out_num;
struct iovec sg[VIRTQUEUE_MAX_SIZE], *out_sg;
- elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement));
+ elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement), NULL, NULL);
if (!elem) {
break;
}
@@ -299,7 +299,7 @@ vubr_backend_recv_cb(int sock, void *ctx)
ssize_t ret, total = 0;
unsigned int num;
- elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement));
+ elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement), NULL, NULL);
if (!elem) {
break;
}
diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
index ddcefee427..bd19358437 100644
--- a/tools/virtiofsd/fuse_virtio.c
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -657,7 +657,8 @@ static void *fv_queue_thread(void *opaque)
__func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes);
while (1) {
- FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest));
+ FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest),
+ NULL, NULL);
if (!req) {
break;
}
--
2.25.1

View File

@ -0,0 +1,36 @@
From da5d60ab13c9e31f775b34d7afe6d82fca7f2336 Mon Sep 17 00:00:00 2001
From: Wainer dos Santos Moschetta <wainersm@redhat.com>
Date: Tue, 2 Feb 2021 13:46:24 -0500
Subject: [PATCH] virtiofsd: Allow to build it without the tools
This changed the Meson build script to allow virtiofsd be built even
though the tools build is disabled, thus honoring the --enable-virtiofsd
option.
(Backport of commit xxxxxx)
Signed-off-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
---
tools/meson.build | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/tools/meson.build b/tools/meson.build
index fdce66857d..3e5a0abfa2 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -10,8 +10,11 @@ if get_option('virtiofsd').enabled()
error('virtiofsd requires Linux')
elif not seccomp.found() or not libcap_ng.found()
error('virtiofsd requires libcap-ng-devel and seccomp-devel')
- elif not have_tools or 'CONFIG_VHOST_USER' not in config_host
- error('virtiofsd needs tools and vhost-user support')
+ elif 'CONFIG_VHOST_USER' not in config_host
+ error('virtiofsd needs vhost-user support')
+ else
+ # Disabled all the tools but virtiofsd.
+ have_virtiofsd = true
endif
endif
elif get_option('virtiofsd').disabled() or not have_system
--
2.25.1

View File

@ -0,0 +1,155 @@
From bb506adc3bc3e3c0cad695b3bab126afdc3f0536 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 16 May 2019 15:11:35 +0100
Subject: [PATCH 02/29] virtiofsd: add security guide document
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Many people want to know: what's up with virtiofsd and security? This
document provides the answers!
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
---
docs/tools/index.rst | 1 +
docs/tools/virtiofsd-security.rst | 118 ++++++++++++++++++++++++++++++
2 files changed, 119 insertions(+)
create mode 100644 docs/tools/virtiofsd-security.rst
diff --git a/docs/tools/index.rst b/docs/tools/index.rst
index 3a5829c17a..d5b65f803b 100644
--- a/docs/tools/index.rst
+++ b/docs/tools/index.rst
@@ -17,3 +17,4 @@ Contents:
qemu-trace-stap
virtfs-proxy-helper
virtiofsd
+ virtiofsd-security
diff --git a/docs/tools/virtiofsd-security.rst b/docs/tools/virtiofsd-security.rst
new file mode 100644
index 0000000000..61ce551344
--- /dev/null
+++ b/docs/tools/virtiofsd-security.rst
@@ -0,0 +1,118 @@
+========================
+Virtiofsd Security Guide
+========================
+
+Introduction
+============
+This document covers security topics for users of virtiofsd, the daemon that
+implements host<->guest file system sharing. Sharing files between one or more
+guests and the host raises questions about the trust relationships between
+these entities. By understanding these topics users can safely deploy
+virtiofsd and control access to their data.
+
+Architecture
+============
+The virtiofsd daemon process acts as a vhost-user device backend, implementing
+the virtio-fs device that the corresponding device driver inside the guest
+interacts with.
+
+There is one virtiofsd process per virtio-fs device instance. For example,
+when two guests have access to the same shared directory there are still two
+virtiofsd processes since there are two virtio-fs device instances. Similarly,
+if one guest has access to two shared directories, there are two virtiofsd
+processes since there are two virtio-fs device instances.
+
+Files are created on the host with uid/gid values provided by the guest.
+Furthermore, virtiofsd is unable to enforce file permissions since guests have
+the ability to access any file within the shared directory. File permissions
+are implemented in the guest, just like with traditional local file systems.
+
+Security Requirements
+=====================
+Guests have root access to the shared directory. This is necessary for root
+file systems on virtio-fs and similar use cases.
+
+When multiple guests have access to the same shared directory, the guests have
+a trust relationship. A broken or malicious guest could delete or corrupt
+files. It could exploit symlink or time-of-check-to-time-of-use (TOCTOU) race
+conditions against applications in other guests. It could plant device nodes
+or setuid executables to gain privileges in other guests. It could perform
+denial-of-service (DoS) attacks by consuming available space or making the file
+system unavailable to other guests.
+
+Guests are restricted to the shared directory and cannot access other files on
+the host.
+
+Guests should not be able to gain arbitrary code execution inside the virtiofsd
+process. If they do, the process is sandboxed to prevent escaping into other
+parts of the host.
+
+Daemon Sandboxing
+=================
+The virtiofsd process handles virtio-fs FUSE requests from the untrusted guest.
+This attack surface could give the guest access to host resources and must
+therefore be protected. Sandboxing mechanisms are integrated into virtiofsd to
+reduce the impact in the event that an attacker gains control of the process.
+
+As a general rule, virtiofsd does not trust inputs from the guest, aside from
+uid/gid values. Input validation is performed so that the guest cannot corrupt
+memory or otherwise gain arbitrary code execution in the virtiofsd process.
+
+Sandboxing adds restrictions on the virtiofsd so that even if an attacker is
+able to exploit a bug, they will be constrained to the virtiofsd process and
+unable to cause damage on the host.
+
+Seccomp Whitelist
+-----------------
+Many system calls are not required by virtiofsd to perform its function. For
+example, ptrace(2) and execve(2) are not necessary and attackers are likely to
+use them to further compromise the system. This is prevented using a seccomp
+whitelist in virtiofsd.
+
+During startup virtiofsd installs a whitelist of allowed system calls. All
+other system calls are forbidden for the remaining lifetime of the process.
+This list has been built through experience of running virtiofsd on several
+flavors of Linux and observing which system calls were encountered.
+
+It is possible that previously unexplored code paths or newer library versions
+will invoke system calls that have not been whitelisted yet. In this case the
+process terminates and a seccomp error is captured in the audit log. The log
+can typically be viewed using ``journalctl -xe`` and searching for ``SECCOMP``.
+
+Should it be necessary to extend the whitelist, system call numbers from the
+audit log can be translated to names through a CPU architecture-specific
+``.tbl`` file in the Linux source tree. They can then be added to the
+whitelist in ``seccomp.c`` in the virtiofsd source tree.
+
+Mount Namespace
+---------------
+During startup virtiofsd enters a new mount namespace and releases all mounts
+except for the shared directory. This makes the file system root `/` the
+shared directory. It is impossible to access files outside the shared
+directory since they cannot be looked up by path resolution.
+
+Several attacks, including `..` traversal and symlink escapes, are prevented by
+the mount namespace.
+
+The current virtiofsd implementation keeps a directory file descriptor to
+/proc/self/fd open in order to implement several FUSE requests. This file
+descriptor could be used by attackers to access files outside the shared
+directory. This limitation will be addressed in a future release of virtiofsd.
+
+Other Namespaces
+----------------
+Virtiofsd enters new pid and network namespaces during startup. The pid
+namespace prevents the process from seeing other processes running on the host.
+The network namespace removes network connectivity from the process.
+
+Deployment Best Practices
+=========================
+The shared directory should be a separate file system so that untrusted guests
+cannot cause a denial-of-service by using up all available inodes or exhausting
+free space.
+
+If the shared directory is also accessible from a host mount namespace, it is
+recommended to keep a parent directory with rwx------ permissions so that other
+users on the host are unable to access any setuid executables or device nodes
+in the shared directory. The `nosuid` and `nodev` mount options can also be
+used to prevent this issue.
--
2.25.1

View File

@ -0,0 +1,110 @@
From 800ce0d08e09320ac2f1bd9125cb07d14a2689fe Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Thu, 7 Feb 2019 18:39:31 +0000
Subject: [PATCH 03/29] DAX contrib/libvhost-user: Add virtio-fs slave types
Add virtio-fs definitions to libvhost-user
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
subprojects/libvhost-user/libvhost-user.c | 32 +++++++++++++++++++++++
subprojects/libvhost-user/libvhost-user.h | 31 ++++++++++++++++++++++
2 files changed, 63 insertions(+)
diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c
index 3b1b5c385f..9b8223b5d5 100644
--- a/subprojects/libvhost-user/libvhost-user.c
+++ b/subprojects/libvhost-user/libvhost-user.c
@@ -2910,3 +2910,35 @@ vu_queue_push(VuDev *dev, VuVirtq *vq,
vu_queue_flush(dev, vq, 1);
vu_queue_inflight_post_put(dev, vq, elem->index);
}
+
+bool vu_fs_cache_request(VuDev *dev, VhostUserSlaveRequest req, int fd,
+ VhostUserFSSlaveMsg *fsm)
+{
+ int fd_num = 0;
+ VhostUserMsg vmsg = {
+ .request = req,
+ .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
+ .size = sizeof(vmsg.payload.fs),
+ .payload.fs = *fsm,
+ };
+
+ if (fd != -1) {
+ vmsg.fds[fd_num++] = fd;
+ }
+
+ vmsg.fd_num = fd_num;
+
+ if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) {
+ return false;
+ }
+
+ pthread_mutex_lock(&dev->slave_mutex);
+ if (!vu_message_write(dev, dev->slave_fd, &vmsg)) {
+ pthread_mutex_unlock(&dev->slave_mutex);
+ return false;
+ }
+
+ /* Also unlocks the slave_mutex */
+ return vu_process_message_reply(dev, &vmsg);
+}
+
diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h
index f0aca2b216..f3b0998eea 100644
--- a/subprojects/libvhost-user/libvhost-user.h
+++ b/subprojects/libvhost-user/libvhost-user.h
@@ -122,6 +122,24 @@ typedef enum VhostUserSlaveRequest {
VHOST_USER_SLAVE_MAX
} VhostUserSlaveRequest;
+/* Structures carried over the slave channel back to QEMU */
+#define VHOST_USER_FS_SLAVE_ENTRIES 8
+
+/* For the flags field of VhostUserFSSlaveMsg */
+#define VHOST_USER_FS_FLAG_MAP_R (1ull << 0)
+#define VHOST_USER_FS_FLAG_MAP_W (1ull << 1)
+
+typedef struct {
+ /* Offsets within the file being mapped */
+ uint64_t fd_offset[VHOST_USER_FS_SLAVE_ENTRIES];
+ /* Offsets within the cache */
+ uint64_t c_offset[VHOST_USER_FS_SLAVE_ENTRIES];
+ /* Lengths of sections */
+ uint64_t len[VHOST_USER_FS_SLAVE_ENTRIES];
+ /* Flags, from VHOST_USER_FS_FLAG_* */
+ uint64_t flags[VHOST_USER_FS_SLAVE_ENTRIES];
+} VhostUserFSSlaveMsg;
+
typedef struct VhostUserMemoryRegion {
uint64_t guest_phys_addr;
uint64_t memory_size;
@@ -197,6 +215,7 @@ typedef struct VhostUserMsg {
VhostUserConfig config;
VhostUserVringArea area;
VhostUserInflight inflight;
+ VhostUserFSSlaveMsg fs;
} payload;
int fds[VHOST_MEMORY_BASELINE_NREGIONS];
@@ -693,4 +712,16 @@ void vu_queue_get_avail_bytes(VuDev *vdev, VuVirtq *vq, unsigned int *in_bytes,
bool vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes,
unsigned int out_bytes);
+/**
+ * vu_fs_cache_request: Send a slave message for an fs client
+ * @dev: a VuDev context
+ * @req: The request type (map, unmap, sync)
+ * @fd: an fd (only required for map, else must be -1)
+ * @fsm: The body of the message
+ *
+ * Returns: true if the reply was 0
+ */
+bool vu_fs_cache_request(VuDev *dev, VhostUserSlaveRequest req, int fd,
+ VhostUserFSSlaveMsg *fsm);
+
#endif /* LIBVHOST_USER_H */
--
2.25.1

View File

@ -0,0 +1,65 @@
From 71c89288b97c92ecb3a67ca8aa73619719dcfe9e Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Fri, 27 Jul 2018 12:38:03 +0100
Subject: [PATCH 04/29] DAX: virtio: Add shared memory capability
Define a new capability type 'VIRTIO_PCI_CAP_SHARED_MEMORY_CFG'
and the data structure 'virtio_pci_cap64' to go with it.
They allow defining shared memory regions with sizes and offsets
of 2^32 and more.
Multiple instances of the capability are allowed and distinguished
by the 'id' field in the base capability.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
hw/virtio/virtio-pci.c | 20 ++++++++++++++++++++
hw/virtio/virtio-pci.h | 4 ++++
2 files changed, 24 insertions(+)
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index f863f69ede..f17ea5a6e8 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1136,6 +1136,26 @@ static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy,
return offset;
}
+int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy,
+ uint8_t bar, uint64_t offset, uint64_t length,
+ uint8_t id)
+{
+ struct virtio_pci_cap64 cap = {
+ .cap.cap_len = sizeof cap,
+ .cap.cfg_type = VIRTIO_PCI_CAP_SHARED_MEMORY_CFG,
+ };
+ uint32_t mask32 = ~0;
+
+ cap.cap.bar = bar;
+ cap.cap.id = id;
+ cap.cap.length = cpu_to_le32(length & mask32);
+ cap.length_hi = cpu_to_le32((length >> 32) & mask32);
+ cap.cap.offset = cpu_to_le32(offset & mask32);
+ cap.offset_hi = cpu_to_le32((offset >> 32) & mask32);
+
+ return virtio_pci_add_mem_cap(proxy, &cap.cap);
+}
+
static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr,
unsigned size)
{
diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
index d7d5d403a9..31ca339099 100644
--- a/hw/virtio/virtio-pci.h
+++ b/hw/virtio/virtio-pci.h
@@ -247,4 +247,8 @@ void virtio_pci_types_register(const VirtioPCIDeviceTypeInfo *t);
*/
unsigned virtio_pci_optimal_num_queues(unsigned fixed_queues);
+int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy,
+ uint8_t bar, uint64_t offset, uint64_t length,
+ uint8_t id);
+
#endif
--
2.25.1

View File

@ -0,0 +1,171 @@
From 3996e9086ddd591494f9cb7f0eb7048a1b52200c Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Tue, 3 Jul 2018 16:33:52 +0100
Subject: [PATCH 05/29] DAX: virtio-fs: Add cache BAR
Add a cache BAR into which files will be directly mapped.
The size can be set with the cache-size= property, e.g.
-device vhost-user-fs-pci,chardev=char0,tag=myfs,cache-size=16G
The default is no cache.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
with PPC fixes by:
Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
---
hw/virtio/vhost-user-fs-pci.c | 23 ++++++++++++++++
hw/virtio/vhost-user-fs.c | 32 ++++++++++++++++++++++
include/hw/virtio/vhost-user-fs.h | 2 ++
include/standard-headers/linux/virtio_fs.h | 2 ++
4 files changed, 59 insertions(+)
diff --git a/hw/virtio/vhost-user-fs-pci.c b/hw/virtio/vhost-user-fs-pci.c
index 8bb389bd28..19aaa8d722 100644
--- a/hw/virtio/vhost-user-fs-pci.c
+++ b/hw/virtio/vhost-user-fs-pci.c
@@ -16,10 +16,12 @@
#include "hw/virtio/vhost-user-fs.h"
#include "virtio-pci.h"
#include "qom/object.h"
+#include "standard-headers/linux/virtio_fs.h"
struct VHostUserFSPCI {
VirtIOPCIProxy parent_obj;
VHostUserFS vdev;
+ MemoryRegion cachebar;
};
typedef struct VHostUserFSPCI VHostUserFSPCI;
@@ -39,6 +41,7 @@ static void vhost_user_fs_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
{
VHostUserFSPCI *dev = VHOST_USER_FS_PCI(vpci_dev);
DeviceState *vdev = DEVICE(&dev->vdev);
+ uint64_t cachesize;
if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
/* Also reserve config change and hiprio queue vectors */
@@ -46,6 +49,26 @@ static void vhost_user_fs_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
}
qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+ cachesize = dev->vdev.conf.cache_size;
+
+ /*
+ * The bar starts with the data/DAX cache
+ * Others will be added later.
+ */
+ memory_region_init(&dev->cachebar, OBJECT(vpci_dev),
+ "vhost-fs-pci-cachebar", cachesize);
+ if (cachesize) {
+ memory_region_add_subregion(&dev->cachebar, 0, &dev->vdev.cache);
+ virtio_pci_add_shm_cap(vpci_dev, VIRTIO_FS_PCI_CACHE_BAR, 0, cachesize,
+ VIRTIO_FS_SHMCAP_ID_CACHE);
+ }
+
+ /* After 'realized' so the memory region exists */
+ pci_register_bar(&vpci_dev->pci_dev, VIRTIO_FS_PCI_CACHE_BAR,
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_PREFETCH |
+ PCI_BASE_ADDRESS_MEM_TYPE_64,
+ &dev->cachebar);
}
static void vhost_user_fs_pci_class_init(ObjectClass *klass, void *data)
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index ed036ad9c1..d111bf2af3 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -23,6 +23,16 @@
#include "hw/virtio/vhost-user-fs.h"
#include "monitor/monitor.h"
+/*
+ * The powerpc kernel code expects the memory to be accessible during
+ * addition/removal.
+ */
+#if defined(TARGET_PPC64) && defined(CONFIG_LINUX)
+#define DAX_WINDOW_PROT PROT_READ
+#else
+#define DAX_WINDOW_PROT PROT_NONE
+#endif
+
static void vuf_get_config(VirtIODevice *vdev, uint8_t *config)
{
VHostUserFS *fs = VHOST_USER_FS(vdev);
@@ -162,6 +172,7 @@ static void vuf_device_realize(DeviceState *dev, Error **errp)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VHostUserFS *fs = VHOST_USER_FS(dev);
+ void *cache_ptr;
unsigned int i;
size_t len;
int ret;
@@ -201,6 +212,26 @@ static void vuf_device_realize(DeviceState *dev, Error **errp)
VIRTQUEUE_MAX_SIZE);
return;
}
+ if (fs->conf.cache_size &&
+ (!is_power_of_2(fs->conf.cache_size) ||
+ fs->conf.cache_size < sysconf(_SC_PAGESIZE))) {
+ error_setg(errp, "cache-size property must be a power of 2 "
+ "no smaller than the page size");
+ return;
+ }
+ if (fs->conf.cache_size) {
+ /* Anonymous, private memory is not counted as overcommit */
+ cache_ptr = mmap(NULL, fs->conf.cache_size, DAX_WINDOW_PROT,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (cache_ptr == MAP_FAILED) {
+ error_setg(errp, "Unable to mmap blank cache");
+ return;
+ }
+
+ memory_region_init_ram_ptr(&fs->cache, OBJECT(vdev),
+ "virtio-fs-cache",
+ fs->conf.cache_size, cache_ptr);
+ }
if (!vhost_user_init(&fs->vhost_user, &fs->conf.chardev, errp)) {
return;
@@ -276,6 +307,7 @@ static Property vuf_properties[] = {
DEFINE_PROP_UINT16("num-request-queues", VHostUserFS,
conf.num_request_queues, 1),
DEFINE_PROP_UINT16("queue-size", VHostUserFS, conf.queue_size, 128),
+ DEFINE_PROP_SIZE("cache-size", VHostUserFS, conf.cache_size, 0),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/include/hw/virtio/vhost-user-fs.h b/include/hw/virtio/vhost-user-fs.h
index 6985752771..df6bf2a926 100644
--- a/include/hw/virtio/vhost-user-fs.h
+++ b/include/hw/virtio/vhost-user-fs.h
@@ -28,6 +28,7 @@ typedef struct {
char *tag;
uint16_t num_request_queues;
uint16_t queue_size;
+ uint64_t cache_size;
} VHostUserFSConf;
struct VHostUserFS {
@@ -41,6 +42,7 @@ struct VHostUserFS {
VirtQueue *hiprio_vq;
/*< public >*/
+ MemoryRegion cache;
};
#endif /* _QEMU_VHOST_USER_FS_H */
diff --git a/include/standard-headers/linux/virtio_fs.h b/include/standard-headers/linux/virtio_fs.h
index a32fe8a64c..808aa3a402 100644
--- a/include/standard-headers/linux/virtio_fs.h
+++ b/include/standard-headers/linux/virtio_fs.h
@@ -19,4 +19,6 @@ struct virtio_fs_config {
/* For the id field in virtio_pci_shm_cap */
#define VIRTIO_FS_SHMCAP_ID_CACHE 0
+#define VIRTIO_FS_PCI_CACHE_BAR 2
+
#endif /* _LINUX_VIRTIO_FS_H */
--
2.25.1

View File

@ -0,0 +1,191 @@
From 27ccc5e4aecbffd590199bae897a8359889fd54d Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Wed, 4 Jul 2018 18:51:42 +0100
Subject: [PATCH 06/29] DAX: virtio-fs: Add vhost-user slave commands for
mapping
The daemon may request that fd's be mapped into the virtio-fs cache
visible to the guest.
These mappings are triggered by commands sent over the slave fd
from the daemon.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
docs/interop/vhost-user.rst | 23 ++++++++++++++++++++++
hw/virtio/vhost-user-fs.c | 19 ++++++++++++++++++
hw/virtio/vhost-user.c | 18 +++++++++++++++++
include/hw/virtio/vhost-user-fs.h | 24 +++++++++++++++++++++++
subprojects/libvhost-user/libvhost-user.h | 3 +++
5 files changed, 87 insertions(+)
diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index d6085f7045..056f94c6fb 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -1432,6 +1432,29 @@ Slave message types
The state.num field is currently reserved and must be set to 0.
+``VHOST_USER_SLAVE_FS_MAP``
+ :id: 6
+ :equivalent ioctl: N/A
+ :slave payload: fd + n * (offset + address + len)
+ :master payload: N/A
+
+ Requests that the QEMU mmap the given fd into the virtio-fs cache;
+ multiple chunks can be mapped in one command.
+ A reply is generated indicating whether mapping succeeded.
+
+``VHOST_USER_SLAVE_FS_UNMAP``
+ :id: 7
+ :equivalent ioctl: N/A
+ :slave payload: n * (address + len)
+ :master payload: N/A
+
+ Requests that the QEMU un-mmap the given range in the virtio-fs cache;
+ multiple chunks can be unmapped in one command.
+ A reply is generated indicating whether unmapping succeeded.
+
+``VHOST_USER_SLAVE_FS_SYNC``
+ [Semantic details TBD]
+
.. _reply_ack:
VHOST_USER_PROTOCOL_F_REPLY_ACK
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index d111bf2af3..9c35fdbeab 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -33,6 +33,25 @@
#define DAX_WINDOW_PROT PROT_NONE
#endif
+int vhost_user_fs_slave_map(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
+ int fd)
+{
+ /* TODO */
+ return -1;
+}
+
+int vhost_user_fs_slave_unmap(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm)
+{
+ /* TODO */
+ return -1;
+}
+
+int vhost_user_fs_slave_sync(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm)
+{
+ /* TODO */
+ return -1;
+}
+
static void vuf_get_config(VirtIODevice *vdev, uint8_t *config)
{
VHostUserFS *fs = VHOST_USER_FS(vdev);
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 2fdd5daf74..757dee0d1e 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -12,6 +12,7 @@
#include "qapi/error.h"
#include "hw/virtio/vhost.h"
#include "hw/virtio/vhost-user.h"
+#include "hw/virtio/vhost-user-fs.h"
#include "hw/virtio/vhost-backend.h"
#include "hw/virtio/virtio.h"
#include "hw/virtio/virtio-net.h"
@@ -132,6 +133,11 @@ typedef enum VhostUserSlaveRequest {
VHOST_USER_SLAVE_IOTLB_MSG = 1,
VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
+ VHOST_USER_SLAVE_VRING_CALL = 4,
+ VHOST_USER_SLAVE_VRING_ERR = 5,
+ VHOST_USER_SLAVE_FS_MAP = 6,
+ VHOST_USER_SLAVE_FS_UNMAP = 7,
+ VHOST_USER_SLAVE_FS_SYNC = 8,
VHOST_USER_SLAVE_MAX
} VhostUserSlaveRequest;
@@ -218,6 +224,7 @@ typedef union {
VhostUserCryptoSession session;
VhostUserVringArea area;
VhostUserInflight inflight;
+ VhostUserFSSlaveMsg fs;
} VhostUserPayload;
typedef struct VhostUserMsg {
@@ -1470,6 +1477,17 @@ static void slave_read(void *opaque)
ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
fd[0]);
break;
+#ifdef CONFIG_VHOST_USER_FS
+ case VHOST_USER_SLAVE_FS_MAP:
+ ret = vhost_user_fs_slave_map(dev, &payload.fs, fd[0]);
+ break;
+ case VHOST_USER_SLAVE_FS_UNMAP:
+ ret = vhost_user_fs_slave_unmap(dev, &payload.fs);
+ break;
+ case VHOST_USER_SLAVE_FS_SYNC:
+ ret = vhost_user_fs_slave_sync(dev, &payload.fs);
+ break;
+#endif
default:
error_report("Received unexpected msg type: %d.", hdr.request);
ret = -EINVAL;
diff --git a/include/hw/virtio/vhost-user-fs.h b/include/hw/virtio/vhost-user-fs.h
index df6bf2a926..69cc6340ed 100644
--- a/include/hw/virtio/vhost-user-fs.h
+++ b/include/hw/virtio/vhost-user-fs.h
@@ -23,6 +23,24 @@
#define TYPE_VHOST_USER_FS "vhost-user-fs-device"
OBJECT_DECLARE_SIMPLE_TYPE(VHostUserFS, VHOST_USER_FS)
+/* Structures carried over the slave channel back to QEMU */
+#define VHOST_USER_FS_SLAVE_ENTRIES 8
+
+/* For the flags field of VhostUserFSSlaveMsg */
+#define VHOST_USER_FS_FLAG_MAP_R (1ull << 0)
+#define VHOST_USER_FS_FLAG_MAP_W (1ull << 1)
+
+typedef struct {
+ /* Offsets within the file being mapped */
+ uint64_t fd_offset[VHOST_USER_FS_SLAVE_ENTRIES];
+ /* Offsets within the cache */
+ uint64_t c_offset[VHOST_USER_FS_SLAVE_ENTRIES];
+ /* Lengths of sections */
+ uint64_t len[VHOST_USER_FS_SLAVE_ENTRIES];
+ /* Flags, from VHOST_USER_FS_FLAG_* */
+ uint64_t flags[VHOST_USER_FS_SLAVE_ENTRIES];
+} VhostUserFSSlaveMsg;
+
typedef struct {
CharBackend chardev;
char *tag;
@@ -45,4 +63,10 @@ struct VHostUserFS {
MemoryRegion cache;
};
+/* Callbacks from the vhost-user code for slave commands */
+int vhost_user_fs_slave_map(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
+ int fd);
+int vhost_user_fs_slave_unmap(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm);
+int vhost_user_fs_slave_sync(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm);
+
#endif /* _QEMU_VHOST_USER_FS_H */
diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h
index f3b0998eea..c63a590069 100644
--- a/subprojects/libvhost-user/libvhost-user.h
+++ b/subprojects/libvhost-user/libvhost-user.h
@@ -119,6 +119,9 @@ typedef enum VhostUserSlaveRequest {
VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
VHOST_USER_SLAVE_VRING_CALL = 4,
VHOST_USER_SLAVE_VRING_ERR = 5,
+ VHOST_USER_SLAVE_FS_MAP = 6,
+ VHOST_USER_SLAVE_FS_UNMAP = 7,
+ VHOST_USER_SLAVE_FS_SYNC = 8,
VHOST_USER_SLAVE_MAX
} VhostUserSlaveRequest;
--
2.25.1

View File

@ -0,0 +1,98 @@
From 3de89ce9fb5eda46f7cefd70e9090cb7cd7ec803 Mon Sep 17 00:00:00 2001
From: Yang Zhong <yang.zhong@intel.com>
Date: Wed, 28 Mar 2018 20:14:53 +0800
Subject: [PATCH 1/2] 9p: removing coroutines of 9p to increase the I/O
performance
This is a quick workaround, need to be fixed.
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
hw/9pfs/9p.c | 12 +++++-------
hw/9pfs/9p.h | 6 +++---
hw/9pfs/coth.h | 3 +++
3 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 9e046f7acb..11c8ee08d9 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -1082,10 +1082,7 @@ static void coroutine_fn pdu_complete(V9fsPDU *pdu, ssize_t len)
out_notify:
pdu->s->transport->push_and_notify(pdu);
- /* Now wakeup anybody waiting in flush for this request */
- if (!qemu_co_queue_next(&pdu->complete)) {
- pdu_free(pdu);
- }
+ pdu_free(pdu);
}
static mode_t v9mode_to_mode(uint32_t mode, V9fsString *extension)
@@ -3997,7 +3994,7 @@ static inline bool is_read_only_op(V9fsPDU *pdu)
void pdu_submit(V9fsPDU *pdu, P9MsgHeader *hdr)
{
- Coroutine *co;
+// Coroutine *co;
CoroutineEntry *handler;
V9fsState *s = pdu->s;
@@ -4015,8 +4012,9 @@ void pdu_submit(V9fsPDU *pdu, P9MsgHeader *hdr)
}
qemu_co_queue_init(&pdu->complete);
- co = qemu_coroutine_create(handler, pdu);
- qemu_coroutine_enter(co);
+ handler(pdu);
+ //co = qemu_coroutine_create(handler, pdu);
+ //qemu_coroutine_enter(co);
}
/* Returns 0 on success, 1 on failure. */
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index b8f72a3bd9..d16bf9d05e 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -391,21 +391,21 @@ extern int total_open_fd;
static inline void v9fs_path_write_lock(V9fsState *s)
{
if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
- qemu_co_rwlock_wrlock(&s->rename_lock);
+ // qemu_co_rwlock_wrlock(&s->rename_lock);
}
}
static inline void v9fs_path_read_lock(V9fsState *s)
{
if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
- qemu_co_rwlock_rdlock(&s->rename_lock);
+ // qemu_co_rwlock_rdlock(&s->rename_lock);
}
}
static inline void v9fs_path_unlock(V9fsState *s)
{
if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
- qemu_co_rwlock_unlock(&s->rename_lock);
+ // qemu_co_rwlock_unlock(&s->rename_lock);
}
}
diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h
index c2cdc7a9ea..0fe971d1f5 100644
--- a/hw/9pfs/coth.h
+++ b/hw/9pfs/coth.h
@@ -46,6 +46,9 @@
qemu_coroutine_yield(); \
} while (0)
+#undef v9fs_co_run_in_worker
+#define v9fs_co_run_in_worker(code_block) do {code_block} while(0);
+
void co_run_in_worker_bh(void *);
int coroutine_fn v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *);
int coroutine_fn v9fs_co_readdir(V9fsPDU *, V9fsFidState *, struct dirent **);
--
2.21.0

View File

@ -0,0 +1,196 @@
From a0d09868a25b9b15b8ef49402b035597ef889f85 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Wed, 4 Jul 2018 20:01:51 +0100
Subject: [PATCH 07/29] DAX: virtio-fs: Fill in slave commands for mapping
Fill in definitions for map, unmap and sync commands.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
with fix by misono.tomohiro@fujitsu.com
---
hw/virtio/vhost-user-fs.c | 161 ++++++++++++++++++++++++++++++++++++--
1 file changed, 155 insertions(+), 6 deletions(-)
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index 9c35fdbeab..98cec993f7 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -36,20 +36,169 @@
int vhost_user_fs_slave_map(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
int fd)
{
- /* TODO */
- return -1;
+ VHostUserFS *fs = VHOST_USER_FS(dev->vdev);
+ if (!fs) {
+ /* Shouldn't happen - but seen on error path */
+ fprintf(stderr, "%s: Bad fs ptr\n", __func__);
+ return -1;
+ }
+ size_t cache_size = fs->conf.cache_size;
+ if (!cache_size) {
+ fprintf(stderr, "%s: map when DAX cache not present\n", __func__);
+ return -1;
+ }
+ void *cache_host = memory_region_get_ram_ptr(&fs->cache);
+
+ unsigned int i;
+ int res = 0;
+
+ if (fd < 0) {
+ fprintf(stderr, "%s: Bad fd for map\n", __func__);
+ return -1;
+ }
+
+ for (i = 0; i < VHOST_USER_FS_SLAVE_ENTRIES; i++) {
+ if (sm->len[i] == 0) {
+ continue;
+ }
+
+ if ((sm->c_offset[i] + sm->len[i]) < sm->len[i] ||
+ (sm->c_offset[i] + sm->len[i]) > cache_size) {
+ fprintf(stderr, "%s: Bad offset/len for map [%d] %"
+ PRIx64 "+%" PRIx64 "\n", __func__,
+ i, sm->c_offset[i], sm->len[i]);
+ res = -1;
+ break;
+ }
+
+ if (mmap(cache_host + sm->c_offset[i], sm->len[i],
+ ((sm->flags[i] & VHOST_USER_FS_FLAG_MAP_R) ? PROT_READ : 0) |
+ ((sm->flags[i] & VHOST_USER_FS_FLAG_MAP_W) ? PROT_WRITE : 0),
+ MAP_SHARED | MAP_FIXED,
+ fd, sm->fd_offset[i]) != (cache_host + sm->c_offset[i])) {
+ fprintf(stderr, "%s: map failed err %d [%d] %"
+ PRIx64 "+%" PRIx64 " from %" PRIx64 "\n", __func__,
+ errno, i, sm->c_offset[i], sm->len[i],
+ sm->fd_offset[i]);
+ res = -1;
+ break;
+ }
+ }
+
+ if (res) {
+ /* Something went wrong, unmap them all */
+ vhost_user_fs_slave_unmap(dev, sm);
+ }
+ return res;
}
int vhost_user_fs_slave_unmap(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm)
{
- /* TODO */
- return -1;
+ VHostUserFS *fs = VHOST_USER_FS(dev->vdev);
+ if (!fs) {
+ /* Shouldn't happen - but seen on error path */
+ fprintf(stderr, "%s: Bad fs ptr\n", __func__);
+ return -1;
+ }
+ size_t cache_size = fs->conf.cache_size;
+ if (!cache_size) {
+ /*
+ * Since dax cache is disabled, there should be no unmap request.
+ * Howerver we still receives whole range unmap request during umount
+ * for cleanup. Ignore it.
+ */
+ if (sm->len[0] == ~(uint64_t)0) {
+ return 0;
+ }
+
+ fprintf(stderr, "%s: unmap when DAX cache not present\n", __func__);
+ return -1;
+ }
+ void *cache_host = memory_region_get_ram_ptr(&fs->cache);
+
+ unsigned int i;
+ int res = 0;
+
+ /*
+ * Note even if one unmap fails we try the rest, since the effect
+ * is to clean up as much as possible.
+ */
+ for (i = 0; i < VHOST_USER_FS_SLAVE_ENTRIES; i++) {
+ void *ptr;
+ if (sm->len[i] == 0) {
+ continue;
+ }
+
+ if (sm->len[i] == ~(uint64_t)0) {
+ /* Special case meaning the whole arena */
+ sm->len[i] = cache_size;
+ }
+
+ if ((sm->c_offset[i] + sm->len[i]) < sm->len[i] ||
+ (sm->c_offset[i] + sm->len[i]) > cache_size) {
+ fprintf(stderr, "%s: Bad offset/len for unmap [%d] %"
+ PRIx64 "+%" PRIx64 "\n", __func__,
+ i, sm->c_offset[i], sm->len[i]);
+ res = -1;
+ continue;
+ }
+
+ ptr = mmap(cache_host + sm->c_offset[i], sm->len[i], DAX_WINDOW_PROT,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ if (ptr != (cache_host + sm->c_offset[i])) {
+ fprintf(stderr, "%s: mmap failed (%s) [%d] %"
+ PRIx64 "+%" PRIx64 " from %" PRIx64 " res: %p\n",
+ __func__,
+ strerror(errno),
+ i, sm->c_offset[i], sm->len[i],
+ sm->fd_offset[i], ptr);
+ res = -1;
+ }
+ }
+
+ return res;
}
int vhost_user_fs_slave_sync(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm)
{
- /* TODO */
- return -1;
+ VHostUserFS *fs = VHOST_USER_FS(dev->vdev);
+ size_t cache_size = fs->conf.cache_size;
+ if (!cache_size) {
+ fprintf(stderr, "%s: sync when DAX cache not present\n", __func__);
+ return -1;
+ }
+ void *cache_host = memory_region_get_ram_ptr(&fs->cache);
+
+ unsigned int i;
+ int res = 0;
+
+ /* Note even if one sync fails we try the rest */
+ for (i = 0; i < VHOST_USER_FS_SLAVE_ENTRIES; i++) {
+ if (sm->len[i] == 0) {
+ continue;
+ }
+
+ if ((sm->c_offset[i] + sm->len[i]) < sm->len[i] ||
+ (sm->c_offset[i] + sm->len[i]) > cache_size) {
+ fprintf(stderr, "%s: Bad offset/len for sync [%d] %"
+ PRIx64 "+%" PRIx64 "\n", __func__,
+ i, sm->c_offset[i], sm->len[i]);
+ res = -1;
+ continue;
+ }
+
+ if (msync(cache_host + sm->c_offset[i], sm->len[i],
+ MS_SYNC /* ?? */)) {
+ fprintf(stderr, "%s: msync failed (%s) [%d] %"
+ PRIx64 "+%" PRIx64 " from %" PRIx64 "\n", __func__,
+ strerror(errno),
+ i, sm->c_offset[i], sm->len[i],
+ sm->fd_offset[i]);
+ res = -1;
+ }
+ }
+
+ return res;
}
static void vuf_get_config(VirtIODevice *vdev, uint8_t *config)
--
2.25.1

View File

@ -0,0 +1,99 @@
From b341b9541023b0a9f0a315ef24e81522b273e552 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Thu, 5 Jul 2018 18:20:34 +0100
Subject: [PATCH 08/29] DAX: virtiofsd Add cache accessor functions
Add low level functions that the clients can use to map/unmap/sync cache
areas.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
tools/virtiofsd/fuse_lowlevel.h | 31 +++++++++++++++++++++++++++++++
tools/virtiofsd/fuse_virtio.c | 27 +++++++++++++++++++++++++++
2 files changed, 58 insertions(+)
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index 0e10a14bc9..b0d111bcb2 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -29,6 +29,8 @@
#include <sys/uio.h>
#include <utime.h>
+#include "subprojects/libvhost-user/libvhost-user.h"
+
/*
* Miscellaneous definitions
*/
@@ -1970,4 +1972,33 @@ void fuse_session_process_buf(struct fuse_session *se,
*/
int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf);
+/**
+ * For use with virtio-fs; request an fd be mapped into the cache
+ *
+ * @param req The request that triggered this action
+ * @param msg A set of mapping requests
+ * @param fd The fd to map
+ * @return Zero on success
+ */
+int fuse_virtio_map(fuse_req_t req, VhostUserFSSlaveMsg *msg, int fd);
+
+/**
+ * For use with virtio-fs; request unmapping of part of the cache
+ *
+ * @param se The session this request is on
+ * @param msg A set of unmapping requests
+ * @return Zero on success
+ */
+int fuse_virtio_unmap(struct fuse_session *se, VhostUserFSSlaveMsg *msg);
+
+/**
+ * For use with virtio-fs; request synchronisation of part of the cache
+ * [Semantics TBD]
+ *
+ * @param req The request that triggered this action
+ * @param msg A set of syncing requests
+ * @return Zero on success
+ */
+int fuse_virtio_sync(fuse_req_t req, VhostUserFSSlaveMsg *msg);
+
#endif /* FUSE_LOWLEVEL_H_ */
diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
index bd19358437..24d9323665 100644
--- a/tools/virtiofsd/fuse_virtio.c
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -1044,3 +1044,30 @@ void virtio_session_close(struct fuse_session *se)
free(se->virtio_dev);
se->virtio_dev = NULL;
}
+
+int fuse_virtio_map(fuse_req_t req, VhostUserFSSlaveMsg *msg, int fd)
+{
+ if (!req->se->virtio_dev) {
+ return -ENODEV;
+ }
+ return !vu_fs_cache_request(&req->se->virtio_dev->dev,
+ VHOST_USER_SLAVE_FS_MAP, fd, msg);
+}
+
+int fuse_virtio_unmap(struct fuse_session *se, VhostUserFSSlaveMsg *msg)
+{
+ if (!se->virtio_dev) {
+ return -ENODEV;
+ }
+ return !vu_fs_cache_request(&se->virtio_dev->dev, VHOST_USER_SLAVE_FS_UNMAP,
+ -1, msg);
+}
+
+int fuse_virtio_sync(fuse_req_t req, VhostUserFSSlaveMsg *msg)
+{
+ if (!req->se->virtio_dev) {
+ return -ENODEV;
+ }
+ return !vu_fs_cache_request(&req->se->virtio_dev->dev,
+ VHOST_USER_SLAVE_FS_SYNC, -1, msg);
+}
--
2.25.1

View File

@ -0,0 +1,152 @@
From c3273cefbec6f5637189ad1cb9a8b7722cc01294 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Fri, 6 Jul 2018 18:03:49 +0100
Subject: [PATCH 09/29] DAX: virtiofsd: Add setup/remove mappings fuse commands
Add commands so that the guest kernel can ask the daemon to map file
sections into a guest kernel visible cache.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Peng Tao <tao.peng@linux.alibaba.com>
---
tools/virtiofsd/fuse_lowlevel.c | 67 +++++++++++++++++++++++++++++++++
tools/virtiofsd/fuse_lowlevel.h | 23 ++++++++++-
2 files changed, 89 insertions(+), 1 deletion(-)
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index e94b71110b..1c3790130a 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -1868,6 +1868,71 @@ static void do_lseek(fuse_req_t req, fuse_ino_t nodeid,
}
}
+static void do_setupmapping(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_setupmapping_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ /*
+ * TODO: Need to come up with a better definition of flags here; it can't
+ * be the kernel view of the flags, since that's abstracted from the client
+ * similarly, it's not the vhost-user set
+ * for now just use O_ flags
+ */
+ uint64_t genflags;
+
+ genflags = O_RDONLY;
+ if (arg->flags & FUSE_SETUPMAPPING_FLAG_WRITE) {
+ genflags = O_RDWR;
+ }
+
+ if (req->se->op.setupmapping) {
+ req->se->op.setupmapping(req, nodeid, arg->foffset, arg->len,
+ arg->moffset, genflags, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_removemapping(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_removemapping_in *arg;
+ struct fuse_removemapping_one *one;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ one = fuse_mbuf_iter_advance(iter, sizeof(*one));
+ if (!one) {
+ fuse_log(
+ FUSE_LOG_ERR,
+ "do_removemapping: invalid in, expected %d * %ld, has %ld - %ld\n",
+ arg->count, sizeof(*one), iter->size, iter->pos);
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.removemapping) {
+ req->se->op.removemapping(req, req->se, nodeid, arg->count, one);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
static void do_init(fuse_req_t req, fuse_ino_t nodeid,
struct fuse_mbuf_iter *iter)
{
@@ -2258,6 +2323,8 @@ static struct {
[FUSE_RENAME2] = { do_rename2, "RENAME2" },
[FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" },
[FUSE_LSEEK] = { do_lseek, "LSEEK" },
+ [FUSE_SETUPMAPPING] = { do_setupmapping, "SETUPMAPPING" },
+ [FUSE_REMOVEMAPPING] = { do_removemapping, "REMOVEMAPPING" },
};
#define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0]))
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index b0d111bcb2..2851840cc2 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -24,6 +24,7 @@
#endif
#include "fuse_common.h"
+#include "standard-headers/linux/fuse.h"
#include <sys/statvfs.h>
#include <sys/uio.h>
@@ -1170,7 +1171,6 @@ struct fuse_lowlevel_ops {
*/
void (*readdirplus)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
struct fuse_file_info *fi);
-
/**
* Copy a range of data from one file to another
*
@@ -1226,6 +1226,27 @@ struct fuse_lowlevel_ops {
*/
void (*lseek)(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
struct fuse_file_info *fi);
+
+ /*
+ * Map file sections into kernel visible cache
+ *
+ * Map a section of the file into address space visible to the kernel
+ * mounting the filesystem.
+ * TODO
+ */
+ void (*setupmapping)(fuse_req_t req, fuse_ino_t ino, uint64_t foffset,
+ uint64_t len, uint64_t moffset, uint64_t flags,
+ struct fuse_file_info *fi);
+
+ /*
+ * Unmap file sections in kernel visible cache
+ *
+ * Unmap sections previously mapped by setupmapping
+ * TODO
+ */
+ void (*removemapping)(fuse_req_t req, struct fuse_session *se,
+ fuse_ino_t ino, unsigned num,
+ struct fuse_removemapping_one *argp);
};
/**
--
2.25.1

View File

@ -0,0 +1,50 @@
From 7029506e6b23fc15f2b7c4a6a62aa3a0ee58fb02 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Fri, 6 Jul 2018 19:52:49 +0100
Subject: [PATCH 10/29] DAX: virtiofsd: Add setup/remove mapping handlers to
passthrough_ll
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
tools/virtiofsd/passthrough_ll.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index 5fb36d9407..784bdcff34 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -2891,6 +2891,22 @@ static void lo_destroy(void *userdata)
pthread_mutex_unlock(&lo->mutex);
}
+static void lo_setupmapping(fuse_req_t req, fuse_ino_t ino, uint64_t foffset,
+ uint64_t len, uint64_t moffset, uint64_t flags,
+ struct fuse_file_info *fi)
+{
+ // TODO
+ fuse_reply_err(req, ENOSYS);
+}
+
+static void lo_removemapping(fuse_req_t req, struct fuse_session *se,
+ fuse_ino_t ino, unsigned num,
+ struct fuse_removemapping_one *argp)
+{
+ // TODO
+ fuse_reply_err(req, ENOSYS);
+}
+
static struct fuse_lowlevel_ops lo_oper = {
.init = lo_init,
.lookup = lo_lookup,
@@ -2932,6 +2948,8 @@ static struct fuse_lowlevel_ops lo_oper = {
#endif
.lseek = lo_lseek,
.destroy = lo_destroy,
+ .setupmapping = lo_setupmapping,
+ .removemapping = lo_removemapping,
};
/* Print vhost-user.json backend program capabilities */
--
2.25.1

View File

@ -0,0 +1,53 @@
From 15fb0e84e38c2681e855e69b58414ba831b399bf Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Mon, 9 Jul 2018 19:57:16 +0100
Subject: [PATCH 11/29] DAX: virtiofsd: Wire up passthrough_ll's
lo_setupmapping
Wire up passthrough_ll's setupmapping to allocate, send to virtio
and then reply OK.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
---
tools/virtiofsd/passthrough_ll.c | 24 ++++++++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index 784bdcff34..b57cb4079e 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -2895,8 +2895,28 @@ static void lo_setupmapping(fuse_req_t req, fuse_ino_t ino, uint64_t foffset,
uint64_t len, uint64_t moffset, uint64_t flags,
struct fuse_file_info *fi)
{
- // TODO
- fuse_reply_err(req, ENOSYS);
+ int ret = 0;
+ VhostUserFSSlaveMsg msg = { 0 };
+ uint64_t vhu_flags;
+ bool writable = flags & O_RDWR;
+
+ vhu_flags = VHOST_USER_FS_FLAG_MAP_R;
+ if (writable) {
+ vhu_flags |= VHOST_USER_FS_FLAG_MAP_W;
+ }
+
+ msg.fd_offset[0] = foffset;
+ msg.len[0] = len;
+ msg.c_offset[0] = moffset;
+ msg.flags[0] = vhu_flags;
+
+ if (fuse_virtio_map(req, &msg, lo_fi_fd(req, fi))) {
+ fprintf(stderr, "%s: map over virtio failed (fd=%d)\n", __func__,
+ (int)fi->fh);
+ ret = EINVAL;
+ }
+
+ fuse_reply_err(req, ret);
}
static void lo_removemapping(fuse_req_t req, struct fuse_session *se,
--
2.25.1

View File

@ -0,0 +1,43 @@
From 17cf13d652885b2c3a09fbbab1cb503f53c27d96 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 13 Aug 2018 11:52:43 -0400
Subject: [PATCH 12/29] DAX: virtiofsd: Make lo_removemapping() work
Let guest pass in the offset in dax window a mapping is currently
mapped at and needs to be removed.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
---
tools/virtiofsd/passthrough_ll.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index b57cb4079e..056b395574 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -2923,8 +2923,20 @@ static void lo_removemapping(fuse_req_t req, struct fuse_session *se,
fuse_ino_t ino, unsigned num,
struct fuse_removemapping_one *argp)
{
- // TODO
- fuse_reply_err(req, ENOSYS);
+ VhostUserFSSlaveMsg msg = { 0 };
+ int ret = 0;
+
+ msg.len[0] = argp->len;
+ msg.c_offset[0] = argp->moffset;
+ if (fuse_virtio_unmap(se, &msg)) {
+ fprintf(stderr,
+ "%s: unmap over virtio failed "
+ "(offset=0x%lx, len=0x%lx)\n",
+ __func__, argp->moffset, argp->len);
+ ret = EINVAL;
+ }
+
+ fuse_reply_err(req, ret);
}
static struct fuse_lowlevel_ops lo_oper = {
--
2.25.1

View File

@ -0,0 +1,104 @@
From a3f692a36307054148e7db640dc7a64158a98250 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 30 Aug 2018 14:22:10 -0400
Subject: [PATCH 13/29] DAX: virtiofsd: Make setupmapping work only with inode
Guest might not pass file pointer. In that case using inode info, open
the file again, mmap() and close fd.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
With fix from:
Signed-off-by: Fotis Xenakis <foxen@windowslive.com>
---
tools/virtiofsd/fuse_lowlevel.c | 13 ++++++++++--
tools/virtiofsd/passthrough_ll.c | 36 ++++++++++++++++++++++++++++----
2 files changed, 43 insertions(+), 6 deletions(-)
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index 1c3790130a..4cfd4c3547 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -1897,8 +1897,17 @@ static void do_setupmapping(fuse_req_t req, fuse_ino_t nodeid,
}
if (req->se->op.setupmapping) {
- req->se->op.setupmapping(req, nodeid, arg->foffset, arg->len,
- arg->moffset, genflags, &fi);
+ /*
+ * TODO: Add a flag to request which tells if arg->fh is
+ * valid or not.
+ */
+ if (fi.fh == (uint64_t)-1) {
+ req->se->op.setupmapping(req, nodeid, arg->foffset, arg->len,
+ arg->moffset, genflags, NULL);
+ } else {
+ req->se->op.setupmapping(req, nodeid, arg->foffset, arg->len,
+ arg->moffset, genflags, &fi);
+ }
} else {
fuse_reply_err(req, ENOSYS);
}
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index 056b395574..ebd5a9b215 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -2895,11 +2895,19 @@ static void lo_setupmapping(fuse_req_t req, fuse_ino_t ino, uint64_t foffset,
uint64_t len, uint64_t moffset, uint64_t flags,
struct fuse_file_info *fi)
{
- int ret = 0;
+ struct lo_data *lo = lo_data(req);
+ int ret = 0, fd, res;
VhostUserFSSlaveMsg msg = { 0 };
uint64_t vhu_flags;
+ char *buf;
bool writable = flags & O_RDWR;
+ fuse_log(FUSE_LOG_DEBUG,
+ "lo_setupmapping(ino=%" PRIu64 ", fi=0x%p,"
+ " foffset=%" PRIu64 ", len=%" PRIu64 ", moffset=%" PRIu64
+ ", flags=%" PRIu64 ")\n",
+ ino, (void *)fi, foffset, len, moffset, flags);
+
vhu_flags = VHOST_USER_FS_FLAG_MAP_R;
if (writable) {
vhu_flags |= VHOST_USER_FS_FLAG_MAP_W;
@@ -2910,12 +2918,32 @@ static void lo_setupmapping(fuse_req_t req, fuse_ino_t ino, uint64_t foffset,
msg.c_offset[0] = moffset;
msg.flags[0] = vhu_flags;
- if (fuse_virtio_map(req, &msg, lo_fi_fd(req, fi))) {
- fprintf(stderr, "%s: map over virtio failed (fd=%d)\n", __func__,
- (int)fi->fh);
+ if (fi) {
+ fd = lo_fi_fd(req, fi);
+ } else {
+ res = asprintf(&buf, "%i", lo_fd(req, ino));
+ if (res == -1) {
+ return (void)fuse_reply_err(req, errno);
+ }
+
+ fd = openat(lo->proc_self_fd, buf, flags);
+ free(buf);
+ if (fd == -1) {
+ return (void)fuse_reply_err(req, errno);
+ }
+ }
+
+ if (fuse_virtio_map(req, &msg, fd)) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: map over virtio failed (ino=%" PRId64
+ "fd=%d moffset=0x%" PRIx64 ")\n",
+ __func__, ino, fi ? (int)fi->fh : lo_fd(req, ino), moffset);
ret = EINVAL;
}
+ if (!fi) {
+ close(fd);
+ }
fuse_reply_err(req, ret);
}
--
2.25.1

View File

@ -0,0 +1,75 @@
From 7c14a24ad467b9404b95345c64e8c5ef5e6d209c Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Fri, 30 Nov 2018 11:47:36 +0000
Subject: [PATCH 14/29] DAX: virtiofsd: route se down to destroy method
We're going to need to pass the session down to destroy so that it can
pass it back to do the remove mapping.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
tools/virtiofsd/fuse_lowlevel.c | 6 +++---
tools/virtiofsd/fuse_lowlevel.h | 2 +-
tools/virtiofsd/passthrough_ll.c | 2 +-
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index 4cfd4c3547..a2480d4aa1 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -2211,7 +2211,7 @@ static void do_destroy(fuse_req_t req, fuse_ino_t nodeid,
se->got_destroy = 1;
se->got_init = 0;
if (se->op.destroy) {
- se->op.destroy(se->userdata);
+ se->op.destroy(se->userdata, se);
}
send_reply_ok(req, NULL, 0);
@@ -2438,7 +2438,7 @@ void fuse_session_process_buf_int(struct fuse_session *se,
se->got_destroy = 1;
se->got_init = 0;
if (se->op.destroy) {
- se->op.destroy(se->userdata);
+ se->op.destroy(se->userdata, se);
}
} else {
goto reply_err;
@@ -2526,7 +2526,7 @@ void fuse_session_destroy(struct fuse_session *se)
{
if (se->got_init && !se->got_destroy) {
if (se->op.destroy) {
- se->op.destroy(se->userdata);
+ se->op.destroy(se->userdata, se);
}
}
pthread_rwlock_destroy(&se->init_rwlock);
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index 2851840cc2..2259623776 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -208,7 +208,7 @@ struct fuse_lowlevel_ops {
*
* @param userdata the user data passed to fuse_session_new()
*/
- void (*destroy)(void *userdata);
+ void (*destroy)(void *userdata, struct fuse_session *se);
/**
* Look up a directory entry by name and get its attributes.
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index ebd5a9b215..0d3cda8d2f 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -2871,7 +2871,7 @@ static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
}
}
-static void lo_destroy(void *userdata)
+static void lo_destroy(void *userdata, struct fuse_session *se)
{
struct lo_data *lo = (struct lo_data *)userdata;
--
2.25.1

View File

@ -0,0 +1,37 @@
From 72bccc497aeb9057e36477c327e0ac58bc154e6f Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Fri, 30 Nov 2018 11:50:25 +0000
Subject: [PATCH 15/29] DAX: virtiofsd: Perform an unmap on destroy
Force unmap all remaining dax cache entries on a destroy.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
tools/virtiofsd/passthrough_ll.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index 0d3cda8d2f..56a4b9404a 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -2875,6 +2875,17 @@ static void lo_destroy(void *userdata, struct fuse_session *se)
{
struct lo_data *lo = (struct lo_data *)userdata;
+ if (fuse_lowlevel_is_virtio(se)) {
+ VhostUserFSSlaveMsg msg = { 0 };
+
+ msg.len[0] = ~(uint64_t)0; /* Special: means 'all' */
+ msg.c_offset[0] = 0;
+ if (fuse_virtio_unmap(se, &msg)) {
+ fuse_log(FUSE_LOG_ERR, "%s: unmap during destroy failed\n",
+ __func__);
+ }
+ }
+
pthread_mutex_lock(&lo->mutex);
while (true) {
GHashTableIter iter;
--
2.25.1

View File

@ -0,0 +1,34 @@
From c05795e129152533d66f131dd019ae903d1eb39a Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Thu, 2 May 2019 18:04:04 +0100
Subject: [PATCH 16/29] DAX: libvhost-user: Allow popping a queue element with
bad pointers
Allow a daemon implemented with libvhost-user to accept an
element with pointers to memory that aren't in the mapping table.
The daemon might have some special way to deal with some special
cases of this.
The default behaviour doesn't change.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
block/export/vhost-user-blk-server.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
index ab2c4d44c4..ea2d302e33 100644
--- a/block/export/vhost-user-blk-server.c
+++ b/block/export/vhost-user-blk-server.c
@@ -205,7 +205,7 @@ static void vu_blk_process_vq(VuDev *vu_dev, int idx)
while (1) {
VuBlkReq *req;
- req = vu_queue_pop(vu_dev, vq, sizeof(VuBlkReq));
+ req = vu_queue_pop(vu_dev, vq, sizeof(VuBlkReq), NULL, NULL);
if (!req) {
break;
}
--
2.25.1

View File

@ -0,0 +1,211 @@
From a238faf5a53668aac037f7ce026d1bf785ee4186 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Mon, 20 May 2019 11:54:02 +0100
Subject: [PATCH 17/29] DAX/unmap: virtiofsd: Add VHOST_USER_SLAVE_FS_IO
Define a new slave command 'VHOST_USER_SLAVE_FS_IO' for a
client to ask qemu to perform a read/write from an fd directly
to GPA.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
docs/interop/vhost-user.rst | 11 +++
hw/virtio/trace-events | 6 ++
hw/virtio/vhost-user-fs.c | 87 +++++++++++++++++++++++
hw/virtio/vhost-user.c | 4 ++
include/hw/virtio/vhost-user-fs.h | 1 +
subprojects/libvhost-user/libvhost-user.h | 1 +
6 files changed, 110 insertions(+)
diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index 056f94c6fb..8d6ec92881 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -1455,6 +1455,17 @@ Slave message types
``VHOST_USER_SLAVE_FS_SYNC``
[Semantic details TBD]
+``VHOST_USER_SLAVE_FS_IO``
+ :id: 9
+ :equivalent ioctl: N/A
+ :slave payload: fd + n * (offset + address + len)
+ :master payload: N/A
+
+ Requests that the QEMU performs IO directly from an fd to guest memory
+ on behalf of the daemon; this is normally for a case where a memory region
+ isn't visible to the daemon.
+ [Semantic details TBD]
+
.. _reply_ack:
VHOST_USER_PROTOCOL_F_REPLY_ACK
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 2060a144a2..a35adf5caf 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -53,6 +53,12 @@ vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRI
vhost_vdpa_set_owner(void *dev) "dev: %p"
vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64
+# vhost-user-fs.c
+
+vhost_user_fs_slave_io_loop(const char *name, uint64_t owr, int is_ram, int is_romd, size_t size) "region %s with internal offset 0x%"PRIx64 " ram=%d romd=%d mrs.size=%zd"
+vhost_user_fs_slave_io_loop_res(ssize_t transferred) "%zd"
+vhost_user_fs_slave_io_exit(int res, size_t done) "res: %d done: %zd"
+
# virtio.c
virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"
virtqueue_fill(void *vq, const void *elem, unsigned int len, unsigned int idx) "vq %p elem %p len %u idx %u"
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index 98cec993f7..82a32492a7 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -22,6 +22,8 @@
#include "qemu/error-report.h"
#include "hw/virtio/vhost-user-fs.h"
#include "monitor/monitor.h"
+#include "exec/address-spaces.h"
+#include "trace.h"
/*
* The powerpc kernel code expects the memory to be accessible during
@@ -201,6 +203,91 @@ int vhost_user_fs_slave_sync(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm)
return res;
}
+int vhost_user_fs_slave_io(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
+ int fd)
+{
+ VHostUserFS *fs = VHOST_USER_FS(dev->vdev);
+ if (!fs) {
+ /* Shouldn't happen - but seen it in error paths */
+ fprintf(stderr, "%s: Bad fs ptr\n", __func__);
+ return -1;
+ }
+
+ unsigned int i;
+ int res = 0;
+ size_t done = 0;
+
+ if (fd < 0) {
+ fprintf(stderr, "%s: Bad fd for map\n", __func__);
+ return -1;
+ }
+
+ for (i = 0; i < VHOST_USER_FS_SLAVE_ENTRIES && !res; i++) {
+ if (sm->len[i] == 0) {
+ continue;
+ }
+
+ size_t len = sm->len[i];
+ hwaddr gpa = sm->c_offset[i];
+
+ while (len && !res) {
+ MemoryRegionSection mrs = memory_region_find(get_system_memory(),
+ gpa, len);
+ size_t mrs_size = (size_t)int128_get64(mrs.size);
+
+ if (!mrs_size) {
+ fprintf(stderr,
+ "%s: No guest region found for 0x%" HWADDR_PRIx "\n",
+ __func__, gpa);
+ res = -EFAULT;
+ break;
+ }
+
+ trace_vhost_user_fs_slave_io_loop(mrs.mr->name,
+ (uint64_t)mrs.offset_within_region,
+ memory_region_is_ram(mrs.mr),
+ memory_region_is_romd(mrs.mr),
+ (size_t)mrs_size);
+
+ void *hostptr = qemu_map_ram_ptr(mrs.mr->ram_block,
+ mrs.offset_within_region);
+ ssize_t transferred;
+ if (sm->flags[i] & VHOST_USER_FS_FLAG_MAP_R) {
+ /* Read from file into RAM */
+ if (mrs.mr->readonly) {
+ res = -EFAULT;
+ break;
+ }
+ transferred = pread(fd, hostptr, mrs_size, sm->fd_offset[i]);
+ } else {
+ /* Write into file from RAM */
+ assert((sm->flags[i] & VHOST_USER_FS_FLAG_MAP_W));
+ transferred = pwrite(fd, hostptr, mrs_size, sm->fd_offset[i]);
+ }
+ trace_vhost_user_fs_slave_io_loop_res(transferred);
+ if (transferred < 0) {
+ res = -errno;
+ break;
+ }
+ if (!transferred) {
+ /* EOF */
+ break;
+ }
+
+ done += transferred;
+ len -= transferred;
+ }
+ }
+ close(fd);
+
+ trace_vhost_user_fs_slave_io_exit(res, done);
+ /*
+ * TODO! We should be returning 'done' if possible but our error handling
+ * doesn't know about that yet.
+ */
+ return res;
+}
+
static void vuf_get_config(VirtIODevice *vdev, uint8_t *config)
{
VHostUserFS *fs = VHOST_USER_FS(vdev);
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 757dee0d1e..b4ef0102ad 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -138,6 +138,7 @@ typedef enum VhostUserSlaveRequest {
VHOST_USER_SLAVE_FS_MAP = 6,
VHOST_USER_SLAVE_FS_UNMAP = 7,
VHOST_USER_SLAVE_FS_SYNC = 8,
+ VHOST_USER_SLAVE_FS_IO = 9,
VHOST_USER_SLAVE_MAX
} VhostUserSlaveRequest;
@@ -1487,6 +1488,9 @@ static void slave_read(void *opaque)
case VHOST_USER_SLAVE_FS_SYNC:
ret = vhost_user_fs_slave_sync(dev, &payload.fs);
break;
+ case VHOST_USER_SLAVE_FS_IO:
+ ret = vhost_user_fs_slave_io(dev, &payload.fs, fd[0]);
+ break;
#endif
default:
error_report("Received unexpected msg type: %d.", hdr.request);
diff --git a/include/hw/virtio/vhost-user-fs.h b/include/hw/virtio/vhost-user-fs.h
index 69cc6340ed..0750687463 100644
--- a/include/hw/virtio/vhost-user-fs.h
+++ b/include/hw/virtio/vhost-user-fs.h
@@ -68,5 +68,6 @@ int vhost_user_fs_slave_map(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
int fd);
int vhost_user_fs_slave_unmap(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm);
int vhost_user_fs_slave_sync(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm);
+int vhost_user_fs_slave_io(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm, int fd);
#endif /* _QEMU_VHOST_USER_FS_H */
diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h
index c63a590069..4b6e681a3e 100644
--- a/subprojects/libvhost-user/libvhost-user.h
+++ b/subprojects/libvhost-user/libvhost-user.h
@@ -122,6 +122,7 @@ typedef enum VhostUserSlaveRequest {
VHOST_USER_SLAVE_FS_MAP = 6,
VHOST_USER_SLAVE_FS_UNMAP = 7,
VHOST_USER_SLAVE_FS_SYNC = 8,
+ VHOST_USER_SLAVE_FS_IO = 9,
VHOST_USER_SLAVE_MAX
} VhostUserSlaveRequest;
--
2.25.1

View File

@ -0,0 +1,98 @@
From 5e3aff71f01f41254cdc7ecefc98a31be002dda0 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Mon, 20 May 2019 12:17:36 +0100
Subject: [PATCH 18/29] DAX/unmap virtiofsd: Add wrappers for
VHOST_USER_SLAVE_FS_IO
Add a wrapper to send VHOST_USER_SLAVE_FS_IO commands and a
further wrapper for sending a fuse_buf write using the FS_IO
slave command.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
tools/virtiofsd/fuse_lowlevel.h | 24 +++++++++++++++++++++
tools/virtiofsd/fuse_virtio.c | 38 +++++++++++++++++++++++++++++++++
2 files changed, 62 insertions(+)
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index 2259623776..866d122352 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -2022,4 +2022,28 @@ int fuse_virtio_unmap(struct fuse_session *se, VhostUserFSSlaveMsg *msg);
*/
int fuse_virtio_sync(fuse_req_t req, VhostUserFSSlaveMsg *msg);
+/**
+ * For use with virtio-fs; request IO directly to memory
+ *
+ * @param se The current session
+ * @param msg A set of IO requests
+ * @param fd The fd to map
+ * @return Zero on success
+ */
+int fuse_virtio_io(struct fuse_session *se, VhostUserFSSlaveMsg *msg, int fd);
+
+/**
+ * For use with virtio-fs; wrapper for fuse_virtio_io for writes
+ * from memory to an fd
+ * @param req The request that triggered this action
+ * @param dst The destination (file) memory buffer
+ * @param dst_off Byte offset in the file
+ * @param src The source (memory) buffer
+ * @param src_off The GPA
+ * @param len Length in bytes
+ */
+ssize_t fuse_virtio_write(fuse_req_t req, const struct fuse_buf *dst,
+ size_t dst_off, const struct fuse_buf *src,
+ size_t src_off, size_t len);
+
#endif /* FUSE_LOWLEVEL_H_ */
diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
index 24d9323665..abac0d0d2e 100644
--- a/tools/virtiofsd/fuse_virtio.c
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -1071,3 +1071,41 @@ int fuse_virtio_sync(fuse_req_t req, VhostUserFSSlaveMsg *msg)
return !vu_fs_cache_request(&req->se->virtio_dev->dev,
VHOST_USER_SLAVE_FS_SYNC, -1, msg);
}
+
+int fuse_virtio_io(struct fuse_session *se, VhostUserFSSlaveMsg *msg, int fd)
+{
+ if (!se->virtio_dev) {
+ return -ENODEV;
+ }
+ return !vu_fs_cache_request(&se->virtio_dev->dev,
+ VHOST_USER_SLAVE_FS_IO, fd, msg);
+}
+
+/*
+ * Write to a file (dst) from an area of guest GPA (src) that probably
+ * isn't visible to the daemon.
+ */
+ssize_t fuse_virtio_write(fuse_req_t req, const struct fuse_buf *dst,
+ size_t dst_off, const struct fuse_buf *src,
+ size_t src_off, size_t len)
+{
+ VhostUserFSSlaveMsg msg = { 0 };
+
+ if (dst->flags & FUSE_BUF_FD_SEEK) {
+ msg.fd_offset[0] = dst->pos + dst_off;
+ } else {
+ off_t cur = lseek(dst->fd, 0, SEEK_CUR);
+ if (cur == (off_t)-1) {
+ return -errno;
+ }
+ msg.fd_offset[0] = cur;
+ }
+ msg.c_offset[0] = (uintptr_t)src->mem + src_off;
+ msg.len[0] = len;
+ msg.flags[0] = VHOST_USER_FS_FLAG_MAP_W;
+
+ bool result = !fuse_virtio_io(req->se, &msg, dst->fd);
+ /* TODO: Rework the result path to actually get length/error */
+ fuse_log(FUSE_LOG_DEBUG, "%s: result=%d\n", __func__, result);
+ return result ? len : -EIO;
+}
--
2.25.1

View File

@ -0,0 +1,335 @@
From 1586d4a5525f44c51cbcbd5004b9a79bfc8c495c Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Mon, 20 May 2019 13:26:09 +0100
Subject: [PATCH 19/29] DAX/unmap virtiofsd: Parse unmappable elements
For some read/writes the virtio queue elements are unmappable by
the daemon; these are cases where the data is to be read/written
from non-RAM. In viritofs's case this is typically a direct read/write
into an mmap'd DAX file also on virtiofs (possibly on another instance).
When we receive a virtio queue element, check that we have enough
mappable data to handle the headers. Make a note of the number of
unmappable 'in' entries (ie. for read data back to the VMM),
and flag the fuse_bufvec for 'out' entries with a new flag
FUSE_BUF_PHYS_ADDR.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
with fix by:
Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
---
tools/virtiofsd/buffer.c | 4 +-
tools/virtiofsd/fuse_common.h | 7 ++
tools/virtiofsd/fuse_virtio.c | 191 ++++++++++++++++++++++++----------
3 files changed, 145 insertions(+), 57 deletions(-)
diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c
index 874f01c488..1a050aa441 100644
--- a/tools/virtiofsd/buffer.c
+++ b/tools/virtiofsd/buffer.c
@@ -77,6 +77,7 @@ static ssize_t fuse_buf_write(const struct fuse_buf *dst, size_t dst_off,
ssize_t res = 0;
size_t copied = 0;
+ assert(!(src->flags & FUSE_BUF_PHYS_ADDR));
while (len) {
if (dst->flags & FUSE_BUF_FD_SEEK) {
res = pwrite(dst->fd, (char *)src->mem + src_off, len,
@@ -272,7 +273,8 @@ ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv)
* process
*/
for (i = 0; i < srcv->count; i++) {
- if (srcv->buf[i].flags & FUSE_BUF_IS_FD) {
+ if ((srcv->buf[i].flags & FUSE_BUF_PHYS_ADDR) ||
+ (srcv->buf[i].flags & FUSE_BUF_IS_FD)) {
break;
}
}
diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h
index a090040bb2..ed9280de91 100644
--- a/tools/virtiofsd/fuse_common.h
+++ b/tools/virtiofsd/fuse_common.h
@@ -611,6 +611,13 @@ enum fuse_buf_flags {
* detected.
*/
FUSE_BUF_FD_RETRY = (1 << 3),
+
+ /**
+ * The addresses in the iovec represent guest physical addresses
+ * that can't be mapped by the daemon process.
+ * IO must be bounced back to the VMM to do it.
+ */
+ FUSE_BUF_PHYS_ADDR = (1 << 4),
};
/**
diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
index abac0d0d2e..31f17ab043 100644
--- a/tools/virtiofsd/fuse_virtio.c
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -49,6 +49,10 @@ typedef struct {
VuVirtqElement elem;
struct fuse_chan ch;
+ /* Number of unmappable iovecs */
+ unsigned bad_in_num;
+ unsigned bad_out_num;
+
/* Used to complete requests that involve no reply */
bool reply_sent;
} FVRequest;
@@ -291,8 +295,10 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
/* The 'in' part of the elem is to qemu */
unsigned int in_num = elem->in_num;
+ unsigned int bad_in_num = req->bad_in_num;
struct iovec *in_sg = elem->in_sg;
size_t in_len = iov_size(in_sg, in_num);
+ size_t in_len_writeable = iov_size(in_sg, in_num - bad_in_num);
fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
__func__, elem->index, in_num, in_len);
@@ -300,7 +306,7 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
* The elem should have room for a 'fuse_out_header' (out from fuse)
* plus the data based on the len in the header.
*/
- if (in_len < sizeof(struct fuse_out_header)) {
+ if (in_len_writeable < sizeof(struct fuse_out_header)) {
fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
__func__, elem->index);
ret = E2BIG;
@@ -327,7 +333,7 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num);
/* These get updated as we skip */
struct iovec *in_sg_ptr = in_sg_cpy;
- int in_sg_cpy_count = in_num;
+ int in_sg_cpy_count = in_num - bad_in_num;
/* skip over parts of in_sg that contained the header iov */
size_t skip_size = iov_len;
@@ -460,17 +466,21 @@ static void fv_queue_worker(gpointer data, gpointer user_data)
/* The 'out' part of the elem is from qemu */
unsigned int out_num = elem->out_num;
+ unsigned int out_num_readable = out_num - req->bad_out_num;
struct iovec *out_sg = elem->out_sg;
size_t out_len = iov_size(out_sg, out_num);
+ size_t out_len_readable = iov_size(out_sg, out_num_readable);
fuse_log(FUSE_LOG_DEBUG,
- "%s: elem %d: with %d out desc of length %zd\n",
- __func__, elem->index, out_num, out_len);
+ "%s: elem %d: with %d out desc of length %zd"
+ " bad_in_num=%u bad_out_num=%u\n",
+ __func__, elem->index, out_num, out_len, req->bad_in_num,
+ req->bad_out_num);
/*
* The elem should contain a 'fuse_in_header' (in to fuse)
* plus the data based on the len in the header.
*/
- if (out_len < sizeof(struct fuse_in_header)) {
+ if (out_len_readable < sizeof(struct fuse_in_header)) {
fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n",
__func__, elem->index);
assert(0); /* TODO */
@@ -484,63 +494,129 @@ static void fv_queue_worker(gpointer data, gpointer user_data)
copy_from_iov(&fbuf, 1, out_sg);
pbufv = NULL; /* Compiler thinks an unitialised path */
- if (out_num > 2 &&
- out_sg[0].iov_len == sizeof(struct fuse_in_header) &&
- ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE &&
- out_sg[1].iov_len == sizeof(struct fuse_write_in)) {
- /*
- * For a write we don't actually need to copy the
- * data, we can just do it straight out of guest memory
- * but we must still copy the headers in case the guest
- * was nasty and changed them while we were using them.
- */
- fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__);
-
- /* copy the fuse_write_in header afte rthe fuse_in_header */
- fbuf.mem += out_sg->iov_len;
- copy_from_iov(&fbuf, 1, out_sg + 1);
- fbuf.mem -= out_sg->iov_len;
- fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len;
-
- /* Allocate the bufv, with space for the rest of the iov */
- pbufv = malloc(sizeof(struct fuse_bufvec) +
- sizeof(struct fuse_buf) * (out_num - 2));
- if (!pbufv) {
- fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n",
- __func__);
- goto out;
- }
+ if (req->bad_in_num || req->bad_out_num) {
+ bool handled_unmappable = false;
+
+ if (out_num > 2 && out_num_readable >= 2 && !req->bad_in_num &&
+ out_sg[0].iov_len == sizeof(struct fuse_in_header) &&
+ ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE &&
+ out_sg[1].iov_len == sizeof(struct fuse_write_in)) {
+ handled_unmappable = true;
+
+ /* copy the fuse_write_in header after fuse_in_header */
+ fbuf.mem += out_sg->iov_len;
+ copy_from_iov(&fbuf, 1, out_sg + 1);
+ fbuf.mem -= out_sg->iov_len;
+ fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len;
+
+ /* Allocate the bufv, with space for the rest of the iov */
+ pbufv = malloc(sizeof(struct fuse_bufvec) +
+ sizeof(struct fuse_buf) * (out_num - 2));
+ if (!pbufv) {
+ fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n",
+ __func__);
+ goto out;
+ }
- allocated_bufv = true;
- pbufv->count = 1;
- pbufv->buf[0] = fbuf;
+ allocated_bufv = true;
+ pbufv->count = 1;
+ pbufv->buf[0] = fbuf;
+
+ size_t iovindex, pbufvindex;
+ iovindex = 2; /* 2 headers, separate iovs */
+ pbufvindex = 1; /* 2 headers, 1 fusebuf */
+
+ for (; iovindex < out_num; iovindex++, pbufvindex++) {
+ pbufv->count++;
+ pbufv->buf[pbufvindex].pos = ~0; /* Dummy */
+ pbufv->buf[pbufvindex].flags =
+ (iovindex < out_num_readable) ? 0 :
+ FUSE_BUF_PHYS_ADDR;
+ pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base;
+ pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len;
+ }
+ }
- size_t iovindex, pbufvindex;
- iovindex = 2; /* 2 headers, separate iovs */
- pbufvindex = 1; /* 2 headers, 1 fusebuf */
+ if (out_num == 2 && out_num_readable == 2 && req->bad_in_num &&
+ out_sg[0].iov_len == sizeof(struct fuse_in_header) &&
+ ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_READ &&
+ out_sg[1].iov_len == sizeof(struct fuse_read_in)) {
+ fuse_log(FUSE_LOG_DEBUG,
+ "Unmappable read case "
+ "in_num=%d bad_in_num=%d\n",
+ elem->in_num, req->bad_in_num);
+ handled_unmappable = true;
+ }
- for (; iovindex < out_num; iovindex++, pbufvindex++) {
- pbufv->count++;
- pbufv->buf[pbufvindex].pos = ~0; /* Dummy */
- pbufv->buf[pbufvindex].flags = 0;
- pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base;
- pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len;
+ if (!handled_unmappable) {
+ fuse_log(FUSE_LOG_ERR,
+ "Unhandled unmappable element: out: %d(b:%d) in: "
+ "%d(b:%d)",
+ out_num, req->bad_out_num, elem->in_num, req->bad_in_num);
+ fv_panic(dev, "Unhandled unmappable element");
}
- } else {
- /* Normal (non fast write) path */
+ }
+
+ if (!req->bad_out_num) {
+ if (out_num > 2 &&
+ out_sg[0].iov_len == sizeof(struct fuse_in_header) &&
+ ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE &&
+ out_sg[1].iov_len == sizeof(struct fuse_write_in)) {
+ /*
+ * For a write we don't actually need to copy the
+ * data, we can just do it straight out of guest memory
+ * but we must still copy the headers in case the guest
+ * was nasty and changed them while we were using them.
+ */
+ fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n",
+ __func__);
+
+ /* copy the fuse_write_in header after fuse_in_header */
+ fbuf.mem += out_sg->iov_len;
+ copy_from_iov(&fbuf, 1, out_sg + 1);
+ fbuf.mem -= out_sg->iov_len;
+ fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len;
+
+ /* Allocate the bufv, with space for the rest of the iov */
+ pbufv = malloc(sizeof(struct fuse_bufvec) +
+ sizeof(struct fuse_buf) * (out_num - 2));
+ if (!pbufv) {
+ fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n",
+ __func__);
+ goto out;
+ }
- /* Copy the rest of the buffer */
- fbuf.mem += out_sg->iov_len;
- copy_from_iov(&fbuf, out_num - 1, out_sg + 1);
- fbuf.mem -= out_sg->iov_len;
- fbuf.size = out_len;
+ allocated_bufv = true;
+ pbufv->count = 1;
+ pbufv->buf[0] = fbuf;
- /* TODO! Endianness of header */
+ size_t iovindex, pbufvindex;
+ iovindex = 2; /* 2 headers, separate iovs */
+ pbufvindex = 1; /* 2 headers, 1 fusebuf */
- /* TODO: Add checks for fuse_session_exited */
- bufv.buf[0] = fbuf;
- bufv.count = 1;
- pbufv = &bufv;
+ for (; iovindex < out_num; iovindex++, pbufvindex++) {
+ pbufv->count++;
+ pbufv->buf[pbufvindex].pos = ~0; /* Dummy */
+ pbufv->buf[pbufvindex].flags = 0;
+ pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base;
+ pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len;
+ }
+ } else {
+ /* Normal (non fast write) path */
+
+ /* Copy the rest of the buffer */
+ fbuf.mem += out_sg->iov_len;
+ copy_from_iov(&fbuf, out_num - 1, out_sg + 1);
+ fbuf.mem -= out_sg->iov_len;
+ fbuf.size = out_len;
+
+ /* TODO! Endianness of header */
+
+ /* TODO: Add checks for fuse_session_exited */
+ bufv.buf[0] = fbuf;
+ bufv.count = 1;
+ pbufv = &bufv;
+ }
}
pbufv->idx = 0;
pbufv->off = 0;
@@ -657,13 +733,16 @@ static void *fv_queue_thread(void *opaque)
__func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes);
while (1) {
+ unsigned int bad_in_num = 0, bad_out_num = 0;
FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest),
- NULL, NULL);
+ &bad_in_num, &bad_out_num);
if (!req) {
break;
}
req->reply_sent = false;
+ req->bad_in_num = bad_in_num;
+ req->bad_out_num = bad_out_num;
if (!se->thread_pool_size) {
req_list = g_list_prepend(req_list, req);
--
2.25.1

View File

@ -0,0 +1,56 @@
From 1f6a9f8567bdf2be00d217abac33a71248541a4a Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Mon, 20 May 2019 13:26:51 +0100
Subject: [PATCH 20/29] DAX/unmap virtiofsd: Route unmappable reads
When a read with unmappable buffers is found, map it to a slave
read command.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
tools/virtiofsd/fuse_virtio.c | 29 +++++++++++++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
index 31f17ab043..1f4c7fff35 100644
--- a/tools/virtiofsd/fuse_virtio.c
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -397,6 +397,35 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
in_sg_left -= ret;
len -= ret;
} while (in_sg_left);
+
+ if (bad_in_num) {
+ while (len && bad_in_num) {
+ VhostUserFSSlaveMsg msg = { 0 };
+ msg.flags[0] = VHOST_USER_FS_FLAG_MAP_R;
+ msg.fd_offset[0] = buf->buf[0].pos;
+ msg.c_offset[0] = (uint64_t)(uintptr_t)in_sg_ptr[0].iov_base;
+ msg.len[0] = in_sg_ptr[0].iov_len;
+ if (len < msg.len[0]) {
+ msg.len[0] = len;
+ }
+ bool req_res = !fuse_virtio_io(se, &msg, buf->buf[0].fd);
+ fuse_log(FUSE_LOG_DEBUG,
+ "%s: bad loop; len=%zd bad_in_num=%d fd_offset=%zd "
+ "c_offset=%p req_res=%d\n",
+ __func__, len, bad_in_num, buf->buf[0].pos,
+ in_sg_ptr[0].iov_base, req_res);
+ if (req_res) {
+ len -= msg.len[0];
+ buf->buf[0].pos += msg.len[0];
+ in_sg_ptr++;
+ bad_in_num--;
+ } else {
+ ret = EIO;
+ free(in_sg_cpy);
+ goto err;
+ }
+ }
+ }
free(in_sg_cpy);
/* Need to fix out->len on EOF */
--
2.25.1

View File

@ -0,0 +1,121 @@
From e291b7766f49b06933afed374b6476416d951517 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Mon, 20 May 2019 13:18:42 +0100
Subject: [PATCH 21/29] DAX/unmap virtiofsd: route unmappable write to slave
command
When a fuse_buf_copy is performed on an element with FUSE_BUF_PHYS_ADDR
route it to a fuse_virtio_write request that does a slave command to
perform the write.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
tools/virtiofsd/buffer.c | 14 +++++++++++---
tools/virtiofsd/fuse_common.h | 6 +++++-
tools/virtiofsd/fuse_lowlevel.h | 3 ---
tools/virtiofsd/passthrough_ll.c | 2 +-
4 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c
index 1a050aa441..8135d52d2a 100644
--- a/tools/virtiofsd/buffer.c
+++ b/tools/virtiofsd/buffer.c
@@ -200,13 +200,20 @@ static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off,
return copied;
}
-static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off,
+static ssize_t fuse_buf_copy_one(fuse_req_t req,
+ const struct fuse_buf *dst, size_t dst_off,
const struct fuse_buf *src, size_t src_off,
size_t len)
{
int src_is_fd = src->flags & FUSE_BUF_IS_FD;
int dst_is_fd = dst->flags & FUSE_BUF_IS_FD;
+ int src_is_phys = src->flags & FUSE_BUF_PHYS_ADDR;
+ int dst_is_phys = src->flags & FUSE_BUF_PHYS_ADDR;
+ if (src_is_phys && !src_is_fd && dst_is_fd) {
+ return fuse_virtio_write(req, dst, dst_off, src, src_off, len);
+ }
+ assert(!src_is_phys && !dst_is_phys);
if (!src_is_fd && !dst_is_fd) {
char *dstmem = (char *)dst->mem + dst_off;
char *srcmem = (char *)src->mem + src_off;
@@ -259,7 +266,8 @@ static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len)
return 1;
}
-ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv)
+ssize_t fuse_buf_copy(fuse_req_t req, struct fuse_bufvec *dstv,
+ struct fuse_bufvec *srcv)
{
size_t copied = 0, i;
@@ -301,7 +309,7 @@ ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv)
dst_len = dst->size - dstv->off;
len = min_size(src_len, dst_len);
- res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len);
+ res = fuse_buf_copy_one(req, dst, dstv->off, src, srcv->off, len);
if (res < 0) {
if (!copied) {
return res;
diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h
index ed9280de91..05d56883dd 100644
--- a/tools/virtiofsd/fuse_common.h
+++ b/tools/virtiofsd/fuse_common.h
@@ -495,6 +495,8 @@ struct fuse_conn_info {
struct fuse_session;
struct fuse_pollhandle;
struct fuse_conn_info_opts;
+struct fuse_req;
+typedef struct fuse_req *fuse_req_t;
/**
* This function parses several command-line options that can be used
@@ -713,11 +715,13 @@ size_t fuse_buf_size(const struct fuse_bufvec *bufv);
/**
* Copy data from one buffer vector to another
*
+ * @param req The request this copy is part of
* @param dst destination buffer vector
* @param src source buffer vector
* @return actual number of bytes copied or -errno on error
*/
-ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src);
+ssize_t fuse_buf_copy(fuse_req_t req,
+ struct fuse_bufvec *dst, struct fuse_bufvec *src);
/**
* Memory buffer iterator
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index 866d122352..e543f64177 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -42,9 +42,6 @@
/** Inode number type */
typedef uint64_t fuse_ino_t;
-/** Request pointer type */
-typedef struct fuse_req *fuse_req_t;
-
/**
* Session
*
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index 56a4b9404a..ab33fabcda 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -2063,7 +2063,7 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
}
}
- res = fuse_buf_copy(&out_buf, in_buf);
+ res = fuse_buf_copy(req, &out_buf, in_buf);
if (res < 0) {
fuse_reply_err(req, -res);
} else {
--
2.25.1

View File

@ -0,0 +1,350 @@
From 2a64df420827ff0b127a30f2ac877a7b1ded925b Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Mon, 20 May 2019 18:08:41 +0100
Subject: [PATCH 22/29] DAX: vhost-user: Rework slave return values
All the current slave handlers on the qemu side generate an 'int'
return value that's squashed down to a bool (!!ret) and stuffed into
a uint64_t (field of a union) to be returned.
Move the uint64_t type back up through the individual handlers so
that we can mkae one actually return a full uint64_t.
Note that the definition in the interop spec says most of these
cases are defined as returning 0 on success and non-0 for failure,
so it's OK to change from a bool to another non-0.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
hw/virtio/vhost-backend.c | 4 +--
hw/virtio/vhost-user-fs.c | 42 ++++++++++++++++---------------
hw/virtio/vhost-user.c | 32 ++++++++++++-----------
include/hw/virtio/vhost-backend.h | 2 +-
include/hw/virtio/vhost-user-fs.h | 13 ++++++----
5 files changed, 50 insertions(+), 43 deletions(-)
diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 222bbcc62d..e81083ddda 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -401,7 +401,7 @@ int vhost_backend_invalidate_device_iotlb(struct vhost_dev *dev,
return -ENODEV;
}
-int vhost_backend_handle_iotlb_msg(struct vhost_dev *dev,
+uint64_t vhost_backend_handle_iotlb_msg(struct vhost_dev *dev,
struct vhost_iotlb_msg *imsg)
{
int ret = 0;
@@ -424,5 +424,5 @@ int vhost_backend_handle_iotlb_msg(struct vhost_dev *dev,
break;
}
- return ret;
+ return !!ret;
}
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index 82a32492a7..c02dcaeca7 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -35,19 +35,19 @@
#define DAX_WINDOW_PROT PROT_NONE
#endif
-int vhost_user_fs_slave_map(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
- int fd)
+uint64_t vhost_user_fs_slave_map(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
+ int fd)
{
VHostUserFS *fs = VHOST_USER_FS(dev->vdev);
if (!fs) {
/* Shouldn't happen - but seen on error path */
fprintf(stderr, "%s: Bad fs ptr\n", __func__);
- return -1;
+ return (uint64_t)-1;
}
size_t cache_size = fs->conf.cache_size;
if (!cache_size) {
fprintf(stderr, "%s: map when DAX cache not present\n", __func__);
- return -1;
+ return (uint64_t)-1;
}
void *cache_host = memory_region_get_ram_ptr(&fs->cache);
@@ -56,7 +56,7 @@ int vhost_user_fs_slave_map(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
if (fd < 0) {
fprintf(stderr, "%s: Bad fd for map\n", __func__);
- return -1;
+ return (uint64_t)-1;
}
for (i = 0; i < VHOST_USER_FS_SLAVE_ENTRIES; i++) {
@@ -78,11 +78,11 @@ int vhost_user_fs_slave_map(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
((sm->flags[i] & VHOST_USER_FS_FLAG_MAP_W) ? PROT_WRITE : 0),
MAP_SHARED | MAP_FIXED,
fd, sm->fd_offset[i]) != (cache_host + sm->c_offset[i])) {
+ res = -errno;
fprintf(stderr, "%s: map failed err %d [%d] %"
PRIx64 "+%" PRIx64 " from %" PRIx64 "\n", __func__,
errno, i, sm->c_offset[i], sm->len[i],
sm->fd_offset[i]);
- res = -1;
break;
}
}
@@ -91,10 +91,11 @@ int vhost_user_fs_slave_map(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
/* Something went wrong, unmap them all */
vhost_user_fs_slave_unmap(dev, sm);
}
- return res;
+ return (uint64_t)res;
}
-int vhost_user_fs_slave_unmap(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm)
+uint64_t vhost_user_fs_slave_unmap(struct vhost_dev *dev,
+ VhostUserFSSlaveMsg *sm)
{
VHostUserFS *fs = VHOST_USER_FS(dev->vdev);
if (!fs) {
@@ -114,7 +115,7 @@ int vhost_user_fs_slave_unmap(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm)
}
fprintf(stderr, "%s: unmap when DAX cache not present\n", __func__);
- return -1;
+ return (uint64_t)-1;
}
void *cache_host = memory_region_get_ram_ptr(&fs->cache);
@@ -148,26 +149,27 @@ int vhost_user_fs_slave_unmap(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm)
ptr = mmap(cache_host + sm->c_offset[i], sm->len[i], DAX_WINDOW_PROT,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
if (ptr != (cache_host + sm->c_offset[i])) {
+ res = -errno;
fprintf(stderr, "%s: mmap failed (%s) [%d] %"
PRIx64 "+%" PRIx64 " from %" PRIx64 " res: %p\n",
__func__,
strerror(errno),
i, sm->c_offset[i], sm->len[i],
sm->fd_offset[i], ptr);
- res = -1;
}
}
- return res;
+ return (uint64_t)res;
}
-int vhost_user_fs_slave_sync(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm)
+uint64_t vhost_user_fs_slave_sync(struct vhost_dev *dev,
+ VhostUserFSSlaveMsg *sm)
{
VHostUserFS *fs = VHOST_USER_FS(dev->vdev);
size_t cache_size = fs->conf.cache_size;
if (!cache_size) {
fprintf(stderr, "%s: sync when DAX cache not present\n", __func__);
- return -1;
+ return (uint64_t)-1;
}
void *cache_host = memory_region_get_ram_ptr(&fs->cache);
@@ -191,26 +193,26 @@ int vhost_user_fs_slave_sync(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm)
if (msync(cache_host + sm->c_offset[i], sm->len[i],
MS_SYNC /* ?? */)) {
+ res = -errno;
fprintf(stderr, "%s: msync failed (%s) [%d] %"
PRIx64 "+%" PRIx64 " from %" PRIx64 "\n", __func__,
strerror(errno),
i, sm->c_offset[i], sm->len[i],
sm->fd_offset[i]);
- res = -1;
}
}
- return res;
+ return (uint64_t)res;
}
-int vhost_user_fs_slave_io(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
- int fd)
+uint64_t vhost_user_fs_slave_io(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
+ int fd)
{
VHostUserFS *fs = VHOST_USER_FS(dev->vdev);
if (!fs) {
/* Shouldn't happen - but seen it in error paths */
fprintf(stderr, "%s: Bad fs ptr\n", __func__);
- return -1;
+ return (uint64_t)-1;
}
unsigned int i;
@@ -219,7 +221,7 @@ int vhost_user_fs_slave_io(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
if (fd < 0) {
fprintf(stderr, "%s: Bad fd for map\n", __func__);
- return -1;
+ return (uint64_t)-1;
}
for (i = 0; i < VHOST_USER_FS_SLAVE_ENTRIES && !res; i++) {
@@ -285,7 +287,7 @@ int vhost_user_fs_slave_io(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
* TODO! We should be returning 'done' if possible but our error handling
* doesn't know about that yet.
*/
- return res;
+ return (uint64_t)res;
}
static void vuf_get_config(VirtIODevice *vdev, uint8_t *config)
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index b4ef0102ad..d95dbc39e3 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -1325,24 +1325,25 @@ static int vhost_user_reset_device(struct vhost_dev *dev)
return 0;
}
-static int vhost_user_slave_handle_config_change(struct vhost_dev *dev)
+static uint64_t vhost_user_slave_handle_config_change(struct vhost_dev *dev)
{
int ret = -1;
if (!dev->config_ops) {
- return -1;
+ return true;
}
if (dev->config_ops->vhost_dev_config_notifier) {
ret = dev->config_ops->vhost_dev_config_notifier(dev);
}
- return ret;
+ return !!ret;
}
-static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
- VhostUserVringArea *area,
- int fd)
+static uint64_t vhost_user_slave_handle_vring_host_notifier(
+ struct vhost_dev *dev,
+ VhostUserVringArea *area,
+ int fd)
{
int queue_idx = area->u64 & VHOST_USER_VRING_IDX_MASK;
size_t page_size = qemu_real_host_page_size;
@@ -1356,7 +1357,7 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) ||
vdev == NULL || queue_idx >= virtio_get_num_queues(vdev)) {
- return -1;
+ return true;
}
n = &user->notifier[queue_idx];
@@ -1369,18 +1370,18 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
}
if (area->u64 & VHOST_USER_VRING_NOFD_MASK) {
- return 0;
+ return false;
}
/* Sanity check. */
if (area->size != page_size) {
- return -1;
+ return true;
}
addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
fd, area->offset);
if (addr == MAP_FAILED) {
- return -1;
+ return true;
}
name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]",
@@ -1391,13 +1392,13 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true)) {
munmap(addr, page_size);
- return -1;
+ return true;
}
n->addr = addr;
n->set = true;
- return 0;
+ return false;
}
static void slave_read(void *opaque)
@@ -1406,7 +1407,8 @@ static void slave_read(void *opaque)
struct vhost_user *u = dev->opaque;
VhostUserHeader hdr = { 0, };
VhostUserPayload payload = { 0, };
- int size, ret = 0;
+ int size;
+ uint64_t ret = 0;
struct iovec iov;
struct msghdr msgh;
int fd[VHOST_USER_SLAVE_MAX_FDS];
@@ -1494,7 +1496,7 @@ static void slave_read(void *opaque)
#endif
default:
error_report("Received unexpected msg type: %d.", hdr.request);
- ret = -EINVAL;
+ ret = (uint64_t)-EINVAL;
}
/* Close the remaining file descriptors. */
@@ -1515,7 +1517,7 @@ static void slave_read(void *opaque)
hdr.flags &= ~VHOST_USER_NEED_REPLY_MASK;
hdr.flags |= VHOST_USER_REPLY_MASK;
- payload.u64 = !!ret;
+ payload.u64 = ret;
hdr.size = sizeof(payload.u64);
iovec[0].iov_base = &hdr;
diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h
index 8a6f8e2a7a..64ac6b6444 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -186,7 +186,7 @@ int vhost_backend_update_device_iotlb(struct vhost_dev *dev,
int vhost_backend_invalidate_device_iotlb(struct vhost_dev *dev,
uint64_t iova, uint64_t len);
-int vhost_backend_handle_iotlb_msg(struct vhost_dev *dev,
+uint64_t vhost_backend_handle_iotlb_msg(struct vhost_dev *dev,
struct vhost_iotlb_msg *imsg);
int vhost_user_gpu_set_socket(struct vhost_dev *dev, int fd);
diff --git a/include/hw/virtio/vhost-user-fs.h b/include/hw/virtio/vhost-user-fs.h
index 0750687463..845cdb0177 100644
--- a/include/hw/virtio/vhost-user-fs.h
+++ b/include/hw/virtio/vhost-user-fs.h
@@ -64,10 +64,13 @@ struct VHostUserFS {
};
/* Callbacks from the vhost-user code for slave commands */
-int vhost_user_fs_slave_map(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
- int fd);
-int vhost_user_fs_slave_unmap(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm);
-int vhost_user_fs_slave_sync(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm);
-int vhost_user_fs_slave_io(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm, int fd);
+uint64_t vhost_user_fs_slave_map(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
+ int fd);
+uint64_t vhost_user_fs_slave_unmap(struct vhost_dev *dev,
+ VhostUserFSSlaveMsg *sm);
+uint64_t vhost_user_fs_slave_sync(struct vhost_dev *dev,
+ VhostUserFSSlaveMsg *sm);
+uint64_t vhost_user_fs_slave_io(struct vhost_dev *dev,
+ VhostUserFSSlaveMsg *sm, int fd);
#endif /* _QEMU_VHOST_USER_FS_H */
--
2.25.1

View File

@ -0,0 +1,97 @@
From 55b6372e1b893e77c6c4d5e87bd1a0765126399c Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Mon, 20 May 2019 20:02:29 +0100
Subject: [PATCH 23/29] DAX: libvhost-user: Route slave message payload
Route the uint64 payload from message replies on the slave back up
through vu_process_message_reply and to the callers.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
subprojects/libvhost-user/libvhost-user.c | 20 ++++++++++++++++----
tools/virtiofsd/fuse_virtio.c | 2 ++
2 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c
index 9b8223b5d5..a1cbb626d2 100644
--- a/subprojects/libvhost-user/libvhost-user.c
+++ b/subprojects/libvhost-user/libvhost-user.c
@@ -403,9 +403,11 @@ vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
* Processes a reply on the slave channel.
* Entered with slave_mutex held and releases it before exit.
* Returns true on success.
+ * *payload is written on success
*/
static bool
-vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg)
+vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg,
+ uint64_t *payload)
{
VhostUserMsg msg_reply;
bool result = false;
@@ -425,7 +427,8 @@ vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg)
goto out;
}
- result = msg_reply.payload.u64 == 0;
+ *payload = msg_reply.payload.u64;
+ result = true;
out:
pthread_mutex_unlock(&dev->slave_mutex);
@@ -1312,6 +1315,8 @@ bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd,
{
int qidx = vq - dev->vq;
int fd_num = 0;
+ bool res;
+ uint64_t payload = 0;
VhostUserMsg vmsg = {
.request = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG,
.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
@@ -1342,7 +1347,10 @@ bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd,
}
/* Also unlocks the slave_mutex */
- return vu_process_message_reply(dev, &vmsg);
+ res = vu_process_message_reply(dev, &vmsg, &payload);
+ res = res && (payload == 0);
+
+ return res;
}
static bool
@@ -2915,6 +2923,8 @@ bool vu_fs_cache_request(VuDev *dev, VhostUserSlaveRequest req, int fd,
VhostUserFSSlaveMsg *fsm)
{
int fd_num = 0;
+ bool res;
+ uint64_t payload = 0;
VhostUserMsg vmsg = {
.request = req,
.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
@@ -2939,6 +2949,8 @@ bool vu_fs_cache_request(VuDev *dev, VhostUserSlaveRequest req, int fd,
}
/* Also unlocks the slave_mutex */
- return vu_process_message_reply(dev, &vmsg);
+ res = vu_process_message_reply(dev, &vmsg, &payload);
+ res = res && (payload == 0);
+ return res;
}
diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
index 1f4c7fff35..416d285844 100644
--- a/tools/virtiofsd/fuse_virtio.c
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -419,6 +419,8 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
buf->buf[0].pos += msg.len[0];
in_sg_ptr++;
bad_in_num--;
+ } else if (req_res == 0) {
+ break;
} else {
ret = EIO;
free(in_sg_cpy);
--
2.25.1

View File

@ -0,0 +1,240 @@
From 5e0e90706b03fa71072b6b17779e0a66cb14aa64 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Tue, 21 May 2019 15:10:05 +0100
Subject: [PATCH 24/29] DAX: virtiofsd: Rework fs-cache-request error path
Rework error values all the way back to the guest for IO requests.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
hw/virtio/vhost-user-fs.c | 9 +++--
subprojects/libvhost-user/libvhost-user.c | 18 ++++++----
subprojects/libvhost-user/libvhost-user.h | 6 ++--
tools/virtiofsd/fuse_lowlevel.h | 11 ++++---
tools/virtiofsd/fuse_virtio.c | 40 +++++++++++------------
5 files changed, 45 insertions(+), 39 deletions(-)
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index c02dcaeca7..b43725824f 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -283,11 +283,10 @@ uint64_t vhost_user_fs_slave_io(struct vhost_dev *dev, VhostUserFSSlaveMsg *sm,
close(fd);
trace_vhost_user_fs_slave_io_exit(res, done);
- /*
- * TODO! We should be returning 'done' if possible but our error handling
- * doesn't know about that yet.
- */
- return (uint64_t)res;
+ if (res < 0) {
+ return (uint64_t)res;
+ }
+ return (uint64_t)done;
}
static void vuf_get_config(VirtIODevice *vdev, uint8_t *config)
diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c
index a1cbb626d2..4cf4aef63d 100644
--- a/subprojects/libvhost-user/libvhost-user.c
+++ b/subprojects/libvhost-user/libvhost-user.c
@@ -2919,8 +2919,8 @@ vu_queue_push(VuDev *dev, VuVirtq *vq,
vu_queue_inflight_post_put(dev, vq, elem->index);
}
-bool vu_fs_cache_request(VuDev *dev, VhostUserSlaveRequest req, int fd,
- VhostUserFSSlaveMsg *fsm)
+int64_t vu_fs_cache_request(VuDev *dev, VhostUserSlaveRequest req, int fd,
+ VhostUserFSSlaveMsg *fsm)
{
int fd_num = 0;
bool res;
@@ -2939,18 +2939,24 @@ bool vu_fs_cache_request(VuDev *dev, VhostUserSlaveRequest req, int fd,
vmsg.fd_num = fd_num;
if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) {
- return false;
+ return -EINVAL;
}
pthread_mutex_lock(&dev->slave_mutex);
if (!vu_message_write(dev, dev->slave_fd, &vmsg)) {
pthread_mutex_unlock(&dev->slave_mutex);
- return false;
+ return -EIO;
}
/* Also unlocks the slave_mutex */
res = vu_process_message_reply(dev, &vmsg, &payload);
- res = res && (payload == 0);
- return res;
+ if (!res) {
+ return -EIO;
+ }
+ /*
+ * Payload is delivered as uint64_t but is actually signed for
+ * errors.
+ */
+ return (int64_t)payload;
}
diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h
index 4b6e681a3e..ee75d4931f 100644
--- a/subprojects/libvhost-user/libvhost-user.h
+++ b/subprojects/libvhost-user/libvhost-user.h
@@ -723,9 +723,9 @@ bool vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes,
* @fd: an fd (only required for map, else must be -1)
* @fsm: The body of the message
*
- * Returns: true if the reply was 0
+ * Returns: 0 or above for success, negative errno on error
*/
-bool vu_fs_cache_request(VuDev *dev, VhostUserSlaveRequest req, int fd,
- VhostUserFSSlaveMsg *fsm);
+int64_t vu_fs_cache_request(VuDev *dev, VhostUserSlaveRequest req, int fd,
+ VhostUserFSSlaveMsg *fsm);
#endif /* LIBVHOST_USER_H */
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index e543f64177..a36a893871 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -1998,7 +1998,7 @@ int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf);
* @param fd The fd to map
* @return Zero on success
*/
-int fuse_virtio_map(fuse_req_t req, VhostUserFSSlaveMsg *msg, int fd);
+int64_t fuse_virtio_map(fuse_req_t req, VhostUserFSSlaveMsg *msg, int fd);
/**
* For use with virtio-fs; request unmapping of part of the cache
@@ -2007,7 +2007,7 @@ int fuse_virtio_map(fuse_req_t req, VhostUserFSSlaveMsg *msg, int fd);
* @param msg A set of unmapping requests
* @return Zero on success
*/
-int fuse_virtio_unmap(struct fuse_session *se, VhostUserFSSlaveMsg *msg);
+int64_t fuse_virtio_unmap(struct fuse_session *se, VhostUserFSSlaveMsg *msg);
/**
* For use with virtio-fs; request synchronisation of part of the cache
@@ -2017,7 +2017,7 @@ int fuse_virtio_unmap(struct fuse_session *se, VhostUserFSSlaveMsg *msg);
* @param msg A set of syncing requests
* @return Zero on success
*/
-int fuse_virtio_sync(fuse_req_t req, VhostUserFSSlaveMsg *msg);
+int64_t fuse_virtio_sync(fuse_req_t req, VhostUserFSSlaveMsg *msg);
/**
* For use with virtio-fs; request IO directly to memory
@@ -2025,9 +2025,10 @@ int fuse_virtio_sync(fuse_req_t req, VhostUserFSSlaveMsg *msg);
* @param se The current session
* @param msg A set of IO requests
* @param fd The fd to map
- * @return Zero on success
+ * @return Length on success, negative errno on error
*/
-int fuse_virtio_io(struct fuse_session *se, VhostUserFSSlaveMsg *msg, int fd);
+int64_t fuse_virtio_io(struct fuse_session *se, VhostUserFSSlaveMsg *msg,
+ int fd);
/**
* For use with virtio-fs; wrapper for fuse_virtio_io for writes
diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
index 416d285844..9577eaa68d 100644
--- a/tools/virtiofsd/fuse_virtio.c
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -408,13 +408,13 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
if (len < msg.len[0]) {
msg.len[0] = len;
}
- bool req_res = !fuse_virtio_io(se, &msg, buf->buf[0].fd);
+ int64_t req_res = fuse_virtio_io(se, &msg, buf->buf[0].fd);
fuse_log(FUSE_LOG_DEBUG,
"%s: bad loop; len=%zd bad_in_num=%d fd_offset=%zd "
- "c_offset=%p req_res=%d\n",
+ "c_offset=%p req_res=%ld\n",
__func__, len, bad_in_num, buf->buf[0].pos,
in_sg_ptr[0].iov_base, req_res);
- if (req_res) {
+ if (req_res > 0) {
len -= msg.len[0];
buf->buf[0].pos += msg.len[0];
in_sg_ptr++;
@@ -422,7 +422,7 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
} else if (req_res == 0) {
break;
} else {
- ret = EIO;
+ ret = req_res;
free(in_sg_cpy);
goto err;
}
@@ -1155,40 +1155,41 @@ void virtio_session_close(struct fuse_session *se)
se->virtio_dev = NULL;
}
-int fuse_virtio_map(fuse_req_t req, VhostUserFSSlaveMsg *msg, int fd)
+int64_t fuse_virtio_map(fuse_req_t req, VhostUserFSSlaveMsg *msg, int fd)
{
if (!req->se->virtio_dev) {
return -ENODEV;
}
- return !vu_fs_cache_request(&req->se->virtio_dev->dev,
- VHOST_USER_SLAVE_FS_MAP, fd, msg);
+ return vu_fs_cache_request(&req->se->virtio_dev->dev,
+ VHOST_USER_SLAVE_FS_MAP, fd, msg);
}
-int fuse_virtio_unmap(struct fuse_session *se, VhostUserFSSlaveMsg *msg)
+int64_t fuse_virtio_unmap(struct fuse_session *se, VhostUserFSSlaveMsg *msg)
{
if (!se->virtio_dev) {
return -ENODEV;
}
- return !vu_fs_cache_request(&se->virtio_dev->dev, VHOST_USER_SLAVE_FS_UNMAP,
- -1, msg);
+ return vu_fs_cache_request(&se->virtio_dev->dev, VHOST_USER_SLAVE_FS_UNMAP,
+ -1, msg);
}
-int fuse_virtio_sync(fuse_req_t req, VhostUserFSSlaveMsg *msg)
+int64_t fuse_virtio_sync(fuse_req_t req, VhostUserFSSlaveMsg *msg)
{
if (!req->se->virtio_dev) {
return -ENODEV;
}
- return !vu_fs_cache_request(&req->se->virtio_dev->dev,
- VHOST_USER_SLAVE_FS_SYNC, -1, msg);
+ return vu_fs_cache_request(&req->se->virtio_dev->dev,
+ VHOST_USER_SLAVE_FS_SYNC, -1, msg);
}
-int fuse_virtio_io(struct fuse_session *se, VhostUserFSSlaveMsg *msg, int fd)
+int64_t fuse_virtio_io(struct fuse_session *se, VhostUserFSSlaveMsg *msg,
+ int fd)
{
if (!se->virtio_dev) {
return -ENODEV;
}
- return !vu_fs_cache_request(&se->virtio_dev->dev,
- VHOST_USER_SLAVE_FS_IO, fd, msg);
+ return vu_fs_cache_request(&se->virtio_dev->dev, VHOST_USER_SLAVE_FS_IO, fd,
+ msg);
}
/*
@@ -1214,8 +1215,7 @@ ssize_t fuse_virtio_write(fuse_req_t req, const struct fuse_buf *dst,
msg.len[0] = len;
msg.flags[0] = VHOST_USER_FS_FLAG_MAP_W;
- bool result = !fuse_virtio_io(req->se, &msg, dst->fd);
- /* TODO: Rework the result path to actually get length/error */
- fuse_log(FUSE_LOG_DEBUG, "%s: result=%d\n", __func__, result);
- return result ? len : -EIO;
+ int64_t result = fuse_virtio_io(req->se, &msg, dst->fd);
+ fuse_log(FUSE_LOG_DEBUG, "%s: result=%ld\n", __func__, result);
+ return result;
}
--
2.25.1

View File

@ -0,0 +1,76 @@
From 0946e9a802943443333eb7e8c6a0989f37c236a5 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@linux.alibaba.com>
Date: Mon, 3 Jun 2019 10:47:19 +0800
Subject: [PATCH 25/29] DAX: virtiofsd: make FUSE_REMOVEMAPPING support
multiple entries
The fuse wire protocol is changed so that we can unmap multiple
mappings in a single call.
Signed-off-by: Peng Tao <tao.peng@linux.alibaba.com>
fix by: Catherine Ho <catherine.hecx@gmail.com>
---
tools/virtiofsd/fuse_lowlevel.c | 5 +++--
tools/virtiofsd/passthrough_ll.c | 26 ++++++++++++++++++--------
2 files changed, 21 insertions(+), 10 deletions(-)
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index a2480d4aa1..99ba000c2e 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -1920,12 +1920,13 @@ static void do_removemapping(fuse_req_t req, fuse_ino_t nodeid,
struct fuse_removemapping_one *one;
arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
- if (!arg) {
+ if (!arg || arg->count <= 0) {
+ fuse_log(FUSE_LOG_ERR, "do_removemapping: invalid arg %p\n", arg);
fuse_reply_err(req, EINVAL);
return;
}
- one = fuse_mbuf_iter_advance(iter, sizeof(*one));
+ one = fuse_mbuf_iter_advance(iter, arg->count * sizeof(*one));
if (!one) {
fuse_log(
FUSE_LOG_ERR,
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index ab33fabcda..3af55ffb8a 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -2965,14 +2965,24 @@ static void lo_removemapping(fuse_req_t req, struct fuse_session *se,
VhostUserFSSlaveMsg msg = { 0 };
int ret = 0;
- msg.len[0] = argp->len;
- msg.c_offset[0] = argp->moffset;
- if (fuse_virtio_unmap(se, &msg)) {
- fprintf(stderr,
- "%s: unmap over virtio failed "
- "(offset=0x%lx, len=0x%lx)\n",
- __func__, argp->moffset, argp->len);
- ret = EINVAL;
+ for (int i = 0; num > 0; i++, argp++) {
+ msg.len[i] = argp->len;
+ msg.c_offset[i] = argp->moffset;
+
+ if (--num == 0 || i == VHOST_USER_FS_SLAVE_ENTRIES - 1) {
+ if (fuse_virtio_unmap(se, &msg)) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: unmap over virtio failed "
+ "(offset=0x%lx, len=0x%lx)\n",
+ __func__, argp->moffset, argp->len);
+ ret = EINVAL;
+ break;
+ }
+ if (num > 0) {
+ i = 0;
+ memset(&msg, 0, sizeof(msg));
+ }
+ }
}
fuse_reply_err(req, ret);
--
2.25.1

View File

@ -0,0 +1,42 @@
From e684fffcaf21baf0f4341091303ce3c2dcbf822d Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Fri, 26 Jul 2019 09:33:22 +0100
Subject: [PATCH 26/29] DAX:virtiofsd: implement FUSE_INIT map_alignment field
Communicate the host page size to the FUSE client so that
FUSE_SETUPMAPPING/FUSE_REMOVEMAPPING requests are aware of our alignment
constraints.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
tools/virtiofsd/fuse_lowlevel.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index 99ba000c2e..d6256f571b 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -2188,6 +2188,12 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
outarg.max_background = se->conn.max_background;
outarg.congestion_threshold = se->conn.congestion_threshold;
outarg.time_gran = se->conn.time_gran;
+ if (arg->flags & FUSE_MAP_ALIGNMENT) {
+ outarg.flags |= FUSE_MAP_ALIGNMENT;
+
+ /* This constraint comes from mmap(2) and munmap(2) */
+ outarg.map_alignment = ffsl(sysconf(_SC_PAGE_SIZE)) - 1;
+ }
fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor);
fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags);
@@ -2197,6 +2203,7 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n",
outarg.congestion_threshold);
fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran);
+ fuse_log(FUSE_LOG_DEBUG, " map_alignment=%u\n", outarg.map_alignment);
send_reply_ok(req, &outarg, outargsize);
}
--
2.25.1

View File

@ -0,0 +1,776 @@
From a0cbb60bb58ffaf2ae771c7822f0cb25762076fa Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 20 Nov 2019 14:27:19 +0000
Subject: [PATCH 27/29] virtiofsd: add initial support for shared versions
Not backward compatible with previous kernels, so please only use with
kernel that has version table support (this will need to be cleaned up).
No READDIRPLUS support in the kernel for versioned entries, so disable for
now.
Attribute timeout is set to "infinity", so changes to underlying filesystem
won't be visible. This also needs to be fixed, but is best for testing the
versioning since the shared version is the only thing that will force
refreshing metadata and dcache lookups.
No caching metadata modifications yet.
Start "ireg" daemon before starting any fuse servers.
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Fix by:
Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Only send entryver_out when shared is enabled by:
With help message update from:
Signed-off-by: Xiao Yang <yangx.jy@cn.fujitsu.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
include/standard-headers/linux/fuse.h | 5 +
tools/virtiofsd/fuse_lowlevel.c | 36 ++-
tools/virtiofsd/fuse_lowlevel.h | 9 +-
tools/virtiofsd/helper.c | 4 +
tools/virtiofsd/ireg.h | 33 +++
tools/virtiofsd/passthrough_ll.c | 321 +++++++++++++++++++++++++-
6 files changed, 387 insertions(+), 21 deletions(-)
create mode 100644 tools/virtiofsd/ireg.h
diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h
index 82c0a38b59..fbced7caef 100644
--- a/include/standard-headers/linux/fuse.h
+++ b/include/standard-headers/linux/fuse.h
@@ -510,6 +510,11 @@ struct fuse_entry_out {
struct fuse_attr attr;
};
+struct fuse_entryver_out {
+ uint64_t version_index;
+ int64_t initial_version;
+};
+
struct fuse_forget_in {
uint64_t nlookup;
};
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index d6256f571b..47231378db 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -389,28 +389,46 @@ static void fill_open(struct fuse_open_out *arg, const struct fuse_file_info *f)
}
}
-int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e)
+int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e,
+ bool shared)
{
- struct fuse_entry_out arg;
- size_t size = sizeof(arg);
+ char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_entryver_out)];
+ struct fuse_entry_out *earg = (struct fuse_entry_out *)buf;
+ struct fuse_entryver_out *ever =
+ (struct fuse_entryver_out *)(buf + sizeof(struct fuse_entry_out));
+ size_t size = sizeof(buf);
- memset(&arg, 0, sizeof(arg));
- fill_entry(&arg, e);
- return send_reply_ok(req, &arg, size);
+ if ((req->se->conn.proto_minor >= 9) && !shared) {
+ size -= sizeof(struct fuse_entryver_out);
+ }
+
+ memset(buf, 0, sizeof(buf));
+ fill_entry(earg, e);
+ ever->initial_version = e->initial_version;
+ ever->version_index = e->version_offset;
+ return send_reply_ok(req, buf, size);
}
int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e,
- const struct fuse_file_info *f)
+ const struct fuse_file_info *f, bool shared)
{
- char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)];
+ char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out) +
+ sizeof(struct fuse_entryver_out)];
size_t entrysize = sizeof(struct fuse_entry_out);
struct fuse_entry_out *earg = (struct fuse_entry_out *)buf;
struct fuse_open_out *oarg = (struct fuse_open_out *)(buf + entrysize);
+ struct fuse_entryver_out *ever =
+ (struct fuse_entryver_out *)(buf + entrysize +
+ sizeof(struct fuse_open_out));
memset(buf, 0, sizeof(buf));
fill_entry(earg, e);
fill_open(oarg, f);
- return send_reply_ok(req, buf, entrysize + sizeof(struct fuse_open_out));
+ ever->initial_version = e->initial_version;
+ ever->version_index = e->version_offset;
+ return send_reply_ok(req, buf,
+ entrysize + sizeof(struct fuse_open_out) +
+ (shared ? sizeof(struct fuse_entryver_out) : 0));
}
int fuse_reply_attr(fuse_req_t req, const struct stat *attr,
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index a36a893871..5f60e3fd2c 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -26,6 +26,7 @@
#include "fuse_common.h"
#include "standard-headers/linux/fuse.h"
+#include <stdbool.h>
#include <sys/statvfs.h>
#include <sys/uio.h>
#include <utime.h>
@@ -104,6 +105,9 @@ struct fuse_entry_param {
* Flags for fuse_attr.flags that do not fit into attr.
*/
uint32_t attr_flags;
+
+ uint64_t version_offset;
+ int64_t initial_version;
};
/**
@@ -1294,7 +1298,8 @@ void fuse_reply_none(fuse_req_t req);
* @param e the entry parameters
* @return zero for success, -errno for failure to send reply
*/
-int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e);
+int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e,
+ bool shared);
/**
* Reply with a directory entry and open parameters
@@ -1314,7 +1319,7 @@ int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e);
* @return zero for success, -errno for failure to send reply
*/
int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e,
- const struct fuse_file_info *fi);
+ const struct fuse_file_info *fi, bool shared);
/**
* Reply with attributes
diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c
index 28243b51b2..29331ec2fc 100644
--- a/tools/virtiofsd/helper.c
+++ b/tools/virtiofsd/helper.c
@@ -174,6 +174,10 @@ void fuse_cmdline_help(void)
" default: no_xattr\n"
" -o modcaps=CAPLIST Modify the list of capabilities\n"
" e.g. -o modcaps=+sys_admin:-chown\n"
+ " -o shared|no_shared enable/disable shared cache\n"
+ " default: no_shared\n"
+ " please start 'ireg' daemon before "
+ " using shared cache\n"
" --rlimit-nofile=<num> set maximum number of file descriptors\n"
" (0 leaves rlimit unchanged)\n"
" default: min(1000000, fs.file-max - 16384)\n"
diff --git a/tools/virtiofsd/ireg.h b/tools/virtiofsd/ireg.h
new file mode 100644
index 0000000000..91c0f386d7
--- /dev/null
+++ b/tools/virtiofsd/ireg.h
@@ -0,0 +1,33 @@
+#define VERSION_TABLE_MAGIC 0x7265566465726853
+
+enum ireg_op {
+ IREG_GET,
+ IREG_PUT,
+};
+
+struct ireg_msg {
+ enum ireg_op op;
+ uint64_t handle;
+ union {
+ struct {
+ uint64_t ino;
+ uint64_t dev;
+ } get;
+ struct {
+ uint64_t refid;
+ } put;
+ };
+};
+
+enum srv_op {
+ SRV_VERSION,
+};
+
+struct srv_msg {
+ enum srv_op op;
+ uint64_t handle;
+ struct {
+ uint64_t refid;
+ uint64_t offset;
+ } version;
+};
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index 3af55ffb8a..52a52b2dd7 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -44,16 +44,21 @@
#include <cap-ng.h>
#include <dirent.h>
#include <pthread.h>
+#include <semaphore.h>
#include <sys/file.h>
+#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/resource.h>
+#include <sys/socket.h>
#include <sys/syscall.h>
+#include <sys/un.h>
#include <sys/wait.h>
#include <sys/xattr.h>
#include <syslog.h>
#include "qemu/cutils.h"
+#include "ireg.h"
#include "passthrough_helpers.h"
#include "passthrough_seccomp.h"
@@ -110,6 +115,8 @@ struct lo_inode {
*/
uint64_t nlookup;
+ uint64_t version_offset;
+ uint64_t ireg_refid;
fuse_ino_t fuse_ino;
pthread_mutex_t plock_mutex;
GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
@@ -152,12 +159,16 @@ struct lo_data {
char *modcaps;
double timeout;
int cache;
+ int shared;
int timeout_set;
int readdirplus_set;
int readdirplus_clear;
int allow_direct_io;
int announce_submounts;
bool use_statx;
+ int ireg_sock;
+ int64_t *version_table;
+ uint64_t version_table_size;
struct lo_inode root;
GHashTable *inodes; /* protected by lo->mutex */
struct lo_map ino_map; /* protected by lo->mutex */
@@ -193,6 +204,8 @@ static const struct fuse_opt lo_opts[] = {
{ "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
{ "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
{ "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
+ { "shared", offsetof(struct lo_data, shared), 1 },
+ { "no_shared", offsetof(struct lo_data, shared), 0 },
{ "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
{ "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
{ "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 },
@@ -204,6 +217,7 @@ static bool use_syslog = false;
static int current_log_level;
static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
uint64_t n);
+static void put_shared(struct lo_data *lo, struct lo_inode *inode);
static struct {
pthread_mutex_t mutex;
@@ -512,6 +526,7 @@ static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
if (g_atomic_int_dec_and_test(&inode->refcount)) {
close(inode->fd);
+ put_shared(lo, inode);
free(inode);
}
}
@@ -587,8 +602,9 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn)
}
}
+ /* TODO: shared version support for readdirplus */
if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
- lo->readdirplus_clear) {
+ lo->readdirplus_clear || lo->shared) {
fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
conn->want &= ~FUSE_CAP_READDIRPLUS;
}
@@ -600,6 +616,29 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn)
}
}
+static int64_t *version_ptr(struct lo_data *lo, struct lo_inode *inode)
+{
+ return lo->version_table + inode->version_offset;
+}
+
+static int64_t get_version(struct lo_data *lo, struct lo_inode *inode)
+{
+ if (!inode->version_offset) {
+ return 0;
+ }
+
+ return __atomic_load_8(version_ptr(lo, inode), __ATOMIC_SEQ_CST);
+}
+
+static void update_version(struct lo_data *lo, struct lo_inode *inode)
+{
+ if (!inode->version_offset) {
+ return;
+ }
+
+ __atomic_add_fetch(version_ptr(lo, inode), 1, __ATOMIC_SEQ_CST);
+}
+
static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi)
{
@@ -731,6 +770,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
goto out_err;
}
}
+ update_version(lo, inode);
lo_inode_put(lo, &inode);
return lo_getattr(req, ino, fi);
@@ -763,6 +803,74 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
return p;
}
+struct msgreply {
+ struct lo_inode *inode;
+ sem_t ready;
+};
+
+static void get_shared(struct lo_data *lo, struct lo_inode *inode)
+{
+ int res;
+ struct msgreply rep = {
+ .inode = inode,
+ };
+ struct ireg_msg msg = {
+ .op = IREG_GET,
+ .handle = (uintptr_t) &rep,
+ .get = {
+ .ino = inode->key.ino,
+ .dev = inode->key.dev,
+ },
+ };
+
+ if (lo->ireg_sock == -1) {
+ inode->version_offset = 0;
+ return;
+ }
+
+ sem_init(&rep.ready, 0, 0);
+
+ res = write(lo->ireg_sock, &msg, sizeof(msg));
+ if (res != sizeof(msg)) {
+ if (res == -1) {
+ fuse_log(FUSE_LOG_WARNING,
+ "write(lo->ireg_sock, {IREG_GET, ...}): %m\n");
+ } else {
+ fuse_log(FUSE_LOG_WARNING, "short write to ireg_sock: %i\n", res);
+ }
+ return;
+ }
+
+ while (sem_wait(&rep.ready)) {
+ ;
+ }
+ sem_destroy(&rep.ready);
+}
+
+static void put_shared(struct lo_data *lo, struct lo_inode *inode)
+{
+ int res;
+ struct ireg_msg msg = {
+ .op = IREG_PUT,
+ .put.refid = inode->ireg_refid,
+ };
+
+ if (lo->ireg_sock == -1) {
+ return;
+ }
+
+ res = write(lo->ireg_sock, &msg, sizeof(msg));
+ if (res != sizeof(msg)) {
+ if (res == -1) {
+ fuse_log(FUSE_LOG_WARNING,
+ "write(lo->ireg_sock, {IREG_PUT, ...}): %m\n");
+ } else {
+ fuse_log(FUSE_LOG_WARNING, "short write to ireg_sock: %i\n", res);
+ }
+ return;
+ }
+}
+
/* value_destroy_func for posix_locks GHashTable */
static void posix_locks_value_destroy(gpointer data)
{
@@ -908,16 +1016,30 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
}
pthread_mutex_lock(&lo->mutex);
+ get_shared(lo, inode);
inode->fuse_ino = lo_add_inode_mapping(req, inode);
g_hash_table_insert(lo->inodes, &inode->key, inode);
pthread_mutex_unlock(&lo->mutex);
}
+
+ e->initial_version = get_version(lo, inode);
+ res = fstatat(inode->fd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
+ if (res == -1) {
+ saverr = errno;
+ unref_inode_lolocked(lo, inode, 1);
+ errno = saverr;
+ goto out_err;
+ }
+
e->ino = inode->fuse_ino;
+ e->version_offset = inode->version_offset;
lo_inode_put(lo, &inode);
lo_inode_put(lo, &dir);
- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
- name, (unsigned long long)e->ino);
+ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli (version_table[%lli]=%lli)\n",
+ (unsigned long long)parent, name, (unsigned long long)e->ino,
+ (unsigned long long)e->version_offset,
+ (unsigned long long)e->initial_version);
return 0;
@@ -952,7 +1074,7 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
if (err) {
fuse_reply_err(req, err);
} else {
- fuse_reply_entry(req, &e);
+ fuse_reply_entry(req, &e, lo_data(req)->shared);
}
}
@@ -1056,6 +1178,8 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
goto out;
}
+ update_version(lo, dir);
+
saverr = lo_do_lookup(req, parent, name, &e);
if (saverr) {
goto out;
@@ -1064,7 +1188,7 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
name, (unsigned long long)e.ino);
- fuse_reply_entry(req, &e);
+ fuse_reply_entry(req, &e, lo->shared);
lo_inode_put(lo, &dir);
return;
@@ -1134,11 +1258,13 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
inode->nlookup++;
pthread_mutex_unlock(&lo->mutex);
e.ino = inode->fuse_ino;
+ update_version(lo, inode);
+ update_version(lo, parent_inode);
fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
name, (unsigned long long)e.ino);
- fuse_reply_entry(req, &e);
+ fuse_reply_entry(req, &e, lo->shared);
lo_inode_put(lo, &parent_inode);
lo_inode_put(lo, &inode);
return;
@@ -1192,8 +1318,21 @@ static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
}
res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
+ if (res == -1) {
+ fuse_reply_err(req, errno);
+ } else {
+ struct lo_inode *parent_inode;
- fuse_reply_err(req, res == -1 ? errno : 0);
+ update_version(lo, inode);
+
+ parent_inode = lo_inode(req, parent);
+ if (parent_inode) {
+ update_version(lo, parent_inode);
+ lo_inode_put(lo, &parent_inode);
+ }
+
+ fuse_reply_err(req, 0);
+ }
unref_inode_lolocked(lo, inode, 1);
lo_inode_put(lo, &inode);
}
@@ -1245,8 +1384,18 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
}
res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
+ if (res == -1) {
+ fuse_reply_err(req, errno);
+ } else {
+ update_version(lo, oldinode);
+ if (newinode) {
+ update_version(lo, newinode);
+ }
+ update_version(lo, parent_inode);
+ update_version(lo, newparent_inode);
+ fuse_reply_err(req, 0);
+ }
- fuse_reply_err(req, res == -1 ? errno : 0);
out:
unref_inode_lolocked(lo, oldinode, 1);
unref_inode_lolocked(lo, newinode, 1);
@@ -1274,8 +1423,21 @@ static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
}
res = unlinkat(lo_fd(req, parent), name, 0);
+ if (res == -1) {
+ fuse_reply_err(req, errno);
+ } else {
+ struct lo_inode *parent_inode;
- fuse_reply_err(req, res == -1 ? errno : 0);
+ update_version(lo, inode);
+
+ parent_inode = lo_inode(req, parent);
+ if (parent_inode) {
+ update_version(lo, parent_inode);
+ lo_inode_put(lo, &parent_inode);
+ }
+
+ fuse_reply_err(req, 0);
+ }
unref_inode_lolocked(lo, inode, 1);
lo_inode_put(lo, &inode);
}
@@ -1690,6 +1852,8 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
if (!err) {
ssize_t fh;
+ update_version(lo, parent_inode);
+
pthread_mutex_lock(&lo->mutex);
fh = lo_add_fd_mapping(req, fd);
pthread_mutex_unlock(&lo->mutex);
@@ -1714,7 +1878,7 @@ out:
if (err) {
fuse_reply_err(req, err);
} else {
- fuse_reply_create(req, &e, fi);
+ fuse_reply_create(req, &e, fi, lo->shared);
}
}
@@ -2041,6 +2205,7 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
(void)ino;
ssize_t res;
struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
+ struct lo_data *lo = lo_data(req);
bool cap_fsetid_dropped = false;
out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
@@ -2067,6 +2232,14 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
if (res < 0) {
fuse_reply_err(req, -res);
} else {
+ struct lo_inode *inode;
+
+ inode = lo_inode(req, ino);
+ if (inode) {
+ update_version(lo, inode);
+ lo_inode_put(lo, &inode);
+ }
+
fuse_reply_write(req, (size_t)res);
}
@@ -2095,6 +2268,7 @@ static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
off_t length, struct fuse_file_info *fi)
{
int err = EOPNOTSUPP;
+ struct lo_data *lo = lo_data(req);
(void)ino;
#ifdef CONFIG_FALLOCATE
@@ -2112,6 +2286,16 @@ static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
#endif
+ if (!err) {
+ struct lo_inode *inode;
+
+ inode = lo_inode(req, ino);
+ if (inode) {
+ update_version(lo, inode);
+ lo_inode_put(lo, &inode);
+ }
+ }
+
fuse_reply_err(req, err);
}
@@ -2754,6 +2938,9 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
saverr = ret == -1 ? errno : 0;
+ if (!saverr) {
+ update_version(lo, inode);
+ }
out:
if (fd >= 0) {
close(fd);
@@ -2820,6 +3007,9 @@ static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
saverr = ret == -1 ? errno : 0;
+ if (!saverr) {
+ update_version(lo, inode);
+ }
out:
if (fd >= 0) {
close(fd);
@@ -3474,6 +3664,101 @@ static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
}
}
+static void *ireg_do(void *data)
+{
+ struct lo_data *lo = data;
+ int res;
+ char buf[100];
+ struct srv_msg reply;
+ struct msgreply *rep;
+
+ for (;;) {
+ res = read(lo->ireg_sock, buf, sizeof(buf));
+ if (res <= 0) {
+ if (res == -1) {
+ fuse_log(FUSE_LOG_WARNING, "read(lo->ireg_sock, ...): %m\n");
+ } else {
+ fuse_log(FUSE_LOG_WARNING, "disconnected from ireg\n");
+ }
+ return NULL;
+ }
+ if (res != sizeof(reply)) {
+ fuse_log(FUSE_LOG_WARNING, "bad size message: %i\n", res);
+ continue;
+ }
+
+ memcpy(&reply, buf, sizeof(reply));
+ if (reply.op != SRV_VERSION) {
+ fuse_log(FUSE_LOG_WARNING, "bad reply to IREG_GET: %i\n", reply.op);
+ continue;
+ }
+
+ rep = (struct msgreply *)(uintptr_t)reply.handle;
+ rep->inode->version_offset = reply.version.offset;
+ rep->inode->ireg_refid = reply.version.refid;
+ sem_post(&rep->ready);
+ }
+}
+
+static void setup_shared_versions(struct lo_data *lo)
+{
+ int fd, sock, res;
+ const char *version_path = "/dev/shm/fuse_shared_versions";
+ struct stat stat;
+ struct sockaddr_un name = { .sun_family = AF_UNIX };
+ const char *socket_name = "/tmp/ireg.sock";
+ void *addr;
+
+ lo->ireg_sock = -1;
+ if (!lo->shared) {
+ return;
+ }
+
+ sock = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+ if (sock == -1) {
+ fuse_log(FUSE_LOG_ERR, "socket(AF_UNIX, SOCK_SEQPACKET, 0): %m\n");
+ exit(1);
+ }
+
+ strncpy(name.sun_path, socket_name, sizeof(name.sun_path) - 1);
+
+ res = connect(sock, (const struct sockaddr *)&name,
+ sizeof(struct sockaddr_un));
+ if (res == -1) {
+ fuse_log(FUSE_LOG_WARNING, "connect to ireg: %m\n");
+ close(sock);
+ lo->ireg_sock = -1;
+ return;
+ }
+
+ lo->ireg_sock = sock;
+
+ fd = open(version_path, O_RDWR);
+ if (sock == -1) {
+ fuse_log(FUSE_LOG_ERR, "open(%s, O_RDWR): %m\n", version_path);
+ exit(1);
+ }
+
+ res = fstat(fd, &stat);
+ if (res == -1) {
+ fuse_log(FUSE_LOG_ERR, "fstat(%i, &stat): %m\n", fd);
+ exit(1);
+ }
+
+ lo->version_table_size = stat.st_size / sizeof(lo->version_table[0]);
+
+ addr = mmap(NULL, stat.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (addr == MAP_FAILED) {
+ fuse_log(
+ FUSE_LOG_ERR,
+ "mmap(NULL, %li, PROT_READ | PROT_WRITE, MAP_SHARED, %i, 0): %m\n",
+ stat.st_size, fd);
+ exit(1);
+ }
+
+ lo->version_table = addr;
+}
+
static void setup_root(struct lo_data *lo, struct lo_inode *root)
{
int fd, res;
@@ -3688,6 +3973,7 @@ int main(int argc, char *argv[])
lo.use_statx = true;
+ setup_shared_versions(&lo);
se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
if (se == NULL) {
goto err_out1;
@@ -3711,9 +3997,24 @@ int main(int argc, char *argv[])
setup_sandbox(&lo, se, opts.syslog);
setup_root(&lo, &lo.root);
+
+ if (lo.ireg_sock != -1) {
+ pthread_t ireg_thread;
+
+ ret = pthread_create(&ireg_thread, NULL, ireg_do, &lo);
+ if (ret) {
+ fuse_log(FUSE_LOG_WARNING, "pthread_create: %s\n", strerror(ret));
+ ret = 1;
+ goto err_out4;
+ }
+
+ get_shared(&lo, &lo.root);
+ }
+
/* Block until ctrl+c or fusermount -u */
ret = virtio_loop(se);
+err_out4:
fuse_session_unmount(se);
cleanup_capng();
err_out3:
--
2.25.1

View File

@ -0,0 +1,167 @@
From 119990ab3a30564c7e44f4e39344be48fc998f26 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Fri, 27 Jul 2018 10:36:41 +0100
Subject: [PATCH 28/29] virtio-fs: Allow mapping of meta data version table
The 'meta data version table' is a block of shared memory mapped between
multiple QEMUs and fuse daemons, so that they can be informed
of metadata updates. It's typically a shmfs file, and
it's specified as :
-device vhost-user-fs-pci,chardev=char0,tag=myfs,cache-size=1G,versiontable=/dev/shm/mdvt1
It gets mapped into the PCI bar after the data cache; it's read only.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
hw/virtio/vhost-user-fs-pci.c | 16 +++++++++--
hw/virtio/vhost-user-fs.c | 32 ++++++++++++++++++++++
include/hw/virtio/vhost-user-fs.h | 4 +++
include/standard-headers/linux/virtio_fs.h | 1 +
4 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/hw/virtio/vhost-user-fs-pci.c b/hw/virtio/vhost-user-fs-pci.c
index 19aaa8d722..aad0128fa5 100644
--- a/hw/virtio/vhost-user-fs-pci.c
+++ b/hw/virtio/vhost-user-fs-pci.c
@@ -42,6 +42,7 @@ static void vhost_user_fs_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
VHostUserFSPCI *dev = VHOST_USER_FS_PCI(vpci_dev);
DeviceState *vdev = DEVICE(&dev->vdev);
uint64_t cachesize;
+ uint64_t totalsize;
if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
/* Also reserve config change and hiprio queue vectors */
@@ -51,18 +52,29 @@ static void vhost_user_fs_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
cachesize = dev->vdev.conf.cache_size;
+ /* PCIe bar needs to be a power of 2 */
+ totalsize = pow2ceil(cachesize + dev->vdev.mdvt_size);
+
/*
* The bar starts with the data/DAX cache
- * Others will be added later.
+ * followed by the metadata cache.
*/
memory_region_init(&dev->cachebar, OBJECT(vpci_dev),
- "vhost-fs-pci-cachebar", cachesize);
+ "vhost-fs-pci-cachebar", totalsize);
if (cachesize) {
memory_region_add_subregion(&dev->cachebar, 0, &dev->vdev.cache);
virtio_pci_add_shm_cap(vpci_dev, VIRTIO_FS_PCI_CACHE_BAR, 0, cachesize,
VIRTIO_FS_SHMCAP_ID_CACHE);
}
+ if (dev->vdev.mdvt_size) {
+ memory_region_add_subregion(&dev->cachebar, cachesize,
+ &dev->vdev.mdvt);
+ virtio_pci_add_shm_cap(vpci_dev, VIRTIO_FS_PCI_CACHE_BAR,
+ cachesize, dev->vdev.mdvt_size,
+ VIRTIO_FS_SHMCAP_ID_VERTAB);
+ }
+
/* After 'realized' so the memory region exists */
pci_register_bar(&vpci_dev->pci_dev, VIRTIO_FS_PCI_CACHE_BAR,
PCI_BASE_ADDRESS_SPACE_MEMORY |
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index b43725824f..fb16db7e0d 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -432,6 +432,7 @@ static void vuf_device_realize(DeviceState *dev, Error **errp)
unsigned int i;
size_t len;
int ret;
+ int mdvtfd = -1;
if (!fs->conf.chardev.chr) {
error_setg(errp, "missing chardev");
@@ -475,6 +476,28 @@ static void vuf_device_realize(DeviceState *dev, Error **errp)
"no smaller than the page size");
return;
}
+ if (fs->conf.mdvtpath) {
+ struct stat statbuf;
+
+ mdvtfd = open(fs->conf.mdvtpath, O_RDWR);
+ if (mdvtfd < 0) {
+ error_setg_errno(errp, errno,
+ "Failed to open meta-data version table '%s'",
+ fs->conf.mdvtpath);
+
+ return;
+ }
+ if (fstat(mdvtfd, &statbuf) == -1) {
+ error_setg_errno(errp, errno,
+ "Failed to stat meta-data version table '%s'",
+ fs->conf.mdvtpath);
+ close(mdvtfd);
+ return;
+ }
+
+ fs->mdvt_size = statbuf.st_size;
+ }
+
if (fs->conf.cache_size) {
/* Anonymous, private memory is not counted as overcommit */
cache_ptr = mmap(NULL, fs->conf.cache_size, DAX_WINDOW_PROT,
@@ -489,6 +512,14 @@ static void vuf_device_realize(DeviceState *dev, Error **errp)
fs->conf.cache_size, cache_ptr);
}
+ if (mdvtfd) {
+ memory_region_init_ram_from_fd(&fs->mdvt, OBJECT(vdev),
+ "virtio-fs-mdvt",
+ fs->mdvt_size, true, mdvtfd, NULL);
+ /* The version table is read-only by the guest */
+ memory_region_set_readonly(&fs->mdvt, true);
+ }
+
if (!vhost_user_init(&fs->vhost_user, &fs->conf.chardev, errp)) {
return;
}
@@ -564,6 +595,7 @@ static Property vuf_properties[] = {
conf.num_request_queues, 1),
DEFINE_PROP_UINT16("queue-size", VHostUserFS, conf.queue_size, 128),
DEFINE_PROP_SIZE("cache-size", VHostUserFS, conf.cache_size, 0),
+ DEFINE_PROP_STRING("versiontable", VHostUserFS, conf.mdvtpath),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/include/hw/virtio/vhost-user-fs.h b/include/hw/virtio/vhost-user-fs.h
index 845cdb0177..83015ac0fc 100644
--- a/include/hw/virtio/vhost-user-fs.h
+++ b/include/hw/virtio/vhost-user-fs.h
@@ -47,6 +47,7 @@ typedef struct {
uint16_t num_request_queues;
uint16_t queue_size;
uint64_t cache_size;
+ char *mdvtpath;
} VHostUserFSConf;
struct VHostUserFS {
@@ -61,6 +62,9 @@ struct VHostUserFS {
/*< public >*/
MemoryRegion cache;
+ /* Metadata version table */
+ size_t mdvt_size;
+ MemoryRegion mdvt;
};
/* Callbacks from the vhost-user code for slave commands */
diff --git a/include/standard-headers/linux/virtio_fs.h b/include/standard-headers/linux/virtio_fs.h
index 808aa3a402..a17b5172a8 100644
--- a/include/standard-headers/linux/virtio_fs.h
+++ b/include/standard-headers/linux/virtio_fs.h
@@ -18,6 +18,7 @@ struct virtio_fs_config {
/* For the id field in virtio_pci_shm_cap */
#define VIRTIO_FS_SHMCAP_ID_CACHE 0
+#define VIRTIO_FS_SHMCAP_ID_VERTAB 1
#define VIRTIO_FS_PCI_CACHE_BAR 2
--
2.25.1

View File

@ -0,0 +1,35 @@
From e2a3c273639368221dae39a7f230a46d0a580e4d Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Tue, 21 Jan 2020 10:20:14 +0000
Subject: [PATCH 29/29] virtiofsd: Add printf checking to fuse_log
Use qemu's GCC_FMT_ATTR to add printf style checking to fuse_log.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
tools/virtiofsd/fuse_log.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/tools/virtiofsd/fuse_log.h b/tools/virtiofsd/fuse_log.h
index 8d7091bd4d..5c2df71603 100644
--- a/tools/virtiofsd/fuse_log.h
+++ b/tools/virtiofsd/fuse_log.h
@@ -14,6 +14,7 @@
* This file defines the logging interface of FUSE
*/
+#include "qemu/compiler.h"
/**
* Log severity level
@@ -68,6 +69,7 @@ void fuse_set_log_func(fuse_log_func_t func);
* @param level severity level (FUSE_LOG_ERR, FUSE_LOG_DEBUG, etc)
* @param fmt sprintf-style format string including newline
*/
+GCC_FMT_ATTR(2,3)
void fuse_log(enum fuse_log_level level, const char *fmt, ...);
#endif /* FUSE_LOG_H_ */
--
2.25.1

View File

@ -107,8 +107,8 @@ assets:
qemu-experimental:
description: "QEMU with virtiofs support"
url: "https://gitlab.com/virtio-fs/qemu"
version: "qemu5.0-virtiofs-with51bits-dax"
url: "https://github.com/qemu/qemu"
version: "470dd6bd360782f5137f7e3376af6a44658eb1d3"
image:
description: |