diff --git a/kernel/configs/x86_64_kata_kvm_4.19.x b/kernel/configs/x86_64_kata_kvm_4.19.x
index 02dde38246..bf61ec9705 100644
--- a/kernel/configs/x86_64_kata_kvm_4.19.x
+++ b/kernel/configs/x86_64_kata_kvm_4.19.x
@@ -1,13 +1,13 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/x86 4.19.24 Kernel Configuration
+# Linux/x86 4.19.28 Kernel Configuration
 #
 
 #
-# Compiler: gcc (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609
+# Compiler: gcc (Ubuntu 7.4.0-1ubuntu1~16.04~ppa1) 7.4.0
 #
 CONFIG_CC_IS_GCC=y
-CONFIG_GCC_VERSION=50400
+CONFIG_GCC_VERSION=70400
 CONFIG_CLANG_VERSION=0
 CONFIG_IRQ_WORK=y
 CONFIG_BUILDTIME_EXTABLE_SORT=y
@@ -2597,7 +2597,9 @@ CONFIG_FANOTIFY=y
 # CONFIG_QUOTA is not set
 CONFIG_AUTOFS4_FS=y
 CONFIG_AUTOFS_FS=y
-# CONFIG_FUSE_FS is not set
+CONFIG_FUSE_FS=y
+# CONFIG_CUSE is not set
+CONFIG_VIRTIO_FS=y
 # CONFIG_OVERLAY_FS is not set
 
 #
diff --git a/kernel/kata_config_version b/kernel/kata_config_version
index 7facc89938..81b5c5d06c 100644
--- a/kernel/kata_config_version
+++ b/kernel/kata_config_version
@@ -1 +1 @@
-36
+37
diff --git a/kernel/patches/4.19.x/0001-fuse-add-skeleton-virtio_fs.ko-module.patch b/kernel/patches/4.19.x/0001-fuse-add-skeleton-virtio_fs.ko-module.patch
new file mode 100644
index 0000000000..0d6d631e80
--- /dev/null
+++ b/kernel/patches/4.19.x/0001-fuse-add-skeleton-virtio_fs.ko-module.patch
@@ -0,0 +1,4604 @@
+From e480fb43fda5d90a6277e969ac74b9a5a60c3f05 Mon Sep 17 00:00:00 2001
+From: Stefan Hajnoczi <stefanha@redhat.com>
+Date: Tue, 12 Jun 2018 09:41:17 +0100
+Subject: [PATCH] fuse: add skeleton virtio_fs.ko module
+
+Add a basic file system module for virtio-fs.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+fuse: add probe/remove virtio driver
+
+Add basic probe/remove functionality for the new virtio-fs device.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+fuse: extract fuse_fill_super_common()
+
+fuse_fill_super() includes code to process the fd= option and link the
+struct fuse_dev to the fd's struct file.  In virtio-fs there is no file
+descriptor because /dev/fuse is not used.
+
+This patch extracts fuse_fill_super_common() so that both classic fuse
+and virtio-fs can share the code to initialize a mount.
+
+parse_fuse_opt() is also extracted so that the fuse_fill_super_common()
+caller has access to the mount options.  This allows classic fuse to
+handle the fd= option outside fuse_fill_super_common().
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+virtio_fs: get mount working
+
+Provide definitions of ->mount and  ->kill_sb. This is still WIP.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+fuse: export fuse_end_request()
+
+virtio-fs will need to complete requests from outside fs/fuse/dev.c.
+Make the symbol visible.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+fuse: export fuse_len_args()
+
+virtio-fs will need to query the length of fuse_arg lists.  Make the
+symbol visible.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+fuse: Export fuse_send_init_request()
+
+This will be used by virtio-fs to send init request to fuse server after
+initialization of virt queues.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: add fuse_iqueue_ops callbacks
+
+The /dev/fuse device uses fiq->waitq and fasync to signal that requests
+are available.  These mechanisms do not apply to virtio-fs.  This patch
+introduces callbacks so alternative behavior can be used.
+
+Note that queue_interrupt() changes along these lines:
+
+  spin_lock(&fiq->waitq.lock);
+  wake_up_locked(&fiq->waitq);
++ kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+  spin_unlock(&fiq->waitq.lock);
+- kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+
+Since queue_request() and queue_forget() also call kill_fasync() inside
+the spinlock this should be safe.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+fuse: Separate fuse device allocation and installation in fuse_conn
+
+As of now fuse_dev_alloc() both allocates a fuse device and installs it
+in fuse_conn list. fuse_dev_alloc() can fail if fuse_device allocation
+fails.
+
+virtio-fs needs to initialize multiple fuse devices (one per virtio
+queue). It initializes one fuse device as part of call to
+fuse_fill_super_common() and rest of the devices are allocated and
+installed after that.
+
+But, we can't affort to fail after calling fuse_fill_super_common() as
+we don't have a way to undo all the actions done by fuse_fill_super_common().
+So to avoid failures after the call to fuse_fill_super_common(),
+pre-allocate all fuse devices early and install them into fuse connection
+later.
+
+This patch provides two separate helpers for fuse device allocation and
+fuse device installation in fuse_conn.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: process requests queues
+
+Send normal requests to the device and handle completions.
+
+This is enough to get mount and basic I/O working.  The hiprio and
+notifications queues still need to be implemented for full FUSE
+functionality.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+fuse: export fuse_get_unique()
+
+virtio-fs will need unique IDs for FORGET requests from outside
+fs/fuse/dev.c.  Make the symbol visible.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+fuse: implement FUSE_FORGET for virtio-fs
+
+Sent single FUSE_FORGET requests on the hiprio queue.  In the future it
+may be possible to do FUSE_BATCH_FORGET but that is tricky since
+virtio-fs gets called synchronously when forgets are queued.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+virtio_fs: Set up dax_device
+
+Setup a dax device.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+dax: remove block device dependencies
+
+Although struct dax_device itself is not tied to a block device, some
+DAX code assumes there is a block device.  Make block devices optional
+by allowing bdev to be NULL in commonly used DAX APIs.
+
+When there is no block device:
+ * Skip the partition offset calculation in bdev_dax_pgoff()
+ * Skip the blkdev_issue_zeroout() optimization
+
+Note that more block device assumptions remain but I haven't reach those
+code paths yet.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+dax: Pass dax_dev to dax_writeback_mapping_range()
+
+Right now dax_writeback_mapping_range() is passed a bdev and dax_dev
+is searched from that bdev name.
+
+virtio-fs does not have a bdev. So pass in dax_dev also to
+dax_writeback_mapping_range(). If dax_dev is passed in, bdev is not
+used otherwise dax_dev is searched using bdev.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: add fuse_conn->dax_dev field
+
+A struct dax_device instance is a prerequisite for the DAX filesystem
+APIs.  Let virtio_fs associate a dax_device with a fuse_conn.  Classic
+FUSE and CUSE set the pointer to NULL, disabling DAX.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+virtio: Add get_shm_region method
+
+Virtio defines 'shared memory regions' that provide a continuously
+shared region between the host and guest.
+
+Provide a method to find a particular region on a device.
+
+Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
+Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+
+virtio: Implement get_shm_region for PCI transport
+
+On PCI the shm regions are found using capability entries;
+find a region by searching for the capability.
+
+Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
+Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+
+virtio: Implement get_shm_region for MMIO transport
+
+On MMIO a new set of registers is defined for finding SHM
+regions.  Add their definitions and use them to find the region.
+
+Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
+
+fuse: map virtio_fs DAX window
+
+Use the shm capability to find the cache entry and map it.
+
+The DAX window is accessed by the fs/dax.c infrastructure and must have
+struct pages (at least on x86).  Use devm_memremap_pages() to map the
+DAX window PCI BAR and allocate struct page.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
+Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+
+virito-fs: Make dax optional
+
+Add a 'dax' option and only enable dax when it's on.
+
+Also show "dax" in mount options if filesystem was mounted with dax
+enabled.
+
+Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+Limit number of pages returned by direct_access()
+
+Truncate number of pages mapped by direct_access() to remain with-in window
+size. User might request mapping pages beyond window size.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: Introduce fuse_dax_mapping
+
+Introduce fuse_dax_mapping. This type will be used to keep track of
+per inode dax mappings.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+Create a list of free memory ranges
+
+Divide the dax memory range into fixed size ranges (2MB for now) and put
+them in a list. This will track free ranges. Once an inode requires a
+free range, we will take one from here and put it in interval-tree
+of ranges assigned to inode.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: simplify fuse_fill_super_common() calling
+
+Add more fields to "struct fuse_mount_data" so that less parameters
+have to be passed to function fuse_fill_super_common().
+
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+
+fuse: Introduce setupmapping/removemapping commands
+
+Introduce two new fuse commands to setup/remove memory mappings.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+Introduce interval tree basic data structures
+
+We want to use interval tree to keep track of per inode dax mappings.
+Introduce basic data structures.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: Implement basic DAX read/write support commands
+
+This patch implements basic DAX support. mmap() is not implemented
+yet and will come in later patches. This patch looks into implemeting
+read/write.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: Maintain a list of busy elements
+
+This list will be used selecting fuse_dax_mapping to free when number of
+free mappings drops below a threshold.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+Do fallocate() to grow file before mapping for file growing writes
+
+How to handle file growing writes. For now, this patch does fallocate() to
+grow file and then map it using dax. We need to figure out what's the best
+way to handle it.
+
+This patch does fallocate() and setup mapping operations in
+fuse_dax_write_iter(), instead of iomap_begin(). I don't have access to file
+pointer needed to send a message to fuse daemon in iomap_begin().
+
+Dave Chinner has expressed concers with this approach as this is not
+atomic. If guest crashes after falloc() but before data was written,
+user will think that filesystem lost its data. So this is still an
+outstanding issue.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: add DAX mmap support
+
+Add DAX mmap() support.
+
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+
+fuse: delete dentry if timeout is zero
+
+Don't hold onto dentry in lru list if need to re-lookup it anyway at next
+access.
+
+More advanced version of this patch would periodically flush out dentries
+from the lru which have gone stale.
+
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+
+fuse: Define dax address space operations
+
+This is done along the lines of ext4 and xfs. I primarily wanted ->writepages
+hook at this time so that I could call into dax_writeback_mapping_range().
+This in turn will decide which pfns need to be written back and call
+dax_flush() on those.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse, dax: Take ->i_mmap_sem lock during dax page fault
+
+We need some kind of locking mechanism here. Normal file systems like
+ext4 and xfs seems to take their own semaphore to protect agains
+truncate while fault is going on.
+
+We have additional requirement to protect against fuse dax memory range
+reclaim. When a range has been selected for reclaim, we need to make sure
+no other read/write/fault can try to access that memory range while
+reclaim is in progress. Once reclaim is complete, lock will be released
+and read/write/fault will trigger allocation of fresh dax range.
+
+Taking inode_lock() is not an option in fault path as lockdep complains
+about circular dependencies. So define a new fuse_inode->i_mmap_sem.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: Add logic to free up a memory range
+
+Add logic to free up a busy memory range. Freed memory range will be
+returned to free pool. Add a worker which can be started to select
+and free some busy memory ranges.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: Add logic to do direct reclaim of memory
+
+This can be done only from same inode. Also it can be done only for
+read/write case and not for fault case. Reason, as of now reclaim requires
+holding inode_lock, fuse_inode->i_mmap_sem and fuse_inode->dmap_tree
+locks in that order and only read/write path will allow that (and not
+fault path).
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: Kick worker when free memory drops below 20% of total ranges
+
+Kick worker to free up some memory when number of free ranges drops below
+20% of total free ranges at the time of initialization.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: multiplex cached/direct_io/dax file operations
+
+Dispatch FORGET requests later instead of dropping them
+
+If virtio queue is full, then don't drop FORGET requests. Instead, wait
+a bit and try to dispatch these little later using a worker thread.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+Release file in process context
+
+fuse_file_put(sync) can be called with sync=true/false. If sync=true,
+it waits for release request response and then calls iput() in the
+caller's context. If sync=false, it does not wait for release request
+response, frees the fuse_file struct immediately and req->end function
+does the iput().
+
+iput() can be a problem with DAX if called in req->end context. If this
+is last reference to inode (VFS has let go its reference already), then
+iput() will clean DAX mappings as well and send REMOVEMAPPING requests
+and wait for completion. (All the the worker thread context which is
+processing fuse replies from daemon on the host).
+
+That means it blocks worker thread and it stops processing further
+replies and system deadlocks.
+
+So for now, force sync release of file in case of DAX inodes.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: Do not block on inode lock while freeing memory range
+
+Once we select a memory range to free, we currently block on inode
+lock. Do not block and use trylock instead. And move on to next memory
+range if trylock fails.
+
+Reason being that in next few patches I want to enabling waiting for
+memmory ranges to become free in fuse_iomap_begin(). So insted of
+returning -EBUSY, a process will wait for a memory range to become
+free.
+
+We don't want to end up in a situation where process is sleeping in
+iomap_begin() with inode lock held and worker is trying to free
+memory from same inode, resulting in deadlock.
+
+To avoid deadlock, use trylock instead.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: Reschedule dax free work if too many EAGAIN attempts
+
+fuse_dax_free_memory() can be very cpu intensive in corner cases. For example,
+if one inode has consumed all the memory and a setupmapping request is
+pending, that means inode lock is held by request and worker thread will
+not get lock for a while. And given there is only one inode consuming all
+the dax ranges, all the attempts to acquire lock will fail.
+
+So if there are too many inode lock failures (-EAGAIN), reschedule the
+worker with a 10ms delay.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: Wait for memory ranges to become free
+
+Sometimes we run out of memory ranges. So in that case, wait for memory
+ranges to become free, instead of returning -EBUSY.
+
+dax fault path is holding fuse_inode->i_mmap_sem and once that is being
+held, memory reclaim can't be done. Its not safe to wait while holding
+fuse_inode->i_mmap_sem for two reasons.
+
+- Worker thread to free memory might block on fuse_inode->i_mmap_sem as well.
+- This inode is holding all the memory and more memory can't be freed.
+
+In both the cases, deadlock will ensue. So return -ENOSPC from iomap_begin()
+in fault path if memory can't be allocated. Drop fuse_inode->i_mmap_sem,
+and wait for a free range to become available and retry.
+
+read/write path is a different story. We hold inode lock and lock ordering
+allows to grab fuse_inode->immap_sem, if needed. That means we can do direct
+reclaim in that path. But if there is no memory allocated to this inode,
+then direct reclaim will not work and we need to wait for a memory range
+to become free. So try following order.
+
+A. Try to get a free range.
+B. If not, try direct reclaim.
+C. If not, wait for a memory range to become free
+
+Here sleeping with locks held should be fine because in step B, we made
+sure this inode is not holding any ranges. That means other inodes are
+holding ranges and somebody should be able to free memory. Also, worker
+thread does a trylock() on inode lock. That means worker tread will not
+wait on this inode and move onto next memory range. Hence above sequence
+should be deadlock free.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: Take inode lock for dax inode truncation
+
+When a file is opened with O_TRUNC, we need to make sure that any other
+DAX operation is not in progress. DAX expects i_size to be stable.
+
+In fuse_iomap_begin() we check for i_size at multiple places and we expect
+i_size to not change.
+
+Another problem is, if we setup a mapping in fuse_iomap_begin(), and
+file gets truncated and dax read/write happens, KVM currently hangs.
+It tries to fault in a page which does not exist on host (file got
+truncated). It probably requries fixing in KVM.
+
+So for now, take inode lock. Once KVM is fixed, we might have to
+have a look at it again.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+fuse: Clear setuid bit even in direct I/O path
+
+With cache=never, we fall back to direct IO. pjdfstest chmod test 12.t was
+failing because if a file has setuid bit, it should be cleared if an
+unpriviledged user opens it for write and writes to it.
+
+Call fuse_remove_privs() even for direct I/O path.
+
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+
+virtio: Free fuse devices on umount
+
+When unmounting the fs close all the fuse devices.
+This includes making sure the daemon gets a FUSE_DESTROY to
+tell it.
+
+Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+
+virtio-fs: Fix a race in range reclaim
+
+We have the notion of doing inline dax range reclaim where caller does not
+have to drop inode lock and reclaim one of it's dax ranges. It assumed
+there is no other reader/writer using that inode (hence not using dax
+range being reclaimed).
+
+But fuse read path takes shared inode lock. That means there could be other
+readers while we need to do reclaim. If we try to reclaim now, it is possible
+we end up reclaiming the range used by another process.
+
+To remove that race, do not try to do inline reclaim for read path. Instead
+return -ENOSPC and fuse read path will try again when a free range is
+available.
+
+Reported-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+---
+ drivers/dax/super.c                |    3 +-
+ drivers/virtio/virtio_mmio.c       |   32 +
+ drivers/virtio/virtio_pci_modern.c |  108 +++
+ fs/dax.c                           |   23 +-
+ fs/ext2/inode.c                    |    2 +-
+ fs/ext4/inode.c                    |    2 +-
+ fs/fuse/Kconfig                    |   11 +
+ fs/fuse/Makefile                   |    1 +
+ fs/fuse/cuse.c                     |    5 +-
+ fs/fuse/dev.c                      |   80 +-
+ fs/fuse/dir.c                      |   28 +-
+ fs/fuse/file.c                     | 1001 +++++++++++++++++++++++--
+ fs/fuse/fuse_i.h                   |  202 ++++-
+ fs/fuse/inode.c                    |  316 +++++---
+ fs/fuse/virtio_fs.c                | 1121 ++++++++++++++++++++++++++++
+ fs/splice.c                        |    3 +-
+ fs/xfs/xfs_aops.c                  |    2 +-
+ include/linux/dax.h                |    6 +-
+ include/linux/fs.h                 |    2 +
+ include/linux/virtio_config.h      |   17 +
+ include/uapi/linux/fuse.h          |   34 +
+ include/uapi/linux/virtio_fs.h     |   44 ++
+ include/uapi/linux/virtio_ids.h    |    1 +
+ include/uapi/linux/virtio_mmio.h   |   11 +
+ include/uapi/linux/virtio_pci.h    |   10 +
+ 25 files changed, 2883 insertions(+), 182 deletions(-)
+ create mode 100644 fs/fuse/virtio_fs.c
+ create mode 100644 include/uapi/linux/virtio_fs.h
+
+diff --git a/drivers/dax/super.c b/drivers/dax/super.c
+index 6e928f37d084..74f3bf7ae822 100644
+--- a/drivers/dax/super.c
++++ b/drivers/dax/super.c
+@@ -52,7 +52,8 @@ EXPORT_SYMBOL_GPL(dax_read_unlock);
+ int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
+ 		pgoff_t *pgoff)
+ {
+-	phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
++	sector_t start_sect = bdev ? get_start_sect(bdev) : 0;
++	phys_addr_t phys_off = (start_sect + sector) * 512;
+ 
+ 	if (pgoff)
+ 		*pgoff = PHYS_PFN(phys_off);
+diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
+index 4cd9ea5c75be..9642fa8dbeb0 100644
+--- a/drivers/virtio/virtio_mmio.c
++++ b/drivers/virtio/virtio_mmio.c
+@@ -494,6 +494,37 @@ static const char *vm_bus_name(struct virtio_device *vdev)
+ 	return vm_dev->pdev->name;
+ }
+ 
++static bool vm_get_shm_region(struct virtio_device *vdev,
++			      struct virtio_shm_region *region, u8 id)
++{
++	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
++	u64 len, addr;
++
++	/* Select the region we're interested in */
++	writel(id, vm_dev->base + VIRTIO_MMIO_SHM_SEL);
++
++	/* Read the region size */
++	len = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_LOW);
++	len |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_HIGH) << 32;
++
++	region->len = len;
++
++	/* Check if region length is -1. If that's the case, the shared memory
++	 * region does not exist and there is no need to proceed further.
++	 */
++	if (len == ~(u64)0) {
++		return false;
++	}
++
++	/* Read the region base address */
++	addr = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_LOW);
++	addr |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_HIGH) << 32;
++
++	region->addr = addr;
++
++	return true;
++}
++
+ static const struct virtio_config_ops virtio_mmio_config_ops = {
+ 	.get		= vm_get,
+ 	.set		= vm_set,
+@@ -506,6 +537,7 @@ static const struct virtio_config_ops virtio_mmio_config_ops = {
+ 	.get_features	= vm_get_features,
+ 	.finalize_features = vm_finalize_features,
+ 	.bus_name	= vm_bus_name,
++	.get_shm_region = vm_get_shm_region,
+ };
+ 
+ 
+diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
+index 07571daccfec..51c9e6eca5ac 100644
+--- a/drivers/virtio/virtio_pci_modern.c
++++ b/drivers/virtio/virtio_pci_modern.c
+@@ -446,6 +446,112 @@ static void del_vq(struct virtio_pci_vq_info *info)
+ 	vring_del_virtqueue(vq);
+ }
+ 
++static int virtio_pci_find_shm_cap(struct pci_dev *dev,
++                                   u8 required_id,
++                                   u8 *bar, u64 *offset, u64 *len)
++{
++	int pos;
++
++        for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
++             pos > 0;
++             pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
++		u8 type, cap_len, id;
++                u32 tmp32;
++                u64 res_offset, res_length;
++
++		pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
++                                                         cfg_type),
++                                     &type);
++                if (type != VIRTIO_PCI_CAP_SHARED_MEMORY_CFG)
++                        continue;
++
++		pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
++                                                         cap_len),
++                                     &cap_len);
++                if (cap_len != sizeof(struct virtio_pci_shm_cap)) {
++		        printk(KERN_ERR "%s: shm cap with bad size offset: %d size: %d\n",
++                               __func__, pos, cap_len);
++                        continue;
++                };
++
++		pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_shm_cap,
++                                                         id),
++                                     &id);
++                if (id != required_id)
++                        continue;
++
++                /* Type, and ID match, looks good */
++                pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
++                                                         bar),
++                                     bar);
++
++                /* Read the lower 32bit of length and offset */
++                pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap, offset),
++                                      &tmp32);
++                res_offset = tmp32;
++                pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap, length),
++                                      &tmp32);
++                res_length = tmp32;
++
++                /* and now the top half */
++                pci_read_config_dword(dev,
++                                      pos + offsetof(struct virtio_pci_shm_cap,
++                                                     offset_hi),
++                                      &tmp32);
++                res_offset |= ((u64)tmp32) << 32;
++                pci_read_config_dword(dev,
++                                      pos + offsetof(struct virtio_pci_shm_cap,
++                                                     length_hi),
++                                      &tmp32);
++                res_length |= ((u64)tmp32) << 32;
++
++                *offset = res_offset;
++                *len = res_length;
++
++                return pos;
++        }
++        return 0;
++}
++
++static bool vp_get_shm_region(struct virtio_device *vdev,
++			      struct virtio_shm_region *region, u8 id)
++{
++	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
++	struct pci_dev *pci_dev = vp_dev->pci_dev;
++	u8 bar;
++	u64 offset, len;
++	phys_addr_t phys_addr;
++	size_t bar_len;
++	char *bar_name;
++	int ret;
++
++	if (!virtio_pci_find_shm_cap(pci_dev, id, &bar, &offset, &len)) {
++		return false;
++	}
++
++	ret = pci_request_region(pci_dev, bar, "virtio-pci-shm");
++	if (ret < 0) {
++		dev_err(&pci_dev->dev, "%s: failed to request BAR\n",
++			__func__);
++		return false;
++	}
++
++	phys_addr = pci_resource_start(pci_dev, bar);
++	bar_len = pci_resource_len(pci_dev, bar);
++
++        if (offset + len > bar_len) {
++                dev_err(&pci_dev->dev,
++                        "%s: bar shorter than cap offset+len\n",
++                        __func__);
++                return false;
++        }
++
++	region->len = len;
++	region->addr = (u64) phys_addr + offset;
++
++	return true;
++}
++
+ static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
+ 	.get		= NULL,
+ 	.set		= NULL,
+@@ -460,6 +566,7 @@ static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
+ 	.bus_name	= vp_bus_name,
+ 	.set_vq_affinity = vp_set_vq_affinity,
+ 	.get_vq_affinity = vp_get_vq_affinity,
++	.get_shm_region  = vp_get_shm_region,
+ };
+ 
+ static const struct virtio_config_ops virtio_pci_config_ops = {
+@@ -476,6 +583,7 @@ static const struct virtio_config_ops virtio_pci_config_ops = {
+ 	.bus_name	= vp_bus_name,
+ 	.set_vq_affinity = vp_set_vq_affinity,
+ 	.get_vq_affinity = vp_get_vq_affinity,
++	.get_shm_region  = vp_get_shm_region,
+ };
+ 
+ /**
+diff --git a/fs/dax.c b/fs/dax.c
+index 09fa70683c41..408a06b74335 100644
+--- a/fs/dax.c
++++ b/fs/dax.c
+@@ -1021,12 +1021,12 @@ static int dax_writeback_one(struct dax_device *dax_dev,
+  * on persistent storage prior to completion of the operation.
+  */
+ int dax_writeback_mapping_range(struct address_space *mapping,
+-		struct block_device *bdev, struct writeback_control *wbc)
++		struct block_device *bdev, struct dax_device *dax_dev,
++		struct writeback_control *wbc)
+ {
+ 	struct inode *inode = mapping->host;
+ 	pgoff_t start_index, end_index;
+ 	pgoff_t indices[PAGEVEC_SIZE];
+-	struct dax_device *dax_dev;
+ 	struct pagevec pvec;
+ 	bool done = false;
+ 	int i, ret = 0;
+@@ -1037,9 +1037,12 @@ int dax_writeback_mapping_range(struct address_space *mapping,
+ 	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
+ 		return 0;
+ 
+-	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+-	if (!dax_dev)
+-		return -EIO;
++	if (bdev) {
++		WARN_ON(dax_dev);
++		dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
++		if (!dax_dev)
++			return -EIO;
++	}
+ 
+ 	start_index = wbc->range_start >> PAGE_SHIFT;
+ 	end_index = wbc->range_end >> PAGE_SHIFT;
+@@ -1073,7 +1076,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
+ 		start_index = indices[pvec.nr - 1] + 1;
+ 	}
+ out:
+-	put_dax(dax_dev);
++	if (bdev)
++		put_dax(dax_dev);
+ 	trace_dax_writeback_range_done(inode, start_index, end_index);
+ 	return (ret < 0 ? ret : 0);
+ }
+@@ -1141,7 +1145,12 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
+ static bool dax_range_is_aligned(struct block_device *bdev,
+ 				 unsigned int offset, unsigned int length)
+ {
+-	unsigned short sector_size = bdev_logical_block_size(bdev);
++	unsigned short sector_size;
++
++	if (!bdev)
++		return false;
++
++	sector_size = bdev_logical_block_size(bdev);
+ 
+ 	if (!IS_ALIGNED(offset, sector_size))
+ 		return false;
+diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
+index e4bb9386c045..c9b024dafe09 100644
+--- a/fs/ext2/inode.c
++++ b/fs/ext2/inode.c
+@@ -956,7 +956,7 @@ static int
+ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc)
+ {
+ 	return dax_writeback_mapping_range(mapping,
+-			mapping->host->i_sb->s_bdev, wbc);
++			mapping->host->i_sb->s_bdev, NULL, wbc);
+ }
+ 
+ const struct address_space_operations ext2_aops = {
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 2c43c5b92229..a94aff38cda4 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -2949,7 +2949,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
+ 	percpu_down_read(&sbi->s_journal_flag_rwsem);
+ 	trace_ext4_writepages(inode, wbc);
+ 
+-	ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
++	ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, NULL, wbc);
+ 	trace_ext4_writepages_result(inode, wbc, ret,
+ 				     nr_to_write - wbc->nr_to_write);
+ 	percpu_up_read(&sbi->s_journal_flag_rwsem);
+diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
+index 76f09ce7e5b2..46e9a8ff9f7a 100644
+--- a/fs/fuse/Kconfig
++++ b/fs/fuse/Kconfig
+@@ -26,3 +26,14 @@ config CUSE
+ 
+ 	  If you want to develop or use a userspace character device
+ 	  based on CUSE, answer Y or M.
++
++config VIRTIO_FS
++	tristate "Virtio Filesystem"
++	depends on FUSE_FS
++	select VIRTIO
++	help
++	  The Virtio Filesystem allows guests to mount file systems from the
++          host.
++
++	  If you want to share files between guests or with the host, answer Y
++          or M.
+diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
+index 60da84a86dab..d125ff826f2d 100644
+--- a/fs/fuse/Makefile
++++ b/fs/fuse/Makefile
+@@ -4,5 +4,6 @@
+ 
+ obj-$(CONFIG_FUSE_FS) += fuse.o
+ obj-$(CONFIG_CUSE) += cuse.o
++obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o
+ 
+ fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o
+diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
+index 8f68181256c0..d49d64f42768 100644
+--- a/fs/fuse/cuse.c
++++ b/fs/fuse/cuse.c
+@@ -503,9 +503,10 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
+ 	 * Limit the cuse channel to requests that can
+ 	 * be represented in file->f_cred->user_ns.
+ 	 */
+-	fuse_conn_init(&cc->fc, file->f_cred->user_ns);
++	fuse_conn_init(&cc->fc, file->f_cred->user_ns, NULL, &fuse_dev_fiq_ops,
++					NULL);
+ 
+-	fud = fuse_dev_alloc(&cc->fc);
++	fud = fuse_dev_alloc_install(&cc->fc);
+ 	if (!fud) {
+ 		kfree(cc);
+ 		return -ENOMEM;
+diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
+index baaed4d05b22..24d4a9b93fb6 100644
+--- a/fs/fuse/dev.c
++++ b/fs/fuse/dev.c
+@@ -103,6 +103,7 @@ void fuse_request_free(struct fuse_req *req)
+ 	}
+ 	kmem_cache_free(fuse_req_cachep, req);
+ }
++EXPORT_SYMBOL_GPL(fuse_request_free);
+ 
+ void __fuse_get_request(struct fuse_req *req)
+ {
+@@ -310,7 +311,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+ }
+ EXPORT_SYMBOL_GPL(fuse_put_request);
+ 
+-static unsigned len_args(unsigned numargs, struct fuse_arg *args)
++unsigned fuse_len_args(unsigned numargs, struct fuse_arg *args)
+ {
+ 	unsigned nbytes = 0;
+ 	unsigned i;
+@@ -320,19 +321,41 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
+ 
+ 	return nbytes;
+ }
++EXPORT_SYMBOL_GPL(fuse_len_args);
+ 
+-static u64 fuse_get_unique(struct fuse_iqueue *fiq)
++u64 fuse_get_unique(struct fuse_iqueue *fiq)
+ {
+ 	return ++fiq->reqctr;
+ }
++EXPORT_SYMBOL_GPL(fuse_get_unique);
+ 
+-static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req)
++/**
++ * A new request is available, wake fiq->waitq
++ */
++static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq)
++__releases(fiq->waitq.lock)
+ {
+-	req->in.h.len = sizeof(struct fuse_in_header) +
+-		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
+-	list_add_tail(&req->list, &fiq->pending);
+ 	wake_up_locked(&fiq->waitq);
+ 	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
++	spin_unlock(&fiq->waitq.lock);
++}
++
++const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
++	.wake_forget_and_unlock		= fuse_dev_wake_and_unlock,
++	.wake_interrupt_and_unlock	= fuse_dev_wake_and_unlock,
++	.wake_pending_and_unlock	= fuse_dev_wake_and_unlock,
++};
++EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
++
++static void queue_request_and_unlock(struct fuse_iqueue *fiq,
++				     struct fuse_req *req)
++__releases(fiq->waitq.lock)
++{
++	req->in.h.len = sizeof(struct fuse_in_header) +
++		fuse_len_args(req->in.numargs,
++			      (struct fuse_arg *) req->in.args);
++	list_add_tail(&req->list, &fiq->pending);
++	fiq->ops->wake_pending_and_unlock(fiq);
+ }
+ 
+ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+@@ -347,12 +370,11 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+ 	if (fiq->connected) {
+ 		fiq->forget_list_tail->next = forget;
+ 		fiq->forget_list_tail = forget;
+-		wake_up_locked(&fiq->waitq);
+-		kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
++		fiq->ops->wake_forget_and_unlock(fiq);
+ 	} else {
+ 		kfree(forget);
++		spin_unlock(&fiq->waitq.lock);
+ 	}
+-	spin_unlock(&fiq->waitq.lock);
+ }
+ 
+ static void flush_bg_queue(struct fuse_conn *fc)
+@@ -367,8 +389,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
+ 		fc->active_background++;
+ 		spin_lock(&fiq->waitq.lock);
+ 		req->in.h.unique = fuse_get_unique(fiq);
+-		queue_request(fiq, req);
+-		spin_unlock(&fiq->waitq.lock);
++		queue_request_and_unlock(fiq, req);
+ 	}
+ }
+ 
+@@ -380,7 +401,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
+  * the 'end' callback is called if given, else the reference to the
+  * request is released
+  */
+-static void request_end(struct fuse_conn *fc, struct fuse_req *req)
++void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
+ {
+ 	struct fuse_iqueue *fiq = &fc->iq;
+ 
+@@ -424,6 +445,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+ put_request:
+ 	fuse_put_request(fc, req);
+ }
++EXPORT_SYMBOL_GPL(fuse_request_end);
+ 
+ static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+ {
+@@ -434,10 +456,10 @@ static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+ 	}
+ 	if (list_empty(&req->intr_entry)) {
+ 		list_add_tail(&req->intr_entry, &fiq->interrupts);
+-		wake_up_locked(&fiq->waitq);
++		fiq->ops->wake_interrupt_and_unlock(fiq);
++	} else {
++		spin_unlock(&fiq->waitq.lock);
+ 	}
+-	spin_unlock(&fiq->waitq.lock);
+-	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+ }
+ 
+ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+@@ -496,14 +518,13 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+ 		req->out.h.error = -ENOTCONN;
+ 	} else {
+ 		req->in.h.unique = fuse_get_unique(fiq);
+-		queue_request(fiq, req);
+ 		/* acquire extra reference, since request is still needed
+-		   after request_end() */
++		   after fuse_request_end() */
+ 		__fuse_get_request(req);
+-		spin_unlock(&fiq->waitq.lock);
++		queue_request_and_unlock(fiq, req);
+ 
+ 		request_wait_answer(fc, req);
+-		/* Pairs with smp_wmb() in request_end() */
++		/* Pairs with smp_wmb() in fuse_request_end() */
+ 		smp_rmb();
+ 	}
+ }
+@@ -635,10 +656,11 @@ static int fuse_request_send_notify_reply(struct fuse_conn *fc,
+ 	req->in.h.unique = unique;
+ 	spin_lock(&fiq->waitq.lock);
+ 	if (fiq->connected) {
+-		queue_request(fiq, req);
++		queue_request_and_unlock(fiq, req);
+ 		err = 0;
++	} else {
++		spin_unlock(&fiq->waitq.lock);
+ 	}
+-	spin_unlock(&fiq->waitq.lock);
+ 
+ 	return err;
+ }
+@@ -1236,7 +1258,7 @@ __releases(fiq->waitq.lock)
+  * the pending list and copies request data to userspace buffer.  If
+  * no reply is needed (FORGET) or request has been aborted or there
+  * was an error during the copying then it's finished by calling
+- * request_end().  Otherwise add it to the processing list, and set
++ * fuse_request_end().  Otherwise add it to the processing list, and set
+  * the 'sent' flag.
+  */
+ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
+@@ -1295,7 +1317,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
+ 		/* SETXATTR is special, since it may contain too large data */
+ 		if (in->h.opcode == FUSE_SETXATTR)
+ 			req->out.h.error = -E2BIG;
+-		request_end(fc, req);
++		fuse_request_end(fc, req);
+ 		goto restart;
+ 	}
+ 	spin_lock(&fpq->lock);
+@@ -1337,7 +1359,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
+ 	if (!test_bit(FR_PRIVATE, &req->flags))
+ 		list_del_init(&req->list);
+ 	spin_unlock(&fpq->lock);
+-	request_end(fc, req);
++	fuse_request_end(fc, req);
+ 	return err;
+ 
+  err_unlock:
+@@ -1824,7 +1846,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
+ 	if (out->h.error)
+ 		return nbytes != reqsize ? -EINVAL : 0;
+ 
+-	reqsize += len_args(out->numargs, out->args);
++	reqsize += fuse_len_args(out->numargs, out->args);
+ 
+ 	if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
+ 		return -EINVAL;
+@@ -1844,7 +1866,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
+  * the write buffer.  The request is then searched on the processing
+  * list by the unique ID found in the header.  If found, then remove
+  * it from the list and copy the rest of the buffer to the request.
+- * The request is finished by calling request_end()
++ * The request is finished by calling fuse_request_end().
+  */
+ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
+ 				 struct fuse_copy_state *cs, size_t nbytes)
+@@ -1931,7 +1953,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
+ 		list_del_init(&req->list);
+ 	spin_unlock(&fpq->lock);
+ 
+-	request_end(fc, req);
++	fuse_request_end(fc, req);
+ 
+ 	return err ? err : nbytes;
+ 
+@@ -2077,7 +2099,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
+ 		req->out.h.error = -ECONNABORTED;
+ 		clear_bit(FR_SENT, &req->flags);
+ 		list_del_init(&req->list);
+-		request_end(fc, req);
++		fuse_request_end(fc, req);
+ 	}
+ }
+ 
+@@ -2223,7 +2245,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
+ 	if (new->private_data)
+ 		return -EINVAL;
+ 
+-	fud = fuse_dev_alloc(fc);
++	fud = fuse_dev_alloc_install(fc);
+ 	if (!fud)
+ 		return -ENOMEM;
+ 
+diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
+index 82a13221775e..3f923fe7841a 100644
+--- a/fs/fuse/dir.c
++++ b/fs/fuse/dir.c
+@@ -44,12 +44,26 @@ union fuse_dentry {
+ 	struct rcu_head rcu;
+ };
+ 
+-static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
++static void fuse_dentry_settime(struct dentry *dentry, u64 time)
+ {
+-	((union fuse_dentry *) entry->d_fsdata)->time = time;
++	/*
++	 * Mess with DCACHE_OP_DELETE because dput() will be faster without it.
++	 *  Don't care about races, either way it's just an optimization
++	 */
++	if ((time && (dentry->d_flags & DCACHE_OP_DELETE)) ||
++	    (!time && !(dentry->d_flags & DCACHE_OP_DELETE))) {
++		spin_lock(&dentry->d_lock);
++		if (time)
++			dentry->d_flags &= ~DCACHE_OP_DELETE;
++		else
++			dentry->d_flags |= DCACHE_OP_DELETE;
++		spin_unlock(&dentry->d_lock);
++	}
++
++	((union fuse_dentry *) dentry->d_fsdata)->time = time;
+ }
+ 
+-static inline u64 fuse_dentry_time(struct dentry *entry)
++static inline u64 fuse_dentry_time(const struct dentry *entry)
+ {
+ 	return ((union fuse_dentry *) entry->d_fsdata)->time;
+ }
+@@ -280,8 +294,14 @@ static void fuse_dentry_release(struct dentry *dentry)
+ 	kfree_rcu(fd, rcu);
+ }
+ 
++static int fuse_dentry_delete(const struct dentry *dentry)
++{
++	return time_before64(fuse_dentry_time(dentry), get_jiffies_64());
++}
++
+ const struct dentry_operations fuse_dentry_operations = {
+ 	.d_revalidate	= fuse_dentry_revalidate,
++	.d_delete	= fuse_dentry_delete,
+ 	.d_init		= fuse_dentry_init,
+ 	.d_release	= fuse_dentry_release,
+ };
+@@ -1728,8 +1748,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
+ 	 */
+ 	if ((is_truncate || !is_wb) &&
+ 	    S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
++		down_write(&fi->i_mmap_sem);
+ 		truncate_pagecache(inode, outarg.attr.size);
+ 		invalidate_inode_pages2(inode->i_mapping);
++		up_write(&fi->i_mmap_sem);
+ 	}
+ 
+ 	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+diff --git a/fs/fuse/file.c b/fs/fuse/file.c
+index bd500c3b7858..51faed351c7c 100644
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -18,8 +18,18 @@
+ #include <linux/swap.h>
+ #include <linux/falloc.h>
+ #include <linux/uio.h>
++#include <linux/dax.h>
++#include <linux/iomap.h>
++#include <linux/interval_tree_generic.h>
+ 
+-static const struct file_operations fuse_direct_io_file_operations;
++INTERVAL_TREE_DEFINE(struct fuse_dax_mapping,
++                     rb, __u64, __subtree_last,
++                     START, LAST, static inline, fuse_dax_interval_tree);
++
++static long __fuse_file_fallocate(struct file *file, int mode,
++					loff_t offset, loff_t length);
++static struct fuse_dax_mapping *alloc_dax_mapping_reclaim(struct fuse_conn *fc,
++					struct inode *inode);
+ 
+ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+ 			  int opcode, struct fuse_open_out *outargp)
+@@ -170,13 +180,222 @@ static void fuse_link_write_file(struct file *file)
+ 	spin_unlock(&fc->lock);
+ }
+ 
++static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn *fc)
++{
++	unsigned long free_threshold;
++	struct fuse_dax_mapping *dmap = NULL;
++
++	spin_lock(&fc->lock);
++
++	/* TODO: Add logic to try to free up memory if wait is allowed */
++	if (fc->nr_free_ranges <= 0) {
++		spin_unlock(&fc->lock);
++		goto out_kick;
++	}
++
++	WARN_ON(list_empty(&fc->free_ranges));
++
++	/* Take a free range */
++	dmap = list_first_entry(&fc->free_ranges, struct fuse_dax_mapping,
++					list);
++	list_del_init(&dmap->list);
++	fc->nr_free_ranges--;
++	spin_unlock(&fc->lock);
++
++out_kick:
++	/* If number of free ranges are below threshold, start reclaim */
++	free_threshold = max((fc->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD)/100,
++				(unsigned long)1);
++	if (fc->nr_free_ranges < free_threshold) {
++		pr_debug("fuse: Kicking dax memory reclaim worker. nr_free_ranges=0x%ld nr_total_ranges=%ld\n", fc->nr_free_ranges, fc->nr_ranges);
++		queue_delayed_work(system_long_wq, &fc->dax_free_work, 0);
++	}
++	return dmap;
++}
++
++/* This assumes fc->lock is held */
++static void __dmap_remove_busy_list(struct fuse_conn *fc,
++				struct fuse_dax_mapping *dmap)
++{
++	list_del_init(&dmap->busy_list);
++	WARN_ON(fc->nr_busy_ranges == 0);
++	fc->nr_busy_ranges--;
++}
++
++static void dmap_remove_busy_list(struct fuse_conn *fc,
++				struct fuse_dax_mapping *dmap)
++{
++	spin_lock(&fc->lock);
++	__dmap_remove_busy_list(fc, dmap);
++	spin_unlock(&fc->lock);
++}
++
++/* This assumes fc->lock is held */
++static void __free_dax_mapping(struct fuse_conn *fc,
++				struct fuse_dax_mapping *dmap)
++{
++	list_add_tail(&dmap->list, &fc->free_ranges);
++	fc->nr_free_ranges++;
++	/* TODO: Wake up only when needed */
++	wake_up(&fc->dax_range_waitq);
++}
++
++static void free_dax_mapping(struct fuse_conn *fc,
++				struct fuse_dax_mapping *dmap)
++{
++	/* Return fuse_dax_mapping to free list */
++	spin_lock(&fc->lock);
++	__free_dax_mapping(fc, dmap);
++	spin_unlock(&fc->lock);
++}
++
++/* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */
++static int fuse_setup_one_mapping(struct inode *inode,
++				struct file *file, loff_t offset,
++				struct fuse_dax_mapping *dmap)
++{
++	struct fuse_conn *fc = get_fuse_conn(inode);
++	struct fuse_inode *fi = get_fuse_inode(inode);
++	struct fuse_file *ff = NULL;
++	struct fuse_setupmapping_in inarg;
++	FUSE_ARGS(args);
++	ssize_t err;
++
++	if (file)
++		ff = file->private_data;
++
++	WARN_ON(offset % FUSE_DAX_MEM_RANGE_SZ);
++	WARN_ON(fc->nr_free_ranges < 0);
++
++	/* Ask fuse daemon to setup mapping */
++	memset(&inarg, 0, sizeof(inarg));
++	inarg.foffset = offset;
++	if (ff)
++		inarg.fh = ff->fh;
++	else
++		inarg.fh = -1;
++	inarg.moffset = dmap->window_offset;
++	inarg.len = FUSE_DAX_MEM_RANGE_SZ;
++	if (file) {
++		inarg.flags |= (file->f_mode & FMODE_WRITE) ?
++				FUSE_SETUPMAPPING_FLAG_WRITE : 0;
++		inarg.flags |= (file->f_mode & FMODE_READ) ?
++				FUSE_SETUPMAPPING_FLAG_READ : 0;
++	} else {
++		inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
++		inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
++	}
++	args.in.h.opcode = FUSE_SETUPMAPPING;
++	args.in.h.nodeid = fi->nodeid;
++	args.in.numargs = 1;
++	args.in.args[0].size = sizeof(inarg);
++	args.in.args[0].value = &inarg;
++	err = fuse_simple_request(fc, &args);
++	if (err < 0) {
++		printk(KERN_ERR "%s request failed at mem_offset=0x%llx %zd\n",
++				 __func__, dmap->window_offset, err);
++		return err;
++	}
++
++	pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx err=%zd\n", offset, err);
++
++	/*
++	 * We don't take a refernce on inode. inode is valid right now and
++	 * when inode is going away, cleanup logic should first cleanup
++	 * dmap entries.
++	 *
++	 * TODO: Do we need to ensure that we are holding inode lock
++	 * as well.
++	 */
++	dmap->inode = inode;
++	dmap->start = offset;
++	dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1;
++	/* Protected by fi->i_dmap_sem */
++	fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree);
++	fi->nr_dmaps++;
++	spin_lock(&fc->lock);
++	list_add_tail(&dmap->busy_list, &fc->busy_ranges);
++	fc->nr_busy_ranges++;
++	spin_unlock(&fc->lock);
++	return 0;
++}
++
++static int fuse_removemapping_one(struct inode *inode,
++					struct fuse_dax_mapping *dmap)
++{
++	struct fuse_inode *fi = get_fuse_inode(inode);
++	struct fuse_conn *fc = get_fuse_conn(inode);
++	struct fuse_removemapping_in inarg;
++	FUSE_ARGS(args);
++
++	memset(&inarg, 0, sizeof(inarg));
++	inarg.moffset = dmap->window_offset;
++	inarg.len = dmap->length;
++	args.in.h.opcode = FUSE_REMOVEMAPPING;
++	args.in.h.nodeid = fi->nodeid;
++	args.in.numargs = 1;
++	args.in.args[0].size = sizeof(inarg);
++	args.in.args[0].value = &inarg;
++	return fuse_simple_request(fc, &args);
++}
++
++/*
++ * It is called from evict_inode() and by that time inode is going away. So
++ * this function does not take any locks like fi->i_dmap_sem for traversing
++ * that fuse inode interval tree. If that lock is taken then lock validator
++ * complains of deadlock situation w.r.t fs_reclaim lock.
++ */
++void fuse_removemapping(struct inode *inode)
++{
++	struct fuse_conn *fc = get_fuse_conn(inode);
++	struct fuse_inode *fi = get_fuse_inode(inode);
++	ssize_t err;
++	struct fuse_dax_mapping *dmap;
++
++	/* Clear the mappings list */
++	while (true) {
++		WARN_ON(fi->nr_dmaps < 0);
++
++		dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, 0,
++								-1);
++		if (dmap) {
++			fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree);
++			fi->nr_dmaps--;
++			dmap_remove_busy_list(fc, dmap);
++		}
++
++		if (!dmap)
++			break;
++
++		/*
++		 * During umount/shutdown, fuse connection is dropped first
++		 * and later evict_inode() is called later. That means any
++		 * removemapping messages are going to fail. Send messages
++		 * only if connection is up. Otherwise fuse daemon is
++		 * responsible for cleaning up any leftover references and
++		 * mappings.
++		 */
++		if (fc->connected) {
++			err = fuse_removemapping_one(inode, dmap);
++			if (err) {
++				pr_warn("Failed to removemapping. offset=0x%llx"
++					" len=0x%llx\n", dmap->window_offset,
++					dmap->length);
++			}
++		}
++
++		dmap->inode = NULL;
++
++		/* Add it back to free ranges list */
++		free_dax_mapping(fc, dmap);
++	}
++}
++
+ void fuse_finish_open(struct inode *inode, struct file *file)
+ {
+ 	struct fuse_file *ff = file->private_data;
+ 	struct fuse_conn *fc = get_fuse_conn(inode);
+ 
+-	if (ff->open_flags & FOPEN_DIRECT_IO)
+-		file->f_op = &fuse_direct_io_file_operations;
+ 	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ 		invalidate_inode_pages2(inode->i_mapping);
+ 	if (ff->open_flags & FOPEN_NONSEEKABLE)
+@@ -202,7 +421,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
+ 	int err;
+ 	bool lock_inode = (file->f_flags & O_TRUNC) &&
+ 			  fc->atomic_o_trunc &&
+-			  fc->writeback_cache;
++			  (fc->writeback_cache || IS_DAX(inode));
+ 
+ 	err = generic_file_open(inode, file);
+ 	if (err)
+@@ -250,6 +469,7 @@ void fuse_release_common(struct file *file, bool isdir)
+ 	struct fuse_file *ff = file->private_data;
+ 	struct fuse_req *req = ff->reserved_req;
+ 	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
++	bool sync = false;
+ 
+ 	fuse_prepare_release(ff, file->f_flags, opcode);
+ 
+@@ -270,8 +490,20 @@ void fuse_release_common(struct file *file, bool isdir)
+ 	 * Make the release synchronous if this is a fuseblk mount,
+ 	 * synchronous RELEASE is allowed (and desirable) in this case
+ 	 * because the server can be trusted not to screw up.
++	 *
++	 * For DAX, fuse server is trusted. So it should be fine to
++	 * do a sync file put. Doing async file put is creating
++	 * problems right now because when request finish, iput()
++	 * can lead to freeing of inode. That means it tears down
++	 * mappings backing DAX memory and sends REMOVEMAPPING message
++	 * to server and blocks for completion. Currently, waiting
++	 * in req->end context deadlocks the system as same worker thread
++	 * can't process REMOVEMAPPING reply it is waiting for.
+ 	 */
+-	fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir);
++	if (IS_DAX(req->misc.release.inode) || ff->fc->destroy_req != NULL)
++		sync = true;
++
++	fuse_file_put(ff, sync, isdir);
+ }
+ 
+ static int fuse_open(struct inode *inode, struct file *file)
+@@ -916,11 +1148,23 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
+ 	return err;
+ }
+ 
++
++static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to);
++static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
++
+ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+ {
+-	struct inode *inode = iocb->ki_filp->f_mapping->host;
++	struct file *file = iocb->ki_filp;
++	struct fuse_file *ff = file->private_data;
++	struct inode *inode = file->f_mapping->host;
+ 	struct fuse_conn *fc = get_fuse_conn(inode);
+ 
++	if (ff->open_flags & FOPEN_DIRECT_IO)
++		return fuse_direct_read_iter(iocb, to);
++
++	if (IS_DAX(inode))
++		return fuse_dax_read_iter(iocb, to);
++
+ 	/*
+ 	 * In auto invalidate mode, always update attributes on read.
+ 	 * Otherwise, only update if we attempt to read past EOF (to ensure
+@@ -1168,9 +1412,14 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
+ 	return res > 0 ? res : err;
+ }
+ 
++static ssize_t fuse_direct_write_iter(struct kiocb *iocb,
++				      struct iov_iter *from);
++static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
++
+ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ {
+ 	struct file *file = iocb->ki_filp;
++	struct fuse_file *ff = file->private_data;
+ 	struct address_space *mapping = file->f_mapping;
+ 	ssize_t written = 0;
+ 	ssize_t written_buffered = 0;
+@@ -1178,6 +1427,11 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ 	ssize_t err;
+ 	loff_t endbyte = 0;
+ 
++	if (ff->open_flags & FOPEN_DIRECT_IO)
++		return fuse_direct_write_iter(iocb, from);
++	if (IS_DAX(inode))
++		return fuse_dax_write_iter(iocb, from);
++
+ 	if (get_fuse_conn(inode)->writeback_cache) {
+ 		/* Update size (EOF optimization) and mode (SUID clearing) */
+ 		err = fuse_update_attributes(mapping->host, file);
+@@ -1442,16 +1696,279 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ 	/* Don't allow parallel writes to the same file */
+ 	inode_lock(inode);
+ 	res = generic_write_checks(iocb, from);
+-	if (res > 0)
+-		res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
++	if (res < 0)
++		goto out_invalidate;
++
++	res = file_remove_privs(iocb->ki_filp);
++	if (res)
++		goto out_invalidate;
++
++	res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
++	if (res < 0)
++		goto out_invalidate;
++
+ 	fuse_invalidate_attr(inode);
+-	if (res > 0)
+-		fuse_write_update_size(inode, iocb->ki_pos);
++	fuse_write_update_size(inode, iocb->ki_pos);
+ 	inode_unlock(inode);
++	return res;
+ 
++out_invalidate:
++	fuse_invalidate_attr(inode);
++	inode_unlock(inode);
+ 	return res;
+ }
+ 
++static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
++{
++	iomap->addr = IOMAP_NULL_ADDR;
++	iomap->length = length;
++	iomap->type = IOMAP_HOLE;
++}
++
++static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
++			struct iomap *iomap, struct fuse_dax_mapping *dmap,
++			unsigned flags)
++{
++	loff_t offset, len;
++	loff_t i_size = i_size_read(inode);
++
++	offset = pos - dmap->start;
++	len = min(length, dmap->length - offset);
++
++	/* If length is beyond end of file, truncate further */
++	if (pos + len > i_size)
++		len = i_size - pos;
++
++	if (len > 0) {
++		iomap->addr = dmap->window_offset + offset;
++		iomap->length = len;
++		if (flags & IOMAP_FAULT)
++			iomap->length = ALIGN(len, PAGE_SIZE);
++		iomap->type = IOMAP_MAPPED;
++		pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
++				" length 0x%llx\n", __func__, iomap->addr,
++				iomap->offset, iomap->length);
++	} else {
++		/* Mapping beyond end of file is hole */
++		fuse_fill_iomap_hole(iomap, length);
++		pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
++				"length 0x%llx\n", __func__, iomap->addr,
++				iomap->offset, iomap->length);
++	}
++}
++
++/* This is just for DAX and the mapping is ephemeral, do not use it for other
++ * purposes since there is no block device with a permanent mapping.
++ */
++static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
++			    unsigned flags, struct iomap *iomap)
++{
++	struct fuse_inode *fi = get_fuse_inode(inode);
++	struct fuse_conn *fc = get_fuse_conn(inode);
++	struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
++	int ret;
++
++	/* We don't support FIEMAP */
++	BUG_ON(flags & IOMAP_REPORT);
++
++	pr_debug("fuse_iomap_begin() called. pos=0x%llx length=0x%llx\n",
++			pos, length);
++
++	iomap->offset = pos;
++	iomap->flags = 0;
++	iomap->bdev = NULL;
++	iomap->dax_dev = fc->dax_dev;
++
++	/*
++	 * Both read/write and mmap path can race here. So we need something
++	 * to make sure if we are setting up mapping, then other path waits
++	 *
++	 * For now, use a semaphore for this. It probably needs to be
++	 * optimized later.
++	 */
++	down_read(&fi->i_dmap_sem);
++	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
++
++	if (dmap) {
++		fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
++		up_read(&fi->i_dmap_sem);
++		return 0;
++	} else {
++		up_read(&fi->i_dmap_sem);
++		pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
++				__func__, pos, length);
++		if (pos >= i_size_read(inode))
++			goto iomap_hole;
++
++		/* Can't do reclaim in fault path yet due to lock ordering.
++		 * Read path takes shared inode lock and that's not sufficient
++		 * for inline range reclaim. Caller needs to drop lock, wait
++		 * and retry.
++		 */
++		if (flags & IOMAP_FAULT || !(flags & IOMAP_WRITE)) {
++			alloc_dmap = alloc_dax_mapping(fc);
++			if (!alloc_dmap)
++				return -ENOSPC;
++		} else {
++			alloc_dmap = alloc_dax_mapping_reclaim(fc, inode);
++			if (IS_ERR(alloc_dmap))
++				return PTR_ERR(alloc_dmap);
++		}
++
++		/* If we are here, we should have memory allocated */
++		if (WARN_ON(!alloc_dmap))
++			return -EBUSY;
++
++		/*
++		 * Drop read lock and take write lock so that only one
++		 * caller can try to setup mapping and other waits
++		 */
++		down_write(&fi->i_dmap_sem);
++		/*
++		 * We dropped lock. Check again if somebody else setup
++		 * mapping already.
++		 */
++		dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos,
++							pos);
++		if (dmap) {
++			fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
++			free_dax_mapping(fc, alloc_dmap);
++			up_write(&fi->i_dmap_sem);
++			return 0;
++		}
++
++		/* Setup one mapping */
++		ret = fuse_setup_one_mapping(inode, NULL,
++				ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
++				alloc_dmap);
++		if (ret < 0) {
++			printk("fuse_setup_one_mapping() failed. err=%d"
++				" pos=0x%llx\n", ret, pos);
++			free_dax_mapping(fc, alloc_dmap);
++			up_write(&fi->i_dmap_sem);
++			return ret;
++		}
++		fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
++		up_write(&fi->i_dmap_sem);
++		return 0;
++	}
++
++	/*
++	 * If read beyond end of file happnes, fs code seems to return
++	 * it as hole
++	 */
++iomap_hole:
++	fuse_fill_iomap_hole(iomap, length);
++	pr_debug("fuse_iomap_begin() returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", pos, length, iomap->length);
++	return 0;
++}
++
++static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
++			  ssize_t written, unsigned flags,
++			  struct iomap *iomap)
++{
++	/* DAX writes beyond end-of-file aren't handled using iomap, so the
++	 * file size is unchanged and there is nothing to do here.
++	 */
++	return 0;
++}
++
++static const struct iomap_ops fuse_iomap_ops = {
++	.iomap_begin = fuse_iomap_begin,
++	.iomap_end = fuse_iomap_end,
++};
++
++static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
++{
++	struct inode *inode = file_inode(iocb->ki_filp);
++	struct fuse_conn *fc = get_fuse_conn(inode);
++	ssize_t ret;
++	bool retry = false;
++
++retry:
++	if (retry && !(fc->nr_free_ranges > 0)) {
++		ret = -EINTR;
++		if (wait_event_killable_exclusive(fc->dax_range_waitq,
++						  (fc->nr_free_ranges > 0))) {
++			goto out;
++		}
++	}
++
++	if (iocb->ki_flags & IOCB_NOWAIT) {
++		if (!inode_trylock_shared(inode))
++			return -EAGAIN;
++	} else {
++		inode_lock_shared(inode);
++	}
++
++	ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
++	inode_unlock_shared(inode);
++
++	/* If a dax range could not be allocated and it can't be reclaimed
++	 * inline, then drop inode lock and retry. Range reclaim logic
++	 * requires exclusive access to inode lock.
++	 *
++	 * TODO: What if -ENOSPC needs to be returned to user space. Fix it.
++	 */
++	if (ret == -ENOSPC) {
++		retry = true;
++		goto retry;
++	}
++	/* TODO file_accessed(iocb->f_filp) */
++
++out:
++	return ret;
++}
++
++static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
++{
++	struct inode *inode = file_inode(iocb->ki_filp);
++	ssize_t ret;
++
++	if (iocb->ki_flags & IOCB_NOWAIT) {
++		if (!inode_trylock(inode))
++			return -EAGAIN;
++	} else {
++		inode_lock(inode);
++	}
++
++	ret = generic_write_checks(iocb, from);
++	if (ret <= 0)
++		goto out;
++
++	ret = file_remove_privs(iocb->ki_filp);
++	if (ret)
++		goto out;
++	/* TODO file_update_time() but we don't want metadata I/O */
++
++	/* TODO handle growing the file */
++	/* Grow file here if need be. iomap_begin() does not have access
++	 * to file pointer
++	 */
++	if (iov_iter_rw(from) == WRITE &&
++	    ((iocb->ki_pos + iov_iter_count(from)) > i_size_read(inode))) {
++		ret = __fuse_file_fallocate(iocb->ki_filp, 0, iocb->ki_pos,
++						iov_iter_count(from));
++		if (ret < 0) {
++			printk("fallocate(offset=0x%llx length=0x%zx)"
++			" failed. err=%zd\n", iocb->ki_pos,
++			iov_iter_count(from), ret);
++			goto out;
++		}
++		pr_debug("fallocate(offset=0x%llx length=0x%zx)"
++		" succeed. ret=%zd\n", iocb->ki_pos, iov_iter_count(from), ret);
++	}
++
++	ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
++
++out:
++	inode_unlock(inode);
++
++	if (ret > 0)
++		ret = generic_write_sync(iocb, ret);
++	return ret;
++}
++
+ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
+ {
+ 	int i;
+@@ -1901,6 +2418,17 @@ static int fuse_writepages_fill(struct page *page,
+ 	return err;
+ }
+ 
++static int fuse_dax_writepages(struct address_space *mapping,
++				struct writeback_control *wbc)
++{
++
++	struct inode *inode = mapping->host;
++	struct fuse_conn *fc = get_fuse_conn(inode);
++
++	return dax_writeback_mapping_range(mapping,
++		NULL, fc->dax_dev, wbc);
++}
++
+ static int fuse_writepages(struct address_space *mapping,
+ 			   struct writeback_control *wbc)
+ {
+@@ -2074,8 +2602,20 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
+ 	.page_mkwrite	= fuse_page_mkwrite,
+ };
+ 
++static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma);
++static int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma);
++
+ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
+ {
++	struct fuse_file *ff = file->private_data;
++
++	/* DAX mmap is superior to direct_io mmap */
++	if (IS_DAX(file_inode(file)))
++		return fuse_dax_mmap(file, vma);
++
++	if (ff->open_flags & FOPEN_DIRECT_IO)
++		return fuse_direct_mmap(file, vma);
++
+ 	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ 		fuse_link_write_file(file);
+ 
+@@ -2095,6 +2635,103 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
+ 	return generic_file_mmap(file, vma);
+ }
+ 
++static ssize_t fuse_file_splice_read(struct file *in, loff_t *ppos,
++				     struct pipe_inode_info *pipe, size_t len,
++				     unsigned int flags)
++{
++	struct fuse_file *ff = in->private_data;
++
++	if (ff->open_flags & FOPEN_DIRECT_IO)
++		return default_file_splice_read(in, ppos, pipe, len, flags);
++	else
++		return generic_file_splice_read(in, ppos, pipe, len, flags);
++
++}
++static int __fuse_dax_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
++			    bool write)
++{
++	int ret, error = 0;
++	struct inode *inode = file_inode(vmf->vma->vm_file);
++	struct super_block *sb = inode->i_sb;
++	pfn_t pfn;
++	struct fuse_conn *fc = get_fuse_conn(inode);
++	bool retry = false;
++
++	if (write)
++		sb_start_pagefault(sb);
++
++retry:
++	if (retry && !(fc->nr_free_ranges > 0)) {
++		ret = -EINTR;
++		if (wait_event_killable_exclusive(fc->dax_range_waitq,
++					(fc->nr_free_ranges > 0)))
++			goto out;
++	}
++
++	/*
++	 * We need to serialize against not only truncate but also against
++	 * fuse dax memory range reclaim. While a range is being reclaimed,
++	 * we do not want any read/write/mmap to make progress and try
++	 * to populate page cache or access memory we are trying to free.
++	 */
++	down_read(&get_fuse_inode(inode)->i_mmap_sem);
++	ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
++	if ((ret & VM_FAULT_ERROR) && error == -ENOSPC) {
++		error = 0;
++		retry = true;
++		up_read(&get_fuse_inode(inode)->i_mmap_sem);
++		goto retry;
++	}
++
++	if (ret & VM_FAULT_NEEDDSYNC)
++		ret = dax_finish_sync_fault(vmf, pe_size, pfn);
++
++	up_read(&get_fuse_inode(inode)->i_mmap_sem);
++
++out:
++	if (write)
++		sb_end_pagefault(sb);
++
++	return ret;
++}
++
++static int fuse_dax_fault(struct vm_fault *vmf)
++{
++	return __fuse_dax_fault(vmf, PE_SIZE_PTE,
++				vmf->flags & FAULT_FLAG_WRITE);
++}
++
++static int fuse_dax_huge_fault(struct vm_fault *vmf,
++			       enum page_entry_size pe_size)
++{
++	return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE);
++}
++
++static int fuse_dax_page_mkwrite(struct vm_fault *vmf)
++{
++	return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
++}
++
++static int fuse_dax_pfn_mkwrite(struct vm_fault *vmf)
++{
++	return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
++}
++
++static const struct vm_operations_struct fuse_dax_vm_ops = {
++	.fault		= fuse_dax_fault,
++	.huge_fault	= fuse_dax_huge_fault,
++	.page_mkwrite	= fuse_dax_page_mkwrite,
++	.pfn_mkwrite	= fuse_dax_pfn_mkwrite,
++};
++
++static int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
++{
++	file_accessed(file);
++	vma->vm_ops = &fuse_dax_vm_ops;
++	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
++	return 0;
++}
++
+ static int convert_fuse_file_lock(struct fuse_conn *fc,
+ 				  const struct fuse_file_lock *ffl,
+ 				  struct file_lock *fl)
+@@ -2938,8 +3575,12 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+ 	return ret;
+ }
+ 
+-static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
+-				loff_t length)
++/*
++ * This variant does not take any inode lock and if locking is required,
++ * caller is supposed to hold lock
++ */
++static long __fuse_file_fallocate(struct file *file, int mode,
++					loff_t offset, loff_t length)
+ {
+ 	struct fuse_file *ff = file->private_data;
+ 	struct inode *inode = file_inode(file);
+@@ -2953,8 +3594,6 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
+ 		.mode = mode
+ 	};
+ 	int err;
+-	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
+-			   (mode & FALLOC_FL_PUNCH_HOLE);
+ 
+ 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ 		return -EOPNOTSUPP;
+@@ -2962,17 +3601,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
+ 	if (fc->no_fallocate)
+ 		return -EOPNOTSUPP;
+ 
+-	if (lock_inode) {
+-		inode_lock(inode);
+-		if (mode & FALLOC_FL_PUNCH_HOLE) {
+-			loff_t endbyte = offset + length - 1;
+-			err = filemap_write_and_wait_range(inode->i_mapping,
+-							   offset, endbyte);
+-			if (err)
+-				goto out;
+-
+-			fuse_sync_writes(inode);
+-		}
++	if (mode & FALLOC_FL_PUNCH_HOLE) {
++		loff_t endbyte = offset + length - 1;
++		err = filemap_write_and_wait_range(inode->i_mapping, offset,
++							endbyte);
++		if (err)
++			goto out;
++		fuse_sync_writes(inode);
+ 	}
+ 
+ 	if (!(mode & FALLOC_FL_KEEP_SIZE))
+@@ -2999,18 +3634,42 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
+ 			file_update_time(file);
+ 	}
+ 
+-	if (mode & FALLOC_FL_PUNCH_HOLE)
++	if (mode & FALLOC_FL_PUNCH_HOLE) {
++		down_write(&fi->i_mmap_sem);
+ 		truncate_pagecache_range(inode, offset, offset + length - 1);
+-
++		up_write(&fi->i_mmap_sem);
++	}
+ 	fuse_invalidate_attr(inode);
+ 
+ out:
+ 	if (!(mode & FALLOC_FL_KEEP_SIZE))
+ 		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ 
++	return err;
++}
++
++static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
++				loff_t length)
++{
++	struct fuse_file *ff = file->private_data;
++	struct inode *inode = file_inode(file);
++	struct fuse_conn *fc = ff->fc;
++	int err;
++	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
++			   (mode & FALLOC_FL_PUNCH_HOLE);
++
++	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
++		return -EOPNOTSUPP;
++
++	if (fc->no_fallocate)
++		return -EOPNOTSUPP;
++
+ 	if (lock_inode)
+-		inode_unlock(inode);
++		inode_lock(inode);
+ 
++	err = __fuse_file_fallocate(file, mode, offset, length);
++	if (lock_inode)
++		inode_unlock(inode);
+ 	return err;
+ }
+ 
+@@ -3018,38 +3677,21 @@ static const struct file_operations fuse_file_operations = {
+ 	.llseek		= fuse_file_llseek,
+ 	.read_iter	= fuse_file_read_iter,
+ 	.write_iter	= fuse_file_write_iter,
+-	.mmap		= fuse_file_mmap,
++ 	.mmap		= fuse_file_mmap,
++	.splice_read    = fuse_file_splice_read,
+ 	.open		= fuse_open,
+ 	.flush		= fuse_flush,
+ 	.release	= fuse_release,
+ 	.fsync		= fuse_fsync,
+ 	.lock		= fuse_file_lock,
++	.get_unmapped_area = thp_get_unmapped_area,
+ 	.flock		= fuse_file_flock,
+-	.splice_read	= generic_file_splice_read,
+ 	.unlocked_ioctl	= fuse_file_ioctl,
+ 	.compat_ioctl	= fuse_file_compat_ioctl,
+ 	.poll		= fuse_file_poll,
+ 	.fallocate	= fuse_file_fallocate,
+ };
+ 
+-static const struct file_operations fuse_direct_io_file_operations = {
+-	.llseek		= fuse_file_llseek,
+-	.read_iter	= fuse_direct_read_iter,
+-	.write_iter	= fuse_direct_write_iter,
+-	.mmap		= fuse_direct_mmap,
+-	.open		= fuse_open,
+-	.flush		= fuse_flush,
+-	.release	= fuse_release,
+-	.fsync		= fuse_fsync,
+-	.lock		= fuse_file_lock,
+-	.flock		= fuse_file_flock,
+-	.unlocked_ioctl	= fuse_file_ioctl,
+-	.compat_ioctl	= fuse_file_compat_ioctl,
+-	.poll		= fuse_file_poll,
+-	.fallocate	= fuse_file_fallocate,
+-	/* no splice_read */
+-};
+-
+ static const struct address_space_operations fuse_file_aops  = {
+ 	.readpage	= fuse_readpage,
+ 	.writepage	= fuse_writepage,
+@@ -3063,8 +3705,271 @@ static const struct address_space_operations fuse_file_aops  = {
+ 	.write_end	= fuse_write_end,
+ };
+ 
++static const struct address_space_operations fuse_dax_file_aops  = {
++	.writepages	= fuse_dax_writepages,
++	.direct_IO	= noop_direct_IO,
++	.set_page_dirty	= noop_set_page_dirty,
++	.invalidatepage	= noop_invalidatepage,
++};
++
+ void fuse_init_file_inode(struct inode *inode)
+ {
++ 	struct fuse_inode *fi = get_fuse_inode(inode);
++	struct fuse_conn *fc = get_fuse_conn(inode);
++
+ 	inode->i_fop = &fuse_file_operations;
+ 	inode->i_data.a_ops = &fuse_file_aops;
++	fi->dmap_tree = RB_ROOT_CACHED;
++
++	if (fc->dax_dev) {
++		inode->i_flags |= S_DAX;
++		inode->i_data.a_ops = &fuse_dax_file_aops;
++	}
++}
++
++int fuse_dax_reclaim_dmap_locked(struct fuse_conn *fc, struct inode *inode,
++				struct fuse_dax_mapping *dmap)
++{
++	int ret;
++	struct fuse_inode *fi = get_fuse_inode(inode);
++
++	ret = filemap_fdatawrite_range(inode->i_mapping, dmap->start,
++					dmap->end);
++	if (ret) {
++		printk("filemap_fdatawrite_range() failed. err=%d start=0x%llx,"
++			" end=0x%llx\n", ret, dmap->start, dmap->end);
++		return ret;
++	}
++
++	ret = invalidate_inode_pages2_range(inode->i_mapping,
++					dmap->start >> PAGE_SHIFT,
++					dmap->end >> PAGE_SHIFT);
++	/* TODO: What to do if above fails? For now,
++	 * leave the range in place.
++	 */
++	if (ret) {
++		printk("invalidate_inode_pages2_range() failed err=%d\n", ret);
++		return ret;
++	}
++
++	/* Remove dax mapping from inode interval tree now */
++	fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree);
++	fi->nr_dmaps--;
++	return 0;
++}
++
++/* First first mapping in the tree and free it. */
++struct fuse_dax_mapping *fuse_dax_reclaim_first_mapping_locked(
++				struct fuse_conn *fc, struct inode *inode)
++{
++	struct fuse_inode *fi = get_fuse_inode(inode);
++	struct fuse_dax_mapping *dmap;
++	int ret;
++
++	/* Find fuse dax mapping at file offset inode. */
++	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, 0, -1);
++	if (!dmap)
++		return NULL;
++
++	ret = fuse_dax_reclaim_dmap_locked(fc, inode, dmap);
++	if (ret < 0)
++		return ERR_PTR(ret);
++
++	/* Clean up dmap. Do not add back to free list */
++	dmap_remove_busy_list(fc, dmap);
++	dmap->inode = NULL;
++	dmap->start = dmap->end = 0;
++
++	pr_debug("fuse: reclaimed memory range window_offset=0x%llx,"
++				" length=0x%llx\n", dmap->window_offset,
++				dmap->length);
++	return dmap;
++}
++
++/*
++ * First first mapping in the tree and free it and return it. Do not add
++ * it back to free pool.
++ *
++ * This is called with inode lock held.
++ */
++struct fuse_dax_mapping *fuse_dax_reclaim_first_mapping(struct fuse_conn *fc,
++					struct inode *inode)
++{
++	struct fuse_inode *fi = get_fuse_inode(inode);
++	struct fuse_dax_mapping *dmap;
++
++	down_write(&fi->i_mmap_sem);
++	down_write(&fi->i_dmap_sem);
++	dmap = fuse_dax_reclaim_first_mapping_locked(fc, inode);
++	up_write(&fi->i_dmap_sem);
++	up_write(&fi->i_mmap_sem);
++	return dmap;
++}
++
++static struct fuse_dax_mapping *alloc_dax_mapping_reclaim(struct fuse_conn *fc,
++					struct inode *inode)
++{
++	struct fuse_dax_mapping *dmap;
++	struct fuse_inode *fi = get_fuse_inode(inode);
++
++	while(1) {
++		dmap = alloc_dax_mapping(fc);
++		if (dmap)
++			return dmap;
++
++		if (fi->nr_dmaps)
++			return fuse_dax_reclaim_first_mapping(fc, inode);
++		/*
++		 * There are no mappings which can be reclaimed.
++		 * Wait for one.
++		 */
++		if (!(fc->nr_free_ranges > 0)) {
++			if (wait_event_killable_exclusive(fc->dax_range_waitq,
++					(fc->nr_free_ranges > 0)))
++				return ERR_PTR(-EINTR);
++		}
++	}
++}
++
++int fuse_dax_free_one_mapping_locked(struct fuse_conn *fc, struct inode *inode,
++				u64 dmap_start)
++{
++	int ret;
++	struct fuse_inode *fi = get_fuse_inode(inode);
++	struct fuse_dax_mapping *dmap;
++
++	WARN_ON(!inode_is_locked(inode));
++
++	/* Find fuse dax mapping at file offset inode. */
++	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, dmap_start,
++							dmap_start);
++
++	/* Range already got cleaned up by somebody else */
++	if (!dmap)
++		return 0;
++
++	ret = fuse_dax_reclaim_dmap_locked(fc, inode, dmap);
++	if (ret < 0)
++		return ret;
++
++	/* Cleanup dmap entry and add back to free list */
++	spin_lock(&fc->lock);
++	__dmap_remove_busy_list(fc, dmap);
++	dmap->inode = NULL;
++	dmap->start = dmap->end = 0;
++	__free_dax_mapping(fc, dmap);
++	spin_unlock(&fc->lock);
++
++	pr_debug("fuse: freed memory range window_offset=0x%llx,"
++				" length=0x%llx\n", dmap->window_offset,
++				dmap->length);
++	return ret;
++}
++
++/*
++ * Free a range of memory.
++ * Locking.
++ * 1. Take inode->i_rwsem to prever further read/write.
++ * 2. Take fuse_inode->i_mmap_sem to block dax faults.
++ * 3. Take fuse_inode->i_dmap_sem to protect interval tree. It might not
++ *    be strictly necessary as lock 1 and 2 seem sufficient.
++ */
++int fuse_dax_free_one_mapping(struct fuse_conn *fc, struct inode *inode,
++				u64 dmap_start)
++{
++	int ret;
++	struct fuse_inode *fi = get_fuse_inode(inode);
++
++	/*
++	 * If process is blocked waiting for memory while holding inode
++	 * lock, we will deadlock. So continue to free next range.
++	 */
++	if (!inode_trylock(inode))
++		return -EAGAIN;
++	down_write(&fi->i_mmap_sem);
++	down_write(&fi->i_dmap_sem);
++	ret = fuse_dax_free_one_mapping_locked(fc, inode, dmap_start);
++	up_write(&fi->i_dmap_sem);
++	up_write(&fi->i_mmap_sem);
++	inode_unlock(inode);
++	return ret;
++}
++
++int fuse_dax_free_memory(struct fuse_conn *fc, unsigned long nr_to_free)
++{
++	struct fuse_dax_mapping *dmap, *pos, *temp;
++	int ret, nr_freed = 0, nr_eagain = 0;
++	u64 dmap_start = 0, window_offset = 0;
++	struct inode *inode = NULL;
++
++	/* Pick first busy range and free it for now*/
++	while(1) {
++		if (nr_freed >= nr_to_free)
++			break;
++
++		if (nr_eagain > 20) {
++			queue_delayed_work(system_long_wq, &fc->dax_free_work,
++						msecs_to_jiffies(10));
++			return 0;
++		}
++
++		dmap = NULL;
++		spin_lock(&fc->lock);
++
++		list_for_each_entry_safe(pos, temp, &fc->busy_ranges,
++						busy_list) {
++			inode = igrab(pos->inode);
++			/*
++			 * This inode is going away. That will free
++			 * up all the ranges anyway, continue to
++			 * next range.
++			 */
++			if (!inode)
++				continue;
++			/*
++			 * Take this element off list and add it tail. If
++			 * inode lock can't be obtained, this will help with
++			 * selecting new element
++			 */
++			dmap = pos;
++			list_move_tail(&dmap->busy_list, &fc->busy_ranges);
++			dmap_start = dmap->start;
++			window_offset = dmap->window_offset;
++			break;
++		}
++		spin_unlock(&fc->lock);
++		if (!dmap)
++			return 0;
++
++		ret = fuse_dax_free_one_mapping(fc, inode, dmap_start);
++		iput(inode);
++		if (ret && ret != -EAGAIN) {
++			printk("%s(window_offset=0x%llx) failed. err=%d\n",
++				__func__, window_offset, ret);
++			return ret;
++		}
++
++		/* Could not get inode lock. Try next element */
++		if (ret == -EAGAIN) {
++			nr_eagain++;
++			continue;
++		}
++		nr_freed++;
++	}
++	return 0;
++}
++
++/* TODO: This probably should go in inode.c */
++void fuse_dax_free_mem_worker(struct work_struct *work)
++{
++	int ret;
++	struct fuse_conn *fc = container_of(work, struct fuse_conn,
++						dax_free_work.work);
++	pr_debug("fuse: Worker to free memory called.\n");
++	pr_debug("fuse: Worker to free memory called. nr_free_ranges=%lu"
++		 " nr_busy_ranges=%lu\n", fc->nr_free_ranges,
++		 fc->nr_busy_ranges);
++	ret = fuse_dax_free_memory(fc, FUSE_DAX_RECLAIM_CHUNK);
++	if (ret)
++		pr_debug("fuse: fuse_dax_free_memory() failed with err=%d\n", ret);
+ }
+diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
+index cec8b8e74969..1149281ab1e8 100644
+--- a/fs/fuse/fuse_i.h
++++ b/fs/fuse/fuse_i.h
+@@ -43,6 +43,20 @@
+ /** Number of page pointers embedded in fuse_req */
+ #define FUSE_REQ_INLINE_PAGES 1
+ 
++/* Default memory range size, 2MB */
++#define FUSE_DAX_MEM_RANGE_SZ	(2*1024*1024)
++#define FUSE_DAX_MEM_RANGE_PAGES	(FUSE_DAX_MEM_RANGE_SZ/PAGE_SIZE)
++
++/* Number of ranges reclaimer will try to free in one invocation */
++#define FUSE_DAX_RECLAIM_CHUNK		(10)
++
++/*
++ * Dax memory reclaim threshold in percetage of total ranges. When free
++ * number of free ranges drops below this threshold, reclaim can trigger
++ * Default is 20%
++ * */
++#define FUSE_DAX_RECLAIM_THRESHOLD	(20)
++
+ /** List of active connections */
+ extern struct list_head fuse_conn_list;
+ 
+@@ -53,12 +67,73 @@ extern struct mutex fuse_mutex;
+ extern unsigned max_user_bgreq;
+ extern unsigned max_user_congthresh;
+ 
++/** Mount options */
++struct fuse_mount_data {
++	int fd;
++	const char *tag; /* lifetime: .fill_super() data argument */
++	unsigned rootmode;
++	kuid_t user_id;
++	kgid_t group_id;
++	unsigned fd_present:1;
++	unsigned tag_present:1;
++	unsigned rootmode_present:1;
++	unsigned user_id_present:1;
++	unsigned group_id_present:1;
++	unsigned default_permissions:1;
++	unsigned allow_other:1;
++	unsigned dax:1;
++	unsigned destroy:1;
++	unsigned max_read;
++	unsigned blksize;
++
++	/* DAX device, may be NULL */
++	struct dax_device *dax_dev;
++
++	/* fuse input queue operations */
++	const struct fuse_iqueue_ops *fiq_ops;
++
++	/* device-specific state for fuse_iqueue */
++	void *fiq_priv;
++
++	/* fuse_dev pointer to fill in, should contain NULL on entry */
++	void **fudptr;
++};
++
+ /* One forget request */
+ struct fuse_forget_link {
+ 	struct fuse_forget_one forget_one;
+ 	struct fuse_forget_link *next;
+ };
+ 
++#define START(node) ((node)->start)
++#define LAST(node) ((node)->end)
++
++/** Translation information for file offsets to DAX window offsets */
++struct fuse_dax_mapping {
++	/* Pointer to inode where this memory range is mapped */
++	struct inode *inode;
++
++	/* Will connect in fc->free_ranges to keep track of free memory */
++	struct list_head list;
++
++	/* For interval tree in file/inode */
++	struct rb_node rb;
++	/** Start Position in file */
++	__u64 start;
++	/** End Position in file */
++	__u64 end;
++	__u64 __subtree_last;
++
++	/* Will connect in fc->busy_ranges to keep track busy memory */
++	struct list_head busy_list;
++
++       /** Position in DAX window */
++       u64 window_offset;
++
++       /** Length of mapping, in bytes */
++       loff_t length;
++};
++
+ /** FUSE inode */
+ struct fuse_inode {
+ 	/** Inode data */
+@@ -108,6 +183,22 @@ struct fuse_inode {
+ 
+ 	/** Lock for serializing lookup and readdir for back compatibility*/
+ 	struct mutex mutex;
++
++	/*
++	 * Semaphore to protect modifications to dmap_tree
++	 */
++	struct rw_semaphore i_dmap_sem;
++
++	/**
++	 * Can't take inode lock in fault path (leads to circular dependency).
++	 * So take this in fuse dax fault path to make sure truncate and
++	 * punch hole etc. can't make progress in parallel.
++	 */
++	struct rw_semaphore i_mmap_sem;
++
++	/** Sorted rb tree of struct fuse_dax_mapping elements */
++	struct rb_root_cached dmap_tree;
++	unsigned long nr_dmaps;
+ };
+ 
+ /** FUSE inode state bits */
+@@ -382,8 +473,44 @@ struct fuse_req {
+ 
+ 	/** Request is stolen from fuse_file->reserved_req */
+ 	struct file *stolen_file;
++
++	/** virtio-fs's physically contiguous buffer for in and out args */
++	void *argbuf;
+ };
+ 
++struct fuse_iqueue;
++
++/**
++ * Input queue callbacks
++ *
++ * Input queue signalling is device-specific.  For example, the /dev/fuse file
++ * uses fiq->waitq and fasync to wake processes that are waiting on queue
++ * readiness.  These callbacks allow other device types to respond to input
++ * queue activity.
++ */
++struct fuse_iqueue_ops {
++	/**
++	 * Signal that a forget has been queued
++	 */
++	void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq)
++	__releases(fiq->waitq.lock);
++
++	/**
++	 * Signal that an INTERRUPT request has been queued
++	 */
++	void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq)
++	__releases(fiq->waitq.lock);
++
++	/**
++	 * Signal that a request has been queued
++	 */
++	void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
++	__releases(fiq->waitq.lock);
++};
++
++/** /dev/fuse input queue operations */
++extern const struct fuse_iqueue_ops fuse_dev_fiq_ops;
++
+ struct fuse_iqueue {
+ 	/** Connection established */
+ 	unsigned connected;
+@@ -409,6 +536,12 @@ struct fuse_iqueue {
+ 
+ 	/** O_ASYNC requests */
+ 	struct fasync_struct *fasync;
++
++	/** Device-specific callbacks */
++	const struct fuse_iqueue_ops *ops;
++
++	/** Device-specific state */
++	void *priv;
+ };
+ 
+ struct fuse_pqueue {
+@@ -675,6 +808,28 @@ struct fuse_conn {
+ 
+ 	/** List of device instances belonging to this connection */
+ 	struct list_head devices;
++
++	/** DAX device, non-NULL if DAX is supported */
++	struct dax_device *dax_dev;
++
++	/* List of memory ranges which are busy */
++	unsigned long nr_busy_ranges;
++	struct list_head busy_ranges;
++
++	/* Worker to free up memory ranges */
++	struct delayed_work dax_free_work;
++
++	/* Wait queue for a dax range to become free */
++	wait_queue_head_t dax_range_waitq;
++
++	/*
++	 * DAX Window Free Ranges. TODO: This might not be best place to store
++	 * this free list
++	 */
++	unsigned long nr_free_ranges;
++	struct list_head free_ranges;
++
++	unsigned long nr_ranges;
+ };
+ 
+ static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+@@ -860,6 +1015,11 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+ void fuse_request_send_background_locked(struct fuse_conn *fc,
+ 					 struct fuse_req *req);
+ 
++/**
++ * End a finished request
++ */
++void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req);
++
+ /* Abort all requests */
+ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort);
+ void fuse_wait_aborted(struct fuse_conn *fc);
+@@ -881,16 +1041,42 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
+ /**
+  * Initialize fuse_conn
+  */
+-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns);
++void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
++			struct dax_device *dax_dev,
++			const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv);
+ 
+ /**
+  * Release reference to fuse_conn
+  */
+ void fuse_conn_put(struct fuse_conn *fc);
+ 
+-struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc);
++struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
++struct fuse_dev *fuse_dev_alloc(void);
++void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
+ void fuse_dev_free(struct fuse_dev *fud);
+ 
++/**
++ * Parse a mount options string
++ */
++int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
++				struct user_namespace *user_ns);
++
++/**
++ * Fill in superblock and initialize fuse connection
++ * @sb: partially-initialized superblock to fill in
++ * @mount_data: mount parameters
++ */
++int fuse_fill_super_common(struct super_block *sb,
++			   struct fuse_mount_data *mount_data);
++void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req);
++
++/**
++ * Disassociate fuse connection from superblock and kill the superblock
++ *
++ * Calls kill_anon_super(), use with do not use with bdev mounts.
++ */
++void fuse_kill_sb_anon(struct super_block *sb);
++
+ /**
+  * Add connection to control filesystem
+  */
+@@ -992,4 +1178,16 @@ struct posix_acl;
+ struct posix_acl *fuse_get_acl(struct inode *inode, int type);
+ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+ 
++/**
++ * Return the number of bytes in an arguments list
++ */
++unsigned fuse_len_args(unsigned numargs, struct fuse_arg *args);
++
++/**
++ * Get the next unique ID for a request
++ */
++u64 fuse_get_unique(struct fuse_iqueue *fiq);
++void fuse_dax_free_mem_worker(struct work_struct *work);
++void fuse_removemapping(struct inode *inode);
++
+ #endif /* _FS_FUSE_I_H */
+diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
+index db9e60b7eb69..dd16c7f6a561 100644
+--- a/fs/fuse/inode.c
++++ b/fs/fuse/inode.c
+@@ -22,6 +22,8 @@
+ #include <linux/exportfs.h>
+ #include <linux/posix_acl.h>
+ #include <linux/pid_namespace.h>
++#include <linux/dax.h>
++#include <linux/pfn_t.h>
+ 
+ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+ MODULE_DESCRIPTION("Filesystem in Userspace");
+@@ -59,21 +61,6 @@ MODULE_PARM_DESC(max_user_congthresh,
+ /** Congestion starts at 75% of maximum */
+ #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
+ 
+-struct fuse_mount_data {
+-	int fd;
+-	unsigned rootmode;
+-	kuid_t user_id;
+-	kgid_t group_id;
+-	unsigned fd_present:1;
+-	unsigned rootmode_present:1;
+-	unsigned user_id_present:1;
+-	unsigned group_id_present:1;
+-	unsigned default_permissions:1;
+-	unsigned allow_other:1;
+-	unsigned max_read;
+-	unsigned blksize;
+-};
+-
+ struct fuse_forget_link *fuse_alloc_forget(void)
+ {
+ 	return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+@@ -96,11 +83,14 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
+ 	fi->writectr = 0;
+ 	fi->orig_ino = 0;
+ 	fi->state = 0;
++	fi->nr_dmaps = 0;
+ 	INIT_LIST_HEAD(&fi->write_files);
+ 	INIT_LIST_HEAD(&fi->queued_writes);
+ 	INIT_LIST_HEAD(&fi->writepages);
+ 	init_waitqueue_head(&fi->page_waitq);
+ 	mutex_init(&fi->mutex);
++	init_rwsem(&fi->i_mmap_sem);
++	init_rwsem(&fi->i_dmap_sem);
+ 	fi->forget = fuse_alloc_forget();
+ 	if (!fi->forget) {
+ 		kmem_cache_free(fuse_inode_cachep, inode);
+@@ -133,6 +123,10 @@ static void fuse_evict_inode(struct inode *inode)
+ 	if (inode->i_sb->s_flags & SB_ACTIVE) {
+ 		struct fuse_conn *fc = get_fuse_conn(inode);
+ 		struct fuse_inode *fi = get_fuse_inode(inode);
++		if (IS_DAX(inode)) {
++			fuse_removemapping(inode);
++			WARN_ON(fi->nr_dmaps);
++		}
+ 		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
+ 		fi->forget = NULL;
+ 	}
+@@ -447,6 +441,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
+ 
+ enum {
+ 	OPT_FD,
++	OPT_TAG,
+ 	OPT_ROOTMODE,
+ 	OPT_USER_ID,
+ 	OPT_GROUP_ID,
+@@ -454,11 +449,13 @@ enum {
+ 	OPT_ALLOW_OTHER,
+ 	OPT_MAX_READ,
+ 	OPT_BLKSIZE,
++	OPT_DAX,
+ 	OPT_ERR
+ };
+ 
+ static const match_table_t tokens = {
+ 	{OPT_FD,			"fd=%u"},
++	{OPT_TAG,			"tag=%s"},
+ 	{OPT_ROOTMODE,			"rootmode=%o"},
+ 	{OPT_USER_ID,			"user_id=%u"},
+ 	{OPT_GROUP_ID,			"group_id=%u"},
+@@ -466,6 +463,7 @@ static const match_table_t tokens = {
+ 	{OPT_ALLOW_OTHER,		"allow_other"},
+ 	{OPT_MAX_READ,			"max_read=%u"},
+ 	{OPT_BLKSIZE,			"blksize=%u"},
++	{OPT_DAX,			"dax"},
+ 	{OPT_ERR,			NULL}
+ };
+ 
+@@ -480,7 +478,7 @@ static int fuse_match_uint(substring_t *s, unsigned int *res)
+ 	return err;
+ }
+ 
+-static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
++int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
+ 			  struct user_namespace *user_ns)
+ {
+ 	char *p;
+@@ -505,6 +503,11 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
+ 			d->fd_present = 1;
+ 			break;
+ 
++		case OPT_TAG:
++			d->tag = args[0].from;
++			d->tag_present = 1;
++			break;
++
+ 		case OPT_ROOTMODE:
+ 			if (match_octal(&args[0], &value))
+ 				return 0;
+@@ -552,17 +555,22 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
+ 			d->blksize = value;
+ 			break;
+ 
++		case OPT_DAX:
++			d->dax = 1;
++			break;
++
+ 		default:
+ 			return 0;
+ 		}
+ 	}
+ 
+-	if (!d->fd_present || !d->rootmode_present ||
+-	    !d->user_id_present || !d->group_id_present)
++	if (!d->rootmode_present || !d->user_id_present ||
++	    !d->group_id_present)
+ 		return 0;
+ 
+ 	return 1;
+ }
++EXPORT_SYMBOL_GPL(parse_fuse_opt);
+ 
+ static int fuse_show_options(struct seq_file *m, struct dentry *root)
+ {
+@@ -579,10 +587,14 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
+ 		seq_printf(m, ",max_read=%u", fc->max_read);
+ 	if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
+ 		seq_printf(m, ",blksize=%lu", sb->s_blocksize);
++	if (fc->dax_dev)
++		seq_printf(m, ",dax");
+ 	return 0;
+ }
+ 
+-static void fuse_iqueue_init(struct fuse_iqueue *fiq)
++static void fuse_iqueue_init(struct fuse_iqueue *fiq,
++			     const struct fuse_iqueue_ops *ops,
++			     void *priv)
+ {
+ 	memset(fiq, 0, sizeof(struct fuse_iqueue));
+ 	init_waitqueue_head(&fiq->waitq);
+@@ -590,6 +602,8 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq)
+ 	INIT_LIST_HEAD(&fiq->interrupts);
+ 	fiq->forget_list_tail = &fiq->forget_list_head;
+ 	fiq->connected = 1;
++	fiq->ops = ops;
++	fiq->priv = priv;
+ }
+ 
+ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
+@@ -601,7 +615,84 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
+ 	fpq->connected = 1;
+ }
+ 
+-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
++static void fuse_free_dax_mem_ranges(struct list_head *mem_list)
++{
++	struct fuse_dax_mapping *range, *temp;
++
++	/* Free All allocated elements */
++	list_for_each_entry_safe(range, temp, mem_list, list) {
++		list_del(&range->list);
++		if (!list_empty(&range->busy_list))
++			list_del(&range->busy_list);
++		kfree(range);
++	}
++}
++
++#ifdef CONFIG_FS_DAX
++static int fuse_dax_mem_range_init(struct fuse_conn *fc,
++				   struct dax_device *dax_dev)
++{
++	long nr_pages, nr_ranges;
++	void *kaddr;
++	pfn_t pfn;
++	struct fuse_dax_mapping *range;
++	LIST_HEAD(mem_ranges);
++	phys_addr_t phys_addr;
++	int ret = 0, id;
++	size_t dax_size = -1;
++	unsigned long allocated_ranges = 0, i;
++
++	id = dax_read_lock();
++	nr_pages = dax_direct_access(dax_dev, 0, PHYS_PFN(dax_size), &kaddr,
++					&pfn);
++	dax_read_unlock(id);
++	if (nr_pages < 0) {
++		pr_debug("dax_direct_access() returned %ld\n", nr_pages);
++		return nr_pages;
++	}
++
++	phys_addr = pfn_t_to_phys(pfn);
++	nr_ranges = nr_pages/FUSE_DAX_MEM_RANGE_PAGES;
++	printk("fuse_dax_mem_range_init(): dax mapped %ld pages. nr_ranges=%ld\n", nr_pages, nr_ranges);
++
++	for (i = 0; i < nr_ranges; i++) {
++		range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL);
++		if (!range) {
++			pr_debug("memory allocation for mem_range failed.\n");
++			ret = -ENOMEM;
++			goto out_err;
++		}
++		/* TODO: This offset only works if virtio-fs driver is not
++		 * having some memory hidden at the beginning. This needs
++		 * better handling
++		 */
++		range->window_offset = i * FUSE_DAX_MEM_RANGE_SZ;
++		range->length = FUSE_DAX_MEM_RANGE_SZ;
++		list_add_tail(&range->list, &mem_ranges);
++		INIT_LIST_HEAD(&range->busy_list);
++		allocated_ranges++;
++	}
++
++	list_replace_init(&mem_ranges, &fc->free_ranges);
++	fc->nr_free_ranges = allocated_ranges;
++	fc->nr_ranges = allocated_ranges;
++	return 0;
++out_err:
++	/* Free All allocated elements */
++	fuse_free_dax_mem_ranges(&mem_ranges);
++	return ret;
++}
++#else /* !CONFIG_FS_DAX */
++static inline int fuse_dax_mem_range_init(struct fuse_conn *fc,
++					  struct dax_device *dax_dev)
++{
++	return 0;
++}
++#endif /* CONFIG_FS_DAX */
++
++void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
++			struct dax_device *dax_dev,
++			const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv)
+ {
+ 	memset(fc, 0, sizeof(*fc));
+ 	spin_lock_init(&fc->lock);
+@@ -610,7 +701,8 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
+ 	atomic_set(&fc->dev_count, 1);
+ 	init_waitqueue_head(&fc->blocked_waitq);
+ 	init_waitqueue_head(&fc->reserved_req_waitq);
+-	fuse_iqueue_init(&fc->iq);
++	init_waitqueue_head(&fc->dax_range_waitq);
++	fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv);
+ 	INIT_LIST_HEAD(&fc->bg_queue);
+ 	INIT_LIST_HEAD(&fc->entry);
+ 	INIT_LIST_HEAD(&fc->devices);
+@@ -625,7 +717,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
+ 	fc->attr_version = 1;
+ 	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+ 	fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
++	fc->dax_dev = dax_dev;
+ 	fc->user_ns = get_user_ns(user_ns);
++	INIT_LIST_HEAD(&fc->free_ranges);
++	INIT_LIST_HEAD(&fc->busy_ranges);
++	INIT_DELAYED_WORK(&fc->dax_free_work, fuse_dax_free_mem_worker);
+ }
+ EXPORT_SYMBOL_GPL(fuse_conn_init);
+ 
+@@ -634,6 +730,9 @@ void fuse_conn_put(struct fuse_conn *fc)
+ 	if (refcount_dec_and_test(&fc->count)) {
+ 		if (fc->destroy_req)
+ 			fuse_request_free(fc->destroy_req);
++		flush_delayed_work(&fc->dax_free_work);
++		if (fc->dax_dev)
++			fuse_free_dax_mem_ranges(&fc->free_ranges);
+ 		put_pid_ns(fc->pid_ns);
+ 		put_user_ns(fc->user_ns);
+ 		fc->release(fc);
+@@ -943,7 +1042,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+ 	wake_up_all(&fc->blocked_waitq);
+ }
+ 
+-static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
++void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
+ {
+ 	struct fuse_init_in *arg = &req->misc.init_in;
+ 
+@@ -972,6 +1071,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
+ 	req->end = process_init_reply;
+ 	fuse_request_send_background(fc, req);
+ }
++EXPORT_SYMBOL_GPL(fuse_send_init);
+ 
+ static void fuse_free_conn(struct fuse_conn *fc)
+ {
+@@ -1019,24 +1119,38 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
+ 	return 0;
+ }
+ 
+-struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc)
+-{
++struct fuse_dev *fuse_dev_alloc(void) {
+ 	struct fuse_dev *fud;
+ 
+ 	fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL);
+-	if (fud) {
+-		fud->fc = fuse_conn_get(fc);
++	if (fud)
+ 		fuse_pqueue_init(&fud->pq);
+ 
+-		spin_lock(&fc->lock);
+-		list_add_tail(&fud->entry, &fc->devices);
+-		spin_unlock(&fc->lock);
+-	}
+-
+ 	return fud;
+ }
+ EXPORT_SYMBOL_GPL(fuse_dev_alloc);
+ 
++void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc) {
++	fud->fc = fuse_conn_get(fc);
++	spin_lock(&fc->lock);
++	list_add_tail(&fud->entry, &fc->devices);
++	spin_unlock(&fc->lock);
++}
++EXPORT_SYMBOL_GPL(fuse_dev_install);
++
++struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc)
++{
++	struct fuse_dev *fud;
++
++	fud = fuse_dev_alloc();
++	if (!fud)
++		return NULL;
++
++	fuse_dev_install(fud, fc);
++	return fud;
++}
++EXPORT_SYMBOL_GPL(fuse_dev_alloc_install);
++
+ void fuse_dev_free(struct fuse_dev *fud)
+ {
+ 	struct fuse_conn *fc = fud->fc;
+@@ -1052,15 +1166,13 @@ void fuse_dev_free(struct fuse_dev *fud)
+ }
+ EXPORT_SYMBOL_GPL(fuse_dev_free);
+ 
+-static int fuse_fill_super(struct super_block *sb, void *data, int silent)
++int fuse_fill_super_common(struct super_block *sb,
++			   struct fuse_mount_data *mount_data)
+ {
+ 	struct fuse_dev *fud;
+ 	struct fuse_conn *fc;
+ 	struct inode *root;
+-	struct fuse_mount_data d;
+-	struct file *file;
+ 	struct dentry *root_dentry;
+-	struct fuse_req *init_req;
+ 	int err;
+ 	int is_bdev = sb->s_bdev != NULL;
+ 
+@@ -1070,13 +1182,10 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+ 
+ 	sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
+ 
+-	if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns))
+-		goto err;
+-
+ 	if (is_bdev) {
+ #ifdef CONFIG_BLOCK
+ 		err = -EINVAL;
+-		if (!sb_set_blocksize(sb, d.blksize))
++		if (!sb_set_blocksize(sb, mount_data->blksize))
+ 			goto err;
+ #endif
+ 	} else {
+@@ -1093,19 +1202,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+ 	if (sb->s_user_ns != &init_user_ns)
+ 		sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
+ 
+-	file = fget(d.fd);
+-	err = -EINVAL;
+-	if (!file)
+-		goto err;
+-
+-	/*
+-	 * Require mount to happen from the same user namespace which
+-	 * opened /dev/fuse to prevent potential attacks.
+-	 */
+-	if (file->f_op != &fuse_dev_operations ||
+-	    file->f_cred->user_ns != sb->s_user_ns)
+-		goto err_fput;
+-
+ 	/*
+ 	 * If we are not in the initial user namespace posix
+ 	 * acls must be translated.
+@@ -1116,12 +1212,21 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+ 	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+ 	err = -ENOMEM;
+ 	if (!fc)
+-		goto err_fput;
++		goto err;
+ 
+-	fuse_conn_init(fc, sb->s_user_ns);
++	fuse_conn_init(fc, sb->s_user_ns, mount_data->dax_dev,
++			mount_data->fiq_ops, mount_data->fiq_priv);
+ 	fc->release = fuse_free_conn;
+ 
+-	fud = fuse_dev_alloc(fc);
++	if (mount_data->dax_dev) {
++		err = fuse_dax_mem_range_init(fc, mount_data->dax_dev);
++		if (err) {
++			pr_debug("fuse_dax_mem_range_init() returned %d\n", err);
++			goto err_free_ranges;
++		}
++	}
++
++	fud = fuse_dev_alloc_install(fc);
+ 	if (!fud)
+ 		goto err_put_conn;
+ 
+@@ -1136,17 +1241,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+ 		fc->dont_mask = 1;
+ 	sb->s_flags |= SB_POSIXACL;
+ 
+-	fc->default_permissions = d.default_permissions;
+-	fc->allow_other = d.allow_other;
+-	fc->user_id = d.user_id;
+-	fc->group_id = d.group_id;
+-	fc->max_read = max_t(unsigned, 4096, d.max_read);
++	fc->default_permissions = mount_data->default_permissions;
++	fc->allow_other = mount_data->allow_other;
++	fc->user_id = mount_data->user_id;
++	fc->group_id = mount_data->group_id;
++	fc->max_read = max_t(unsigned, 4096, mount_data->max_read);
+ 
+ 	/* Used by get_root_inode() */
+ 	sb->s_fs_info = fc;
+ 
+ 	err = -ENOMEM;
+-	root = fuse_get_root_inode(sb, d.rootmode);
++	root = fuse_get_root_inode(sb, mount_data->rootmode);
+ 	sb->s_d_op = &fuse_root_dentry_operations;
+ 	root_dentry = d_make_root(root);
+ 	if (!root_dentry)
+@@ -1154,20 +1259,15 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+ 	/* Root dentry doesn't have .d_revalidate */
+ 	sb->s_d_op = &fuse_dentry_operations;
+ 
+-	init_req = fuse_request_alloc(0);
+-	if (!init_req)
+-		goto err_put_root;
+-	__set_bit(FR_BACKGROUND, &init_req->flags);
+-
+-	if (is_bdev) {
++	if (mount_data->destroy) {
+ 		fc->destroy_req = fuse_request_alloc(0);
+ 		if (!fc->destroy_req)
+-			goto err_free_init_req;
++			goto err_put_root;
+ 	}
+ 
+ 	mutex_lock(&fuse_mutex);
+ 	err = -EINVAL;
+-	if (file->private_data)
++	if (*mount_data->fudptr)
+ 		goto err_unlock;
+ 
+ 	err = fuse_ctl_add_conn(fc);
+@@ -1176,35 +1276,82 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+ 
+ 	list_add_tail(&fc->entry, &fuse_conn_list);
+ 	sb->s_root = root_dentry;
+-	file->private_data = fud;
++	*mount_data->fudptr = fud;
+ 	mutex_unlock(&fuse_mutex);
+-	/*
+-	 * atomic_dec_and_test() in fput() provides the necessary
+-	 * memory barrier for file->private_data to be visible on all
+-	 * CPUs after this
+-	 */
+-	fput(file);
+-
+-	fuse_send_init(fc, init_req);
+-
+ 	return 0;
+ 
+  err_unlock:
+ 	mutex_unlock(&fuse_mutex);
+- err_free_init_req:
+-	fuse_request_free(init_req);
+  err_put_root:
+ 	dput(root_dentry);
+  err_dev_free:
+ 	fuse_dev_free(fud);
++ err_free_ranges:
++	if (mount_data->dax_dev)
++		fuse_free_dax_mem_ranges(&fc->free_ranges);
+  err_put_conn:
+ 	fuse_conn_put(fc);
+ 	sb->s_fs_info = NULL;
+- err_fput:
+-	fput(file);
+  err:
+ 	return err;
+ }
++EXPORT_SYMBOL_GPL(fuse_fill_super_common);
++
++static int fuse_fill_super(struct super_block *sb, void *data, int silent)
++{
++	struct fuse_mount_data d;
++	struct file *file;
++	int is_bdev = sb->s_bdev != NULL;
++	int err;
++	struct fuse_req *init_req;
++
++	err = -EINVAL;
++	if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns))
++		goto err;
++	if (!d.fd_present || d.tag_present)
++		goto err;
++
++	file = fget(d.fd);
++	if (!file)
++		goto err;
++
++	/*
++	 * Require mount to happen from the same user namespace which
++	 * opened /dev/fuse to prevent potential attacks.
++	 */
++	if ((file->f_op != &fuse_dev_operations) ||
++	    (file->f_cred->user_ns != sb->s_user_ns))
++		goto err_fput;
++
++	init_req = fuse_request_alloc(0);
++	if (!init_req)
++		goto err_fput;
++	__set_bit(FR_BACKGROUND, &init_req->flags);
++
++	d.dax_dev = NULL;
++	d.fiq_ops = &fuse_dev_fiq_ops;
++	d.fiq_priv = NULL;
++	d.fudptr = &file->private_data;
++	d.destroy = is_bdev;
++	err = fuse_fill_super_common(sb, &d);
++	if (err < 0)
++		goto err_free_init_req;
++	/*
++	 * atomic_dec_and_test() in fput() provides the necessary
++	 * memory barrier for file->private_data to be visible on all
++	 * CPUs after this
++	 */
++	fput(file);
++	fuse_send_init(get_fuse_conn_super(sb), init_req);
++	return 0;
++
++err_free_init_req:
++	fuse_request_free(init_req);
++err_fput:
++	fput(file);
++err:
++	return err;
++}
+ 
+ static struct dentry *fuse_mount(struct file_system_type *fs_type,
+ 		       int flags, const char *dev_name,
+@@ -1229,11 +1376,12 @@ static void fuse_sb_destroy(struct super_block *sb)
+ 	}
+ }
+ 
+-static void fuse_kill_sb_anon(struct super_block *sb)
++void fuse_kill_sb_anon(struct super_block *sb)
+ {
+ 	fuse_sb_destroy(sb);
+ 	kill_anon_super(sb);
+ }
++EXPORT_SYMBOL_GPL(fuse_kill_sb_anon);
+ 
+ static struct file_system_type fuse_fs_type = {
+ 	.owner		= THIS_MODULE,
+diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
+new file mode 100644
+index 000000000000..a0a2cd1cefc7
+--- /dev/null
++++ b/fs/fuse/virtio_fs.c
+@@ -0,0 +1,1121 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * virtio-fs: Virtio Filesystem
++ * Copyright (C) 2018 Red Hat, Inc.
++ */
++
++#include <linux/fs.h>
++#include <linux/dax.h>
++#include <linux/pci.h>
++#include <linux/pfn_t.h>
++#include <linux/module.h>
++#include <linux/virtio.h>
++#include <linux/virtio_fs.h>
++#include "fuse_i.h"
++
++/* List of virtio-fs device instances and a lock for the list */
++static DEFINE_MUTEX(virtio_fs_mutex);
++static LIST_HEAD(virtio_fs_instances);
++
++enum {
++	VQ_HIPRIO,
++	VQ_REQUEST
++};
++
++/* Per-virtqueue state */
++struct virtio_fs_vq {
++	struct virtqueue *vq;     /* protected by fpq->lock */
++	struct work_struct done_work;
++	struct list_head queued_reqs;
++	struct delayed_work dispatch_work;
++	struct fuse_dev *fud;
++	char name[24];
++} ____cacheline_aligned_in_smp;
++
++/* State needed for devm_memremap_pages().  This API is called on the
++ * underlying pci_dev instead of struct virtio_fs (layering violation).  Since
++ * the memremap release function only gets called when the pci_dev is released,
++ * keep the associated state separate from struct virtio_fs (it has a different
++ * lifecycle from pci_dev).
++ */
++struct virtio_fs_memremap_info {
++	struct dev_pagemap pgmap;
++	struct percpu_ref ref;
++	struct completion completion;
++};
++
++/* A virtio-fs device instance */
++struct virtio_fs {
++	struct list_head list;    /* on virtio_fs_instances */
++	char *tag;
++	struct virtio_fs_vq *vqs;
++	unsigned nvqs;            /* number of virtqueues */
++	unsigned num_queues;      /* number of request queues */
++	struct dax_device *dax_dev;
++
++	/* DAX memory window where file contents are mapped */
++	void *window_kaddr;
++	phys_addr_t window_phys_addr;
++	size_t window_len;
++};
++
++struct virtio_fs_forget {
++	struct fuse_in_header ih;
++	struct fuse_forget_in arg;
++	/* This request can be temporarily queued on virt queue */
++	struct list_head list;
++};
++
++static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
++{
++	struct virtio_fs *fs = vq->vdev->priv;
++
++	return &fs->vqs[vq->index];
++}
++
++static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq)
++{
++	return &vq_to_fsvq(vq)->fud->pq;
++}
++
++/* Add a new instance to the list or return -EEXIST if tag name exists*/
++static int virtio_fs_add_instance(struct virtio_fs *fs)
++{
++	struct virtio_fs *fs2;
++	bool duplicate = false;
++
++	mutex_lock(&virtio_fs_mutex);
++
++	list_for_each_entry(fs2, &virtio_fs_instances, list) {
++		if (strcmp(fs->tag, fs2->tag) == 0)
++			duplicate = true;
++	}
++
++	if (!duplicate)
++		list_add_tail(&fs->list, &virtio_fs_instances);
++
++	mutex_unlock(&virtio_fs_mutex);
++
++	if (duplicate)
++		return -EEXIST;
++	return 0;
++}
++
++/* Return the virtio_fs with a given tag, or NULL */
++static struct virtio_fs *virtio_fs_find_instance(const char *tag)
++{
++	struct virtio_fs *fs;
++
++	mutex_lock(&virtio_fs_mutex);
++
++	list_for_each_entry(fs, &virtio_fs_instances, list) {
++		if (strcmp(fs->tag, tag) == 0)
++			goto found;
++	}
++
++	fs = NULL; /* not found */
++
++found:
++	mutex_unlock(&virtio_fs_mutex);
++
++	return fs;
++}
++
++static void virtio_fs_free_devs(struct virtio_fs *fs)
++{
++	unsigned int i;
++
++	/* TODO lock */
++
++	for (i = 0; i < fs->nvqs; i++) {
++		struct virtio_fs_vq *fsvq = &fs->vqs[i];
++
++		if (!fsvq->fud)
++			continue;
++
++		flush_work(&fsvq->done_work);
++		flush_delayed_work(&fsvq->dispatch_work);
++
++		fuse_dev_free(fsvq->fud); /* TODO need to quiesce/end_requests/decrement dev_count */
++		fsvq->fud = NULL;
++	}
++}
++
++/* Read filesystem name from virtio config into fs->tag (must kfree()). */
++static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
++{
++	char tag_buf[sizeof_field(struct virtio_fs_config, tag)];
++	char *end;
++	size_t len;
++
++	virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag),
++			   &tag_buf, sizeof(tag_buf));
++	end = memchr(tag_buf, '\0', sizeof(tag_buf));
++	if (end == tag_buf)
++		return -EINVAL; /* empty tag */
++	if (!end)
++		end = &tag_buf[sizeof(tag_buf)];
++
++	len = end - tag_buf;
++	fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL);
++	if (!fs->tag)
++		return -ENOMEM;
++	memcpy(fs->tag, tag_buf, len);
++	fs->tag[len] = '\0';
++	return 0;
++}
++
++/* Work function for hiprio completion */
++static void virtio_fs_hiprio_done_work(struct work_struct *work)
++{
++	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
++						 done_work);
++	struct fuse_pqueue *fpq = &fsvq->fud->pq;
++	struct virtqueue *vq = fsvq->vq;
++
++	/* Free completed FUSE_FORGET requests */
++	spin_lock(&fpq->lock);
++	do {
++		unsigned len;
++		void *req;
++
++		virtqueue_disable_cb(vq);
++
++		while ((req = virtqueue_get_buf(vq, &len)) != NULL)
++			kfree(req);
++	} while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
++	spin_unlock(&fpq->lock);
++}
++
++static void virtio_fs_dummy_dispatch_work(struct work_struct *work)
++{
++	return;
++}
++
++static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
++{
++	struct virtio_fs_forget *forget;
++	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
++						 dispatch_work.work);
++	struct fuse_pqueue *fpq = &fsvq->fud->pq;
++	struct virtqueue *vq = fsvq->vq;
++	struct scatterlist sg;
++	struct scatterlist *sgs[] = {&sg};
++	bool notify;
++	int ret;
++
++	pr_debug("worker virtio_fs_hiprio_dispatch_work() called.\n");
++	while(1) {
++		spin_lock(&fpq->lock);
++		forget = list_first_entry_or_null(&fsvq->queued_reqs,
++					struct virtio_fs_forget, list);
++		if (!forget) {
++			spin_unlock(&fpq->lock);
++			return;
++		}
++
++		list_del(&forget->list);
++		sg_init_one(&sg, forget, sizeof(*forget));
++
++		/* Enqueue the request */
++		dev_dbg(&vq->vdev->dev, "%s\n", __func__);
++		ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
++		if (ret < 0) {
++			if (ret == -ENOMEM || ret == -ENOSPC) {
++				pr_debug("virtio-fs: Could not queue FORGET:"
++					 " err=%d. Will try later\n", ret);
++				list_add_tail(&forget->list,
++						&fsvq->queued_reqs);
++				schedule_delayed_work(&fsvq->dispatch_work,
++						msecs_to_jiffies(1));
++			} else {
++				pr_debug("virtio-fs: Could not queue FORGET:"
++					 " err=%d. Dropping it.\n", ret);
++				kfree(forget);
++			}
++			spin_unlock(&fpq->lock);
++			return;
++		}
++
++		notify = virtqueue_kick_prepare(vq);
++		spin_unlock(&fpq->lock);
++
++		if (notify)
++			virtqueue_notify(vq);
++		pr_debug("worker virtio_fs_hiprio_dispatch_work() dispatched one forget request.\n");
++	}
++}
++
++/* Allocate and copy args into req->argbuf */
++static int copy_args_to_argbuf(struct fuse_req *req)
++{
++	unsigned offset = 0;
++	unsigned num_in;
++	unsigned num_out;
++	unsigned len;
++	unsigned i;
++
++	num_in = req->in.numargs - req->in.argpages;
++	num_out = req->out.numargs - req->out.argpages;
++	len = fuse_len_args(num_in, (struct fuse_arg *)req->in.args) +
++	      fuse_len_args(num_out, req->out.args);
++
++	req->argbuf = kmalloc(len, GFP_ATOMIC);
++	if (!req->argbuf)
++		return -ENOMEM;
++
++	for (i = 0; i < num_in; i++) {
++		memcpy(req->argbuf + offset,
++		       req->in.args[i].value,
++		       req->in.args[i].size);
++		offset += req->in.args[i].size;
++	}
++
++	return 0;
++}
++
++/* Copy args out of and free req->argbuf */
++static void copy_args_from_argbuf(struct fuse_req *req)
++{
++	unsigned remaining;
++	unsigned offset;
++	unsigned num_in;
++	unsigned num_out;
++	unsigned i;
++
++	remaining = req->out.h.len - sizeof(req->out.h);
++	num_in = req->in.numargs - req->in.argpages;
++	num_out = req->out.numargs - req->out.argpages;
++	offset = fuse_len_args(num_in, (struct fuse_arg *)req->in.args);
++
++	for (i = 0; i < num_out; i++) {
++		unsigned argsize = req->out.args[i].size;
++
++		if (req->out.argvar &&
++		    i == req->out.numargs - 1 &&
++		    argsize > remaining) {
++			argsize = remaining;
++		}
++
++		memcpy(req->out.args[i].value, req->argbuf + offset, argsize);
++		offset += argsize;
++
++		if (i != req->out.numargs - 1)
++			remaining -= argsize;
++	}
++
++	/* Store the actual size of the variable-length arg */
++	if (req->out.argvar)
++		req->out.args[req->out.numargs - 1].size = remaining;
++
++	kfree(req->argbuf);
++	req->argbuf = NULL;
++}
++
++/* Work function for request completion */
++static void virtio_fs_requests_done_work(struct work_struct *work)
++{
++	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
++						 done_work);
++	struct fuse_pqueue *fpq = &fsvq->fud->pq;
++	struct fuse_conn *fc = fsvq->fud->fc;
++	struct virtqueue *vq = fsvq->vq;
++	struct fuse_req *req;
++	struct fuse_req *next;
++	LIST_HEAD(reqs);
++
++	/* Collect completed requests off the virtqueue */
++	spin_lock(&fpq->lock);
++	do {
++		unsigned len;
++
++		virtqueue_disable_cb(vq);
++
++		while ((req = virtqueue_get_buf(vq, &len)) != NULL)
++			list_move_tail(&req->list, &reqs);
++	} while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
++	spin_unlock(&fpq->lock);
++
++	/* End requests */
++	list_for_each_entry_safe(req, next, &reqs, list) {
++		/* TODO check unique */
++		/* TODO fuse_len_args(out) against oh.len */
++
++		copy_args_from_argbuf(req);
++
++		/* TODO zeroing? */
++
++		spin_lock(&fpq->lock);
++		clear_bit(FR_SENT, &req->flags);
++		list_del_init(&req->list);
++		spin_unlock(&fpq->lock);
++
++		fuse_request_end(fc, req);
++	}
++}
++
++/* Virtqueue interrupt handler */
++static void virtio_fs_vq_done(struct virtqueue *vq)
++{
++	struct virtio_fs_vq *fsvq = vq_to_fsvq(vq);
++
++	dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name);
++
++	schedule_work(&fsvq->done_work);
++}
++
++/* Initialize virtqueues */
++static int virtio_fs_setup_vqs(struct virtio_device *vdev,
++			       struct virtio_fs *fs)
++{
++	struct virtqueue **vqs;
++	vq_callback_t **callbacks;
++	const char **names;
++	unsigned i;
++	int ret;
++
++	virtio_cread(vdev, struct virtio_fs_config, num_queues,
++		     &fs->num_queues);
++	if (fs->num_queues == 0)
++		return -EINVAL;
++
++	fs->nvqs = 1 + fs->num_queues;
++
++	fs->vqs = devm_kcalloc(&vdev->dev, fs->nvqs,
++				sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
++	if (!fs->vqs)
++		return -ENOMEM;
++
++	vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL);
++	callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
++					GFP_KERNEL);
++	names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
++	if (!vqs || !callbacks || !names) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
++	snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
++			"hiprio");
++	names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
++	INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
++	INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
++	INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
++			virtio_fs_hiprio_dispatch_work);
++
++	/* Initialize the requests virtqueues */
++	for (i = VQ_REQUEST; i < fs->nvqs; i++) {
++		INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
++		INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
++					virtio_fs_dummy_dispatch_work);
++		INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
++		snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
++			 "requests.%u", i - VQ_REQUEST);
++		callbacks[i] = virtio_fs_vq_done;
++		names[i] = fs->vqs[i].name;
++	}
++
++	ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
++	if (ret < 0)
++		goto out;
++
++	for (i = 0; i < fs->nvqs; i++)
++		fs->vqs[i].vq = vqs[i];
++
++out:
++	kfree(names);
++	kfree(callbacks);
++	kfree(vqs);
++	return ret;
++}
++
++/* Free virtqueues (device must already be reset) */
++static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
++				  struct virtio_fs *fs)
++{
++	vdev->config->del_vqs(vdev);
++}
++
++/* Map a window offset to a page frame number.  The window offset will have
++ * been produced by .iomap_begin(), which maps a file offset to a window
++ * offset.
++ */
++static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
++				    long nr_pages, void **kaddr, pfn_t *pfn)
++{
++	struct virtio_fs *fs = dax_get_private(dax_dev);
++	phys_addr_t offset = PFN_PHYS(pgoff);
++	size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
++
++	pr_debug("virtio_fs_direct_access(): called. nr_pages=%ld max_nr_pages=%zu\n", nr_pages, max_nr_pages);
++
++	if (kaddr)
++		*kaddr = fs->window_kaddr + offset;
++	if (pfn)
++		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
++					PFN_DEV | PFN_MAP);
++	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
++}
++
++static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev,
++				       pgoff_t pgoff, void *addr,
++				       size_t bytes, struct iov_iter *i)
++{
++	return copy_from_iter(addr, bytes, i);
++}
++
++static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev,
++				       pgoff_t pgoff, void *addr,
++				       size_t bytes, struct iov_iter *i)
++{
++	return copy_to_iter(addr, bytes, i);
++}
++
++static const struct dax_operations virtio_fs_dax_ops = {
++	.direct_access = virtio_fs_direct_access,
++	.copy_from_iter = virtio_fs_copy_from_iter,
++	.copy_to_iter = virtio_fs_copy_to_iter,
++};
++
++static void virtio_fs_percpu_release(struct percpu_ref *ref)
++{
++	struct virtio_fs_memremap_info *mi =
++		container_of(ref, struct virtio_fs_memremap_info, ref);
++
++	complete(&mi->completion);
++}
++
++static void virtio_fs_percpu_exit(void *data)
++{
++	struct virtio_fs_memremap_info *mi = data;
++
++	wait_for_completion(&mi->completion);
++	percpu_ref_exit(&mi->ref);
++}
++
++static void virtio_fs_percpu_kill(struct percpu_ref *ref)
++{
++	percpu_ref_kill(ref);
++}
++
++static void virtio_fs_cleanup_dax(void *data)
++{
++	struct virtio_fs *fs = data;
++
++	kill_dax(fs->dax_dev);
++	put_dax(fs->dax_dev);
++}
++
++static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
++{
++	struct virtio_shm_region cache_reg;
++	struct virtio_fs_memremap_info *mi;
++	struct dev_pagemap *pgmap;
++	bool have_cache;
++	int ret;
++
++	if (!IS_ENABLED(CONFIG_DAX_DRIVER))
++		return 0;
++
++	/* Get cache region */
++	have_cache = virtio_get_shm_region(vdev,
++					   &cache_reg,
++					   (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
++	if (!have_cache) {
++		dev_err(&vdev->dev, "%s: No cache capability\n", __func__);
++		return -ENXIO;
++	} else {
++		dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n",
++			   cache_reg.len, cache_reg.addr);
++	}
++
++	mi = devm_kzalloc(&vdev->dev, sizeof(*mi), GFP_KERNEL);
++	if (!mi)
++		return -ENOMEM;
++
++	init_completion(&mi->completion);
++	ret = percpu_ref_init(&mi->ref, virtio_fs_percpu_release, 0,
++			      GFP_KERNEL);
++	if (ret < 0) {
++		dev_err(&vdev->dev, "%s: percpu_ref_init failed (%d)\n",
++			__func__, ret);
++		return ret;
++	}
++
++	ret = devm_add_action(&vdev->dev, virtio_fs_percpu_exit, mi);
++	if (ret < 0) {
++		percpu_ref_exit(&mi->ref);
++		return ret;
++	}
++
++	pgmap = &mi->pgmap;
++	pgmap->altmap_valid = false;
++	pgmap->ref = &mi->ref;
++	pgmap->kill = virtio_fs_percpu_kill;
++	pgmap->type = MEMORY_DEVICE_FS_DAX;
++
++	/* Ideally we would directly use the PCI BAR resource but
++	 * devm_memremap_pages() wants its own copy in pgmap.  So
++	 * initialize a struct resource from scratch (only the start
++	 * and end fields will be used).
++	 */
++	pgmap->res = (struct resource){
++		.name = "virtio-fs dax window",
++		.start = (phys_addr_t) cache_reg.addr,
++		.end = (phys_addr_t) cache_reg.addr + cache_reg.len,
++	};
++
++	fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);
++	if (IS_ERR(fs->window_kaddr))
++		return PTR_ERR(fs->window_kaddr);
++
++	fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
++	fs->window_len = (phys_addr_t) cache_reg.len;
++
++	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx"
++		" len 0x%llx\n", __func__, fs->window_kaddr, cache_reg.addr,
++		cache_reg.len);
++
++	fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
++	if (!fs->dax_dev)
++		return -ENOMEM;
++
++	return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs);
++}
++
++static int virtio_fs_probe(struct virtio_device *vdev)
++{
++	struct virtio_fs *fs;
++	int ret;
++
++	fs = devm_kzalloc(&vdev->dev, sizeof(*fs), GFP_KERNEL);
++	if (!fs)
++		return -ENOMEM;
++	vdev->priv = fs;
++
++	ret = virtio_fs_read_tag(vdev, fs);
++	if (ret < 0)
++		goto out;
++
++	ret = virtio_fs_setup_vqs(vdev, fs);
++	if (ret < 0)
++		goto out;
++
++	/* TODO vq affinity */
++	/* TODO populate notifications vq */
++
++	ret = virtio_fs_setup_dax(vdev, fs);
++	if (ret < 0)
++		goto out_vqs;
++
++	/* Bring the device online in case the filesystem is mounted and
++	 * requests need to be sent before we return.
++	 */
++	virtio_device_ready(vdev);
++
++	ret = virtio_fs_add_instance(fs);
++	if (ret < 0)
++		goto out_vqs;
++
++	return 0;
++
++out_vqs:
++	vdev->config->reset(vdev);
++	virtio_fs_cleanup_vqs(vdev, fs);
++out:
++	vdev->priv = NULL;
++	return ret;
++}
++
++static void virtio_fs_remove(struct virtio_device *vdev)
++{
++	struct virtio_fs *fs = vdev->priv;
++
++	virtio_fs_free_devs(fs);
++
++	vdev->config->reset(vdev);
++	virtio_fs_cleanup_vqs(vdev, fs);
++
++	mutex_lock(&virtio_fs_mutex);
++	list_del(&fs->list);
++	mutex_unlock(&virtio_fs_mutex);
++
++	vdev->priv = NULL;
++}
++
++#ifdef CONFIG_PM
++static int virtio_fs_freeze(struct virtio_device *vdev)
++{
++	return 0; /* TODO */
++}
++
++static int virtio_fs_restore(struct virtio_device *vdev)
++{
++	return 0; /* TODO */
++}
++#endif /* CONFIG_PM */
++
++const static struct virtio_device_id id_table[] = {
++	{ VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID },
++	{},
++};
++
++const static unsigned int feature_table[] = {};
++
++static struct virtio_driver virtio_fs_driver = {
++	.driver.name		= KBUILD_MODNAME,
++	.driver.owner		= THIS_MODULE,
++	.id_table		= id_table,
++	.feature_table		= feature_table,
++	.feature_table_size	= ARRAY_SIZE(feature_table),
++	/* TODO validate config_get != NULL */
++	.probe			= virtio_fs_probe,
++	.remove			= virtio_fs_remove,
++#ifdef CONFIG_PM_SLEEP
++	.freeze			= virtio_fs_freeze,
++	.restore		= virtio_fs_restore,
++#endif
++};
++
++static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
++__releases(fiq->waitq.lock)
++{
++	struct fuse_forget_link *link;
++	struct virtio_fs_forget *forget;
++	struct fuse_pqueue *fpq;
++	struct scatterlist sg;
++	struct scatterlist *sgs[] = {&sg};
++	struct virtio_fs *fs;
++	struct virtqueue *vq;
++	struct virtio_fs_vq *fsvq;
++	bool notify;
++	u64 unique;
++	int ret;
++
++	BUG_ON(!fiq->forget_list_head.next);
++	link = fiq->forget_list_head.next;
++	BUG_ON(link->next);
++	fiq->forget_list_head.next = NULL;
++	fiq->forget_list_tail = &fiq->forget_list_head;
++
++	unique = fuse_get_unique(fiq);
++
++	fs = fiq->priv;
++	fsvq = &fs->vqs[VQ_HIPRIO];
++	spin_unlock(&fiq->waitq.lock);
++
++	/* Allocate a buffer for the request */
++	forget = kmalloc(sizeof(*forget), GFP_ATOMIC);
++	if (!forget) {
++		pr_err("virtio-fs: dropped FORGET: kmalloc failed\n");
++		goto out; /* TODO avoid dropping it? */
++	}
++
++	forget->ih = (struct fuse_in_header){
++		.opcode = FUSE_FORGET,
++		.nodeid = link->forget_one.nodeid,
++		.unique = unique,
++		.len = sizeof(*forget),
++	};
++	forget->arg = (struct fuse_forget_in){
++		.nlookup = link->forget_one.nlookup,
++	};
++
++	sg_init_one(&sg, forget, sizeof(*forget));
++
++	/* Enqueue the request */
++	vq = fsvq->vq;
++	dev_dbg(&vq->vdev->dev, "%s\n", __func__);
++	fpq = vq_to_fpq(vq);
++	spin_lock(&fpq->lock);
++
++	ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
++	if (ret < 0) {
++		if (ret == -ENOMEM || ret == -ENOSPC) {
++			pr_debug("virtio-fs: Could not queue FORGET: err=%d."
++				 " Will try later.\n", ret);
++			list_add_tail(&forget->list, &fsvq->queued_reqs);
++			schedule_delayed_work(&fsvq->dispatch_work,
++					msecs_to_jiffies(1));
++		} else {
++			pr_debug("virtio-fs: Could not queue FORGET: err=%d."
++				 " Dropping it.\n", ret);
++			kfree(forget);
++		}
++		spin_unlock(&fpq->lock);
++		goto out;
++	}
++
++	notify = virtqueue_kick_prepare(vq);
++
++	spin_unlock(&fpq->lock);
++
++	if (notify)
++		virtqueue_notify(vq);
++out:
++	kfree(link);
++}
++
++static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
++__releases(fiq->waitq.lock)
++{
++	/* TODO */
++	spin_unlock(&fiq->waitq.lock);
++}
++
++/* Return the number of scatter-gather list elements required */
++static unsigned sg_count_fuse_req(struct fuse_req *req)
++{
++	unsigned total_sgs = 1 /* fuse_in_header */;
++
++	if (req->in.numargs - req->in.argpages)
++		total_sgs += 1;
++
++	if (req->in.argpages)
++		total_sgs += req->num_pages;
++
++	if (!test_bit(FR_ISREPLY, &req->flags))
++		return total_sgs;
++
++	total_sgs += 1 /* fuse_out_header */;
++
++	if (req->out.numargs - req->out.argpages)
++		total_sgs += 1;
++
++	if (req->out.argpages)
++		total_sgs += req->num_pages;
++
++	return total_sgs;
++}
++
++/* Add pages to scatter-gather list and return number of elements used */
++static unsigned sg_init_fuse_pages(struct scatterlist *sg,
++				   struct page **pages,
++				   struct fuse_page_desc *page_descs,
++				   unsigned num_pages)
++{
++	unsigned i;
++
++	for (i = 0; i < num_pages; i++) {
++		sg_init_table(&sg[i], 1);
++		sg_set_page(&sg[i], pages[i],
++			    page_descs[i].length,
++			    page_descs[i].offset);
++	}
++
++	return i;
++}
++
++/* Add args to scatter-gather list and return number of elements used */
++static unsigned sg_init_fuse_args(struct scatterlist *sg,
++				  struct fuse_req *req,
++				  struct fuse_arg *args,
++				  unsigned numargs,
++				  bool argpages,
++				  void *argbuf,
++				  unsigned *len_used)
++{
++	unsigned total_sgs = 0;
++	unsigned len;
++
++	len = fuse_len_args(numargs - argpages, args);
++	if (len)
++		sg_init_one(&sg[total_sgs++], argbuf, len);
++
++	if (argpages)
++		total_sgs += sg_init_fuse_pages(&sg[total_sgs],
++						req->pages,
++						req->page_descs,
++						req->num_pages);
++
++	if (len_used)
++		*len_used = len;
++
++	return total_sgs;
++}
++
++/* Add a request to a virtqueue and kick the device */
++static int virtio_fs_enqueue_req(struct virtqueue *vq, struct fuse_req *req)
++{
++	struct scatterlist *stack_sgs[6 /* requests need at least 4 elements */];
++	struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)];
++	struct scatterlist **sgs = stack_sgs;
++	struct scatterlist *sg = stack_sg;
++	struct fuse_pqueue *fpq;
++	unsigned argbuf_used = 0;
++	unsigned out_sgs = 0;
++	unsigned in_sgs = 0;
++	unsigned total_sgs;
++	unsigned i;
++	int ret;
++	bool notify;
++
++	/* Does the sglist fit on the stack? */
++	total_sgs = sg_count_fuse_req(req);
++	if (total_sgs > ARRAY_SIZE(stack_sgs)) {
++		sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
++		sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
++		if (!sgs || !sg) {
++			ret = -ENOMEM;
++			goto out;
++		}
++	}
++
++	/* Use a bounce buffer since stack args cannot be mapped */
++	ret = copy_args_to_argbuf(req);
++	if (ret < 0)
++		goto out;
++
++	/* Request elements */
++	sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h));
++	out_sgs += sg_init_fuse_args(&sg[out_sgs], req,
++				     (struct fuse_arg *)req->in.args,
++				     req->in.numargs, req->in.argpages,
++				     req->argbuf, &argbuf_used);
++
++	/* Reply elements */
++	if (test_bit(FR_ISREPLY, &req->flags)) {
++		sg_init_one(&sg[out_sgs + in_sgs++],
++			    &req->out.h, sizeof(req->out.h));
++		in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req,
++					    req->out.args, req->out.numargs,
++					    req->out.argpages,
++					    req->argbuf + argbuf_used, NULL);
++	}
++
++	BUG_ON(out_sgs + in_sgs != total_sgs);
++
++	for (i = 0; i < total_sgs; i++)
++		sgs[i] = &sg[i];
++
++	fpq = vq_to_fpq(vq);
++	spin_lock(&fpq->lock);
++
++	ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC);
++	if (ret < 0) {
++		/* TODO handle full virtqueue */
++		spin_unlock(&fpq->lock);
++		goto out;
++	}
++
++	notify = virtqueue_kick_prepare(vq);
++
++	spin_unlock(&fpq->lock);
++
++	if (notify)
++		virtqueue_notify(vq);
++
++out:
++	if (ret < 0 && req->argbuf) {
++		kfree(req->argbuf);
++		req->argbuf = NULL;
++	}
++	if (sgs != stack_sgs) {
++		kfree(sgs);
++		kfree(sg);
++	}
++
++	return ret;
++}
++
++static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
++__releases(fiq->waitq.lock)
++{
++	unsigned queue_id = VQ_REQUEST; /* TODO multiqueue */
++	struct virtio_fs *fs;
++	struct fuse_conn *fc;
++	struct fuse_req *req;
++	struct fuse_pqueue *fpq;
++	int ret;
++
++	BUG_ON(list_empty(&fiq->pending));
++	req = list_last_entry(&fiq->pending, struct fuse_req, list);
++	clear_bit(FR_PENDING, &req->flags);
++	list_del_init(&req->list);
++	BUG_ON(!list_empty(&fiq->pending));
++	spin_unlock(&fiq->waitq.lock);
++
++	fs = fiq->priv;
++	fc = fs->vqs[queue_id].fud->fc;
++
++	dev_dbg(&fs->vqs[queue_id].vq->vdev->dev,
++		"%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
++		__func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid,
++		req->in.h.len, fuse_len_args(req->out.numargs, req->out.args));
++
++	/* TODO put request onto fpq->io list? */
++
++	fpq = &fs->vqs[queue_id].fud->pq;
++	spin_lock(&fpq->lock);
++	if (!fpq->connected) {
++		spin_unlock(&fpq->lock);
++		req->out.h.error = -ENODEV;
++		printk(KERN_ERR "%s: disconnected\n", __func__);
++		fuse_request_end(fc, req);
++		return;
++	}
++	list_add_tail(&req->list, &fpq->processing);
++	spin_unlock(&fpq->lock);
++	set_bit(FR_SENT, &req->flags);
++	/* matches barrier in request_wait_answer() */
++	smp_mb__after_atomic();
++	/* TODO check for FR_INTERRUPTED? */
++
++	ret = virtio_fs_enqueue_req(fs->vqs[queue_id].vq, req);
++	if (ret < 0) {
++		req->out.h.error = ret;
++		printk(KERN_ERR "%s: virtio_fs_enqueue_req failed %d\n",
++			__func__, ret);
++		fuse_request_end(fc, req);
++		return;
++	}
++}
++
++const static struct fuse_iqueue_ops virtio_fs_fiq_ops = {
++	.wake_forget_and_unlock		= virtio_fs_wake_forget_and_unlock,
++	.wake_interrupt_and_unlock	= virtio_fs_wake_interrupt_and_unlock,
++	.wake_pending_and_unlock	= virtio_fs_wake_pending_and_unlock,
++};
++
++static int virtio_fs_fill_super(struct super_block *sb, void *data,
++				int silent)
++{
++	struct fuse_mount_data d;
++	struct fuse_conn *fc;
++	struct virtio_fs *fs;
++	int is_bdev = sb->s_bdev != NULL;
++	unsigned int i;
++	int err;
++	struct fuse_req *init_req;
++
++	err = -EINVAL;
++	if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns))
++		goto err;
++	if (d.fd_present) {
++		printk(KERN_ERR "virtio-fs: fd option cannot be used\n");
++		goto err;
++	}
++	if (!d.tag_present) {
++		printk(KERN_ERR "virtio-fs: missing tag option\n");
++		goto err;
++	}
++
++	fs = virtio_fs_find_instance(d.tag);
++	if (!fs) {
++		printk(KERN_ERR "virtio-fs: tag not found\n");
++		err = -ENOENT;
++		goto err;
++	}
++
++	/* TODO lock */
++	if (fs->vqs[VQ_REQUEST].fud) {
++		printk(KERN_ERR "virtio-fs: device already in use\n");
++		err = -EBUSY;
++		goto err;
++	}
++
++	err = -ENOMEM;
++	/* Allocate fuse_dev for hiprio and notification queues */
++	for (i = 0; i < VQ_REQUEST; i++) {
++		struct virtio_fs_vq *fsvq = &fs->vqs[i];
++
++		fsvq->fud = fuse_dev_alloc();
++		if (!fsvq->fud)
++			goto err_free_fuse_devs;
++	}
++
++	init_req = fuse_request_alloc(0);
++	if (!init_req)
++		goto err;
++	__set_bit(FR_BACKGROUND, &init_req->flags);
++
++	d.dax_dev = d.dax ? fs->dax_dev : NULL;
++	d.fiq_ops = &virtio_fs_fiq_ops;
++	d.fiq_priv = fs;
++	d.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud;
++	d.destroy = true; /* Send destroy request on unmount */
++	err = fuse_fill_super_common(sb, &d);
++	if (err < 0)
++		goto err_free_init_req;
++
++	fc = fs->vqs[VQ_REQUEST].fud->fc;
++
++	/* TODO take fuse_mutex around this loop? */
++	for (i = 0; i < fs->nvqs; i++) {
++		struct virtio_fs_vq *fsvq = &fs->vqs[i];
++
++		if (i == VQ_REQUEST)
++			continue; /* already initialized */
++		fuse_dev_install(fsvq->fud, fc);
++		atomic_inc(&fc->dev_count);
++	}
++
++	fuse_send_init(fc, init_req);
++	return 0;
++
++err_free_init_req:
++	fuse_request_free(init_req);
++err_free_fuse_devs:
++	for (i = 0; i < fs->nvqs; i++) {
++		struct virtio_fs_vq *fsvq = &fs->vqs[i];
++		fuse_dev_free(fsvq->fud);
++	}
++err:
++	return err;
++}
++
++static void virtio_kill_sb(struct super_block *sb)
++{
++	struct fuse_conn *fc = get_fuse_conn_super(sb);
++	fuse_kill_sb_anon(sb);
++	if (fc) {
++		struct virtio_fs *vfs = fc->iq.priv;
++		virtio_fs_free_devs(vfs);
++	}
++}
++
++static struct dentry *virtio_fs_mount(struct file_system_type *fs_type,
++				      int flags, const char *dev_name,
++				      void *raw_data)
++{
++	return mount_nodev(fs_type, flags, raw_data, virtio_fs_fill_super);
++}
++
++static struct file_system_type virtio_fs_type = {
++	.owner		= THIS_MODULE,
++	.name		= KBUILD_MODNAME,
++	.mount		= virtio_fs_mount,
++	.kill_sb	= virtio_kill_sb,
++};
++
++static int __init virtio_fs_init(void)
++{
++	int ret;
++
++	ret = register_virtio_driver(&virtio_fs_driver);
++	if (ret < 0)
++		return ret;
++
++	ret = register_filesystem(&virtio_fs_type);
++	if (ret < 0) {
++		unregister_virtio_driver(&virtio_fs_driver);
++		return ret;
++	}
++
++	return 0;
++}
++module_init(virtio_fs_init);
++
++static void __exit virtio_fs_exit(void)
++{
++	unregister_filesystem(&virtio_fs_type);
++	unregister_virtio_driver(&virtio_fs_driver);
++}
++module_exit(virtio_fs_exit);
++
++MODULE_AUTHOR("Stefan Hajnoczi <stefanha@redhat.com>");
++MODULE_DESCRIPTION("Virtio Filesystem");
++MODULE_LICENSE("GPL");
++MODULE_ALIAS_FS(KBUILD_MODNAME);
++MODULE_DEVICE_TABLE(virtio, id_table);
+diff --git a/fs/splice.c b/fs/splice.c
+index b3daa971f597..d0bfbc13a417 100644
+--- a/fs/splice.c
++++ b/fs/splice.c
+@@ -365,7 +365,7 @@ static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
+ 	return res;
+ }
+ 
+-static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
++ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+ 				 struct pipe_inode_info *pipe, size_t len,
+ 				 unsigned int flags)
+ {
+@@ -429,6 +429,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+ 	iov_iter_advance(&to, copied);	/* truncates and discards */
+ 	return res;
+ }
++EXPORT_SYMBOL(default_file_splice_read);
+ 
+ /*
+  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
+diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
+index b697866946d2..c97f8a0cb47b 100644
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -953,7 +953,7 @@ xfs_dax_writepages(
+ {
+ 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+ 	return dax_writeback_mapping_range(mapping,
+-			xfs_find_bdev_for_inode(mapping->host), wbc);
++			xfs_find_bdev_for_inode(mapping->host), NULL, wbc);
+ }
+ 
+ STATIC int
+diff --git a/include/linux/dax.h b/include/linux/dax.h
+index 450b28db9533..a8461841f148 100644
+--- a/include/linux/dax.h
++++ b/include/linux/dax.h
+@@ -85,7 +85,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
+ 
+ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
+ int dax_writeback_mapping_range(struct address_space *mapping,
+-		struct block_device *bdev, struct writeback_control *wbc);
++		struct block_device *bdev, struct dax_device *dax_dev,
++		struct writeback_control *wbc);
+ 
+ struct page *dax_layout_busy_page(struct address_space *mapping);
+ bool dax_lock_mapping_entry(struct page *page);
+@@ -117,7 +118,8 @@ static inline struct page *dax_layout_busy_page(struct address_space *mapping)
+ }
+ 
+ static inline int dax_writeback_mapping_range(struct address_space *mapping,
+-		struct block_device *bdev, struct writeback_control *wbc)
++		struct block_device *bdev, struct dax_device *dax_dev,
++		struct writeback_control *wbc)
+ {
+ 	return -EOPNOTSUPP;
+ }
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index 7b6084854bfe..1c5ef6bf46e5 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -2991,6 +2991,8 @@ extern void block_sync_page(struct page *page);
+ /* fs/splice.c */
+ extern ssize_t generic_file_splice_read(struct file *, loff_t *,
+ 		struct pipe_inode_info *, size_t, unsigned int);
++extern ssize_t default_file_splice_read(struct file *, loff_t *,
++		struct pipe_inode_info *, size_t, unsigned int);
+ extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
+ 		struct file *, loff_t *, size_t, unsigned int);
+ extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
+diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
+index 32baf8e26735..8f85d1d8a895 100644
+--- a/include/linux/virtio_config.h
++++ b/include/linux/virtio_config.h
+@@ -10,6 +10,11 @@
+ 
+ struct irq_affinity;
+ 
++struct virtio_shm_region {
++       u64 addr;
++       u64 len;
++};
++
+ /**
+  * virtio_config_ops - operations for configuring a virtio device
+  * @get: read the value of a configuration field
+@@ -60,6 +65,7 @@ struct irq_affinity;
+  *      the caller can then copy.
+  * @set_vq_affinity: set the affinity for a virtqueue.
+  * @get_vq_affinity: get the affinity for a virtqueue (optional).
++ * @get_shm_region: get a shared memory region based on the index.
+  */
+ typedef void vq_callback_t(struct virtqueue *);
+ struct virtio_config_ops {
+@@ -83,6 +89,8 @@ struct virtio_config_ops {
+ 			       const struct cpumask *cpu_mask);
+ 	const struct cpumask *(*get_vq_affinity)(struct virtio_device *vdev,
+ 			int index);
++	bool (*get_shm_region)(struct virtio_device *vdev,
++			       struct virtio_shm_region *region, u8 id);
+ };
+ 
+ /* If driver didn't advertise the feature, it will never appear. */
+@@ -245,6 +253,15 @@ int virtqueue_set_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask)
+ 	return 0;
+ }
+ 
++static inline
++bool virtio_get_shm_region(struct virtio_device *vdev,
++                         struct virtio_shm_region *region, u8 id)
++{
++	if (!vdev->config->get_shm_region)
++		return false;
++	return vdev->config->get_shm_region(vdev, region, id);
++}
++
+ static inline bool virtio_is_little_endian(struct virtio_device *vdev)
+ {
+ 	return virtio_has_feature(vdev, VIRTIO_F_VERSION_1) ||
+diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
+index 92fa24c24c92..dbc5013ad747 100644
+--- a/include/uapi/linux/fuse.h
++++ b/include/uapi/linux/fuse.h
+@@ -381,6 +381,8 @@ enum fuse_opcode {
+ 	FUSE_READDIRPLUS   = 44,
+ 	FUSE_RENAME2       = 45,
+ 	FUSE_LSEEK         = 46,
++	FUSE_SETUPMAPPING	= 48,
++	FUSE_REMOVEMAPPING	= 49,
+ 
+ 	/* CUSE specific operations */
+ 	CUSE_INIT          = 4096,
+@@ -792,4 +794,36 @@ struct fuse_lseek_out {
+ 	uint64_t	offset;
+ };
+ 
++#define FUSE_SETUPMAPPING_ENTRIES 8
++#define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0)
++#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1)
++struct fuse_setupmapping_in {
++	/* An already open handle */
++	uint64_t	fh;
++	/* Offset into the file to start the mapping */
++	uint64_t	foffset;
++	/* Length of mapping required */
++	uint64_t	len;
++	/* Flags, FUSE_SETUPMAPPING_FLAG_* */
++	uint64_t	flags;
++	/* Offset in Memory Window */
++	uint64_t	moffset;
++};
++
++struct fuse_setupmapping_out {
++	/* Offsets into the cache of mappings */
++	uint64_t	coffset[FUSE_SETUPMAPPING_ENTRIES];
++        /* Lengths of each mapping */
++        uint64_t	len[FUSE_SETUPMAPPING_ENTRIES];
++};
++
++struct fuse_removemapping_in {
++        /* An already open handle */
++        uint64_t	fh;
++	/* Offset into the dax window start the unmapping */
++	uint64_t        moffset;
++        /* Length of mapping required */
++        uint64_t	len;
++};
++
+ #endif /* _LINUX_FUSE_H */
+diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h
+new file mode 100644
+index 000000000000..d4bb549568eb
+--- /dev/null
++++ b/include/uapi/linux/virtio_fs.h
+@@ -0,0 +1,44 @@
++#ifndef _UAPI_LINUX_VIRTIO_FS_H
++#define _UAPI_LINUX_VIRTIO_FS_H
++/* This header is BSD licensed so anyone can use the definitions to implement
++ * compatible drivers/servers.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ * 1. Redistributions of source code must retain the above copyright
++ *    notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ *    notice, this list of conditions and the following disclaimer in the
++ *    documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of IBM nor the names of its contributors
++ *    may be used to endorse or promote products derived from this software
++ *    without specific prior written permission.
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
++ * SUCH DAMAGE. */
++#include <linux/types.h>
++#include <linux/virtio_ids.h>
++#include <linux/virtio_config.h>
++#include <linux/virtio_types.h>
++
++struct virtio_fs_config {
++	/* Filesystem name (UTF-8, not NUL-terminated, padded with NULs) */
++	__u8 tag[36];
++
++	/* Number of request queues */
++	__u32 num_queues;
++} __attribute__((packed));
++
++/* For the id field in virtio_pci_shm_cap */
++#define VIRTIO_FS_SHMCAP_ID_CACHE 0
++
++#endif /* _UAPI_LINUX_VIRTIO_FS_H */
+diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
+index 6d5c3b2d4f4d..884b0e2734bb 100644
+--- a/include/uapi/linux/virtio_ids.h
++++ b/include/uapi/linux/virtio_ids.h
+@@ -43,5 +43,6 @@
+ #define VIRTIO_ID_INPUT        18 /* virtio input */
+ #define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
+ #define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
++#define VIRTIO_ID_FS           26 /* virtio filesystem */
+ 
+ #endif /* _LINUX_VIRTIO_IDS_H */
+diff --git a/include/uapi/linux/virtio_mmio.h b/include/uapi/linux/virtio_mmio.h
+index c4b09689ab64..0650f91bea6c 100644
+--- a/include/uapi/linux/virtio_mmio.h
++++ b/include/uapi/linux/virtio_mmio.h
+@@ -122,6 +122,17 @@
+ #define VIRTIO_MMIO_QUEUE_USED_LOW	0x0a0
+ #define VIRTIO_MMIO_QUEUE_USED_HIGH	0x0a4
+ 
++/* Shared memory region id */
++#define VIRTIO_MMIO_SHM_SEL             0x0ac
++
++/* Shared memory region length, 64 bits in two halves */
++#define VIRTIO_MMIO_SHM_LEN_LOW         0x0b0
++#define VIRTIO_MMIO_SHM_LEN_HIGH        0x0b4
++
++/* Shared memory region base address, 64 bits in two halves */
++#define VIRTIO_MMIO_SHM_BASE_LOW        0x0b8
++#define VIRTIO_MMIO_SHM_BASE_HIGH       0x0bc
++
+ /* Configuration atomicity value */
+ #define VIRTIO_MMIO_CONFIG_GENERATION	0x0fc
+ 
+diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
+index 90007a1abcab..31841a60a4ad 100644
+--- a/include/uapi/linux/virtio_pci.h
++++ b/include/uapi/linux/virtio_pci.h
+@@ -113,6 +113,8 @@
+ #define VIRTIO_PCI_CAP_DEVICE_CFG	4
+ /* PCI configuration access */
+ #define VIRTIO_PCI_CAP_PCI_CFG		5
++/* Additional shared memory capability */
++#define VIRTIO_PCI_CAP_SHARED_MEMORY_CFG 8
+ 
+ /* This is the PCI capability header: */
+ struct virtio_pci_cap {
+@@ -163,6 +165,14 @@ struct virtio_pci_cfg_cap {
+ 	__u8 pci_cfg_data[4]; /* Data for BAR access. */
+ };
+ 
++/* Fields in VIRTIO_PCI_CAP_SHARED_MEMORY_CFG */
++struct virtio_pci_shm_cap {
++       struct virtio_pci_cap cap;
++       __le32 offset_hi;             /* Most sig 32 bits of offset */
++       __le32 length_hi;             /* Most sig 32 bits of length */
++        __u8   id;                    /* To distinguish shm chunks */
++};
++
+ /* Macro versions of offsets for the Old Timers! */
+ #define VIRTIO_PCI_CAP_VNDR		0
+ #define VIRTIO_PCI_CAP_NEXT		1
+-- 
+2.20.1
+
diff --git a/kernel/patches/4.19.x/0001-Enable-memory-hotplug-using-probe-for-arm64.patch b/kernel/patches/4.19.x/0002-Enable-memory-hotplug-using-probe-for-arm64.patch
similarity index 95%
rename from kernel/patches/4.19.x/0001-Enable-memory-hotplug-using-probe-for-arm64.patch
rename to kernel/patches/4.19.x/0002-Enable-memory-hotplug-using-probe-for-arm64.patch
index 8d595a544a..6a4c7783a3 100644
--- a/kernel/patches/4.19.x/0001-Enable-memory-hotplug-using-probe-for-arm64.patch
+++ b/kernel/patches/4.19.x/0002-Enable-memory-hotplug-using-probe-for-arm64.patch
@@ -1,7 +1,7 @@
-From 074a6a9d83a4e790f892ef0fc91cdabbfbf26202 Mon Sep 17 00:00:00 2001
+From 33ffc9a93a1d9e72594d5eb3e4fc583a1a2911d1 Mon Sep 17 00:00:00 2001
 From: Jianyong Wu <jianyong.wu@arm.com>
 Date: Tue, 19 Feb 2019 01:15:32 -0500
-Subject: [PATCH] Enable memory-hotplug using probe for arm64
+Subject: [PATCH 2/5] Enable memory-hotplug using probe for arm64
 
 ---
  arch/arm64/Kconfig   |  7 +++++++
@@ -94,5 +94,5 @@ index 146c04ceaa51..d276bd4d38b5 100644
 +	return 0;
 +}
 -- 
-2.17.1
+2.20.1
 
diff --git a/kernel/patches/4.19.x/0001-NO-UPSTREAM-9P-always-use-cached-inode-to-fill-in-v9.patch b/kernel/patches/4.19.x/0003-NO-UPSTREAM-9P-always-use-cached-inode-to-fill-in-v9.patch
similarity index 80%
rename from kernel/patches/4.19.x/0001-NO-UPSTREAM-9P-always-use-cached-inode-to-fill-in-v9.patch
rename to kernel/patches/4.19.x/0003-NO-UPSTREAM-9P-always-use-cached-inode-to-fill-in-v9.patch
index f2ada7eb85..86e587503e 100644
--- a/kernel/patches/4.19.x/0001-NO-UPSTREAM-9P-always-use-cached-inode-to-fill-in-v9.patch
+++ b/kernel/patches/4.19.x/0003-NO-UPSTREAM-9P-always-use-cached-inode-to-fill-in-v9.patch
@@ -1,7 +1,7 @@
-From 0a235af3130a0c40fe2198f18198c7ac4e799a03 Mon Sep 17 00:00:00 2001
+From cab495651e8f71c39e87a08abbe051916110b3ca Mon Sep 17 00:00:00 2001
 From: Julio Montes <julio.montes@intel.com>
 Date: Mon, 18 Sep 2017 11:46:59 -0500
-Subject: [PATCH 2/3] NO-UPSTREAM: 9P: always use cached inode to fill in
+Subject: [PATCH 3/5] NO-UPSTREAM: 9P: always use cached inode to fill in
  v9fs_vfs_getattr
 
 So that if in cache=none mode, we don't have to lookup server that
@@ -17,10 +17,10 @@ Signed-off-by: Peng Tao <bergwolf@gmail.com>
  2 files changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
-index bdabb27..30395e0 100644
+index 85ff859d3af5..efdc2a8f37bb 100644
 --- a/fs/9p/vfs_inode.c
 +++ b/fs/9p/vfs_inode.c
-@@ -1068,7 +1068,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
+@@ -1080,7 +1080,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
  
  	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
  	v9ses = v9fs_dentry2v9ses(dentry);
@@ -30,10 +30,10 @@ index bdabb27..30395e0 100644
  		return 0;
  	}
 diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
-index 7f6ae21..5d7e970 100644
+index 4823e1c46999..daa5e6a41864 100644
 --- a/fs/9p/vfs_inode_dotl.c
 +++ b/fs/9p/vfs_inode_dotl.c
-@@ -481,7 +481,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
+@@ -480,7 +480,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
  
  	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
  	v9ses = v9fs_dentry2v9ses(dentry);
@@ -43,5 +43,5 @@ index 7f6ae21..5d7e970 100644
  		return 0;
  	}
 -- 
-2.9.5
+2.20.1
 
diff --git a/kernel/patches/4.19.x/0002-Compile-in-evged-always.patch b/kernel/patches/4.19.x/0004-Compile-in-evged-always.patch
similarity index 87%
rename from kernel/patches/4.19.x/0002-Compile-in-evged-always.patch
rename to kernel/patches/4.19.x/0004-Compile-in-evged-always.patch
index fbcabe67c7..c211adbe72 100644
--- a/kernel/patches/4.19.x/0002-Compile-in-evged-always.patch
+++ b/kernel/patches/4.19.x/0004-Compile-in-evged-always.patch
@@ -1,7 +1,7 @@
-From e35cb54fb8d07dd80fa8df44ff0de6eb5ff8d6cf Mon Sep 17 00:00:00 2001
+From d78297bf9d8e41711bddc6003f460e815340a214 Mon Sep 17 00:00:00 2001
 From: Arjan van de Ven <arjan@linux.intel.com>
 Date: Fri, 10 Aug 2018 13:22:08 +0000
-Subject: [PATCH 108/108] Compile in evged always
+Subject: [PATCH 4/5] Compile in evged always
 
 We need evged for NEMU (and in general for hw reduced)
 
@@ -25,5 +25,5 @@ index 6d59aa109a91..97f2fbbd5014 100644
  acpi-y				+= property.o
  acpi-$(CONFIG_X86)		+= acpi_cmos_rtc.o
 -- 
-2.18.0
+2.20.1
 
diff --git a/kernel/patches/4.19.x/0003-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch b/kernel/patches/4.19.x/0005-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch
similarity index 99%
rename from kernel/patches/4.19.x/0003-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch
rename to kernel/patches/4.19.x/0005-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch
index 17564efd2d..c51dd09425 100644
--- a/kernel/patches/4.19.x/0003-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch
+++ b/kernel/patches/4.19.x/0005-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch
@@ -1,8 +1,8 @@
-From 60a4fed76e63c36cd327c4b404ec163e93a4805e Mon Sep 17 00:00:00 2001
+From 6823b343a7c5f6fc3b93d4a00e919d14cb6a4adb Mon Sep 17 00:00:00 2001
 From: Penny Zheng <penny.zheng@arm.com>
 Date: Tue, 19 Feb 2019 16:05:44 +0800
-Subject: [PATCH] arm64: backport Arm64 KVM Dynamic IPA and 52bit IPA support
- to  4.19.X
+Subject: [PATCH 5/5] arm64: backport Arm64 KVM Dynamic IPA and 52bit IPA
+ support to 4.19.X
 
 This patch is based on Suzuki K Poulose's
 [v6,00/18] kvm: arm64: Dynamic IPA and 52bit IPA
@@ -258,10 +258,10 @@ index 460d616bb2d6..f6a7ea805232 100644
  
  #endif	/* __ARM_S2_PGTABLE_H_ */
 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
-index 1b1a0e95c751..f9162da575a9 100644
+index 881bea194d53..d77da7a56eb5 100644
 --- a/arch/arm64/Kconfig
 +++ b/arch/arm64/Kconfig
-@@ -1132,6 +1132,19 @@ config ARM64_RAS_EXTN
+@@ -1139,6 +1139,19 @@ config ARM64_RAS_EXTN
  	  and access the new registers if the system supports the extension.
  	  Platform RAS features may additionally depend on firmware support.
  
@@ -2273,5 +2273,5 @@ index a2a175b08b17..b3d1f0985117 100644
  }
  
 -- 
-2.17.1
+2.20.1