diff --git a/kernel/patches/4.19.x/0001-fuse-add-skeleton-virtio_fs.ko-module.patch b/kernel/patches/4.19.x/0001-fuse-add-skeleton-virtio_fs.ko-module.patch deleted file mode 100644 index f218c9865c..0000000000 --- a/kernel/patches/4.19.x/0001-fuse-add-skeleton-virtio_fs.ko-module.patch +++ /dev/null @@ -1,4604 +0,0 @@ -From 9a821958eb0b586b526af5490c811f28ec062d94 Mon Sep 17 00:00:00 2001 -From: Stefan Hajnoczi -Date: Tue, 12 Jun 2018 09:41:17 +0100 -Subject: [PATCH] fuse: add skeleton virtio_fs.ko module - -Add a basic file system module for virtio-fs. - -Signed-off-by: Stefan Hajnoczi - -fuse: add probe/remove virtio driver - -Add basic probe/remove functionality for the new virtio-fs device. - -Signed-off-by: Stefan Hajnoczi - -fuse: extract fuse_fill_super_common() - -fuse_fill_super() includes code to process the fd= option and link the -struct fuse_dev to the fd's struct file. In virtio-fs there is no file -descriptor because /dev/fuse is not used. - -This patch extracts fuse_fill_super_common() so that both classic fuse -and virtio-fs can share the code to initialize a mount. - -parse_fuse_opt() is also extracted so that the fuse_fill_super_common() -caller has access to the mount options. This allows classic fuse to -handle the fd= option outside fuse_fill_super_common(). - -Signed-off-by: Stefan Hajnoczi - -virtio_fs: get mount working - -Provide definitions of ->mount and ->kill_sb. This is still WIP. - -Signed-off-by: Stefan Hajnoczi - -fuse: export fuse_end_request() - -virtio-fs will need to complete requests from outside fs/fuse/dev.c. -Make the symbol visible. - -Signed-off-by: Stefan Hajnoczi - -fuse: export fuse_len_args() - -virtio-fs will need to query the length of fuse_arg lists. Make the -symbol visible. - -Signed-off-by: Stefan Hajnoczi - -fuse: Export fuse_send_init_request() - -This will be used by virtio-fs to send init request to fuse server after -initialization of virt queues. - -Signed-off-by: Vivek Goyal - -fuse: add fuse_iqueue_ops callbacks - -The /dev/fuse device uses fiq->waitq and fasync to signal that requests -are available. These mechanisms do not apply to virtio-fs. This patch -introduces callbacks so alternative behavior can be used. - -Note that queue_interrupt() changes along these lines: - - spin_lock(&fiq->waitq.lock); - wake_up_locked(&fiq->waitq); -+ kill_fasync(&fiq->fasync, SIGIO, POLL_IN); - spin_unlock(&fiq->waitq.lock); -- kill_fasync(&fiq->fasync, SIGIO, POLL_IN); - -Since queue_request() and queue_forget() also call kill_fasync() inside -the spinlock this should be safe. - -Signed-off-by: Stefan Hajnoczi - -fuse: Separate fuse device allocation and installation in fuse_conn - -As of now fuse_dev_alloc() both allocates a fuse device and installs it -in fuse_conn list. fuse_dev_alloc() can fail if fuse_device allocation -fails. - -virtio-fs needs to initialize multiple fuse devices (one per virtio -queue). It initializes one fuse device as part of call to -fuse_fill_super_common() and rest of the devices are allocated and -installed after that. - -But, we can't affort to fail after calling fuse_fill_super_common() as -we don't have a way to undo all the actions done by fuse_fill_super_common(). -So to avoid failures after the call to fuse_fill_super_common(), -pre-allocate all fuse devices early and install them into fuse connection -later. - -This patch provides two separate helpers for fuse device allocation and -fuse device installation in fuse_conn. - -Signed-off-by: Vivek Goyal - -fuse: process requests queues - -Send normal requests to the device and handle completions. - -This is enough to get mount and basic I/O working. The hiprio and -notifications queues still need to be implemented for full FUSE -functionality. - -Signed-off-by: Vivek Goyal -Signed-off-by: Stefan Hajnoczi - -fuse: export fuse_get_unique() - -virtio-fs will need unique IDs for FORGET requests from outside -fs/fuse/dev.c. Make the symbol visible. - -Signed-off-by: Stefan Hajnoczi - -fuse: implement FUSE_FORGET for virtio-fs - -Sent single FUSE_FORGET requests on the hiprio queue. In the future it -may be possible to do FUSE_BATCH_FORGET but that is tricky since -virtio-fs gets called synchronously when forgets are queued. - -Signed-off-by: Stefan Hajnoczi - -virtio_fs: Set up dax_device - -Setup a dax device. - -Signed-off-by: Stefan Hajnoczi - -dax: remove block device dependencies - -Although struct dax_device itself is not tied to a block device, some -DAX code assumes there is a block device. Make block devices optional -by allowing bdev to be NULL in commonly used DAX APIs. - -When there is no block device: - * Skip the partition offset calculation in bdev_dax_pgoff() - * Skip the blkdev_issue_zeroout() optimization - -Note that more block device assumptions remain but I haven't reach those -code paths yet. - -Signed-off-by: Stefan Hajnoczi - -dax: Pass dax_dev to dax_writeback_mapping_range() - -Right now dax_writeback_mapping_range() is passed a bdev and dax_dev -is searched from that bdev name. - -virtio-fs does not have a bdev. So pass in dax_dev also to -dax_writeback_mapping_range(). If dax_dev is passed in, bdev is not -used otherwise dax_dev is searched using bdev. - -Signed-off-by: Vivek Goyal - -fuse: add fuse_conn->dax_dev field - -A struct dax_device instance is a prerequisite for the DAX filesystem -APIs. Let virtio_fs associate a dax_device with a fuse_conn. Classic -FUSE and CUSE set the pointer to NULL, disabling DAX. - -Signed-off-by: Stefan Hajnoczi - -virtio: Add get_shm_region method - -Virtio defines 'shared memory regions' that provide a continuously -shared region between the host and guest. - -Provide a method to find a particular region on a device. - -Signed-off-by: Sebastien Boeuf -Signed-off-by: Dr. David Alan Gilbert - -virtio: Implement get_shm_region for PCI transport - -On PCI the shm regions are found using capability entries; -find a region by searching for the capability. - -Signed-off-by: Sebastien Boeuf -Signed-off-by: Dr. David Alan Gilbert - -virtio: Implement get_shm_region for MMIO transport - -On MMIO a new set of registers is defined for finding SHM -regions. Add their definitions and use them to find the region. - -Signed-off-by: Sebastien Boeuf - -fuse: map virtio_fs DAX window - -Use the shm capability to find the cache entry and map it. - -The DAX window is accessed by the fs/dax.c infrastructure and must have -struct pages (at least on x86). Use devm_memremap_pages() to map the -DAX window PCI BAR and allocate struct page. - -Signed-off-by: Stefan Hajnoczi -Signed-off-by: Sebastien Boeuf -Signed-off-by: Dr. David Alan Gilbert - -virito-fs: Make dax optional - -Add a 'dax' option and only enable dax when it's on. - -Also show "dax" in mount options if filesystem was mounted with dax -enabled. - -Signed-off-by: Dr. David Alan Gilbert -Signed-off-by: Vivek Goyal - -Limit number of pages returned by direct_access() - -Truncate number of pages mapped by direct_access() to remain with-in window -size. User might request mapping pages beyond window size. - -Signed-off-by: Vivek Goyal - -fuse: Introduce fuse_dax_mapping - -Introduce fuse_dax_mapping. This type will be used to keep track of -per inode dax mappings. - -Signed-off-by: Vivek Goyal - -Create a list of free memory ranges - -Divide the dax memory range into fixed size ranges (2MB for now) and put -them in a list. This will track free ranges. Once an inode requires a -free range, we will take one from here and put it in interval-tree -of ranges assigned to inode. - -Signed-off-by: Vivek Goyal - -fuse: simplify fuse_fill_super_common() calling - -Add more fields to "struct fuse_mount_data" so that less parameters -have to be passed to function fuse_fill_super_common(). - -Signed-off-by: Miklos Szeredi - -fuse: Introduce setupmapping/removemapping commands - -Introduce two new fuse commands to setup/remove memory mappings. - -Signed-off-by: Vivek Goyal - -Introduce interval tree basic data structures - -We want to use interval tree to keep track of per inode dax mappings. -Introduce basic data structures. - -Signed-off-by: Vivek Goyal - -fuse: Implement basic DAX read/write support commands - -This patch implements basic DAX support. mmap() is not implemented -yet and will come in later patches. This patch looks into implemeting -read/write. - -Signed-off-by: Stefan Hajnoczi -Signed-off-by: Dr. David Alan Gilbert -Signed-off-by: Vivek Goyal - -fuse: Maintain a list of busy elements - -This list will be used selecting fuse_dax_mapping to free when number of -free mappings drops below a threshold. - -Signed-off-by: Vivek Goyal - -Do fallocate() to grow file before mapping for file growing writes - -How to handle file growing writes. For now, this patch does fallocate() to -grow file and then map it using dax. We need to figure out what's the best -way to handle it. - -This patch does fallocate() and setup mapping operations in -fuse_dax_write_iter(), instead of iomap_begin(). I don't have access to file -pointer needed to send a message to fuse daemon in iomap_begin(). - -Dave Chinner has expressed concers with this approach as this is not -atomic. If guest crashes after falloc() but before data was written, -user will think that filesystem lost its data. So this is still an -outstanding issue. - -Signed-off-by: Vivek Goyal - -fuse: add DAX mmap support - -Add DAX mmap() support. - -Signed-off-by: Stefan Hajnoczi - -fuse: delete dentry if timeout is zero - -Don't hold onto dentry in lru list if need to re-lookup it anyway at next -access. - -More advanced version of this patch would periodically flush out dentries -from the lru which have gone stale. - -Signed-off-by: Miklos Szeredi - -fuse: Define dax address space operations - -This is done along the lines of ext4 and xfs. I primarily wanted ->writepages -hook at this time so that I could call into dax_writeback_mapping_range(). -This in turn will decide which pfns need to be written back and call -dax_flush() on those. - -Signed-off-by: Vivek Goyal - -fuse, dax: Take ->i_mmap_sem lock during dax page fault - -We need some kind of locking mechanism here. Normal file systems like -ext4 and xfs seems to take their own semaphore to protect agains -truncate while fault is going on. - -We have additional requirement to protect against fuse dax memory range -reclaim. When a range has been selected for reclaim, we need to make sure -no other read/write/fault can try to access that memory range while -reclaim is in progress. Once reclaim is complete, lock will be released -and read/write/fault will trigger allocation of fresh dax range. - -Taking inode_lock() is not an option in fault path as lockdep complains -about circular dependencies. So define a new fuse_inode->i_mmap_sem. - -Signed-off-by: Vivek Goyal - -fuse: Add logic to free up a memory range - -Add logic to free up a busy memory range. Freed memory range will be -returned to free pool. Add a worker which can be started to select -and free some busy memory ranges. - -Signed-off-by: Vivek Goyal - -fuse: Add logic to do direct reclaim of memory - -This can be done only from same inode. Also it can be done only for -read/write case and not for fault case. Reason, as of now reclaim requires -holding inode_lock, fuse_inode->i_mmap_sem and fuse_inode->dmap_tree -locks in that order and only read/write path will allow that (and not -fault path). - -Signed-off-by: Vivek Goyal - -fuse: Kick worker when free memory drops below 20% of total ranges - -Kick worker to free up some memory when number of free ranges drops below -20% of total free ranges at the time of initialization. - -Signed-off-by: Vivek Goyal - -fuse: multiplex cached/direct_io/dax file operations - -Dispatch FORGET requests later instead of dropping them - -If virtio queue is full, then don't drop FORGET requests. Instead, wait -a bit and try to dispatch these little later using a worker thread. - -Signed-off-by: Vivek Goyal - -Release file in process context - -fuse_file_put(sync) can be called with sync=true/false. If sync=true, -it waits for release request response and then calls iput() in the -caller's context. If sync=false, it does not wait for release request -response, frees the fuse_file struct immediately and req->end function -does the iput(). - -iput() can be a problem with DAX if called in req->end context. If this -is last reference to inode (VFS has let go its reference already), then -iput() will clean DAX mappings as well and send REMOVEMAPPING requests -and wait for completion. (All the the worker thread context which is -processing fuse replies from daemon on the host). - -That means it blocks worker thread and it stops processing further -replies and system deadlocks. - -So for now, force sync release of file in case of DAX inodes. - -Signed-off-by: Vivek Goyal - -fuse: Do not block on inode lock while freeing memory range - -Once we select a memory range to free, we currently block on inode -lock. Do not block and use trylock instead. And move on to next memory -range if trylock fails. - -Reason being that in next few patches I want to enabling waiting for -memmory ranges to become free in fuse_iomap_begin(). So insted of -returning -EBUSY, a process will wait for a memory range to become -free. - -We don't want to end up in a situation where process is sleeping in -iomap_begin() with inode lock held and worker is trying to free -memory from same inode, resulting in deadlock. - -To avoid deadlock, use trylock instead. - -Signed-off-by: Vivek Goyal - -fuse: Reschedule dax free work if too many EAGAIN attempts - -fuse_dax_free_memory() can be very cpu intensive in corner cases. For example, -if one inode has consumed all the memory and a setupmapping request is -pending, that means inode lock is held by request and worker thread will -not get lock for a while. And given there is only one inode consuming all -the dax ranges, all the attempts to acquire lock will fail. - -So if there are too many inode lock failures (-EAGAIN), reschedule the -worker with a 10ms delay. - -Signed-off-by: Vivek Goyal - -fuse: Wait for memory ranges to become free - -Sometimes we run out of memory ranges. So in that case, wait for memory -ranges to become free, instead of returning -EBUSY. - -dax fault path is holding fuse_inode->i_mmap_sem and once that is being -held, memory reclaim can't be done. Its not safe to wait while holding -fuse_inode->i_mmap_sem for two reasons. - -- Worker thread to free memory might block on fuse_inode->i_mmap_sem as well. -- This inode is holding all the memory and more memory can't be freed. - -In both the cases, deadlock will ensue. So return -ENOSPC from iomap_begin() -in fault path if memory can't be allocated. Drop fuse_inode->i_mmap_sem, -and wait for a free range to become available and retry. - -read/write path is a different story. We hold inode lock and lock ordering -allows to grab fuse_inode->immap_sem, if needed. That means we can do direct -reclaim in that path. But if there is no memory allocated to this inode, -then direct reclaim will not work and we need to wait for a memory range -to become free. So try following order. - -A. Try to get a free range. -B. If not, try direct reclaim. -C. If not, wait for a memory range to become free - -Here sleeping with locks held should be fine because in step B, we made -sure this inode is not holding any ranges. That means other inodes are -holding ranges and somebody should be able to free memory. Also, worker -thread does a trylock() on inode lock. That means worker tread will not -wait on this inode and move onto next memory range. Hence above sequence -should be deadlock free. - -Signed-off-by: Vivek Goyal - -fuse: Take inode lock for dax inode truncation - -When a file is opened with O_TRUNC, we need to make sure that any other -DAX operation is not in progress. DAX expects i_size to be stable. - -In fuse_iomap_begin() we check for i_size at multiple places and we expect -i_size to not change. - -Another problem is, if we setup a mapping in fuse_iomap_begin(), and -file gets truncated and dax read/write happens, KVM currently hangs. -It tries to fault in a page which does not exist on host (file got -truncated). It probably requries fixing in KVM. - -So for now, take inode lock. Once KVM is fixed, we might have to -have a look at it again. - -Signed-off-by: Vivek Goyal - -fuse: Clear setuid bit even in direct I/O path - -With cache=never, we fall back to direct IO. pjdfstest chmod test 12.t was -failing because if a file has setuid bit, it should be cleared if an -unpriviledged user opens it for write and writes to it. - -Call fuse_remove_privs() even for direct I/O path. - -Signed-off-by: Vivek Goyal - -virtio: Free fuse devices on umount - -When unmounting the fs close all the fuse devices. -This includes making sure the daemon gets a FUSE_DESTROY to -tell it. - -Signed-off-by: Dr. David Alan Gilbert - -virtio-fs: Fix a race in range reclaim - -We have the notion of doing inline dax range reclaim where caller does not -have to drop inode lock and reclaim one of it's dax ranges. It assumed -there is no other reader/writer using that inode (hence not using dax -range being reclaimed). - -But fuse read path takes shared inode lock. That means there could be other -readers while we need to do reclaim. If we try to reclaim now, it is possible -we end up reclaiming the range used by another process. - -To remove that race, do not try to do inline reclaim for read path. Instead -return -ENOSPC and fuse read path will try again when a free range is -available. - -Reported-by: Dr. David Alan Gilbert -Signed-off-by: Vivek Goyal ---- - drivers/dax/super.c | 3 +- - drivers/virtio/virtio_mmio.c | 32 + - drivers/virtio/virtio_pci_modern.c | 108 +++ - fs/dax.c | 23 +- - fs/ext2/inode.c | 2 +- - fs/ext4/inode.c | 2 +- - fs/fuse/Kconfig | 11 + - fs/fuse/Makefile | 1 + - fs/fuse/cuse.c | 5 +- - fs/fuse/dev.c | 80 +- - fs/fuse/dir.c | 28 +- - fs/fuse/file.c | 1001 +++++++++++++++++++++++-- - fs/fuse/fuse_i.h | 202 ++++- - fs/fuse/inode.c | 316 +++++--- - fs/fuse/virtio_fs.c | 1121 ++++++++++++++++++++++++++++ - fs/splice.c | 3 +- - fs/xfs/xfs_aops.c | 2 +- - include/linux/dax.h | 6 +- - include/linux/fs.h | 2 + - include/linux/virtio_config.h | 17 + - include/uapi/linux/fuse.h | 34 + - include/uapi/linux/virtio_fs.h | 44 ++ - include/uapi/linux/virtio_ids.h | 1 + - include/uapi/linux/virtio_mmio.h | 11 + - include/uapi/linux/virtio_pci.h | 10 + - 25 files changed, 2883 insertions(+), 182 deletions(-) - create mode 100644 fs/fuse/virtio_fs.c - create mode 100644 include/uapi/linux/virtio_fs.h - -diff --git a/drivers/dax/super.c b/drivers/dax/super.c -index 6e928f37d..74f3bf7ae 100644 ---- a/drivers/dax/super.c -+++ b/drivers/dax/super.c -@@ -52,7 +52,8 @@ EXPORT_SYMBOL_GPL(dax_read_unlock); - int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, - pgoff_t *pgoff) - { -- phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512; -+ sector_t start_sect = bdev ? get_start_sect(bdev) : 0; -+ phys_addr_t phys_off = (start_sect + sector) * 512; - - if (pgoff) - *pgoff = PHYS_PFN(phys_off); -diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c -index 4cd9ea5c7..9642fa8db 100644 ---- a/drivers/virtio/virtio_mmio.c -+++ b/drivers/virtio/virtio_mmio.c -@@ -494,6 +494,37 @@ static const char *vm_bus_name(struct virtio_device *vdev) - return vm_dev->pdev->name; - } - -+static bool vm_get_shm_region(struct virtio_device *vdev, -+ struct virtio_shm_region *region, u8 id) -+{ -+ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); -+ u64 len, addr; -+ -+ /* Select the region we're interested in */ -+ writel(id, vm_dev->base + VIRTIO_MMIO_SHM_SEL); -+ -+ /* Read the region size */ -+ len = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_LOW); -+ len |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_HIGH) << 32; -+ -+ region->len = len; -+ -+ /* Check if region length is -1. If that's the case, the shared memory -+ * region does not exist and there is no need to proceed further. -+ */ -+ if (len == ~(u64)0) { -+ return false; -+ } -+ -+ /* Read the region base address */ -+ addr = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_LOW); -+ addr |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_HIGH) << 32; -+ -+ region->addr = addr; -+ -+ return true; -+} -+ - static const struct virtio_config_ops virtio_mmio_config_ops = { - .get = vm_get, - .set = vm_set, -@@ -506,6 +537,7 @@ static const struct virtio_config_ops virtio_mmio_config_ops = { - .get_features = vm_get_features, - .finalize_features = vm_finalize_features, - .bus_name = vm_bus_name, -+ .get_shm_region = vm_get_shm_region, - }; - - -diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c -index 07571dacc..51c9e6eca 100644 ---- a/drivers/virtio/virtio_pci_modern.c -+++ b/drivers/virtio/virtio_pci_modern.c -@@ -446,6 +446,112 @@ static void del_vq(struct virtio_pci_vq_info *info) - vring_del_virtqueue(vq); - } - -+static int virtio_pci_find_shm_cap(struct pci_dev *dev, -+ u8 required_id, -+ u8 *bar, u64 *offset, u64 *len) -+{ -+ int pos; -+ -+ for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); -+ pos > 0; -+ pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) { -+ u8 type, cap_len, id; -+ u32 tmp32; -+ u64 res_offset, res_length; -+ -+ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, -+ cfg_type), -+ &type); -+ if (type != VIRTIO_PCI_CAP_SHARED_MEMORY_CFG) -+ continue; -+ -+ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, -+ cap_len), -+ &cap_len); -+ if (cap_len != sizeof(struct virtio_pci_shm_cap)) { -+ printk(KERN_ERR "%s: shm cap with bad size offset: %d size: %d\n", -+ __func__, pos, cap_len); -+ continue; -+ }; -+ -+ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_shm_cap, -+ id), -+ &id); -+ if (id != required_id) -+ continue; -+ -+ /* Type, and ID match, looks good */ -+ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, -+ bar), -+ bar); -+ -+ /* Read the lower 32bit of length and offset */ -+ pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap, offset), -+ &tmp32); -+ res_offset = tmp32; -+ pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap, length), -+ &tmp32); -+ res_length = tmp32; -+ -+ /* and now the top half */ -+ pci_read_config_dword(dev, -+ pos + offsetof(struct virtio_pci_shm_cap, -+ offset_hi), -+ &tmp32); -+ res_offset |= ((u64)tmp32) << 32; -+ pci_read_config_dword(dev, -+ pos + offsetof(struct virtio_pci_shm_cap, -+ length_hi), -+ &tmp32); -+ res_length |= ((u64)tmp32) << 32; -+ -+ *offset = res_offset; -+ *len = res_length; -+ -+ return pos; -+ } -+ return 0; -+} -+ -+static bool vp_get_shm_region(struct virtio_device *vdev, -+ struct virtio_shm_region *region, u8 id) -+{ -+ struct virtio_pci_device *vp_dev = to_vp_device(vdev); -+ struct pci_dev *pci_dev = vp_dev->pci_dev; -+ u8 bar; -+ u64 offset, len; -+ phys_addr_t phys_addr; -+ size_t bar_len; -+ char *bar_name; -+ int ret; -+ -+ if (!virtio_pci_find_shm_cap(pci_dev, id, &bar, &offset, &len)) { -+ return false; -+ } -+ -+ ret = pci_request_region(pci_dev, bar, "virtio-pci-shm"); -+ if (ret < 0) { -+ dev_err(&pci_dev->dev, "%s: failed to request BAR\n", -+ __func__); -+ return false; -+ } -+ -+ phys_addr = pci_resource_start(pci_dev, bar); -+ bar_len = pci_resource_len(pci_dev, bar); -+ -+ if (offset + len > bar_len) { -+ dev_err(&pci_dev->dev, -+ "%s: bar shorter than cap offset+len\n", -+ __func__); -+ return false; -+ } -+ -+ region->len = len; -+ region->addr = (u64) phys_addr + offset; -+ -+ return true; -+} -+ - static const struct virtio_config_ops virtio_pci_config_nodev_ops = { - .get = NULL, - .set = NULL, -@@ -460,6 +566,7 @@ static const struct virtio_config_ops virtio_pci_config_nodev_ops = { - .bus_name = vp_bus_name, - .set_vq_affinity = vp_set_vq_affinity, - .get_vq_affinity = vp_get_vq_affinity, -+ .get_shm_region = vp_get_shm_region, - }; - - static const struct virtio_config_ops virtio_pci_config_ops = { -@@ -476,6 +583,7 @@ static const struct virtio_config_ops virtio_pci_config_ops = { - .bus_name = vp_bus_name, - .set_vq_affinity = vp_set_vq_affinity, - .get_vq_affinity = vp_get_vq_affinity, -+ .get_shm_region = vp_get_shm_region, - }; - - /** -diff --git a/fs/dax.c b/fs/dax.c -index 75a289c31..8c55d4bdf 100644 ---- a/fs/dax.c -+++ b/fs/dax.c -@@ -1021,12 +1021,12 @@ static int dax_writeback_one(struct dax_device *dax_dev, - * on persistent storage prior to completion of the operation. - */ - int dax_writeback_mapping_range(struct address_space *mapping, -- struct block_device *bdev, struct writeback_control *wbc) -+ struct block_device *bdev, struct dax_device *dax_dev, -+ struct writeback_control *wbc) - { - struct inode *inode = mapping->host; - pgoff_t start_index, end_index; - pgoff_t indices[PAGEVEC_SIZE]; -- struct dax_device *dax_dev; - struct pagevec pvec; - bool done = false; - int i, ret = 0; -@@ -1037,9 +1037,12 @@ int dax_writeback_mapping_range(struct address_space *mapping, - if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) - return 0; - -- dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); -- if (!dax_dev) -- return -EIO; -+ if (bdev) { -+ WARN_ON(dax_dev); -+ dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); -+ if (!dax_dev) -+ return -EIO; -+ } - - start_index = wbc->range_start >> PAGE_SHIFT; - end_index = wbc->range_end >> PAGE_SHIFT; -@@ -1073,7 +1076,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, - start_index = indices[pvec.nr - 1] + 1; - } - out: -- put_dax(dax_dev); -+ if (bdev) -+ put_dax(dax_dev); - trace_dax_writeback_range_done(inode, start_index, end_index); - return (ret < 0 ? ret : 0); - } -@@ -1141,7 +1145,12 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, - static bool dax_range_is_aligned(struct block_device *bdev, - unsigned int offset, unsigned int length) - { -- unsigned short sector_size = bdev_logical_block_size(bdev); -+ unsigned short sector_size; -+ -+ if (!bdev) -+ return false; -+ -+ sector_size = bdev_logical_block_size(bdev); - - if (!IS_ALIGNED(offset, sector_size)) - return false; -diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c -index e4bb9386c..c9b024daf 100644 ---- a/fs/ext2/inode.c -+++ b/fs/ext2/inode.c -@@ -956,7 +956,7 @@ static int - ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) - { - return dax_writeback_mapping_range(mapping, -- mapping->host->i_sb->s_bdev, wbc); -+ mapping->host->i_sb->s_bdev, NULL, wbc); - } - - const struct address_space_operations ext2_aops = { -diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c -index 05dc5a4ba..221824742 100644 ---- a/fs/ext4/inode.c -+++ b/fs/ext4/inode.c -@@ -2949,7 +2949,7 @@ static int ext4_dax_writepages(struct address_space *mapping, - percpu_down_read(&sbi->s_journal_flag_rwsem); - trace_ext4_writepages(inode, wbc); - -- ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc); -+ ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, NULL, wbc); - trace_ext4_writepages_result(inode, wbc, ret, - nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_journal_flag_rwsem); -diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig -index 76f09ce7e..46e9a8ff9 100644 ---- a/fs/fuse/Kconfig -+++ b/fs/fuse/Kconfig -@@ -26,3 +26,14 @@ config CUSE - - If you want to develop or use a userspace character device - based on CUSE, answer Y or M. -+ -+config VIRTIO_FS -+ tristate "Virtio Filesystem" -+ depends on FUSE_FS -+ select VIRTIO -+ help -+ The Virtio Filesystem allows guests to mount file systems from the -+ host. -+ -+ If you want to share files between guests or with the host, answer Y -+ or M. -diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile -index 60da84a86..d125ff826 100644 ---- a/fs/fuse/Makefile -+++ b/fs/fuse/Makefile -@@ -4,5 +4,6 @@ - - obj-$(CONFIG_FUSE_FS) += fuse.o - obj-$(CONFIG_CUSE) += cuse.o -+obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o - - fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o -diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c -index 8f6818125..d49d64f42 100644 ---- a/fs/fuse/cuse.c -+++ b/fs/fuse/cuse.c -@@ -503,9 +503,10 @@ static int cuse_channel_open(struct inode *inode, struct file *file) - * Limit the cuse channel to requests that can - * be represented in file->f_cred->user_ns. - */ -- fuse_conn_init(&cc->fc, file->f_cred->user_ns); -+ fuse_conn_init(&cc->fc, file->f_cred->user_ns, NULL, &fuse_dev_fiq_ops, -+ NULL); - -- fud = fuse_dev_alloc(&cc->fc); -+ fud = fuse_dev_alloc_install(&cc->fc); - if (!fud) { - kfree(cc); - return -ENOMEM; -diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c -index 6ee471b72..601da8d58 100644 ---- a/fs/fuse/dev.c -+++ b/fs/fuse/dev.c -@@ -103,6 +103,7 @@ void fuse_request_free(struct fuse_req *req) - } - kmem_cache_free(fuse_req_cachep, req); - } -+EXPORT_SYMBOL_GPL(fuse_request_free); - - void __fuse_get_request(struct fuse_req *req) - { -@@ -310,7 +311,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) - } - EXPORT_SYMBOL_GPL(fuse_put_request); - --static unsigned len_args(unsigned numargs, struct fuse_arg *args) -+unsigned fuse_len_args(unsigned numargs, struct fuse_arg *args) - { - unsigned nbytes = 0; - unsigned i; -@@ -320,19 +321,41 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) - - return nbytes; - } -+EXPORT_SYMBOL_GPL(fuse_len_args); - --static u64 fuse_get_unique(struct fuse_iqueue *fiq) -+u64 fuse_get_unique(struct fuse_iqueue *fiq) - { - return ++fiq->reqctr; - } -+EXPORT_SYMBOL_GPL(fuse_get_unique); - --static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) -+/** -+ * A new request is available, wake fiq->waitq -+ */ -+static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq) -+__releases(fiq->waitq.lock) - { -- req->in.h.len = sizeof(struct fuse_in_header) + -- len_args(req->in.numargs, (struct fuse_arg *) req->in.args); -- list_add_tail(&req->list, &fiq->pending); - wake_up_locked(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); -+ spin_unlock(&fiq->waitq.lock); -+} -+ -+const struct fuse_iqueue_ops fuse_dev_fiq_ops = { -+ .wake_forget_and_unlock = fuse_dev_wake_and_unlock, -+ .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, -+ .wake_pending_and_unlock = fuse_dev_wake_and_unlock, -+}; -+EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); -+ -+static void queue_request_and_unlock(struct fuse_iqueue *fiq, -+ struct fuse_req *req) -+__releases(fiq->waitq.lock) -+{ -+ req->in.h.len = sizeof(struct fuse_in_header) + -+ fuse_len_args(req->in.numargs, -+ (struct fuse_arg *) req->in.args); -+ list_add_tail(&req->list, &fiq->pending); -+ fiq->ops->wake_pending_and_unlock(fiq); - } - - void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, -@@ -347,12 +370,11 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, - if (fiq->connected) { - fiq->forget_list_tail->next = forget; - fiq->forget_list_tail = forget; -- wake_up_locked(&fiq->waitq); -- kill_fasync(&fiq->fasync, SIGIO, POLL_IN); -+ fiq->ops->wake_forget_and_unlock(fiq); - } else { - kfree(forget); -+ spin_unlock(&fiq->waitq.lock); - } -- spin_unlock(&fiq->waitq.lock); - } - - static void flush_bg_queue(struct fuse_conn *fc) -@@ -367,8 +389,7 @@ static void flush_bg_queue(struct fuse_conn *fc) - fc->active_background++; - spin_lock(&fiq->waitq.lock); - req->in.h.unique = fuse_get_unique(fiq); -- queue_request(fiq, req); -- spin_unlock(&fiq->waitq.lock); -+ queue_request_and_unlock(fiq, req); - } - } - -@@ -380,7 +401,7 @@ static void flush_bg_queue(struct fuse_conn *fc) - * the 'end' callback is called if given, else the reference to the - * request is released - */ --static void request_end(struct fuse_conn *fc, struct fuse_req *req) -+void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) - { - struct fuse_iqueue *fiq = &fc->iq; - -@@ -424,6 +445,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) - put_request: - fuse_put_request(fc, req); - } -+EXPORT_SYMBOL_GPL(fuse_request_end); - - static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) - { -@@ -434,10 +456,10 @@ static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) - } - if (list_empty(&req->intr_entry)) { - list_add_tail(&req->intr_entry, &fiq->interrupts); -- wake_up_locked(&fiq->waitq); -+ fiq->ops->wake_interrupt_and_unlock(fiq); -+ } else { -+ spin_unlock(&fiq->waitq.lock); - } -- spin_unlock(&fiq->waitq.lock); -- kill_fasync(&fiq->fasync, SIGIO, POLL_IN); - } - - static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) -@@ -496,14 +518,13 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) - req->out.h.error = -ENOTCONN; - } else { - req->in.h.unique = fuse_get_unique(fiq); -- queue_request(fiq, req); - /* acquire extra reference, since request is still needed -- after request_end() */ -+ after fuse_request_end() */ - __fuse_get_request(req); -- spin_unlock(&fiq->waitq.lock); -+ queue_request_and_unlock(fiq, req); - - request_wait_answer(fc, req); -- /* Pairs with smp_wmb() in request_end() */ -+ /* Pairs with smp_wmb() in fuse_request_end() */ - smp_rmb(); - } - } -@@ -635,10 +656,11 @@ static int fuse_request_send_notify_reply(struct fuse_conn *fc, - req->in.h.unique = unique; - spin_lock(&fiq->waitq.lock); - if (fiq->connected) { -- queue_request(fiq, req); -+ queue_request_and_unlock(fiq, req); - err = 0; -+ } else { -+ spin_unlock(&fiq->waitq.lock); - } -- spin_unlock(&fiq->waitq.lock); - - return err; - } -@@ -1236,7 +1258,7 @@ __releases(fiq->waitq.lock) - * the pending list and copies request data to userspace buffer. If - * no reply is needed (FORGET) or request has been aborted or there - * was an error during the copying then it's finished by calling -- * request_end(). Otherwise add it to the processing list, and set -+ * fuse_request_end(). Otherwise add it to the processing list, and set - * the 'sent' flag. - */ - static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, -@@ -1295,7 +1317,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, - /* SETXATTR is special, since it may contain too large data */ - if (in->h.opcode == FUSE_SETXATTR) - req->out.h.error = -E2BIG; -- request_end(fc, req); -+ fuse_request_end(fc, req); - goto restart; - } - spin_lock(&fpq->lock); -@@ -1337,7 +1359,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, - if (!test_bit(FR_PRIVATE, &req->flags)) - list_del_init(&req->list); - spin_unlock(&fpq->lock); -- request_end(fc, req); -+ fuse_request_end(fc, req); - return err; - - err_unlock: -@@ -1824,7 +1846,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, - if (out->h.error) - return nbytes != reqsize ? -EINVAL : 0; - -- reqsize += len_args(out->numargs, out->args); -+ reqsize += fuse_len_args(out->numargs, out->args); - - if (reqsize < nbytes || (reqsize > nbytes && !out->argvar)) - return -EINVAL; -@@ -1844,7 +1866,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, - * the write buffer. The request is then searched on the processing - * list by the unique ID found in the header. If found, then remove - * it from the list and copy the rest of the buffer to the request. -- * The request is finished by calling request_end() -+ * The request is finished by calling fuse_request_end(). - */ - static ssize_t fuse_dev_do_write(struct fuse_dev *fud, - struct fuse_copy_state *cs, size_t nbytes) -@@ -1931,7 +1953,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, - list_del_init(&req->list); - spin_unlock(&fpq->lock); - -- request_end(fc, req); -+ fuse_request_end(fc, req); - - return err ? err : nbytes; - -@@ -2077,7 +2099,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) - req->out.h.error = -ECONNABORTED; - clear_bit(FR_SENT, &req->flags); - list_del_init(&req->list); -- request_end(fc, req); -+ fuse_request_end(fc, req); - } - } - -@@ -2223,7 +2245,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) - if (new->private_data) - return -EINVAL; - -- fud = fuse_dev_alloc(fc); -+ fud = fuse_dev_alloc_install(fc); - if (!fud) - return -ENOMEM; - -diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c -index 82a132217..3f923fe78 100644 ---- a/fs/fuse/dir.c -+++ b/fs/fuse/dir.c -@@ -44,12 +44,26 @@ union fuse_dentry { - struct rcu_head rcu; - }; - --static inline void fuse_dentry_settime(struct dentry *entry, u64 time) -+static void fuse_dentry_settime(struct dentry *dentry, u64 time) - { -- ((union fuse_dentry *) entry->d_fsdata)->time = time; -+ /* -+ * Mess with DCACHE_OP_DELETE because dput() will be faster without it. -+ * Don't care about races, either way it's just an optimization -+ */ -+ if ((time && (dentry->d_flags & DCACHE_OP_DELETE)) || -+ (!time && !(dentry->d_flags & DCACHE_OP_DELETE))) { -+ spin_lock(&dentry->d_lock); -+ if (time) -+ dentry->d_flags &= ~DCACHE_OP_DELETE; -+ else -+ dentry->d_flags |= DCACHE_OP_DELETE; -+ spin_unlock(&dentry->d_lock); -+ } -+ -+ ((union fuse_dentry *) dentry->d_fsdata)->time = time; - } - --static inline u64 fuse_dentry_time(struct dentry *entry) -+static inline u64 fuse_dentry_time(const struct dentry *entry) - { - return ((union fuse_dentry *) entry->d_fsdata)->time; - } -@@ -280,8 +294,14 @@ static void fuse_dentry_release(struct dentry *dentry) - kfree_rcu(fd, rcu); - } - -+static int fuse_dentry_delete(const struct dentry *dentry) -+{ -+ return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); -+} -+ - const struct dentry_operations fuse_dentry_operations = { - .d_revalidate = fuse_dentry_revalidate, -+ .d_delete = fuse_dentry_delete, - .d_init = fuse_dentry_init, - .d_release = fuse_dentry_release, - }; -@@ -1728,8 +1748,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, - */ - if ((is_truncate || !is_wb) && - S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { -+ down_write(&fi->i_mmap_sem); - truncate_pagecache(inode, outarg.attr.size); - invalidate_inode_pages2(inode->i_mapping); -+ up_write(&fi->i_mmap_sem); - } - - clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); -diff --git a/fs/fuse/file.c b/fs/fuse/file.c -index 9a22aa580..7333b449e 100644 ---- a/fs/fuse/file.c -+++ b/fs/fuse/file.c -@@ -18,8 +18,18 @@ - #include - #include - #include -+#include -+#include -+#include - --static const struct file_operations fuse_direct_io_file_operations; -+INTERVAL_TREE_DEFINE(struct fuse_dax_mapping, -+ rb, __u64, __subtree_last, -+ START, LAST, static inline, fuse_dax_interval_tree); -+ -+static long __fuse_file_fallocate(struct file *file, int mode, -+ loff_t offset, loff_t length); -+static struct fuse_dax_mapping *alloc_dax_mapping_reclaim(struct fuse_conn *fc, -+ struct inode *inode); - - static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, - int opcode, struct fuse_open_out *outargp) -@@ -170,13 +180,222 @@ static void fuse_link_write_file(struct file *file) - spin_unlock(&fc->lock); - } - -+static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn *fc) -+{ -+ unsigned long free_threshold; -+ struct fuse_dax_mapping *dmap = NULL; -+ -+ spin_lock(&fc->lock); -+ -+ /* TODO: Add logic to try to free up memory if wait is allowed */ -+ if (fc->nr_free_ranges <= 0) { -+ spin_unlock(&fc->lock); -+ goto out_kick; -+ } -+ -+ WARN_ON(list_empty(&fc->free_ranges)); -+ -+ /* Take a free range */ -+ dmap = list_first_entry(&fc->free_ranges, struct fuse_dax_mapping, -+ list); -+ list_del_init(&dmap->list); -+ fc->nr_free_ranges--; -+ spin_unlock(&fc->lock); -+ -+out_kick: -+ /* If number of free ranges are below threshold, start reclaim */ -+ free_threshold = max((fc->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD)/100, -+ (unsigned long)1); -+ if (fc->nr_free_ranges < free_threshold) { -+ pr_debug("fuse: Kicking dax memory reclaim worker. nr_free_ranges=0x%ld nr_total_ranges=%ld\n", fc->nr_free_ranges, fc->nr_ranges); -+ queue_delayed_work(system_long_wq, &fc->dax_free_work, 0); -+ } -+ return dmap; -+} -+ -+/* This assumes fc->lock is held */ -+static void __dmap_remove_busy_list(struct fuse_conn *fc, -+ struct fuse_dax_mapping *dmap) -+{ -+ list_del_init(&dmap->busy_list); -+ WARN_ON(fc->nr_busy_ranges == 0); -+ fc->nr_busy_ranges--; -+} -+ -+static void dmap_remove_busy_list(struct fuse_conn *fc, -+ struct fuse_dax_mapping *dmap) -+{ -+ spin_lock(&fc->lock); -+ __dmap_remove_busy_list(fc, dmap); -+ spin_unlock(&fc->lock); -+} -+ -+/* This assumes fc->lock is held */ -+static void __free_dax_mapping(struct fuse_conn *fc, -+ struct fuse_dax_mapping *dmap) -+{ -+ list_add_tail(&dmap->list, &fc->free_ranges); -+ fc->nr_free_ranges++; -+ /* TODO: Wake up only when needed */ -+ wake_up(&fc->dax_range_waitq); -+} -+ -+static void free_dax_mapping(struct fuse_conn *fc, -+ struct fuse_dax_mapping *dmap) -+{ -+ /* Return fuse_dax_mapping to free list */ -+ spin_lock(&fc->lock); -+ __free_dax_mapping(fc, dmap); -+ spin_unlock(&fc->lock); -+} -+ -+/* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */ -+static int fuse_setup_one_mapping(struct inode *inode, -+ struct file *file, loff_t offset, -+ struct fuse_dax_mapping *dmap) -+{ -+ struct fuse_conn *fc = get_fuse_conn(inode); -+ struct fuse_inode *fi = get_fuse_inode(inode); -+ struct fuse_file *ff = NULL; -+ struct fuse_setupmapping_in inarg; -+ FUSE_ARGS(args); -+ ssize_t err; -+ -+ if (file) -+ ff = file->private_data; -+ -+ WARN_ON(offset % FUSE_DAX_MEM_RANGE_SZ); -+ WARN_ON(fc->nr_free_ranges < 0); -+ -+ /* Ask fuse daemon to setup mapping */ -+ memset(&inarg, 0, sizeof(inarg)); -+ inarg.foffset = offset; -+ if (ff) -+ inarg.fh = ff->fh; -+ else -+ inarg.fh = -1; -+ inarg.moffset = dmap->window_offset; -+ inarg.len = FUSE_DAX_MEM_RANGE_SZ; -+ if (file) { -+ inarg.flags |= (file->f_mode & FMODE_WRITE) ? -+ FUSE_SETUPMAPPING_FLAG_WRITE : 0; -+ inarg.flags |= (file->f_mode & FMODE_READ) ? -+ FUSE_SETUPMAPPING_FLAG_READ : 0; -+ } else { -+ inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; -+ inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; -+ } -+ args.in.h.opcode = FUSE_SETUPMAPPING; -+ args.in.h.nodeid = fi->nodeid; -+ args.in.numargs = 1; -+ args.in.args[0].size = sizeof(inarg); -+ args.in.args[0].value = &inarg; -+ err = fuse_simple_request(fc, &args); -+ if (err < 0) { -+ printk(KERN_ERR "%s request failed at mem_offset=0x%llx %zd\n", -+ __func__, dmap->window_offset, err); -+ return err; -+ } -+ -+ pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx err=%zd\n", offset, err); -+ -+ /* -+ * We don't take a refernce on inode. inode is valid right now and -+ * when inode is going away, cleanup logic should first cleanup -+ * dmap entries. -+ * -+ * TODO: Do we need to ensure that we are holding inode lock -+ * as well. -+ */ -+ dmap->inode = inode; -+ dmap->start = offset; -+ dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1; -+ /* Protected by fi->i_dmap_sem */ -+ fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree); -+ fi->nr_dmaps++; -+ spin_lock(&fc->lock); -+ list_add_tail(&dmap->busy_list, &fc->busy_ranges); -+ fc->nr_busy_ranges++; -+ spin_unlock(&fc->lock); -+ return 0; -+} -+ -+static int fuse_removemapping_one(struct inode *inode, -+ struct fuse_dax_mapping *dmap) -+{ -+ struct fuse_inode *fi = get_fuse_inode(inode); -+ struct fuse_conn *fc = get_fuse_conn(inode); -+ struct fuse_removemapping_in inarg; -+ FUSE_ARGS(args); -+ -+ memset(&inarg, 0, sizeof(inarg)); -+ inarg.moffset = dmap->window_offset; -+ inarg.len = dmap->length; -+ args.in.h.opcode = FUSE_REMOVEMAPPING; -+ args.in.h.nodeid = fi->nodeid; -+ args.in.numargs = 1; -+ args.in.args[0].size = sizeof(inarg); -+ args.in.args[0].value = &inarg; -+ return fuse_simple_request(fc, &args); -+} -+ -+/* -+ * It is called from evict_inode() and by that time inode is going away. So -+ * this function does not take any locks like fi->i_dmap_sem for traversing -+ * that fuse inode interval tree. If that lock is taken then lock validator -+ * complains of deadlock situation w.r.t fs_reclaim lock. -+ */ -+void fuse_removemapping(struct inode *inode) -+{ -+ struct fuse_conn *fc = get_fuse_conn(inode); -+ struct fuse_inode *fi = get_fuse_inode(inode); -+ ssize_t err; -+ struct fuse_dax_mapping *dmap; -+ -+ /* Clear the mappings list */ -+ while (true) { -+ WARN_ON(fi->nr_dmaps < 0); -+ -+ dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, 0, -+ -1); -+ if (dmap) { -+ fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree); -+ fi->nr_dmaps--; -+ dmap_remove_busy_list(fc, dmap); -+ } -+ -+ if (!dmap) -+ break; -+ -+ /* -+ * During umount/shutdown, fuse connection is dropped first -+ * and later evict_inode() is called later. That means any -+ * removemapping messages are going to fail. Send messages -+ * only if connection is up. Otherwise fuse daemon is -+ * responsible for cleaning up any leftover references and -+ * mappings. -+ */ -+ if (fc->connected) { -+ err = fuse_removemapping_one(inode, dmap); -+ if (err) { -+ pr_warn("Failed to removemapping. offset=0x%llx" -+ " len=0x%llx\n", dmap->window_offset, -+ dmap->length); -+ } -+ } -+ -+ dmap->inode = NULL; -+ -+ /* Add it back to free ranges list */ -+ free_dax_mapping(fc, dmap); -+ } -+} -+ - void fuse_finish_open(struct inode *inode, struct file *file) - { - struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = get_fuse_conn(inode); - -- if (ff->open_flags & FOPEN_DIRECT_IO) -- file->f_op = &fuse_direct_io_file_operations; - if (!(ff->open_flags & FOPEN_KEEP_CACHE)) - invalidate_inode_pages2(inode->i_mapping); - if (ff->open_flags & FOPEN_STREAM) -@@ -204,7 +423,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) - int err; - bool lock_inode = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && -- fc->writeback_cache; -+ (fc->writeback_cache || IS_DAX(inode)); - - err = generic_file_open(inode, file); - if (err) -@@ -252,6 +471,7 @@ void fuse_release_common(struct file *file, bool isdir) - struct fuse_file *ff = file->private_data; - struct fuse_req *req = ff->reserved_req; - int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; -+ bool sync = false; - - fuse_prepare_release(ff, file->f_flags, opcode); - -@@ -272,8 +492,20 @@ void fuse_release_common(struct file *file, bool isdir) - * Make the release synchronous if this is a fuseblk mount, - * synchronous RELEASE is allowed (and desirable) in this case - * because the server can be trusted not to screw up. -+ * -+ * For DAX, fuse server is trusted. So it should be fine to -+ * do a sync file put. Doing async file put is creating -+ * problems right now because when request finish, iput() -+ * can lead to freeing of inode. That means it tears down -+ * mappings backing DAX memory and sends REMOVEMAPPING message -+ * to server and blocks for completion. Currently, waiting -+ * in req->end context deadlocks the system as same worker thread -+ * can't process REMOVEMAPPING reply it is waiting for. - */ -- fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir); -+ if (IS_DAX(req->misc.release.inode) || ff->fc->destroy_req != NULL) -+ sync = true; -+ -+ fuse_file_put(ff, sync, isdir); - } - - static int fuse_open(struct inode *inode, struct file *file) -@@ -918,11 +1150,23 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, - return err; - } - -+ -+static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to); -+static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); -+ - static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) - { -- struct inode *inode = iocb->ki_filp->f_mapping->host; -+ struct file *file = iocb->ki_filp; -+ struct fuse_file *ff = file->private_data; -+ struct inode *inode = file->f_mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - -+ if (ff->open_flags & FOPEN_DIRECT_IO) -+ return fuse_direct_read_iter(iocb, to); -+ -+ if (IS_DAX(inode)) -+ return fuse_dax_read_iter(iocb, to); -+ - /* - * In auto invalidate mode, always update attributes on read. - * Otherwise, only update if we attempt to read past EOF (to ensure -@@ -1170,9 +1414,14 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, - return res > 0 ? res : err; - } - -+static ssize_t fuse_direct_write_iter(struct kiocb *iocb, -+ struct iov_iter *from); -+static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); -+ - static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) - { - struct file *file = iocb->ki_filp; -+ struct fuse_file *ff = file->private_data; - struct address_space *mapping = file->f_mapping; - ssize_t written = 0; - ssize_t written_buffered = 0; -@@ -1180,6 +1429,11 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) - ssize_t err; - loff_t endbyte = 0; - -+ if (ff->open_flags & FOPEN_DIRECT_IO) -+ return fuse_direct_write_iter(iocb, from); -+ if (IS_DAX(inode)) -+ return fuse_dax_write_iter(iocb, from); -+ - if (get_fuse_conn(inode)->writeback_cache) { - /* Update size (EOF optimization) and mode (SUID clearing) */ - err = fuse_update_attributes(mapping->host, file); -@@ -1444,16 +1698,279 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) - /* Don't allow parallel writes to the same file */ - inode_lock(inode); - res = generic_write_checks(iocb, from); -- if (res > 0) -- res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); -+ if (res < 0) -+ goto out_invalidate; -+ -+ res = file_remove_privs(iocb->ki_filp); -+ if (res) -+ goto out_invalidate; -+ -+ res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); -+ if (res < 0) -+ goto out_invalidate; -+ - fuse_invalidate_attr(inode); -- if (res > 0) -- fuse_write_update_size(inode, iocb->ki_pos); -+ fuse_write_update_size(inode, iocb->ki_pos); - inode_unlock(inode); -+ return res; - -+out_invalidate: -+ fuse_invalidate_attr(inode); -+ inode_unlock(inode); - return res; - } - -+static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length) -+{ -+ iomap->addr = IOMAP_NULL_ADDR; -+ iomap->length = length; -+ iomap->type = IOMAP_HOLE; -+} -+ -+static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length, -+ struct iomap *iomap, struct fuse_dax_mapping *dmap, -+ unsigned flags) -+{ -+ loff_t offset, len; -+ loff_t i_size = i_size_read(inode); -+ -+ offset = pos - dmap->start; -+ len = min(length, dmap->length - offset); -+ -+ /* If length is beyond end of file, truncate further */ -+ if (pos + len > i_size) -+ len = i_size - pos; -+ -+ if (len > 0) { -+ iomap->addr = dmap->window_offset + offset; -+ iomap->length = len; -+ if (flags & IOMAP_FAULT) -+ iomap->length = ALIGN(len, PAGE_SIZE); -+ iomap->type = IOMAP_MAPPED; -+ pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx" -+ " length 0x%llx\n", __func__, iomap->addr, -+ iomap->offset, iomap->length); -+ } else { -+ /* Mapping beyond end of file is hole */ -+ fuse_fill_iomap_hole(iomap, length); -+ pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx" -+ "length 0x%llx\n", __func__, iomap->addr, -+ iomap->offset, iomap->length); -+ } -+} -+ -+/* This is just for DAX and the mapping is ephemeral, do not use it for other -+ * purposes since there is no block device with a permanent mapping. -+ */ -+static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, -+ unsigned flags, struct iomap *iomap) -+{ -+ struct fuse_inode *fi = get_fuse_inode(inode); -+ struct fuse_conn *fc = get_fuse_conn(inode); -+ struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; -+ int ret; -+ -+ /* We don't support FIEMAP */ -+ BUG_ON(flags & IOMAP_REPORT); -+ -+ pr_debug("fuse_iomap_begin() called. pos=0x%llx length=0x%llx\n", -+ pos, length); -+ -+ iomap->offset = pos; -+ iomap->flags = 0; -+ iomap->bdev = NULL; -+ iomap->dax_dev = fc->dax_dev; -+ -+ /* -+ * Both read/write and mmap path can race here. So we need something -+ * to make sure if we are setting up mapping, then other path waits -+ * -+ * For now, use a semaphore for this. It probably needs to be -+ * optimized later. -+ */ -+ down_read(&fi->i_dmap_sem); -+ dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos); -+ -+ if (dmap) { -+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); -+ up_read(&fi->i_dmap_sem); -+ return 0; -+ } else { -+ up_read(&fi->i_dmap_sem); -+ pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", -+ __func__, pos, length); -+ if (pos >= i_size_read(inode)) -+ goto iomap_hole; -+ -+ /* Can't do reclaim in fault path yet due to lock ordering. -+ * Read path takes shared inode lock and that's not sufficient -+ * for inline range reclaim. Caller needs to drop lock, wait -+ * and retry. -+ */ -+ if (flags & IOMAP_FAULT || !(flags & IOMAP_WRITE)) { -+ alloc_dmap = alloc_dax_mapping(fc); -+ if (!alloc_dmap) -+ return -ENOSPC; -+ } else { -+ alloc_dmap = alloc_dax_mapping_reclaim(fc, inode); -+ if (IS_ERR(alloc_dmap)) -+ return PTR_ERR(alloc_dmap); -+ } -+ -+ /* If we are here, we should have memory allocated */ -+ if (WARN_ON(!alloc_dmap)) -+ return -EBUSY; -+ -+ /* -+ * Drop read lock and take write lock so that only one -+ * caller can try to setup mapping and other waits -+ */ -+ down_write(&fi->i_dmap_sem); -+ /* -+ * We dropped lock. Check again if somebody else setup -+ * mapping already. -+ */ -+ dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, -+ pos); -+ if (dmap) { -+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); -+ free_dax_mapping(fc, alloc_dmap); -+ up_write(&fi->i_dmap_sem); -+ return 0; -+ } -+ -+ /* Setup one mapping */ -+ ret = fuse_setup_one_mapping(inode, NULL, -+ ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ), -+ alloc_dmap); -+ if (ret < 0) { -+ printk("fuse_setup_one_mapping() failed. err=%d" -+ " pos=0x%llx\n", ret, pos); -+ free_dax_mapping(fc, alloc_dmap); -+ up_write(&fi->i_dmap_sem); -+ return ret; -+ } -+ fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags); -+ up_write(&fi->i_dmap_sem); -+ return 0; -+ } -+ -+ /* -+ * If read beyond end of file happnes, fs code seems to return -+ * it as hole -+ */ -+iomap_hole: -+ fuse_fill_iomap_hole(iomap, length); -+ pr_debug("fuse_iomap_begin() returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", pos, length, iomap->length); -+ return 0; -+} -+ -+static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length, -+ ssize_t written, unsigned flags, -+ struct iomap *iomap) -+{ -+ /* DAX writes beyond end-of-file aren't handled using iomap, so the -+ * file size is unchanged and there is nothing to do here. -+ */ -+ return 0; -+} -+ -+static const struct iomap_ops fuse_iomap_ops = { -+ .iomap_begin = fuse_iomap_begin, -+ .iomap_end = fuse_iomap_end, -+}; -+ -+static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) -+{ -+ struct inode *inode = file_inode(iocb->ki_filp); -+ struct fuse_conn *fc = get_fuse_conn(inode); -+ ssize_t ret; -+ bool retry = false; -+ -+retry: -+ if (retry && !(fc->nr_free_ranges > 0)) { -+ ret = -EINTR; -+ if (wait_event_killable_exclusive(fc->dax_range_waitq, -+ (fc->nr_free_ranges > 0))) { -+ goto out; -+ } -+ } -+ -+ if (iocb->ki_flags & IOCB_NOWAIT) { -+ if (!inode_trylock_shared(inode)) -+ return -EAGAIN; -+ } else { -+ inode_lock_shared(inode); -+ } -+ -+ ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops); -+ inode_unlock_shared(inode); -+ -+ /* If a dax range could not be allocated and it can't be reclaimed -+ * inline, then drop inode lock and retry. Range reclaim logic -+ * requires exclusive access to inode lock. -+ * -+ * TODO: What if -ENOSPC needs to be returned to user space. Fix it. -+ */ -+ if (ret == -ENOSPC) { -+ retry = true; -+ goto retry; -+ } -+ /* TODO file_accessed(iocb->f_filp) */ -+ -+out: -+ return ret; -+} -+ -+static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) -+{ -+ struct inode *inode = file_inode(iocb->ki_filp); -+ ssize_t ret; -+ -+ if (iocb->ki_flags & IOCB_NOWAIT) { -+ if (!inode_trylock(inode)) -+ return -EAGAIN; -+ } else { -+ inode_lock(inode); -+ } -+ -+ ret = generic_write_checks(iocb, from); -+ if (ret <= 0) -+ goto out; -+ -+ ret = file_remove_privs(iocb->ki_filp); -+ if (ret) -+ goto out; -+ /* TODO file_update_time() but we don't want metadata I/O */ -+ -+ /* TODO handle growing the file */ -+ /* Grow file here if need be. iomap_begin() does not have access -+ * to file pointer -+ */ -+ if (iov_iter_rw(from) == WRITE && -+ ((iocb->ki_pos + iov_iter_count(from)) > i_size_read(inode))) { -+ ret = __fuse_file_fallocate(iocb->ki_filp, 0, iocb->ki_pos, -+ iov_iter_count(from)); -+ if (ret < 0) { -+ printk("fallocate(offset=0x%llx length=0x%zx)" -+ " failed. err=%zd\n", iocb->ki_pos, -+ iov_iter_count(from), ret); -+ goto out; -+ } -+ pr_debug("fallocate(offset=0x%llx length=0x%zx)" -+ " succeed. ret=%zd\n", iocb->ki_pos, iov_iter_count(from), ret); -+ } -+ -+ ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops); -+ -+out: -+ inode_unlock(inode); -+ -+ if (ret > 0) -+ ret = generic_write_sync(iocb, ret); -+ return ret; -+} -+ - static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) - { - int i; -@@ -1903,6 +2420,17 @@ static int fuse_writepages_fill(struct page *page, - return err; - } - -+static int fuse_dax_writepages(struct address_space *mapping, -+ struct writeback_control *wbc) -+{ -+ -+ struct inode *inode = mapping->host; -+ struct fuse_conn *fc = get_fuse_conn(inode); -+ -+ return dax_writeback_mapping_range(mapping, -+ NULL, fc->dax_dev, wbc); -+} -+ - static int fuse_writepages(struct address_space *mapping, - struct writeback_control *wbc) - { -@@ -2076,8 +2604,20 @@ static const struct vm_operations_struct fuse_file_vm_ops = { - .page_mkwrite = fuse_page_mkwrite, - }; - -+static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma); -+static int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); -+ - static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) - { -+ struct fuse_file *ff = file->private_data; -+ -+ /* DAX mmap is superior to direct_io mmap */ -+ if (IS_DAX(file_inode(file))) -+ return fuse_dax_mmap(file, vma); -+ -+ if (ff->open_flags & FOPEN_DIRECT_IO) -+ return fuse_direct_mmap(file, vma); -+ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) - fuse_link_write_file(file); - -@@ -2097,6 +2637,103 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) - return generic_file_mmap(file, vma); - } - -+static ssize_t fuse_file_splice_read(struct file *in, loff_t *ppos, -+ struct pipe_inode_info *pipe, size_t len, -+ unsigned int flags) -+{ -+ struct fuse_file *ff = in->private_data; -+ -+ if (ff->open_flags & FOPEN_DIRECT_IO) -+ return default_file_splice_read(in, ppos, pipe, len, flags); -+ else -+ return generic_file_splice_read(in, ppos, pipe, len, flags); -+ -+} -+static int __fuse_dax_fault(struct vm_fault *vmf, enum page_entry_size pe_size, -+ bool write) -+{ -+ int ret, error = 0; -+ struct inode *inode = file_inode(vmf->vma->vm_file); -+ struct super_block *sb = inode->i_sb; -+ pfn_t pfn; -+ struct fuse_conn *fc = get_fuse_conn(inode); -+ bool retry = false; -+ -+ if (write) -+ sb_start_pagefault(sb); -+ -+retry: -+ if (retry && !(fc->nr_free_ranges > 0)) { -+ ret = -EINTR; -+ if (wait_event_killable_exclusive(fc->dax_range_waitq, -+ (fc->nr_free_ranges > 0))) -+ goto out; -+ } -+ -+ /* -+ * We need to serialize against not only truncate but also against -+ * fuse dax memory range reclaim. While a range is being reclaimed, -+ * we do not want any read/write/mmap to make progress and try -+ * to populate page cache or access memory we are trying to free. -+ */ -+ down_read(&get_fuse_inode(inode)->i_mmap_sem); -+ ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); -+ if ((ret & VM_FAULT_ERROR) && error == -ENOSPC) { -+ error = 0; -+ retry = true; -+ up_read(&get_fuse_inode(inode)->i_mmap_sem); -+ goto retry; -+ } -+ -+ if (ret & VM_FAULT_NEEDDSYNC) -+ ret = dax_finish_sync_fault(vmf, pe_size, pfn); -+ -+ up_read(&get_fuse_inode(inode)->i_mmap_sem); -+ -+out: -+ if (write) -+ sb_end_pagefault(sb); -+ -+ return ret; -+} -+ -+static int fuse_dax_fault(struct vm_fault *vmf) -+{ -+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, -+ vmf->flags & FAULT_FLAG_WRITE); -+} -+ -+static int fuse_dax_huge_fault(struct vm_fault *vmf, -+ enum page_entry_size pe_size) -+{ -+ return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); -+} -+ -+static int fuse_dax_page_mkwrite(struct vm_fault *vmf) -+{ -+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); -+} -+ -+static int fuse_dax_pfn_mkwrite(struct vm_fault *vmf) -+{ -+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); -+} -+ -+static const struct vm_operations_struct fuse_dax_vm_ops = { -+ .fault = fuse_dax_fault, -+ .huge_fault = fuse_dax_huge_fault, -+ .page_mkwrite = fuse_dax_page_mkwrite, -+ .pfn_mkwrite = fuse_dax_pfn_mkwrite, -+}; -+ -+static int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) -+{ -+ file_accessed(file); -+ vma->vm_ops = &fuse_dax_vm_ops; -+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; -+ return 0; -+} -+ - static int convert_fuse_file_lock(struct fuse_conn *fc, - const struct fuse_file_lock *ffl, - struct file_lock *fl) -@@ -2940,8 +3577,12 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) - return ret; - } - --static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, -- loff_t length) -+/* -+ * This variant does not take any inode lock and if locking is required, -+ * caller is supposed to hold lock -+ */ -+static long __fuse_file_fallocate(struct file *file, int mode, -+ loff_t offset, loff_t length) - { - struct fuse_file *ff = file->private_data; - struct inode *inode = file_inode(file); -@@ -2955,8 +3596,6 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, - .mode = mode - }; - int err; -- bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || -- (mode & FALLOC_FL_PUNCH_HOLE); - - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) - return -EOPNOTSUPP; -@@ -2964,17 +3603,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, - if (fc->no_fallocate) - return -EOPNOTSUPP; - -- if (lock_inode) { -- inode_lock(inode); -- if (mode & FALLOC_FL_PUNCH_HOLE) { -- loff_t endbyte = offset + length - 1; -- err = filemap_write_and_wait_range(inode->i_mapping, -- offset, endbyte); -- if (err) -- goto out; -- -- fuse_sync_writes(inode); -- } -+ if (mode & FALLOC_FL_PUNCH_HOLE) { -+ loff_t endbyte = offset + length - 1; -+ err = filemap_write_and_wait_range(inode->i_mapping, offset, -+ endbyte); -+ if (err) -+ goto out; -+ fuse_sync_writes(inode); - } - - if (!(mode & FALLOC_FL_KEEP_SIZE) && -@@ -3008,18 +3643,42 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, - file_update_time(file); - } - -- if (mode & FALLOC_FL_PUNCH_HOLE) -+ if (mode & FALLOC_FL_PUNCH_HOLE) { -+ down_write(&fi->i_mmap_sem); - truncate_pagecache_range(inode, offset, offset + length - 1); -- -+ up_write(&fi->i_mmap_sem); -+ } - fuse_invalidate_attr(inode); - - out: - if (!(mode & FALLOC_FL_KEEP_SIZE)) - clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - -+ return err; -+} -+ -+static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, -+ loff_t length) -+{ -+ struct fuse_file *ff = file->private_data; -+ struct inode *inode = file_inode(file); -+ struct fuse_conn *fc = ff->fc; -+ int err; -+ bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || -+ (mode & FALLOC_FL_PUNCH_HOLE); -+ -+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) -+ return -EOPNOTSUPP; -+ -+ if (fc->no_fallocate) -+ return -EOPNOTSUPP; -+ - if (lock_inode) -- inode_unlock(inode); -+ inode_lock(inode); - -+ err = __fuse_file_fallocate(file, mode, offset, length); -+ if (lock_inode) -+ inode_unlock(inode); - return err; - } - -@@ -3027,38 +3686,21 @@ static const struct file_operations fuse_file_operations = { - .llseek = fuse_file_llseek, - .read_iter = fuse_file_read_iter, - .write_iter = fuse_file_write_iter, -- .mmap = fuse_file_mmap, -+ .mmap = fuse_file_mmap, -+ .splice_read = fuse_file_splice_read, - .open = fuse_open, - .flush = fuse_flush, - .release = fuse_release, - .fsync = fuse_fsync, - .lock = fuse_file_lock, -+ .get_unmapped_area = thp_get_unmapped_area, - .flock = fuse_file_flock, -- .splice_read = generic_file_splice_read, - .unlocked_ioctl = fuse_file_ioctl, - .compat_ioctl = fuse_file_compat_ioctl, - .poll = fuse_file_poll, - .fallocate = fuse_file_fallocate, - }; - --static const struct file_operations fuse_direct_io_file_operations = { -- .llseek = fuse_file_llseek, -- .read_iter = fuse_direct_read_iter, -- .write_iter = fuse_direct_write_iter, -- .mmap = fuse_direct_mmap, -- .open = fuse_open, -- .flush = fuse_flush, -- .release = fuse_release, -- .fsync = fuse_fsync, -- .lock = fuse_file_lock, -- .flock = fuse_file_flock, -- .unlocked_ioctl = fuse_file_ioctl, -- .compat_ioctl = fuse_file_compat_ioctl, -- .poll = fuse_file_poll, -- .fallocate = fuse_file_fallocate, -- /* no splice_read */ --}; -- - static const struct address_space_operations fuse_file_aops = { - .readpage = fuse_readpage, - .writepage = fuse_writepage, -@@ -3072,8 +3714,271 @@ static const struct address_space_operations fuse_file_aops = { - .write_end = fuse_write_end, - }; - -+static const struct address_space_operations fuse_dax_file_aops = { -+ .writepages = fuse_dax_writepages, -+ .direct_IO = noop_direct_IO, -+ .set_page_dirty = noop_set_page_dirty, -+ .invalidatepage = noop_invalidatepage, -+}; -+ - void fuse_init_file_inode(struct inode *inode) - { -+ struct fuse_inode *fi = get_fuse_inode(inode); -+ struct fuse_conn *fc = get_fuse_conn(inode); -+ - inode->i_fop = &fuse_file_operations; - inode->i_data.a_ops = &fuse_file_aops; -+ fi->dmap_tree = RB_ROOT_CACHED; -+ -+ if (fc->dax_dev) { -+ inode->i_flags |= S_DAX; -+ inode->i_data.a_ops = &fuse_dax_file_aops; -+ } -+} -+ -+int fuse_dax_reclaim_dmap_locked(struct fuse_conn *fc, struct inode *inode, -+ struct fuse_dax_mapping *dmap) -+{ -+ int ret; -+ struct fuse_inode *fi = get_fuse_inode(inode); -+ -+ ret = filemap_fdatawrite_range(inode->i_mapping, dmap->start, -+ dmap->end); -+ if (ret) { -+ printk("filemap_fdatawrite_range() failed. err=%d start=0x%llx," -+ " end=0x%llx\n", ret, dmap->start, dmap->end); -+ return ret; -+ } -+ -+ ret = invalidate_inode_pages2_range(inode->i_mapping, -+ dmap->start >> PAGE_SHIFT, -+ dmap->end >> PAGE_SHIFT); -+ /* TODO: What to do if above fails? For now, -+ * leave the range in place. -+ */ -+ if (ret) { -+ printk("invalidate_inode_pages2_range() failed err=%d\n", ret); -+ return ret; -+ } -+ -+ /* Remove dax mapping from inode interval tree now */ -+ fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree); -+ fi->nr_dmaps--; -+ return 0; -+} -+ -+/* First first mapping in the tree and free it. */ -+struct fuse_dax_mapping *fuse_dax_reclaim_first_mapping_locked( -+ struct fuse_conn *fc, struct inode *inode) -+{ -+ struct fuse_inode *fi = get_fuse_inode(inode); -+ struct fuse_dax_mapping *dmap; -+ int ret; -+ -+ /* Find fuse dax mapping at file offset inode. */ -+ dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, 0, -1); -+ if (!dmap) -+ return NULL; -+ -+ ret = fuse_dax_reclaim_dmap_locked(fc, inode, dmap); -+ if (ret < 0) -+ return ERR_PTR(ret); -+ -+ /* Clean up dmap. Do not add back to free list */ -+ dmap_remove_busy_list(fc, dmap); -+ dmap->inode = NULL; -+ dmap->start = dmap->end = 0; -+ -+ pr_debug("fuse: reclaimed memory range window_offset=0x%llx," -+ " length=0x%llx\n", dmap->window_offset, -+ dmap->length); -+ return dmap; -+} -+ -+/* -+ * First first mapping in the tree and free it and return it. Do not add -+ * it back to free pool. -+ * -+ * This is called with inode lock held. -+ */ -+struct fuse_dax_mapping *fuse_dax_reclaim_first_mapping(struct fuse_conn *fc, -+ struct inode *inode) -+{ -+ struct fuse_inode *fi = get_fuse_inode(inode); -+ struct fuse_dax_mapping *dmap; -+ -+ down_write(&fi->i_mmap_sem); -+ down_write(&fi->i_dmap_sem); -+ dmap = fuse_dax_reclaim_first_mapping_locked(fc, inode); -+ up_write(&fi->i_dmap_sem); -+ up_write(&fi->i_mmap_sem); -+ return dmap; -+} -+ -+static struct fuse_dax_mapping *alloc_dax_mapping_reclaim(struct fuse_conn *fc, -+ struct inode *inode) -+{ -+ struct fuse_dax_mapping *dmap; -+ struct fuse_inode *fi = get_fuse_inode(inode); -+ -+ while(1) { -+ dmap = alloc_dax_mapping(fc); -+ if (dmap) -+ return dmap; -+ -+ if (fi->nr_dmaps) -+ return fuse_dax_reclaim_first_mapping(fc, inode); -+ /* -+ * There are no mappings which can be reclaimed. -+ * Wait for one. -+ */ -+ if (!(fc->nr_free_ranges > 0)) { -+ if (wait_event_killable_exclusive(fc->dax_range_waitq, -+ (fc->nr_free_ranges > 0))) -+ return ERR_PTR(-EINTR); -+ } -+ } -+} -+ -+int fuse_dax_free_one_mapping_locked(struct fuse_conn *fc, struct inode *inode, -+ u64 dmap_start) -+{ -+ int ret; -+ struct fuse_inode *fi = get_fuse_inode(inode); -+ struct fuse_dax_mapping *dmap; -+ -+ WARN_ON(!inode_is_locked(inode)); -+ -+ /* Find fuse dax mapping at file offset inode. */ -+ dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, dmap_start, -+ dmap_start); -+ -+ /* Range already got cleaned up by somebody else */ -+ if (!dmap) -+ return 0; -+ -+ ret = fuse_dax_reclaim_dmap_locked(fc, inode, dmap); -+ if (ret < 0) -+ return ret; -+ -+ /* Cleanup dmap entry and add back to free list */ -+ spin_lock(&fc->lock); -+ __dmap_remove_busy_list(fc, dmap); -+ dmap->inode = NULL; -+ dmap->start = dmap->end = 0; -+ __free_dax_mapping(fc, dmap); -+ spin_unlock(&fc->lock); -+ -+ pr_debug("fuse: freed memory range window_offset=0x%llx," -+ " length=0x%llx\n", dmap->window_offset, -+ dmap->length); -+ return ret; -+} -+ -+/* -+ * Free a range of memory. -+ * Locking. -+ * 1. Take inode->i_rwsem to prever further read/write. -+ * 2. Take fuse_inode->i_mmap_sem to block dax faults. -+ * 3. Take fuse_inode->i_dmap_sem to protect interval tree. It might not -+ * be strictly necessary as lock 1 and 2 seem sufficient. -+ */ -+int fuse_dax_free_one_mapping(struct fuse_conn *fc, struct inode *inode, -+ u64 dmap_start) -+{ -+ int ret; -+ struct fuse_inode *fi = get_fuse_inode(inode); -+ -+ /* -+ * If process is blocked waiting for memory while holding inode -+ * lock, we will deadlock. So continue to free next range. -+ */ -+ if (!inode_trylock(inode)) -+ return -EAGAIN; -+ down_write(&fi->i_mmap_sem); -+ down_write(&fi->i_dmap_sem); -+ ret = fuse_dax_free_one_mapping_locked(fc, inode, dmap_start); -+ up_write(&fi->i_dmap_sem); -+ up_write(&fi->i_mmap_sem); -+ inode_unlock(inode); -+ return ret; -+} -+ -+int fuse_dax_free_memory(struct fuse_conn *fc, unsigned long nr_to_free) -+{ -+ struct fuse_dax_mapping *dmap, *pos, *temp; -+ int ret, nr_freed = 0, nr_eagain = 0; -+ u64 dmap_start = 0, window_offset = 0; -+ struct inode *inode = NULL; -+ -+ /* Pick first busy range and free it for now*/ -+ while(1) { -+ if (nr_freed >= nr_to_free) -+ break; -+ -+ if (nr_eagain > 20) { -+ queue_delayed_work(system_long_wq, &fc->dax_free_work, -+ msecs_to_jiffies(10)); -+ return 0; -+ } -+ -+ dmap = NULL; -+ spin_lock(&fc->lock); -+ -+ list_for_each_entry_safe(pos, temp, &fc->busy_ranges, -+ busy_list) { -+ inode = igrab(pos->inode); -+ /* -+ * This inode is going away. That will free -+ * up all the ranges anyway, continue to -+ * next range. -+ */ -+ if (!inode) -+ continue; -+ /* -+ * Take this element off list and add it tail. If -+ * inode lock can't be obtained, this will help with -+ * selecting new element -+ */ -+ dmap = pos; -+ list_move_tail(&dmap->busy_list, &fc->busy_ranges); -+ dmap_start = dmap->start; -+ window_offset = dmap->window_offset; -+ break; -+ } -+ spin_unlock(&fc->lock); -+ if (!dmap) -+ return 0; -+ -+ ret = fuse_dax_free_one_mapping(fc, inode, dmap_start); -+ iput(inode); -+ if (ret && ret != -EAGAIN) { -+ printk("%s(window_offset=0x%llx) failed. err=%d\n", -+ __func__, window_offset, ret); -+ return ret; -+ } -+ -+ /* Could not get inode lock. Try next element */ -+ if (ret == -EAGAIN) { -+ nr_eagain++; -+ continue; -+ } -+ nr_freed++; -+ } -+ return 0; -+} -+ -+/* TODO: This probably should go in inode.c */ -+void fuse_dax_free_mem_worker(struct work_struct *work) -+{ -+ int ret; -+ struct fuse_conn *fc = container_of(work, struct fuse_conn, -+ dax_free_work.work); -+ pr_debug("fuse: Worker to free memory called.\n"); -+ pr_debug("fuse: Worker to free memory called. nr_free_ranges=%lu" -+ " nr_busy_ranges=%lu\n", fc->nr_free_ranges, -+ fc->nr_busy_ranges); -+ ret = fuse_dax_free_memory(fc, FUSE_DAX_RECLAIM_CHUNK); -+ if (ret) -+ pr_debug("fuse: fuse_dax_free_memory() failed with err=%d\n", ret); - } -diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h -index cec8b8e74..1149281ab 100644 ---- a/fs/fuse/fuse_i.h -+++ b/fs/fuse/fuse_i.h -@@ -43,6 +43,20 @@ - /** Number of page pointers embedded in fuse_req */ - #define FUSE_REQ_INLINE_PAGES 1 - -+/* Default memory range size, 2MB */ -+#define FUSE_DAX_MEM_RANGE_SZ (2*1024*1024) -+#define FUSE_DAX_MEM_RANGE_PAGES (FUSE_DAX_MEM_RANGE_SZ/PAGE_SIZE) -+ -+/* Number of ranges reclaimer will try to free in one invocation */ -+#define FUSE_DAX_RECLAIM_CHUNK (10) -+ -+/* -+ * Dax memory reclaim threshold in percetage of total ranges. When free -+ * number of free ranges drops below this threshold, reclaim can trigger -+ * Default is 20% -+ * */ -+#define FUSE_DAX_RECLAIM_THRESHOLD (20) -+ - /** List of active connections */ - extern struct list_head fuse_conn_list; - -@@ -53,12 +67,73 @@ extern struct mutex fuse_mutex; - extern unsigned max_user_bgreq; - extern unsigned max_user_congthresh; - -+/** Mount options */ -+struct fuse_mount_data { -+ int fd; -+ const char *tag; /* lifetime: .fill_super() data argument */ -+ unsigned rootmode; -+ kuid_t user_id; -+ kgid_t group_id; -+ unsigned fd_present:1; -+ unsigned tag_present:1; -+ unsigned rootmode_present:1; -+ unsigned user_id_present:1; -+ unsigned group_id_present:1; -+ unsigned default_permissions:1; -+ unsigned allow_other:1; -+ unsigned dax:1; -+ unsigned destroy:1; -+ unsigned max_read; -+ unsigned blksize; -+ -+ /* DAX device, may be NULL */ -+ struct dax_device *dax_dev; -+ -+ /* fuse input queue operations */ -+ const struct fuse_iqueue_ops *fiq_ops; -+ -+ /* device-specific state for fuse_iqueue */ -+ void *fiq_priv; -+ -+ /* fuse_dev pointer to fill in, should contain NULL on entry */ -+ void **fudptr; -+}; -+ - /* One forget request */ - struct fuse_forget_link { - struct fuse_forget_one forget_one; - struct fuse_forget_link *next; - }; - -+#define START(node) ((node)->start) -+#define LAST(node) ((node)->end) -+ -+/** Translation information for file offsets to DAX window offsets */ -+struct fuse_dax_mapping { -+ /* Pointer to inode where this memory range is mapped */ -+ struct inode *inode; -+ -+ /* Will connect in fc->free_ranges to keep track of free memory */ -+ struct list_head list; -+ -+ /* For interval tree in file/inode */ -+ struct rb_node rb; -+ /** Start Position in file */ -+ __u64 start; -+ /** End Position in file */ -+ __u64 end; -+ __u64 __subtree_last; -+ -+ /* Will connect in fc->busy_ranges to keep track busy memory */ -+ struct list_head busy_list; -+ -+ /** Position in DAX window */ -+ u64 window_offset; -+ -+ /** Length of mapping, in bytes */ -+ loff_t length; -+}; -+ - /** FUSE inode */ - struct fuse_inode { - /** Inode data */ -@@ -108,6 +183,22 @@ struct fuse_inode { - - /** Lock for serializing lookup and readdir for back compatibility*/ - struct mutex mutex; -+ -+ /* -+ * Semaphore to protect modifications to dmap_tree -+ */ -+ struct rw_semaphore i_dmap_sem; -+ -+ /** -+ * Can't take inode lock in fault path (leads to circular dependency). -+ * So take this in fuse dax fault path to make sure truncate and -+ * punch hole etc. can't make progress in parallel. -+ */ -+ struct rw_semaphore i_mmap_sem; -+ -+ /** Sorted rb tree of struct fuse_dax_mapping elements */ -+ struct rb_root_cached dmap_tree; -+ unsigned long nr_dmaps; - }; - - /** FUSE inode state bits */ -@@ -382,8 +473,44 @@ struct fuse_req { - - /** Request is stolen from fuse_file->reserved_req */ - struct file *stolen_file; -+ -+ /** virtio-fs's physically contiguous buffer for in and out args */ -+ void *argbuf; - }; - -+struct fuse_iqueue; -+ -+/** -+ * Input queue callbacks -+ * -+ * Input queue signalling is device-specific. For example, the /dev/fuse file -+ * uses fiq->waitq and fasync to wake processes that are waiting on queue -+ * readiness. These callbacks allow other device types to respond to input -+ * queue activity. -+ */ -+struct fuse_iqueue_ops { -+ /** -+ * Signal that a forget has been queued -+ */ -+ void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) -+ __releases(fiq->waitq.lock); -+ -+ /** -+ * Signal that an INTERRUPT request has been queued -+ */ -+ void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) -+ __releases(fiq->waitq.lock); -+ -+ /** -+ * Signal that a request has been queued -+ */ -+ void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) -+ __releases(fiq->waitq.lock); -+}; -+ -+/** /dev/fuse input queue operations */ -+extern const struct fuse_iqueue_ops fuse_dev_fiq_ops; -+ - struct fuse_iqueue { - /** Connection established */ - unsigned connected; -@@ -409,6 +536,12 @@ struct fuse_iqueue { - - /** O_ASYNC requests */ - struct fasync_struct *fasync; -+ -+ /** Device-specific callbacks */ -+ const struct fuse_iqueue_ops *ops; -+ -+ /** Device-specific state */ -+ void *priv; - }; - - struct fuse_pqueue { -@@ -675,6 +808,28 @@ struct fuse_conn { - - /** List of device instances belonging to this connection */ - struct list_head devices; -+ -+ /** DAX device, non-NULL if DAX is supported */ -+ struct dax_device *dax_dev; -+ -+ /* List of memory ranges which are busy */ -+ unsigned long nr_busy_ranges; -+ struct list_head busy_ranges; -+ -+ /* Worker to free up memory ranges */ -+ struct delayed_work dax_free_work; -+ -+ /* Wait queue for a dax range to become free */ -+ wait_queue_head_t dax_range_waitq; -+ -+ /* -+ * DAX Window Free Ranges. TODO: This might not be best place to store -+ * this free list -+ */ -+ unsigned long nr_free_ranges; -+ struct list_head free_ranges; -+ -+ unsigned long nr_ranges; - }; - - static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) -@@ -860,6 +1015,11 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); - void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req); - -+/** -+ * End a finished request -+ */ -+void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req); -+ - /* Abort all requests */ - void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); - void fuse_wait_aborted(struct fuse_conn *fc); -@@ -881,16 +1041,42 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); - /** - * Initialize fuse_conn - */ --void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns); -+void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, -+ struct dax_device *dax_dev, -+ const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv); - - /** - * Release reference to fuse_conn - */ - void fuse_conn_put(struct fuse_conn *fc); - --struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); -+struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); -+struct fuse_dev *fuse_dev_alloc(void); -+void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); - void fuse_dev_free(struct fuse_dev *fud); - -+/** -+ * Parse a mount options string -+ */ -+int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, -+ struct user_namespace *user_ns); -+ -+/** -+ * Fill in superblock and initialize fuse connection -+ * @sb: partially-initialized superblock to fill in -+ * @mount_data: mount parameters -+ */ -+int fuse_fill_super_common(struct super_block *sb, -+ struct fuse_mount_data *mount_data); -+void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req); -+ -+/** -+ * Disassociate fuse connection from superblock and kill the superblock -+ * -+ * Calls kill_anon_super(), use with do not use with bdev mounts. -+ */ -+void fuse_kill_sb_anon(struct super_block *sb); -+ - /** - * Add connection to control filesystem - */ -@@ -992,4 +1178,16 @@ struct posix_acl; - struct posix_acl *fuse_get_acl(struct inode *inode, int type); - int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); - -+/** -+ * Return the number of bytes in an arguments list -+ */ -+unsigned fuse_len_args(unsigned numargs, struct fuse_arg *args); -+ -+/** -+ * Get the next unique ID for a request -+ */ -+u64 fuse_get_unique(struct fuse_iqueue *fiq); -+void fuse_dax_free_mem_worker(struct work_struct *work); -+void fuse_removemapping(struct inode *inode); -+ - #endif /* _FS_FUSE_I_H */ -diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c -index db9e60b7e..dd16c7f6a 100644 ---- a/fs/fuse/inode.c -+++ b/fs/fuse/inode.c -@@ -22,6 +22,8 @@ - #include - #include - #include -+#include -+#include - - MODULE_AUTHOR("Miklos Szeredi "); - MODULE_DESCRIPTION("Filesystem in Userspace"); -@@ -59,21 +61,6 @@ MODULE_PARM_DESC(max_user_congthresh, - /** Congestion starts at 75% of maximum */ - #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) - --struct fuse_mount_data { -- int fd; -- unsigned rootmode; -- kuid_t user_id; -- kgid_t group_id; -- unsigned fd_present:1; -- unsigned rootmode_present:1; -- unsigned user_id_present:1; -- unsigned group_id_present:1; -- unsigned default_permissions:1; -- unsigned allow_other:1; -- unsigned max_read; -- unsigned blksize; --}; -- - struct fuse_forget_link *fuse_alloc_forget(void) - { - return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); -@@ -96,11 +83,14 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) - fi->writectr = 0; - fi->orig_ino = 0; - fi->state = 0; -+ fi->nr_dmaps = 0; - INIT_LIST_HEAD(&fi->write_files); - INIT_LIST_HEAD(&fi->queued_writes); - INIT_LIST_HEAD(&fi->writepages); - init_waitqueue_head(&fi->page_waitq); - mutex_init(&fi->mutex); -+ init_rwsem(&fi->i_mmap_sem); -+ init_rwsem(&fi->i_dmap_sem); - fi->forget = fuse_alloc_forget(); - if (!fi->forget) { - kmem_cache_free(fuse_inode_cachep, inode); -@@ -133,6 +123,10 @@ static void fuse_evict_inode(struct inode *inode) - if (inode->i_sb->s_flags & SB_ACTIVE) { - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); -+ if (IS_DAX(inode)) { -+ fuse_removemapping(inode); -+ WARN_ON(fi->nr_dmaps); -+ } - fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); - fi->forget = NULL; - } -@@ -447,6 +441,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) - - enum { - OPT_FD, -+ OPT_TAG, - OPT_ROOTMODE, - OPT_USER_ID, - OPT_GROUP_ID, -@@ -454,11 +449,13 @@ enum { - OPT_ALLOW_OTHER, - OPT_MAX_READ, - OPT_BLKSIZE, -+ OPT_DAX, - OPT_ERR - }; - - static const match_table_t tokens = { - {OPT_FD, "fd=%u"}, -+ {OPT_TAG, "tag=%s"}, - {OPT_ROOTMODE, "rootmode=%o"}, - {OPT_USER_ID, "user_id=%u"}, - {OPT_GROUP_ID, "group_id=%u"}, -@@ -466,6 +463,7 @@ static const match_table_t tokens = { - {OPT_ALLOW_OTHER, "allow_other"}, - {OPT_MAX_READ, "max_read=%u"}, - {OPT_BLKSIZE, "blksize=%u"}, -+ {OPT_DAX, "dax"}, - {OPT_ERR, NULL} - }; - -@@ -480,7 +478,7 @@ static int fuse_match_uint(substring_t *s, unsigned int *res) - return err; - } - --static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, -+int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, - struct user_namespace *user_ns) - { - char *p; -@@ -505,6 +503,11 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, - d->fd_present = 1; - break; - -+ case OPT_TAG: -+ d->tag = args[0].from; -+ d->tag_present = 1; -+ break; -+ - case OPT_ROOTMODE: - if (match_octal(&args[0], &value)) - return 0; -@@ -552,17 +555,22 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, - d->blksize = value; - break; - -+ case OPT_DAX: -+ d->dax = 1; -+ break; -+ - default: - return 0; - } - } - -- if (!d->fd_present || !d->rootmode_present || -- !d->user_id_present || !d->group_id_present) -+ if (!d->rootmode_present || !d->user_id_present || -+ !d->group_id_present) - return 0; - - return 1; - } -+EXPORT_SYMBOL_GPL(parse_fuse_opt); - - static int fuse_show_options(struct seq_file *m, struct dentry *root) - { -@@ -579,10 +587,14 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) - seq_printf(m, ",max_read=%u", fc->max_read); - if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) - seq_printf(m, ",blksize=%lu", sb->s_blocksize); -+ if (fc->dax_dev) -+ seq_printf(m, ",dax"); - return 0; - } - --static void fuse_iqueue_init(struct fuse_iqueue *fiq) -+static void fuse_iqueue_init(struct fuse_iqueue *fiq, -+ const struct fuse_iqueue_ops *ops, -+ void *priv) - { - memset(fiq, 0, sizeof(struct fuse_iqueue)); - init_waitqueue_head(&fiq->waitq); -@@ -590,6 +602,8 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq) - INIT_LIST_HEAD(&fiq->interrupts); - fiq->forget_list_tail = &fiq->forget_list_head; - fiq->connected = 1; -+ fiq->ops = ops; -+ fiq->priv = priv; - } - - static void fuse_pqueue_init(struct fuse_pqueue *fpq) -@@ -601,7 +615,84 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) - fpq->connected = 1; - } - --void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) -+static void fuse_free_dax_mem_ranges(struct list_head *mem_list) -+{ -+ struct fuse_dax_mapping *range, *temp; -+ -+ /* Free All allocated elements */ -+ list_for_each_entry_safe(range, temp, mem_list, list) { -+ list_del(&range->list); -+ if (!list_empty(&range->busy_list)) -+ list_del(&range->busy_list); -+ kfree(range); -+ } -+} -+ -+#ifdef CONFIG_FS_DAX -+static int fuse_dax_mem_range_init(struct fuse_conn *fc, -+ struct dax_device *dax_dev) -+{ -+ long nr_pages, nr_ranges; -+ void *kaddr; -+ pfn_t pfn; -+ struct fuse_dax_mapping *range; -+ LIST_HEAD(mem_ranges); -+ phys_addr_t phys_addr; -+ int ret = 0, id; -+ size_t dax_size = -1; -+ unsigned long allocated_ranges = 0, i; -+ -+ id = dax_read_lock(); -+ nr_pages = dax_direct_access(dax_dev, 0, PHYS_PFN(dax_size), &kaddr, -+ &pfn); -+ dax_read_unlock(id); -+ if (nr_pages < 0) { -+ pr_debug("dax_direct_access() returned %ld\n", nr_pages); -+ return nr_pages; -+ } -+ -+ phys_addr = pfn_t_to_phys(pfn); -+ nr_ranges = nr_pages/FUSE_DAX_MEM_RANGE_PAGES; -+ printk("fuse_dax_mem_range_init(): dax mapped %ld pages. nr_ranges=%ld\n", nr_pages, nr_ranges); -+ -+ for (i = 0; i < nr_ranges; i++) { -+ range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL); -+ if (!range) { -+ pr_debug("memory allocation for mem_range failed.\n"); -+ ret = -ENOMEM; -+ goto out_err; -+ } -+ /* TODO: This offset only works if virtio-fs driver is not -+ * having some memory hidden at the beginning. This needs -+ * better handling -+ */ -+ range->window_offset = i * FUSE_DAX_MEM_RANGE_SZ; -+ range->length = FUSE_DAX_MEM_RANGE_SZ; -+ list_add_tail(&range->list, &mem_ranges); -+ INIT_LIST_HEAD(&range->busy_list); -+ allocated_ranges++; -+ } -+ -+ list_replace_init(&mem_ranges, &fc->free_ranges); -+ fc->nr_free_ranges = allocated_ranges; -+ fc->nr_ranges = allocated_ranges; -+ return 0; -+out_err: -+ /* Free All allocated elements */ -+ fuse_free_dax_mem_ranges(&mem_ranges); -+ return ret; -+} -+#else /* !CONFIG_FS_DAX */ -+static inline int fuse_dax_mem_range_init(struct fuse_conn *fc, -+ struct dax_device *dax_dev) -+{ -+ return 0; -+} -+#endif /* CONFIG_FS_DAX */ -+ -+void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, -+ struct dax_device *dax_dev, -+ const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) - { - memset(fc, 0, sizeof(*fc)); - spin_lock_init(&fc->lock); -@@ -610,7 +701,8 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) - atomic_set(&fc->dev_count, 1); - init_waitqueue_head(&fc->blocked_waitq); - init_waitqueue_head(&fc->reserved_req_waitq); -- fuse_iqueue_init(&fc->iq); -+ init_waitqueue_head(&fc->dax_range_waitq); -+ fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); - INIT_LIST_HEAD(&fc->bg_queue); - INIT_LIST_HEAD(&fc->entry); - INIT_LIST_HEAD(&fc->devices); -@@ -625,7 +717,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) - fc->attr_version = 1; - get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); - fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); -+ fc->dax_dev = dax_dev; - fc->user_ns = get_user_ns(user_ns); -+ INIT_LIST_HEAD(&fc->free_ranges); -+ INIT_LIST_HEAD(&fc->busy_ranges); -+ INIT_DELAYED_WORK(&fc->dax_free_work, fuse_dax_free_mem_worker); - } - EXPORT_SYMBOL_GPL(fuse_conn_init); - -@@ -634,6 +730,9 @@ void fuse_conn_put(struct fuse_conn *fc) - if (refcount_dec_and_test(&fc->count)) { - if (fc->destroy_req) - fuse_request_free(fc->destroy_req); -+ flush_delayed_work(&fc->dax_free_work); -+ if (fc->dax_dev) -+ fuse_free_dax_mem_ranges(&fc->free_ranges); - put_pid_ns(fc->pid_ns); - put_user_ns(fc->user_ns); - fc->release(fc); -@@ -943,7 +1042,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) - wake_up_all(&fc->blocked_waitq); - } - --static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) -+void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) - { - struct fuse_init_in *arg = &req->misc.init_in; - -@@ -972,6 +1071,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) - req->end = process_init_reply; - fuse_request_send_background(fc, req); - } -+EXPORT_SYMBOL_GPL(fuse_send_init); - - static void fuse_free_conn(struct fuse_conn *fc) - { -@@ -1019,24 +1119,38 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) - return 0; - } - --struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) --{ -+struct fuse_dev *fuse_dev_alloc(void) { - struct fuse_dev *fud; - - fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL); -- if (fud) { -- fud->fc = fuse_conn_get(fc); -+ if (fud) - fuse_pqueue_init(&fud->pq); - -- spin_lock(&fc->lock); -- list_add_tail(&fud->entry, &fc->devices); -- spin_unlock(&fc->lock); -- } -- - return fud; - } - EXPORT_SYMBOL_GPL(fuse_dev_alloc); - -+void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc) { -+ fud->fc = fuse_conn_get(fc); -+ spin_lock(&fc->lock); -+ list_add_tail(&fud->entry, &fc->devices); -+ spin_unlock(&fc->lock); -+} -+EXPORT_SYMBOL_GPL(fuse_dev_install); -+ -+struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc) -+{ -+ struct fuse_dev *fud; -+ -+ fud = fuse_dev_alloc(); -+ if (!fud) -+ return NULL; -+ -+ fuse_dev_install(fud, fc); -+ return fud; -+} -+EXPORT_SYMBOL_GPL(fuse_dev_alloc_install); -+ - void fuse_dev_free(struct fuse_dev *fud) - { - struct fuse_conn *fc = fud->fc; -@@ -1052,15 +1166,13 @@ void fuse_dev_free(struct fuse_dev *fud) - } - EXPORT_SYMBOL_GPL(fuse_dev_free); - --static int fuse_fill_super(struct super_block *sb, void *data, int silent) -+int fuse_fill_super_common(struct super_block *sb, -+ struct fuse_mount_data *mount_data) - { - struct fuse_dev *fud; - struct fuse_conn *fc; - struct inode *root; -- struct fuse_mount_data d; -- struct file *file; - struct dentry *root_dentry; -- struct fuse_req *init_req; - int err; - int is_bdev = sb->s_bdev != NULL; - -@@ -1070,13 +1182,10 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) - - sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - -- if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) -- goto err; -- - if (is_bdev) { - #ifdef CONFIG_BLOCK - err = -EINVAL; -- if (!sb_set_blocksize(sb, d.blksize)) -+ if (!sb_set_blocksize(sb, mount_data->blksize)) - goto err; - #endif - } else { -@@ -1093,19 +1202,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) - if (sb->s_user_ns != &init_user_ns) - sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; - -- file = fget(d.fd); -- err = -EINVAL; -- if (!file) -- goto err; -- -- /* -- * Require mount to happen from the same user namespace which -- * opened /dev/fuse to prevent potential attacks. -- */ -- if (file->f_op != &fuse_dev_operations || -- file->f_cred->user_ns != sb->s_user_ns) -- goto err_fput; -- - /* - * If we are not in the initial user namespace posix - * acls must be translated. -@@ -1116,12 +1212,21 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) - fc = kmalloc(sizeof(*fc), GFP_KERNEL); - err = -ENOMEM; - if (!fc) -- goto err_fput; -+ goto err; - -- fuse_conn_init(fc, sb->s_user_ns); -+ fuse_conn_init(fc, sb->s_user_ns, mount_data->dax_dev, -+ mount_data->fiq_ops, mount_data->fiq_priv); - fc->release = fuse_free_conn; - -- fud = fuse_dev_alloc(fc); -+ if (mount_data->dax_dev) { -+ err = fuse_dax_mem_range_init(fc, mount_data->dax_dev); -+ if (err) { -+ pr_debug("fuse_dax_mem_range_init() returned %d\n", err); -+ goto err_free_ranges; -+ } -+ } -+ -+ fud = fuse_dev_alloc_install(fc); - if (!fud) - goto err_put_conn; - -@@ -1136,17 +1241,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) - fc->dont_mask = 1; - sb->s_flags |= SB_POSIXACL; - -- fc->default_permissions = d.default_permissions; -- fc->allow_other = d.allow_other; -- fc->user_id = d.user_id; -- fc->group_id = d.group_id; -- fc->max_read = max_t(unsigned, 4096, d.max_read); -+ fc->default_permissions = mount_data->default_permissions; -+ fc->allow_other = mount_data->allow_other; -+ fc->user_id = mount_data->user_id; -+ fc->group_id = mount_data->group_id; -+ fc->max_read = max_t(unsigned, 4096, mount_data->max_read); - - /* Used by get_root_inode() */ - sb->s_fs_info = fc; - - err = -ENOMEM; -- root = fuse_get_root_inode(sb, d.rootmode); -+ root = fuse_get_root_inode(sb, mount_data->rootmode); - sb->s_d_op = &fuse_root_dentry_operations; - root_dentry = d_make_root(root); - if (!root_dentry) -@@ -1154,20 +1259,15 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) - /* Root dentry doesn't have .d_revalidate */ - sb->s_d_op = &fuse_dentry_operations; - -- init_req = fuse_request_alloc(0); -- if (!init_req) -- goto err_put_root; -- __set_bit(FR_BACKGROUND, &init_req->flags); -- -- if (is_bdev) { -+ if (mount_data->destroy) { - fc->destroy_req = fuse_request_alloc(0); - if (!fc->destroy_req) -- goto err_free_init_req; -+ goto err_put_root; - } - - mutex_lock(&fuse_mutex); - err = -EINVAL; -- if (file->private_data) -+ if (*mount_data->fudptr) - goto err_unlock; - - err = fuse_ctl_add_conn(fc); -@@ -1176,35 +1276,82 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) - - list_add_tail(&fc->entry, &fuse_conn_list); - sb->s_root = root_dentry; -- file->private_data = fud; -+ *mount_data->fudptr = fud; - mutex_unlock(&fuse_mutex); -- /* -- * atomic_dec_and_test() in fput() provides the necessary -- * memory barrier for file->private_data to be visible on all -- * CPUs after this -- */ -- fput(file); -- -- fuse_send_init(fc, init_req); -- - return 0; - - err_unlock: - mutex_unlock(&fuse_mutex); -- err_free_init_req: -- fuse_request_free(init_req); - err_put_root: - dput(root_dentry); - err_dev_free: - fuse_dev_free(fud); -+ err_free_ranges: -+ if (mount_data->dax_dev) -+ fuse_free_dax_mem_ranges(&fc->free_ranges); - err_put_conn: - fuse_conn_put(fc); - sb->s_fs_info = NULL; -- err_fput: -- fput(file); - err: - return err; - } -+EXPORT_SYMBOL_GPL(fuse_fill_super_common); -+ -+static int fuse_fill_super(struct super_block *sb, void *data, int silent) -+{ -+ struct fuse_mount_data d; -+ struct file *file; -+ int is_bdev = sb->s_bdev != NULL; -+ int err; -+ struct fuse_req *init_req; -+ -+ err = -EINVAL; -+ if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) -+ goto err; -+ if (!d.fd_present || d.tag_present) -+ goto err; -+ -+ file = fget(d.fd); -+ if (!file) -+ goto err; -+ -+ /* -+ * Require mount to happen from the same user namespace which -+ * opened /dev/fuse to prevent potential attacks. -+ */ -+ if ((file->f_op != &fuse_dev_operations) || -+ (file->f_cred->user_ns != sb->s_user_ns)) -+ goto err_fput; -+ -+ init_req = fuse_request_alloc(0); -+ if (!init_req) -+ goto err_fput; -+ __set_bit(FR_BACKGROUND, &init_req->flags); -+ -+ d.dax_dev = NULL; -+ d.fiq_ops = &fuse_dev_fiq_ops; -+ d.fiq_priv = NULL; -+ d.fudptr = &file->private_data; -+ d.destroy = is_bdev; -+ err = fuse_fill_super_common(sb, &d); -+ if (err < 0) -+ goto err_free_init_req; -+ /* -+ * atomic_dec_and_test() in fput() provides the necessary -+ * memory barrier for file->private_data to be visible on all -+ * CPUs after this -+ */ -+ fput(file); -+ fuse_send_init(get_fuse_conn_super(sb), init_req); -+ return 0; -+ -+err_free_init_req: -+ fuse_request_free(init_req); -+err_fput: -+ fput(file); -+err: -+ return err; -+} - - static struct dentry *fuse_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, -@@ -1229,11 +1376,12 @@ static void fuse_sb_destroy(struct super_block *sb) - } - } - --static void fuse_kill_sb_anon(struct super_block *sb) -+void fuse_kill_sb_anon(struct super_block *sb) - { - fuse_sb_destroy(sb); - kill_anon_super(sb); - } -+EXPORT_SYMBOL_GPL(fuse_kill_sb_anon); - - static struct file_system_type fuse_fs_type = { - .owner = THIS_MODULE, -diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c -new file mode 100644 -index 000000000..a0a2cd1ce ---- /dev/null -+++ b/fs/fuse/virtio_fs.c -@@ -0,0 +1,1121 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * virtio-fs: Virtio Filesystem -+ * Copyright (C) 2018 Red Hat, Inc. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "fuse_i.h" -+ -+/* List of virtio-fs device instances and a lock for the list */ -+static DEFINE_MUTEX(virtio_fs_mutex); -+static LIST_HEAD(virtio_fs_instances); -+ -+enum { -+ VQ_HIPRIO, -+ VQ_REQUEST -+}; -+ -+/* Per-virtqueue state */ -+struct virtio_fs_vq { -+ struct virtqueue *vq; /* protected by fpq->lock */ -+ struct work_struct done_work; -+ struct list_head queued_reqs; -+ struct delayed_work dispatch_work; -+ struct fuse_dev *fud; -+ char name[24]; -+} ____cacheline_aligned_in_smp; -+ -+/* State needed for devm_memremap_pages(). This API is called on the -+ * underlying pci_dev instead of struct virtio_fs (layering violation). Since -+ * the memremap release function only gets called when the pci_dev is released, -+ * keep the associated state separate from struct virtio_fs (it has a different -+ * lifecycle from pci_dev). -+ */ -+struct virtio_fs_memremap_info { -+ struct dev_pagemap pgmap; -+ struct percpu_ref ref; -+ struct completion completion; -+}; -+ -+/* A virtio-fs device instance */ -+struct virtio_fs { -+ struct list_head list; /* on virtio_fs_instances */ -+ char *tag; -+ struct virtio_fs_vq *vqs; -+ unsigned nvqs; /* number of virtqueues */ -+ unsigned num_queues; /* number of request queues */ -+ struct dax_device *dax_dev; -+ -+ /* DAX memory window where file contents are mapped */ -+ void *window_kaddr; -+ phys_addr_t window_phys_addr; -+ size_t window_len; -+}; -+ -+struct virtio_fs_forget { -+ struct fuse_in_header ih; -+ struct fuse_forget_in arg; -+ /* This request can be temporarily queued on virt queue */ -+ struct list_head list; -+}; -+ -+static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) -+{ -+ struct virtio_fs *fs = vq->vdev->priv; -+ -+ return &fs->vqs[vq->index]; -+} -+ -+static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq) -+{ -+ return &vq_to_fsvq(vq)->fud->pq; -+} -+ -+/* Add a new instance to the list or return -EEXIST if tag name exists*/ -+static int virtio_fs_add_instance(struct virtio_fs *fs) -+{ -+ struct virtio_fs *fs2; -+ bool duplicate = false; -+ -+ mutex_lock(&virtio_fs_mutex); -+ -+ list_for_each_entry(fs2, &virtio_fs_instances, list) { -+ if (strcmp(fs->tag, fs2->tag) == 0) -+ duplicate = true; -+ } -+ -+ if (!duplicate) -+ list_add_tail(&fs->list, &virtio_fs_instances); -+ -+ mutex_unlock(&virtio_fs_mutex); -+ -+ if (duplicate) -+ return -EEXIST; -+ return 0; -+} -+ -+/* Return the virtio_fs with a given tag, or NULL */ -+static struct virtio_fs *virtio_fs_find_instance(const char *tag) -+{ -+ struct virtio_fs *fs; -+ -+ mutex_lock(&virtio_fs_mutex); -+ -+ list_for_each_entry(fs, &virtio_fs_instances, list) { -+ if (strcmp(fs->tag, tag) == 0) -+ goto found; -+ } -+ -+ fs = NULL; /* not found */ -+ -+found: -+ mutex_unlock(&virtio_fs_mutex); -+ -+ return fs; -+} -+ -+static void virtio_fs_free_devs(struct virtio_fs *fs) -+{ -+ unsigned int i; -+ -+ /* TODO lock */ -+ -+ for (i = 0; i < fs->nvqs; i++) { -+ struct virtio_fs_vq *fsvq = &fs->vqs[i]; -+ -+ if (!fsvq->fud) -+ continue; -+ -+ flush_work(&fsvq->done_work); -+ flush_delayed_work(&fsvq->dispatch_work); -+ -+ fuse_dev_free(fsvq->fud); /* TODO need to quiesce/end_requests/decrement dev_count */ -+ fsvq->fud = NULL; -+ } -+} -+ -+/* Read filesystem name from virtio config into fs->tag (must kfree()). */ -+static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs) -+{ -+ char tag_buf[sizeof_field(struct virtio_fs_config, tag)]; -+ char *end; -+ size_t len; -+ -+ virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag), -+ &tag_buf, sizeof(tag_buf)); -+ end = memchr(tag_buf, '\0', sizeof(tag_buf)); -+ if (end == tag_buf) -+ return -EINVAL; /* empty tag */ -+ if (!end) -+ end = &tag_buf[sizeof(tag_buf)]; -+ -+ len = end - tag_buf; -+ fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL); -+ if (!fs->tag) -+ return -ENOMEM; -+ memcpy(fs->tag, tag_buf, len); -+ fs->tag[len] = '\0'; -+ return 0; -+} -+ -+/* Work function for hiprio completion */ -+static void virtio_fs_hiprio_done_work(struct work_struct *work) -+{ -+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, -+ done_work); -+ struct fuse_pqueue *fpq = &fsvq->fud->pq; -+ struct virtqueue *vq = fsvq->vq; -+ -+ /* Free completed FUSE_FORGET requests */ -+ spin_lock(&fpq->lock); -+ do { -+ unsigned len; -+ void *req; -+ -+ virtqueue_disable_cb(vq); -+ -+ while ((req = virtqueue_get_buf(vq, &len)) != NULL) -+ kfree(req); -+ } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); -+ spin_unlock(&fpq->lock); -+} -+ -+static void virtio_fs_dummy_dispatch_work(struct work_struct *work) -+{ -+ return; -+} -+ -+static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) -+{ -+ struct virtio_fs_forget *forget; -+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, -+ dispatch_work.work); -+ struct fuse_pqueue *fpq = &fsvq->fud->pq; -+ struct virtqueue *vq = fsvq->vq; -+ struct scatterlist sg; -+ struct scatterlist *sgs[] = {&sg}; -+ bool notify; -+ int ret; -+ -+ pr_debug("worker virtio_fs_hiprio_dispatch_work() called.\n"); -+ while(1) { -+ spin_lock(&fpq->lock); -+ forget = list_first_entry_or_null(&fsvq->queued_reqs, -+ struct virtio_fs_forget, list); -+ if (!forget) { -+ spin_unlock(&fpq->lock); -+ return; -+ } -+ -+ list_del(&forget->list); -+ sg_init_one(&sg, forget, sizeof(*forget)); -+ -+ /* Enqueue the request */ -+ dev_dbg(&vq->vdev->dev, "%s\n", __func__); -+ ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); -+ if (ret < 0) { -+ if (ret == -ENOMEM || ret == -ENOSPC) { -+ pr_debug("virtio-fs: Could not queue FORGET:" -+ " err=%d. Will try later\n", ret); -+ list_add_tail(&forget->list, -+ &fsvq->queued_reqs); -+ schedule_delayed_work(&fsvq->dispatch_work, -+ msecs_to_jiffies(1)); -+ } else { -+ pr_debug("virtio-fs: Could not queue FORGET:" -+ " err=%d. Dropping it.\n", ret); -+ kfree(forget); -+ } -+ spin_unlock(&fpq->lock); -+ return; -+ } -+ -+ notify = virtqueue_kick_prepare(vq); -+ spin_unlock(&fpq->lock); -+ -+ if (notify) -+ virtqueue_notify(vq); -+ pr_debug("worker virtio_fs_hiprio_dispatch_work() dispatched one forget request.\n"); -+ } -+} -+ -+/* Allocate and copy args into req->argbuf */ -+static int copy_args_to_argbuf(struct fuse_req *req) -+{ -+ unsigned offset = 0; -+ unsigned num_in; -+ unsigned num_out; -+ unsigned len; -+ unsigned i; -+ -+ num_in = req->in.numargs - req->in.argpages; -+ num_out = req->out.numargs - req->out.argpages; -+ len = fuse_len_args(num_in, (struct fuse_arg *)req->in.args) + -+ fuse_len_args(num_out, req->out.args); -+ -+ req->argbuf = kmalloc(len, GFP_ATOMIC); -+ if (!req->argbuf) -+ return -ENOMEM; -+ -+ for (i = 0; i < num_in; i++) { -+ memcpy(req->argbuf + offset, -+ req->in.args[i].value, -+ req->in.args[i].size); -+ offset += req->in.args[i].size; -+ } -+ -+ return 0; -+} -+ -+/* Copy args out of and free req->argbuf */ -+static void copy_args_from_argbuf(struct fuse_req *req) -+{ -+ unsigned remaining; -+ unsigned offset; -+ unsigned num_in; -+ unsigned num_out; -+ unsigned i; -+ -+ remaining = req->out.h.len - sizeof(req->out.h); -+ num_in = req->in.numargs - req->in.argpages; -+ num_out = req->out.numargs - req->out.argpages; -+ offset = fuse_len_args(num_in, (struct fuse_arg *)req->in.args); -+ -+ for (i = 0; i < num_out; i++) { -+ unsigned argsize = req->out.args[i].size; -+ -+ if (req->out.argvar && -+ i == req->out.numargs - 1 && -+ argsize > remaining) { -+ argsize = remaining; -+ } -+ -+ memcpy(req->out.args[i].value, req->argbuf + offset, argsize); -+ offset += argsize; -+ -+ if (i != req->out.numargs - 1) -+ remaining -= argsize; -+ } -+ -+ /* Store the actual size of the variable-length arg */ -+ if (req->out.argvar) -+ req->out.args[req->out.numargs - 1].size = remaining; -+ -+ kfree(req->argbuf); -+ req->argbuf = NULL; -+} -+ -+/* Work function for request completion */ -+static void virtio_fs_requests_done_work(struct work_struct *work) -+{ -+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, -+ done_work); -+ struct fuse_pqueue *fpq = &fsvq->fud->pq; -+ struct fuse_conn *fc = fsvq->fud->fc; -+ struct virtqueue *vq = fsvq->vq; -+ struct fuse_req *req; -+ struct fuse_req *next; -+ LIST_HEAD(reqs); -+ -+ /* Collect completed requests off the virtqueue */ -+ spin_lock(&fpq->lock); -+ do { -+ unsigned len; -+ -+ virtqueue_disable_cb(vq); -+ -+ while ((req = virtqueue_get_buf(vq, &len)) != NULL) -+ list_move_tail(&req->list, &reqs); -+ } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); -+ spin_unlock(&fpq->lock); -+ -+ /* End requests */ -+ list_for_each_entry_safe(req, next, &reqs, list) { -+ /* TODO check unique */ -+ /* TODO fuse_len_args(out) against oh.len */ -+ -+ copy_args_from_argbuf(req); -+ -+ /* TODO zeroing? */ -+ -+ spin_lock(&fpq->lock); -+ clear_bit(FR_SENT, &req->flags); -+ list_del_init(&req->list); -+ spin_unlock(&fpq->lock); -+ -+ fuse_request_end(fc, req); -+ } -+} -+ -+/* Virtqueue interrupt handler */ -+static void virtio_fs_vq_done(struct virtqueue *vq) -+{ -+ struct virtio_fs_vq *fsvq = vq_to_fsvq(vq); -+ -+ dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name); -+ -+ schedule_work(&fsvq->done_work); -+} -+ -+/* Initialize virtqueues */ -+static int virtio_fs_setup_vqs(struct virtio_device *vdev, -+ struct virtio_fs *fs) -+{ -+ struct virtqueue **vqs; -+ vq_callback_t **callbacks; -+ const char **names; -+ unsigned i; -+ int ret; -+ -+ virtio_cread(vdev, struct virtio_fs_config, num_queues, -+ &fs->num_queues); -+ if (fs->num_queues == 0) -+ return -EINVAL; -+ -+ fs->nvqs = 1 + fs->num_queues; -+ -+ fs->vqs = devm_kcalloc(&vdev->dev, fs->nvqs, -+ sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); -+ if (!fs->vqs) -+ return -ENOMEM; -+ -+ vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL); -+ callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]), -+ GFP_KERNEL); -+ names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL); -+ if (!vqs || !callbacks || !names) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ callbacks[VQ_HIPRIO] = virtio_fs_vq_done; -+ snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name), -+ "hiprio"); -+ names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; -+ INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work); -+ INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs); -+ INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work, -+ virtio_fs_hiprio_dispatch_work); -+ -+ /* Initialize the requests virtqueues */ -+ for (i = VQ_REQUEST; i < fs->nvqs; i++) { -+ INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work); -+ INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work, -+ virtio_fs_dummy_dispatch_work); -+ INIT_LIST_HEAD(&fs->vqs[i].queued_reqs); -+ snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name), -+ "requests.%u", i - VQ_REQUEST); -+ callbacks[i] = virtio_fs_vq_done; -+ names[i] = fs->vqs[i].name; -+ } -+ -+ ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL); -+ if (ret < 0) -+ goto out; -+ -+ for (i = 0; i < fs->nvqs; i++) -+ fs->vqs[i].vq = vqs[i]; -+ -+out: -+ kfree(names); -+ kfree(callbacks); -+ kfree(vqs); -+ return ret; -+} -+ -+/* Free virtqueues (device must already be reset) */ -+static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, -+ struct virtio_fs *fs) -+{ -+ vdev->config->del_vqs(vdev); -+} -+ -+/* Map a window offset to a page frame number. The window offset will have -+ * been produced by .iomap_begin(), which maps a file offset to a window -+ * offset. -+ */ -+static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, -+ long nr_pages, void **kaddr, pfn_t *pfn) -+{ -+ struct virtio_fs *fs = dax_get_private(dax_dev); -+ phys_addr_t offset = PFN_PHYS(pgoff); -+ size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff; -+ -+ pr_debug("virtio_fs_direct_access(): called. nr_pages=%ld max_nr_pages=%zu\n", nr_pages, max_nr_pages); -+ -+ if (kaddr) -+ *kaddr = fs->window_kaddr + offset; -+ if (pfn) -+ *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, -+ PFN_DEV | PFN_MAP); -+ return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; -+} -+ -+static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev, -+ pgoff_t pgoff, void *addr, -+ size_t bytes, struct iov_iter *i) -+{ -+ return copy_from_iter(addr, bytes, i); -+} -+ -+static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev, -+ pgoff_t pgoff, void *addr, -+ size_t bytes, struct iov_iter *i) -+{ -+ return copy_to_iter(addr, bytes, i); -+} -+ -+static const struct dax_operations virtio_fs_dax_ops = { -+ .direct_access = virtio_fs_direct_access, -+ .copy_from_iter = virtio_fs_copy_from_iter, -+ .copy_to_iter = virtio_fs_copy_to_iter, -+}; -+ -+static void virtio_fs_percpu_release(struct percpu_ref *ref) -+{ -+ struct virtio_fs_memremap_info *mi = -+ container_of(ref, struct virtio_fs_memremap_info, ref); -+ -+ complete(&mi->completion); -+} -+ -+static void virtio_fs_percpu_exit(void *data) -+{ -+ struct virtio_fs_memremap_info *mi = data; -+ -+ wait_for_completion(&mi->completion); -+ percpu_ref_exit(&mi->ref); -+} -+ -+static void virtio_fs_percpu_kill(struct percpu_ref *ref) -+{ -+ percpu_ref_kill(ref); -+} -+ -+static void virtio_fs_cleanup_dax(void *data) -+{ -+ struct virtio_fs *fs = data; -+ -+ kill_dax(fs->dax_dev); -+ put_dax(fs->dax_dev); -+} -+ -+static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) -+{ -+ struct virtio_shm_region cache_reg; -+ struct virtio_fs_memremap_info *mi; -+ struct dev_pagemap *pgmap; -+ bool have_cache; -+ int ret; -+ -+ if (!IS_ENABLED(CONFIG_DAX_DRIVER)) -+ return 0; -+ -+ /* Get cache region */ -+ have_cache = virtio_get_shm_region(vdev, -+ &cache_reg, -+ (u8)VIRTIO_FS_SHMCAP_ID_CACHE); -+ if (!have_cache) { -+ dev_err(&vdev->dev, "%s: No cache capability\n", __func__); -+ return -ENXIO; -+ } else { -+ dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", -+ cache_reg.len, cache_reg.addr); -+ } -+ -+ mi = devm_kzalloc(&vdev->dev, sizeof(*mi), GFP_KERNEL); -+ if (!mi) -+ return -ENOMEM; -+ -+ init_completion(&mi->completion); -+ ret = percpu_ref_init(&mi->ref, virtio_fs_percpu_release, 0, -+ GFP_KERNEL); -+ if (ret < 0) { -+ dev_err(&vdev->dev, "%s: percpu_ref_init failed (%d)\n", -+ __func__, ret); -+ return ret; -+ } -+ -+ ret = devm_add_action(&vdev->dev, virtio_fs_percpu_exit, mi); -+ if (ret < 0) { -+ percpu_ref_exit(&mi->ref); -+ return ret; -+ } -+ -+ pgmap = &mi->pgmap; -+ pgmap->altmap_valid = false; -+ pgmap->ref = &mi->ref; -+ pgmap->kill = virtio_fs_percpu_kill; -+ pgmap->type = MEMORY_DEVICE_FS_DAX; -+ -+ /* Ideally we would directly use the PCI BAR resource but -+ * devm_memremap_pages() wants its own copy in pgmap. So -+ * initialize a struct resource from scratch (only the start -+ * and end fields will be used). -+ */ -+ pgmap->res = (struct resource){ -+ .name = "virtio-fs dax window", -+ .start = (phys_addr_t) cache_reg.addr, -+ .end = (phys_addr_t) cache_reg.addr + cache_reg.len, -+ }; -+ -+ fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap); -+ if (IS_ERR(fs->window_kaddr)) -+ return PTR_ERR(fs->window_kaddr); -+ -+ fs->window_phys_addr = (phys_addr_t) cache_reg.addr; -+ fs->window_len = (phys_addr_t) cache_reg.len; -+ -+ dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx" -+ " len 0x%llx\n", __func__, fs->window_kaddr, cache_reg.addr, -+ cache_reg.len); -+ -+ fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops); -+ if (!fs->dax_dev) -+ return -ENOMEM; -+ -+ return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs); -+} -+ -+static int virtio_fs_probe(struct virtio_device *vdev) -+{ -+ struct virtio_fs *fs; -+ int ret; -+ -+ fs = devm_kzalloc(&vdev->dev, sizeof(*fs), GFP_KERNEL); -+ if (!fs) -+ return -ENOMEM; -+ vdev->priv = fs; -+ -+ ret = virtio_fs_read_tag(vdev, fs); -+ if (ret < 0) -+ goto out; -+ -+ ret = virtio_fs_setup_vqs(vdev, fs); -+ if (ret < 0) -+ goto out; -+ -+ /* TODO vq affinity */ -+ /* TODO populate notifications vq */ -+ -+ ret = virtio_fs_setup_dax(vdev, fs); -+ if (ret < 0) -+ goto out_vqs; -+ -+ /* Bring the device online in case the filesystem is mounted and -+ * requests need to be sent before we return. -+ */ -+ virtio_device_ready(vdev); -+ -+ ret = virtio_fs_add_instance(fs); -+ if (ret < 0) -+ goto out_vqs; -+ -+ return 0; -+ -+out_vqs: -+ vdev->config->reset(vdev); -+ virtio_fs_cleanup_vqs(vdev, fs); -+out: -+ vdev->priv = NULL; -+ return ret; -+} -+ -+static void virtio_fs_remove(struct virtio_device *vdev) -+{ -+ struct virtio_fs *fs = vdev->priv; -+ -+ virtio_fs_free_devs(fs); -+ -+ vdev->config->reset(vdev); -+ virtio_fs_cleanup_vqs(vdev, fs); -+ -+ mutex_lock(&virtio_fs_mutex); -+ list_del(&fs->list); -+ mutex_unlock(&virtio_fs_mutex); -+ -+ vdev->priv = NULL; -+} -+ -+#ifdef CONFIG_PM -+static int virtio_fs_freeze(struct virtio_device *vdev) -+{ -+ return 0; /* TODO */ -+} -+ -+static int virtio_fs_restore(struct virtio_device *vdev) -+{ -+ return 0; /* TODO */ -+} -+#endif /* CONFIG_PM */ -+ -+const static struct virtio_device_id id_table[] = { -+ { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID }, -+ {}, -+}; -+ -+const static unsigned int feature_table[] = {}; -+ -+static struct virtio_driver virtio_fs_driver = { -+ .driver.name = KBUILD_MODNAME, -+ .driver.owner = THIS_MODULE, -+ .id_table = id_table, -+ .feature_table = feature_table, -+ .feature_table_size = ARRAY_SIZE(feature_table), -+ /* TODO validate config_get != NULL */ -+ .probe = virtio_fs_probe, -+ .remove = virtio_fs_remove, -+#ifdef CONFIG_PM_SLEEP -+ .freeze = virtio_fs_freeze, -+ .restore = virtio_fs_restore, -+#endif -+}; -+ -+static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq) -+__releases(fiq->waitq.lock) -+{ -+ struct fuse_forget_link *link; -+ struct virtio_fs_forget *forget; -+ struct fuse_pqueue *fpq; -+ struct scatterlist sg; -+ struct scatterlist *sgs[] = {&sg}; -+ struct virtio_fs *fs; -+ struct virtqueue *vq; -+ struct virtio_fs_vq *fsvq; -+ bool notify; -+ u64 unique; -+ int ret; -+ -+ BUG_ON(!fiq->forget_list_head.next); -+ link = fiq->forget_list_head.next; -+ BUG_ON(link->next); -+ fiq->forget_list_head.next = NULL; -+ fiq->forget_list_tail = &fiq->forget_list_head; -+ -+ unique = fuse_get_unique(fiq); -+ -+ fs = fiq->priv; -+ fsvq = &fs->vqs[VQ_HIPRIO]; -+ spin_unlock(&fiq->waitq.lock); -+ -+ /* Allocate a buffer for the request */ -+ forget = kmalloc(sizeof(*forget), GFP_ATOMIC); -+ if (!forget) { -+ pr_err("virtio-fs: dropped FORGET: kmalloc failed\n"); -+ goto out; /* TODO avoid dropping it? */ -+ } -+ -+ forget->ih = (struct fuse_in_header){ -+ .opcode = FUSE_FORGET, -+ .nodeid = link->forget_one.nodeid, -+ .unique = unique, -+ .len = sizeof(*forget), -+ }; -+ forget->arg = (struct fuse_forget_in){ -+ .nlookup = link->forget_one.nlookup, -+ }; -+ -+ sg_init_one(&sg, forget, sizeof(*forget)); -+ -+ /* Enqueue the request */ -+ vq = fsvq->vq; -+ dev_dbg(&vq->vdev->dev, "%s\n", __func__); -+ fpq = vq_to_fpq(vq); -+ spin_lock(&fpq->lock); -+ -+ ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); -+ if (ret < 0) { -+ if (ret == -ENOMEM || ret == -ENOSPC) { -+ pr_debug("virtio-fs: Could not queue FORGET: err=%d." -+ " Will try later.\n", ret); -+ list_add_tail(&forget->list, &fsvq->queued_reqs); -+ schedule_delayed_work(&fsvq->dispatch_work, -+ msecs_to_jiffies(1)); -+ } else { -+ pr_debug("virtio-fs: Could not queue FORGET: err=%d." -+ " Dropping it.\n", ret); -+ kfree(forget); -+ } -+ spin_unlock(&fpq->lock); -+ goto out; -+ } -+ -+ notify = virtqueue_kick_prepare(vq); -+ -+ spin_unlock(&fpq->lock); -+ -+ if (notify) -+ virtqueue_notify(vq); -+out: -+ kfree(link); -+} -+ -+static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq) -+__releases(fiq->waitq.lock) -+{ -+ /* TODO */ -+ spin_unlock(&fiq->waitq.lock); -+} -+ -+/* Return the number of scatter-gather list elements required */ -+static unsigned sg_count_fuse_req(struct fuse_req *req) -+{ -+ unsigned total_sgs = 1 /* fuse_in_header */; -+ -+ if (req->in.numargs - req->in.argpages) -+ total_sgs += 1; -+ -+ if (req->in.argpages) -+ total_sgs += req->num_pages; -+ -+ if (!test_bit(FR_ISREPLY, &req->flags)) -+ return total_sgs; -+ -+ total_sgs += 1 /* fuse_out_header */; -+ -+ if (req->out.numargs - req->out.argpages) -+ total_sgs += 1; -+ -+ if (req->out.argpages) -+ total_sgs += req->num_pages; -+ -+ return total_sgs; -+} -+ -+/* Add pages to scatter-gather list and return number of elements used */ -+static unsigned sg_init_fuse_pages(struct scatterlist *sg, -+ struct page **pages, -+ struct fuse_page_desc *page_descs, -+ unsigned num_pages) -+{ -+ unsigned i; -+ -+ for (i = 0; i < num_pages; i++) { -+ sg_init_table(&sg[i], 1); -+ sg_set_page(&sg[i], pages[i], -+ page_descs[i].length, -+ page_descs[i].offset); -+ } -+ -+ return i; -+} -+ -+/* Add args to scatter-gather list and return number of elements used */ -+static unsigned sg_init_fuse_args(struct scatterlist *sg, -+ struct fuse_req *req, -+ struct fuse_arg *args, -+ unsigned numargs, -+ bool argpages, -+ void *argbuf, -+ unsigned *len_used) -+{ -+ unsigned total_sgs = 0; -+ unsigned len; -+ -+ len = fuse_len_args(numargs - argpages, args); -+ if (len) -+ sg_init_one(&sg[total_sgs++], argbuf, len); -+ -+ if (argpages) -+ total_sgs += sg_init_fuse_pages(&sg[total_sgs], -+ req->pages, -+ req->page_descs, -+ req->num_pages); -+ -+ if (len_used) -+ *len_used = len; -+ -+ return total_sgs; -+} -+ -+/* Add a request to a virtqueue and kick the device */ -+static int virtio_fs_enqueue_req(struct virtqueue *vq, struct fuse_req *req) -+{ -+ struct scatterlist *stack_sgs[6 /* requests need at least 4 elements */]; -+ struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)]; -+ struct scatterlist **sgs = stack_sgs; -+ struct scatterlist *sg = stack_sg; -+ struct fuse_pqueue *fpq; -+ unsigned argbuf_used = 0; -+ unsigned out_sgs = 0; -+ unsigned in_sgs = 0; -+ unsigned total_sgs; -+ unsigned i; -+ int ret; -+ bool notify; -+ -+ /* Does the sglist fit on the stack? */ -+ total_sgs = sg_count_fuse_req(req); -+ if (total_sgs > ARRAY_SIZE(stack_sgs)) { -+ sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC); -+ sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC); -+ if (!sgs || !sg) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ } -+ -+ /* Use a bounce buffer since stack args cannot be mapped */ -+ ret = copy_args_to_argbuf(req); -+ if (ret < 0) -+ goto out; -+ -+ /* Request elements */ -+ sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h)); -+ out_sgs += sg_init_fuse_args(&sg[out_sgs], req, -+ (struct fuse_arg *)req->in.args, -+ req->in.numargs, req->in.argpages, -+ req->argbuf, &argbuf_used); -+ -+ /* Reply elements */ -+ if (test_bit(FR_ISREPLY, &req->flags)) { -+ sg_init_one(&sg[out_sgs + in_sgs++], -+ &req->out.h, sizeof(req->out.h)); -+ in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req, -+ req->out.args, req->out.numargs, -+ req->out.argpages, -+ req->argbuf + argbuf_used, NULL); -+ } -+ -+ BUG_ON(out_sgs + in_sgs != total_sgs); -+ -+ for (i = 0; i < total_sgs; i++) -+ sgs[i] = &sg[i]; -+ -+ fpq = vq_to_fpq(vq); -+ spin_lock(&fpq->lock); -+ -+ ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); -+ if (ret < 0) { -+ /* TODO handle full virtqueue */ -+ spin_unlock(&fpq->lock); -+ goto out; -+ } -+ -+ notify = virtqueue_kick_prepare(vq); -+ -+ spin_unlock(&fpq->lock); -+ -+ if (notify) -+ virtqueue_notify(vq); -+ -+out: -+ if (ret < 0 && req->argbuf) { -+ kfree(req->argbuf); -+ req->argbuf = NULL; -+ } -+ if (sgs != stack_sgs) { -+ kfree(sgs); -+ kfree(sg); -+ } -+ -+ return ret; -+} -+ -+static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) -+__releases(fiq->waitq.lock) -+{ -+ unsigned queue_id = VQ_REQUEST; /* TODO multiqueue */ -+ struct virtio_fs *fs; -+ struct fuse_conn *fc; -+ struct fuse_req *req; -+ struct fuse_pqueue *fpq; -+ int ret; -+ -+ BUG_ON(list_empty(&fiq->pending)); -+ req = list_last_entry(&fiq->pending, struct fuse_req, list); -+ clear_bit(FR_PENDING, &req->flags); -+ list_del_init(&req->list); -+ BUG_ON(!list_empty(&fiq->pending)); -+ spin_unlock(&fiq->waitq.lock); -+ -+ fs = fiq->priv; -+ fc = fs->vqs[queue_id].fud->fc; -+ -+ dev_dbg(&fs->vqs[queue_id].vq->vdev->dev, -+ "%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n", -+ __func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid, -+ req->in.h.len, fuse_len_args(req->out.numargs, req->out.args)); -+ -+ /* TODO put request onto fpq->io list? */ -+ -+ fpq = &fs->vqs[queue_id].fud->pq; -+ spin_lock(&fpq->lock); -+ if (!fpq->connected) { -+ spin_unlock(&fpq->lock); -+ req->out.h.error = -ENODEV; -+ printk(KERN_ERR "%s: disconnected\n", __func__); -+ fuse_request_end(fc, req); -+ return; -+ } -+ list_add_tail(&req->list, &fpq->processing); -+ spin_unlock(&fpq->lock); -+ set_bit(FR_SENT, &req->flags); -+ /* matches barrier in request_wait_answer() */ -+ smp_mb__after_atomic(); -+ /* TODO check for FR_INTERRUPTED? */ -+ -+ ret = virtio_fs_enqueue_req(fs->vqs[queue_id].vq, req); -+ if (ret < 0) { -+ req->out.h.error = ret; -+ printk(KERN_ERR "%s: virtio_fs_enqueue_req failed %d\n", -+ __func__, ret); -+ fuse_request_end(fc, req); -+ return; -+ } -+} -+ -+const static struct fuse_iqueue_ops virtio_fs_fiq_ops = { -+ .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, -+ .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, -+ .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, -+}; -+ -+static int virtio_fs_fill_super(struct super_block *sb, void *data, -+ int silent) -+{ -+ struct fuse_mount_data d; -+ struct fuse_conn *fc; -+ struct virtio_fs *fs; -+ int is_bdev = sb->s_bdev != NULL; -+ unsigned int i; -+ int err; -+ struct fuse_req *init_req; -+ -+ err = -EINVAL; -+ if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) -+ goto err; -+ if (d.fd_present) { -+ printk(KERN_ERR "virtio-fs: fd option cannot be used\n"); -+ goto err; -+ } -+ if (!d.tag_present) { -+ printk(KERN_ERR "virtio-fs: missing tag option\n"); -+ goto err; -+ } -+ -+ fs = virtio_fs_find_instance(d.tag); -+ if (!fs) { -+ printk(KERN_ERR "virtio-fs: tag not found\n"); -+ err = -ENOENT; -+ goto err; -+ } -+ -+ /* TODO lock */ -+ if (fs->vqs[VQ_REQUEST].fud) { -+ printk(KERN_ERR "virtio-fs: device already in use\n"); -+ err = -EBUSY; -+ goto err; -+ } -+ -+ err = -ENOMEM; -+ /* Allocate fuse_dev for hiprio and notification queues */ -+ for (i = 0; i < VQ_REQUEST; i++) { -+ struct virtio_fs_vq *fsvq = &fs->vqs[i]; -+ -+ fsvq->fud = fuse_dev_alloc(); -+ if (!fsvq->fud) -+ goto err_free_fuse_devs; -+ } -+ -+ init_req = fuse_request_alloc(0); -+ if (!init_req) -+ goto err; -+ __set_bit(FR_BACKGROUND, &init_req->flags); -+ -+ d.dax_dev = d.dax ? fs->dax_dev : NULL; -+ d.fiq_ops = &virtio_fs_fiq_ops; -+ d.fiq_priv = fs; -+ d.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud; -+ d.destroy = true; /* Send destroy request on unmount */ -+ err = fuse_fill_super_common(sb, &d); -+ if (err < 0) -+ goto err_free_init_req; -+ -+ fc = fs->vqs[VQ_REQUEST].fud->fc; -+ -+ /* TODO take fuse_mutex around this loop? */ -+ for (i = 0; i < fs->nvqs; i++) { -+ struct virtio_fs_vq *fsvq = &fs->vqs[i]; -+ -+ if (i == VQ_REQUEST) -+ continue; /* already initialized */ -+ fuse_dev_install(fsvq->fud, fc); -+ atomic_inc(&fc->dev_count); -+ } -+ -+ fuse_send_init(fc, init_req); -+ return 0; -+ -+err_free_init_req: -+ fuse_request_free(init_req); -+err_free_fuse_devs: -+ for (i = 0; i < fs->nvqs; i++) { -+ struct virtio_fs_vq *fsvq = &fs->vqs[i]; -+ fuse_dev_free(fsvq->fud); -+ } -+err: -+ return err; -+} -+ -+static void virtio_kill_sb(struct super_block *sb) -+{ -+ struct fuse_conn *fc = get_fuse_conn_super(sb); -+ fuse_kill_sb_anon(sb); -+ if (fc) { -+ struct virtio_fs *vfs = fc->iq.priv; -+ virtio_fs_free_devs(vfs); -+ } -+} -+ -+static struct dentry *virtio_fs_mount(struct file_system_type *fs_type, -+ int flags, const char *dev_name, -+ void *raw_data) -+{ -+ return mount_nodev(fs_type, flags, raw_data, virtio_fs_fill_super); -+} -+ -+static struct file_system_type virtio_fs_type = { -+ .owner = THIS_MODULE, -+ .name = KBUILD_MODNAME, -+ .mount = virtio_fs_mount, -+ .kill_sb = virtio_kill_sb, -+}; -+ -+static int __init virtio_fs_init(void) -+{ -+ int ret; -+ -+ ret = register_virtio_driver(&virtio_fs_driver); -+ if (ret < 0) -+ return ret; -+ -+ ret = register_filesystem(&virtio_fs_type); -+ if (ret < 0) { -+ unregister_virtio_driver(&virtio_fs_driver); -+ return ret; -+ } -+ -+ return 0; -+} -+module_init(virtio_fs_init); -+ -+static void __exit virtio_fs_exit(void) -+{ -+ unregister_filesystem(&virtio_fs_type); -+ unregister_virtio_driver(&virtio_fs_driver); -+} -+module_exit(virtio_fs_exit); -+ -+MODULE_AUTHOR("Stefan Hajnoczi "); -+MODULE_DESCRIPTION("Virtio Filesystem"); -+MODULE_LICENSE("GPL"); -+MODULE_ALIAS_FS(KBUILD_MODNAME); -+MODULE_DEVICE_TABLE(virtio, id_table); -diff --git a/fs/splice.c b/fs/splice.c -index 485e409ef..c74f18098 100644 ---- a/fs/splice.c -+++ b/fs/splice.c -@@ -365,7 +365,7 @@ static ssize_t kernel_readv(struct file *file, const struct kvec *vec, - return res; - } - --static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, -+ssize_t default_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) - { -@@ -429,6 +429,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, - iov_iter_advance(&to, copied); /* truncates and discards */ - return res; - } -+EXPORT_SYMBOL(default_file_splice_read); - - /* - * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' -diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c -index b69786694..c97f8a0cb 100644 ---- a/fs/xfs/xfs_aops.c -+++ b/fs/xfs/xfs_aops.c -@@ -953,7 +953,7 @@ xfs_dax_writepages( - { - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - return dax_writeback_mapping_range(mapping, -- xfs_find_bdev_for_inode(mapping->host), wbc); -+ xfs_find_bdev_for_inode(mapping->host), NULL, wbc); - } - - STATIC int -diff --git a/include/linux/dax.h b/include/linux/dax.h -index 450b28db9..a8461841f 100644 ---- a/include/linux/dax.h -+++ b/include/linux/dax.h -@@ -85,7 +85,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev) - - struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); - int dax_writeback_mapping_range(struct address_space *mapping, -- struct block_device *bdev, struct writeback_control *wbc); -+ struct block_device *bdev, struct dax_device *dax_dev, -+ struct writeback_control *wbc); - - struct page *dax_layout_busy_page(struct address_space *mapping); - bool dax_lock_mapping_entry(struct page *page); -@@ -117,7 +118,8 @@ static inline struct page *dax_layout_busy_page(struct address_space *mapping) - } - - static inline int dax_writeback_mapping_range(struct address_space *mapping, -- struct block_device *bdev, struct writeback_control *wbc) -+ struct block_device *bdev, struct dax_device *dax_dev, -+ struct writeback_control *wbc) - { - return -EOPNOTSUPP; - } -diff --git a/include/linux/fs.h b/include/linux/fs.h -index d4e1b43a5..374122b5b 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -3000,6 +3000,8 @@ extern void block_sync_page(struct page *page); - /* fs/splice.c */ - extern ssize_t generic_file_splice_read(struct file *, loff_t *, - struct pipe_inode_info *, size_t, unsigned int); -+extern ssize_t default_file_splice_read(struct file *, loff_t *, -+ struct pipe_inode_info *, size_t, unsigned int); - extern ssize_t iter_file_splice_write(struct pipe_inode_info *, - struct file *, loff_t *, size_t, unsigned int); - extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, -diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h -index 32baf8e26..8f85d1d8a 100644 ---- a/include/linux/virtio_config.h -+++ b/include/linux/virtio_config.h -@@ -10,6 +10,11 @@ - - struct irq_affinity; - -+struct virtio_shm_region { -+ u64 addr; -+ u64 len; -+}; -+ - /** - * virtio_config_ops - operations for configuring a virtio device - * @get: read the value of a configuration field -@@ -60,6 +65,7 @@ struct irq_affinity; - * the caller can then copy. - * @set_vq_affinity: set the affinity for a virtqueue. - * @get_vq_affinity: get the affinity for a virtqueue (optional). -+ * @get_shm_region: get a shared memory region based on the index. - */ - typedef void vq_callback_t(struct virtqueue *); - struct virtio_config_ops { -@@ -83,6 +89,8 @@ struct virtio_config_ops { - const struct cpumask *cpu_mask); - const struct cpumask *(*get_vq_affinity)(struct virtio_device *vdev, - int index); -+ bool (*get_shm_region)(struct virtio_device *vdev, -+ struct virtio_shm_region *region, u8 id); - }; - - /* If driver didn't advertise the feature, it will never appear. */ -@@ -245,6 +253,15 @@ int virtqueue_set_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) - return 0; - } - -+static inline -+bool virtio_get_shm_region(struct virtio_device *vdev, -+ struct virtio_shm_region *region, u8 id) -+{ -+ if (!vdev->config->get_shm_region) -+ return false; -+ return vdev->config->get_shm_region(vdev, region, id); -+} -+ - static inline bool virtio_is_little_endian(struct virtio_device *vdev) - { - return virtio_has_feature(vdev, VIRTIO_F_VERSION_1) || -diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h -index 2170e58a2..2f35b3791 100644 ---- a/include/uapi/linux/fuse.h -+++ b/include/uapi/linux/fuse.h -@@ -383,6 +383,8 @@ enum fuse_opcode { - FUSE_READDIRPLUS = 44, - FUSE_RENAME2 = 45, - FUSE_LSEEK = 46, -+ FUSE_SETUPMAPPING = 48, -+ FUSE_REMOVEMAPPING = 49, - - /* CUSE specific operations */ - CUSE_INIT = 4096, -@@ -794,4 +796,36 @@ struct fuse_lseek_out { - uint64_t offset; - }; - -+#define FUSE_SETUPMAPPING_ENTRIES 8 -+#define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) -+#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1) -+struct fuse_setupmapping_in { -+ /* An already open handle */ -+ uint64_t fh; -+ /* Offset into the file to start the mapping */ -+ uint64_t foffset; -+ /* Length of mapping required */ -+ uint64_t len; -+ /* Flags, FUSE_SETUPMAPPING_FLAG_* */ -+ uint64_t flags; -+ /* Offset in Memory Window */ -+ uint64_t moffset; -+}; -+ -+struct fuse_setupmapping_out { -+ /* Offsets into the cache of mappings */ -+ uint64_t coffset[FUSE_SETUPMAPPING_ENTRIES]; -+ /* Lengths of each mapping */ -+ uint64_t len[FUSE_SETUPMAPPING_ENTRIES]; -+}; -+ -+struct fuse_removemapping_in { -+ /* An already open handle */ -+ uint64_t fh; -+ /* Offset into the dax window start the unmapping */ -+ uint64_t moffset; -+ /* Length of mapping required */ -+ uint64_t len; -+}; -+ - #endif /* _LINUX_FUSE_H */ -diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h -new file mode 100644 -index 000000000..d4bb54956 ---- /dev/null -+++ b/include/uapi/linux/virtio_fs.h -@@ -0,0 +1,44 @@ -+#ifndef _UAPI_LINUX_VIRTIO_FS_H -+#define _UAPI_LINUX_VIRTIO_FS_H -+/* This header is BSD licensed so anyone can use the definitions to implement -+ * compatible drivers/servers. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of IBM nor the names of its contributors -+ * may be used to endorse or promote products derived from this software -+ * without specific prior written permission. -+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. */ -+#include -+#include -+#include -+#include -+ -+struct virtio_fs_config { -+ /* Filesystem name (UTF-8, not NUL-terminated, padded with NULs) */ -+ __u8 tag[36]; -+ -+ /* Number of request queues */ -+ __u32 num_queues; -+} __attribute__((packed)); -+ -+/* For the id field in virtio_pci_shm_cap */ -+#define VIRTIO_FS_SHMCAP_ID_CACHE 0 -+ -+#endif /* _UAPI_LINUX_VIRTIO_FS_H */ -diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h -index 6d5c3b2d4..884b0e273 100644 ---- a/include/uapi/linux/virtio_ids.h -+++ b/include/uapi/linux/virtio_ids.h -@@ -43,5 +43,6 @@ - #define VIRTIO_ID_INPUT 18 /* virtio input */ - #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ - #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ -+#define VIRTIO_ID_FS 26 /* virtio filesystem */ - - #endif /* _LINUX_VIRTIO_IDS_H */ -diff --git a/include/uapi/linux/virtio_mmio.h b/include/uapi/linux/virtio_mmio.h -index c4b09689a..0650f91be 100644 ---- a/include/uapi/linux/virtio_mmio.h -+++ b/include/uapi/linux/virtio_mmio.h -@@ -122,6 +122,17 @@ - #define VIRTIO_MMIO_QUEUE_USED_LOW 0x0a0 - #define VIRTIO_MMIO_QUEUE_USED_HIGH 0x0a4 - -+/* Shared memory region id */ -+#define VIRTIO_MMIO_SHM_SEL 0x0ac -+ -+/* Shared memory region length, 64 bits in two halves */ -+#define VIRTIO_MMIO_SHM_LEN_LOW 0x0b0 -+#define VIRTIO_MMIO_SHM_LEN_HIGH 0x0b4 -+ -+/* Shared memory region base address, 64 bits in two halves */ -+#define VIRTIO_MMIO_SHM_BASE_LOW 0x0b8 -+#define VIRTIO_MMIO_SHM_BASE_HIGH 0x0bc -+ - /* Configuration atomicity value */ - #define VIRTIO_MMIO_CONFIG_GENERATION 0x0fc - -diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h -index 90007a1ab..31841a60a 100644 ---- a/include/uapi/linux/virtio_pci.h -+++ b/include/uapi/linux/virtio_pci.h -@@ -113,6 +113,8 @@ - #define VIRTIO_PCI_CAP_DEVICE_CFG 4 - /* PCI configuration access */ - #define VIRTIO_PCI_CAP_PCI_CFG 5 -+/* Additional shared memory capability */ -+#define VIRTIO_PCI_CAP_SHARED_MEMORY_CFG 8 - - /* This is the PCI capability header: */ - struct virtio_pci_cap { -@@ -163,6 +165,14 @@ struct virtio_pci_cfg_cap { - __u8 pci_cfg_data[4]; /* Data for BAR access. */ - }; - -+/* Fields in VIRTIO_PCI_CAP_SHARED_MEMORY_CFG */ -+struct virtio_pci_shm_cap { -+ struct virtio_pci_cap cap; -+ __le32 offset_hi; /* Most sig 32 bits of offset */ -+ __le32 length_hi; /* Most sig 32 bits of length */ -+ __u8 id; /* To distinguish shm chunks */ -+}; -+ - /* Macro versions of offsets for the Old Timers! */ - #define VIRTIO_PCI_CAP_VNDR 0 - #define VIRTIO_PCI_CAP_NEXT 1 --- -2.19.2 -