diff --git a/kernel/configs/x86_64_kata_kvm_4.19.x b/kernel/configs/x86_64_kata_kvm_4.19.x index 02dde38246..bf61ec9705 100644 --- a/kernel/configs/x86_64_kata_kvm_4.19.x +++ b/kernel/configs/x86_64_kata_kvm_4.19.x @@ -1,13 +1,13 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.19.24 Kernel Configuration +# Linux/x86 4.19.28 Kernel Configuration # # -# Compiler: gcc (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609 +# Compiler: gcc (Ubuntu 7.4.0-1ubuntu1~16.04~ppa1) 7.4.0 # CONFIG_CC_IS_GCC=y -CONFIG_GCC_VERSION=50400 +CONFIG_GCC_VERSION=70400 CONFIG_CLANG_VERSION=0 CONFIG_IRQ_WORK=y CONFIG_BUILDTIME_EXTABLE_SORT=y @@ -2597,7 +2597,9 @@ CONFIG_FANOTIFY=y # CONFIG_QUOTA is not set CONFIG_AUTOFS4_FS=y CONFIG_AUTOFS_FS=y -# CONFIG_FUSE_FS is not set +CONFIG_FUSE_FS=y +# CONFIG_CUSE is not set +CONFIG_VIRTIO_FS=y # CONFIG_OVERLAY_FS is not set # diff --git a/kernel/kata_config_version b/kernel/kata_config_version index 7facc89938..81b5c5d06c 100644 --- a/kernel/kata_config_version +++ b/kernel/kata_config_version @@ -1 +1 @@ -36 +37 diff --git a/kernel/patches/4.19.x/0001-fuse-add-skeleton-virtio_fs.ko-module.patch b/kernel/patches/4.19.x/0001-fuse-add-skeleton-virtio_fs.ko-module.patch new file mode 100644 index 0000000000..0d6d631e80 --- /dev/null +++ b/kernel/patches/4.19.x/0001-fuse-add-skeleton-virtio_fs.ko-module.patch @@ -0,0 +1,4604 @@ +From e480fb43fda5d90a6277e969ac74b9a5a60c3f05 Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Tue, 12 Jun 2018 09:41:17 +0100 +Subject: [PATCH] fuse: add skeleton virtio_fs.ko module + +Add a basic file system module for virtio-fs. + +Signed-off-by: Stefan Hajnoczi + +fuse: add probe/remove virtio driver + +Add basic probe/remove functionality for the new virtio-fs device. + +Signed-off-by: Stefan Hajnoczi + +fuse: extract fuse_fill_super_common() + +fuse_fill_super() includes code to process the fd= option and link the +struct fuse_dev to the fd's struct file. In virtio-fs there is no file +descriptor because /dev/fuse is not used. + +This patch extracts fuse_fill_super_common() so that both classic fuse +and virtio-fs can share the code to initialize a mount. + +parse_fuse_opt() is also extracted so that the fuse_fill_super_common() +caller has access to the mount options. This allows classic fuse to +handle the fd= option outside fuse_fill_super_common(). + +Signed-off-by: Stefan Hajnoczi + +virtio_fs: get mount working + +Provide definitions of ->mount and ->kill_sb. This is still WIP. + +Signed-off-by: Stefan Hajnoczi + +fuse: export fuse_end_request() + +virtio-fs will need to complete requests from outside fs/fuse/dev.c. +Make the symbol visible. + +Signed-off-by: Stefan Hajnoczi + +fuse: export fuse_len_args() + +virtio-fs will need to query the length of fuse_arg lists. Make the +symbol visible. + +Signed-off-by: Stefan Hajnoczi + +fuse: Export fuse_send_init_request() + +This will be used by virtio-fs to send init request to fuse server after +initialization of virt queues. + +Signed-off-by: Vivek Goyal + +fuse: add fuse_iqueue_ops callbacks + +The /dev/fuse device uses fiq->waitq and fasync to signal that requests +are available. These mechanisms do not apply to virtio-fs. This patch +introduces callbacks so alternative behavior can be used. + +Note that queue_interrupt() changes along these lines: + + spin_lock(&fiq->waitq.lock); + wake_up_locked(&fiq->waitq); ++ kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + spin_unlock(&fiq->waitq.lock); +- kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + +Since queue_request() and queue_forget() also call kill_fasync() inside +the spinlock this should be safe. + +Signed-off-by: Stefan Hajnoczi + +fuse: Separate fuse device allocation and installation in fuse_conn + +As of now fuse_dev_alloc() both allocates a fuse device and installs it +in fuse_conn list. fuse_dev_alloc() can fail if fuse_device allocation +fails. + +virtio-fs needs to initialize multiple fuse devices (one per virtio +queue). It initializes one fuse device as part of call to +fuse_fill_super_common() and rest of the devices are allocated and +installed after that. + +But, we can't affort to fail after calling fuse_fill_super_common() as +we don't have a way to undo all the actions done by fuse_fill_super_common(). +So to avoid failures after the call to fuse_fill_super_common(), +pre-allocate all fuse devices early and install them into fuse connection +later. + +This patch provides two separate helpers for fuse device allocation and +fuse device installation in fuse_conn. + +Signed-off-by: Vivek Goyal + +fuse: process requests queues + +Send normal requests to the device and handle completions. + +This is enough to get mount and basic I/O working. The hiprio and +notifications queues still need to be implemented for full FUSE +functionality. + +Signed-off-by: Vivek Goyal +Signed-off-by: Stefan Hajnoczi + +fuse: export fuse_get_unique() + +virtio-fs will need unique IDs for FORGET requests from outside +fs/fuse/dev.c. Make the symbol visible. + +Signed-off-by: Stefan Hajnoczi + +fuse: implement FUSE_FORGET for virtio-fs + +Sent single FUSE_FORGET requests on the hiprio queue. In the future it +may be possible to do FUSE_BATCH_FORGET but that is tricky since +virtio-fs gets called synchronously when forgets are queued. + +Signed-off-by: Stefan Hajnoczi + +virtio_fs: Set up dax_device + +Setup a dax device. + +Signed-off-by: Stefan Hajnoczi + +dax: remove block device dependencies + +Although struct dax_device itself is not tied to a block device, some +DAX code assumes there is a block device. Make block devices optional +by allowing bdev to be NULL in commonly used DAX APIs. + +When there is no block device: + * Skip the partition offset calculation in bdev_dax_pgoff() + * Skip the blkdev_issue_zeroout() optimization + +Note that more block device assumptions remain but I haven't reach those +code paths yet. + +Signed-off-by: Stefan Hajnoczi + +dax: Pass dax_dev to dax_writeback_mapping_range() + +Right now dax_writeback_mapping_range() is passed a bdev and dax_dev +is searched from that bdev name. + +virtio-fs does not have a bdev. So pass in dax_dev also to +dax_writeback_mapping_range(). If dax_dev is passed in, bdev is not +used otherwise dax_dev is searched using bdev. + +Signed-off-by: Vivek Goyal + +fuse: add fuse_conn->dax_dev field + +A struct dax_device instance is a prerequisite for the DAX filesystem +APIs. Let virtio_fs associate a dax_device with a fuse_conn. Classic +FUSE and CUSE set the pointer to NULL, disabling DAX. + +Signed-off-by: Stefan Hajnoczi + +virtio: Add get_shm_region method + +Virtio defines 'shared memory regions' that provide a continuously +shared region between the host and guest. + +Provide a method to find a particular region on a device. + +Signed-off-by: Sebastien Boeuf +Signed-off-by: Dr. David Alan Gilbert + +virtio: Implement get_shm_region for PCI transport + +On PCI the shm regions are found using capability entries; +find a region by searching for the capability. + +Signed-off-by: Sebastien Boeuf +Signed-off-by: Dr. David Alan Gilbert + +virtio: Implement get_shm_region for MMIO transport + +On MMIO a new set of registers is defined for finding SHM +regions. Add their definitions and use them to find the region. + +Signed-off-by: Sebastien Boeuf + +fuse: map virtio_fs DAX window + +Use the shm capability to find the cache entry and map it. + +The DAX window is accessed by the fs/dax.c infrastructure and must have +struct pages (at least on x86). Use devm_memremap_pages() to map the +DAX window PCI BAR and allocate struct page. + +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Sebastien Boeuf +Signed-off-by: Dr. David Alan Gilbert + +virito-fs: Make dax optional + +Add a 'dax' option and only enable dax when it's on. + +Also show "dax" in mount options if filesystem was mounted with dax +enabled. + +Signed-off-by: Dr. David Alan Gilbert +Signed-off-by: Vivek Goyal + +Limit number of pages returned by direct_access() + +Truncate number of pages mapped by direct_access() to remain with-in window +size. User might request mapping pages beyond window size. + +Signed-off-by: Vivek Goyal + +fuse: Introduce fuse_dax_mapping + +Introduce fuse_dax_mapping. This type will be used to keep track of +per inode dax mappings. + +Signed-off-by: Vivek Goyal + +Create a list of free memory ranges + +Divide the dax memory range into fixed size ranges (2MB for now) and put +them in a list. This will track free ranges. Once an inode requires a +free range, we will take one from here and put it in interval-tree +of ranges assigned to inode. + +Signed-off-by: Vivek Goyal + +fuse: simplify fuse_fill_super_common() calling + +Add more fields to "struct fuse_mount_data" so that less parameters +have to be passed to function fuse_fill_super_common(). + +Signed-off-by: Miklos Szeredi + +fuse: Introduce setupmapping/removemapping commands + +Introduce two new fuse commands to setup/remove memory mappings. + +Signed-off-by: Vivek Goyal + +Introduce interval tree basic data structures + +We want to use interval tree to keep track of per inode dax mappings. +Introduce basic data structures. + +Signed-off-by: Vivek Goyal + +fuse: Implement basic DAX read/write support commands + +This patch implements basic DAX support. mmap() is not implemented +yet and will come in later patches. This patch looks into implemeting +read/write. + +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +Signed-off-by: Vivek Goyal + +fuse: Maintain a list of busy elements + +This list will be used selecting fuse_dax_mapping to free when number of +free mappings drops below a threshold. + +Signed-off-by: Vivek Goyal + +Do fallocate() to grow file before mapping for file growing writes + +How to handle file growing writes. For now, this patch does fallocate() to +grow file and then map it using dax. We need to figure out what's the best +way to handle it. + +This patch does fallocate() and setup mapping operations in +fuse_dax_write_iter(), instead of iomap_begin(). I don't have access to file +pointer needed to send a message to fuse daemon in iomap_begin(). + +Dave Chinner has expressed concers with this approach as this is not +atomic. If guest crashes after falloc() but before data was written, +user will think that filesystem lost its data. So this is still an +outstanding issue. + +Signed-off-by: Vivek Goyal + +fuse: add DAX mmap support + +Add DAX mmap() support. + +Signed-off-by: Stefan Hajnoczi + +fuse: delete dentry if timeout is zero + +Don't hold onto dentry in lru list if need to re-lookup it anyway at next +access. + +More advanced version of this patch would periodically flush out dentries +from the lru which have gone stale. + +Signed-off-by: Miklos Szeredi + +fuse: Define dax address space operations + +This is done along the lines of ext4 and xfs. I primarily wanted ->writepages +hook at this time so that I could call into dax_writeback_mapping_range(). +This in turn will decide which pfns need to be written back and call +dax_flush() on those. + +Signed-off-by: Vivek Goyal + +fuse, dax: Take ->i_mmap_sem lock during dax page fault + +We need some kind of locking mechanism here. Normal file systems like +ext4 and xfs seems to take their own semaphore to protect agains +truncate while fault is going on. + +We have additional requirement to protect against fuse dax memory range +reclaim. When a range has been selected for reclaim, we need to make sure +no other read/write/fault can try to access that memory range while +reclaim is in progress. Once reclaim is complete, lock will be released +and read/write/fault will trigger allocation of fresh dax range. + +Taking inode_lock() is not an option in fault path as lockdep complains +about circular dependencies. So define a new fuse_inode->i_mmap_sem. + +Signed-off-by: Vivek Goyal + +fuse: Add logic to free up a memory range + +Add logic to free up a busy memory range. Freed memory range will be +returned to free pool. Add a worker which can be started to select +and free some busy memory ranges. + +Signed-off-by: Vivek Goyal + +fuse: Add logic to do direct reclaim of memory + +This can be done only from same inode. Also it can be done only for +read/write case and not for fault case. Reason, as of now reclaim requires +holding inode_lock, fuse_inode->i_mmap_sem and fuse_inode->dmap_tree +locks in that order and only read/write path will allow that (and not +fault path). + +Signed-off-by: Vivek Goyal + +fuse: Kick worker when free memory drops below 20% of total ranges + +Kick worker to free up some memory when number of free ranges drops below +20% of total free ranges at the time of initialization. + +Signed-off-by: Vivek Goyal + +fuse: multiplex cached/direct_io/dax file operations + +Dispatch FORGET requests later instead of dropping them + +If virtio queue is full, then don't drop FORGET requests. Instead, wait +a bit and try to dispatch these little later using a worker thread. + +Signed-off-by: Vivek Goyal + +Release file in process context + +fuse_file_put(sync) can be called with sync=true/false. If sync=true, +it waits for release request response and then calls iput() in the +caller's context. If sync=false, it does not wait for release request +response, frees the fuse_file struct immediately and req->end function +does the iput(). + +iput() can be a problem with DAX if called in req->end context. If this +is last reference to inode (VFS has let go its reference already), then +iput() will clean DAX mappings as well and send REMOVEMAPPING requests +and wait for completion. (All the the worker thread context which is +processing fuse replies from daemon on the host). + +That means it blocks worker thread and it stops processing further +replies and system deadlocks. + +So for now, force sync release of file in case of DAX inodes. + +Signed-off-by: Vivek Goyal + +fuse: Do not block on inode lock while freeing memory range + +Once we select a memory range to free, we currently block on inode +lock. Do not block and use trylock instead. And move on to next memory +range if trylock fails. + +Reason being that in next few patches I want to enabling waiting for +memmory ranges to become free in fuse_iomap_begin(). So insted of +returning -EBUSY, a process will wait for a memory range to become +free. + +We don't want to end up in a situation where process is sleeping in +iomap_begin() with inode lock held and worker is trying to free +memory from same inode, resulting in deadlock. + +To avoid deadlock, use trylock instead. + +Signed-off-by: Vivek Goyal + +fuse: Reschedule dax free work if too many EAGAIN attempts + +fuse_dax_free_memory() can be very cpu intensive in corner cases. For example, +if one inode has consumed all the memory and a setupmapping request is +pending, that means inode lock is held by request and worker thread will +not get lock for a while. And given there is only one inode consuming all +the dax ranges, all the attempts to acquire lock will fail. + +So if there are too many inode lock failures (-EAGAIN), reschedule the +worker with a 10ms delay. + +Signed-off-by: Vivek Goyal + +fuse: Wait for memory ranges to become free + +Sometimes we run out of memory ranges. So in that case, wait for memory +ranges to become free, instead of returning -EBUSY. + +dax fault path is holding fuse_inode->i_mmap_sem and once that is being +held, memory reclaim can't be done. Its not safe to wait while holding +fuse_inode->i_mmap_sem for two reasons. + +- Worker thread to free memory might block on fuse_inode->i_mmap_sem as well. +- This inode is holding all the memory and more memory can't be freed. + +In both the cases, deadlock will ensue. So return -ENOSPC from iomap_begin() +in fault path if memory can't be allocated. Drop fuse_inode->i_mmap_sem, +and wait for a free range to become available and retry. + +read/write path is a different story. We hold inode lock and lock ordering +allows to grab fuse_inode->immap_sem, if needed. That means we can do direct +reclaim in that path. But if there is no memory allocated to this inode, +then direct reclaim will not work and we need to wait for a memory range +to become free. So try following order. + +A. Try to get a free range. +B. If not, try direct reclaim. +C. If not, wait for a memory range to become free + +Here sleeping with locks held should be fine because in step B, we made +sure this inode is not holding any ranges. That means other inodes are +holding ranges and somebody should be able to free memory. Also, worker +thread does a trylock() on inode lock. That means worker tread will not +wait on this inode and move onto next memory range. Hence above sequence +should be deadlock free. + +Signed-off-by: Vivek Goyal + +fuse: Take inode lock for dax inode truncation + +When a file is opened with O_TRUNC, we need to make sure that any other +DAX operation is not in progress. DAX expects i_size to be stable. + +In fuse_iomap_begin() we check for i_size at multiple places and we expect +i_size to not change. + +Another problem is, if we setup a mapping in fuse_iomap_begin(), and +file gets truncated and dax read/write happens, KVM currently hangs. +It tries to fault in a page which does not exist on host (file got +truncated). It probably requries fixing in KVM. + +So for now, take inode lock. Once KVM is fixed, we might have to +have a look at it again. + +Signed-off-by: Vivek Goyal + +fuse: Clear setuid bit even in direct I/O path + +With cache=never, we fall back to direct IO. pjdfstest chmod test 12.t was +failing because if a file has setuid bit, it should be cleared if an +unpriviledged user opens it for write and writes to it. + +Call fuse_remove_privs() even for direct I/O path. + +Signed-off-by: Vivek Goyal + +virtio: Free fuse devices on umount + +When unmounting the fs close all the fuse devices. +This includes making sure the daemon gets a FUSE_DESTROY to +tell it. + +Signed-off-by: Dr. David Alan Gilbert + +virtio-fs: Fix a race in range reclaim + +We have the notion of doing inline dax range reclaim where caller does not +have to drop inode lock and reclaim one of it's dax ranges. It assumed +there is no other reader/writer using that inode (hence not using dax +range being reclaimed). + +But fuse read path takes shared inode lock. That means there could be other +readers while we need to do reclaim. If we try to reclaim now, it is possible +we end up reclaiming the range used by another process. + +To remove that race, do not try to do inline reclaim for read path. Instead +return -ENOSPC and fuse read path will try again when a free range is +available. + +Reported-by: Dr. David Alan Gilbert +Signed-off-by: Vivek Goyal +--- + drivers/dax/super.c | 3 +- + drivers/virtio/virtio_mmio.c | 32 + + drivers/virtio/virtio_pci_modern.c | 108 +++ + fs/dax.c | 23 +- + fs/ext2/inode.c | 2 +- + fs/ext4/inode.c | 2 +- + fs/fuse/Kconfig | 11 + + fs/fuse/Makefile | 1 + + fs/fuse/cuse.c | 5 +- + fs/fuse/dev.c | 80 +- + fs/fuse/dir.c | 28 +- + fs/fuse/file.c | 1001 +++++++++++++++++++++++-- + fs/fuse/fuse_i.h | 202 ++++- + fs/fuse/inode.c | 316 +++++--- + fs/fuse/virtio_fs.c | 1121 ++++++++++++++++++++++++++++ + fs/splice.c | 3 +- + fs/xfs/xfs_aops.c | 2 +- + include/linux/dax.h | 6 +- + include/linux/fs.h | 2 + + include/linux/virtio_config.h | 17 + + include/uapi/linux/fuse.h | 34 + + include/uapi/linux/virtio_fs.h | 44 ++ + include/uapi/linux/virtio_ids.h | 1 + + include/uapi/linux/virtio_mmio.h | 11 + + include/uapi/linux/virtio_pci.h | 10 + + 25 files changed, 2883 insertions(+), 182 deletions(-) + create mode 100644 fs/fuse/virtio_fs.c + create mode 100644 include/uapi/linux/virtio_fs.h + +diff --git a/drivers/dax/super.c b/drivers/dax/super.c +index 6e928f37d084..74f3bf7ae822 100644 +--- a/drivers/dax/super.c ++++ b/drivers/dax/super.c +@@ -52,7 +52,8 @@ EXPORT_SYMBOL_GPL(dax_read_unlock); + int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, + pgoff_t *pgoff) + { +- phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512; ++ sector_t start_sect = bdev ? get_start_sect(bdev) : 0; ++ phys_addr_t phys_off = (start_sect + sector) * 512; + + if (pgoff) + *pgoff = PHYS_PFN(phys_off); +diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c +index 4cd9ea5c75be..9642fa8dbeb0 100644 +--- a/drivers/virtio/virtio_mmio.c ++++ b/drivers/virtio/virtio_mmio.c +@@ -494,6 +494,37 @@ static const char *vm_bus_name(struct virtio_device *vdev) + return vm_dev->pdev->name; + } + ++static bool vm_get_shm_region(struct virtio_device *vdev, ++ struct virtio_shm_region *region, u8 id) ++{ ++ struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); ++ u64 len, addr; ++ ++ /* Select the region we're interested in */ ++ writel(id, vm_dev->base + VIRTIO_MMIO_SHM_SEL); ++ ++ /* Read the region size */ ++ len = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_LOW); ++ len |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_HIGH) << 32; ++ ++ region->len = len; ++ ++ /* Check if region length is -1. If that's the case, the shared memory ++ * region does not exist and there is no need to proceed further. ++ */ ++ if (len == ~(u64)0) { ++ return false; ++ } ++ ++ /* Read the region base address */ ++ addr = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_LOW); ++ addr |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_HIGH) << 32; ++ ++ region->addr = addr; ++ ++ return true; ++} ++ + static const struct virtio_config_ops virtio_mmio_config_ops = { + .get = vm_get, + .set = vm_set, +@@ -506,6 +537,7 @@ static const struct virtio_config_ops virtio_mmio_config_ops = { + .get_features = vm_get_features, + .finalize_features = vm_finalize_features, + .bus_name = vm_bus_name, ++ .get_shm_region = vm_get_shm_region, + }; + + +diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c +index 07571daccfec..51c9e6eca5ac 100644 +--- a/drivers/virtio/virtio_pci_modern.c ++++ b/drivers/virtio/virtio_pci_modern.c +@@ -446,6 +446,112 @@ static void del_vq(struct virtio_pci_vq_info *info) + vring_del_virtqueue(vq); + } + ++static int virtio_pci_find_shm_cap(struct pci_dev *dev, ++ u8 required_id, ++ u8 *bar, u64 *offset, u64 *len) ++{ ++ int pos; ++ ++ for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); ++ pos > 0; ++ pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) { ++ u8 type, cap_len, id; ++ u32 tmp32; ++ u64 res_offset, res_length; ++ ++ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, ++ cfg_type), ++ &type); ++ if (type != VIRTIO_PCI_CAP_SHARED_MEMORY_CFG) ++ continue; ++ ++ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, ++ cap_len), ++ &cap_len); ++ if (cap_len != sizeof(struct virtio_pci_shm_cap)) { ++ printk(KERN_ERR "%s: shm cap with bad size offset: %d size: %d\n", ++ __func__, pos, cap_len); ++ continue; ++ }; ++ ++ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_shm_cap, ++ id), ++ &id); ++ if (id != required_id) ++ continue; ++ ++ /* Type, and ID match, looks good */ ++ pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, ++ bar), ++ bar); ++ ++ /* Read the lower 32bit of length and offset */ ++ pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap, offset), ++ &tmp32); ++ res_offset = tmp32; ++ pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap, length), ++ &tmp32); ++ res_length = tmp32; ++ ++ /* and now the top half */ ++ pci_read_config_dword(dev, ++ pos + offsetof(struct virtio_pci_shm_cap, ++ offset_hi), ++ &tmp32); ++ res_offset |= ((u64)tmp32) << 32; ++ pci_read_config_dword(dev, ++ pos + offsetof(struct virtio_pci_shm_cap, ++ length_hi), ++ &tmp32); ++ res_length |= ((u64)tmp32) << 32; ++ ++ *offset = res_offset; ++ *len = res_length; ++ ++ return pos; ++ } ++ return 0; ++} ++ ++static bool vp_get_shm_region(struct virtio_device *vdev, ++ struct virtio_shm_region *region, u8 id) ++{ ++ struct virtio_pci_device *vp_dev = to_vp_device(vdev); ++ struct pci_dev *pci_dev = vp_dev->pci_dev; ++ u8 bar; ++ u64 offset, len; ++ phys_addr_t phys_addr; ++ size_t bar_len; ++ char *bar_name; ++ int ret; ++ ++ if (!virtio_pci_find_shm_cap(pci_dev, id, &bar, &offset, &len)) { ++ return false; ++ } ++ ++ ret = pci_request_region(pci_dev, bar, "virtio-pci-shm"); ++ if (ret < 0) { ++ dev_err(&pci_dev->dev, "%s: failed to request BAR\n", ++ __func__); ++ return false; ++ } ++ ++ phys_addr = pci_resource_start(pci_dev, bar); ++ bar_len = pci_resource_len(pci_dev, bar); ++ ++ if (offset + len > bar_len) { ++ dev_err(&pci_dev->dev, ++ "%s: bar shorter than cap offset+len\n", ++ __func__); ++ return false; ++ } ++ ++ region->len = len; ++ region->addr = (u64) phys_addr + offset; ++ ++ return true; ++} ++ + static const struct virtio_config_ops virtio_pci_config_nodev_ops = { + .get = NULL, + .set = NULL, +@@ -460,6 +566,7 @@ static const struct virtio_config_ops virtio_pci_config_nodev_ops = { + .bus_name = vp_bus_name, + .set_vq_affinity = vp_set_vq_affinity, + .get_vq_affinity = vp_get_vq_affinity, ++ .get_shm_region = vp_get_shm_region, + }; + + static const struct virtio_config_ops virtio_pci_config_ops = { +@@ -476,6 +583,7 @@ static const struct virtio_config_ops virtio_pci_config_ops = { + .bus_name = vp_bus_name, + .set_vq_affinity = vp_set_vq_affinity, + .get_vq_affinity = vp_get_vq_affinity, ++ .get_shm_region = vp_get_shm_region, + }; + + /** +diff --git a/fs/dax.c b/fs/dax.c +index 09fa70683c41..408a06b74335 100644 +--- a/fs/dax.c ++++ b/fs/dax.c +@@ -1021,12 +1021,12 @@ static int dax_writeback_one(struct dax_device *dax_dev, + * on persistent storage prior to completion of the operation. + */ + int dax_writeback_mapping_range(struct address_space *mapping, +- struct block_device *bdev, struct writeback_control *wbc) ++ struct block_device *bdev, struct dax_device *dax_dev, ++ struct writeback_control *wbc) + { + struct inode *inode = mapping->host; + pgoff_t start_index, end_index; + pgoff_t indices[PAGEVEC_SIZE]; +- struct dax_device *dax_dev; + struct pagevec pvec; + bool done = false; + int i, ret = 0; +@@ -1037,9 +1037,12 @@ int dax_writeback_mapping_range(struct address_space *mapping, + if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) + return 0; + +- dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); +- if (!dax_dev) +- return -EIO; ++ if (bdev) { ++ WARN_ON(dax_dev); ++ dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); ++ if (!dax_dev) ++ return -EIO; ++ } + + start_index = wbc->range_start >> PAGE_SHIFT; + end_index = wbc->range_end >> PAGE_SHIFT; +@@ -1073,7 +1076,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, + start_index = indices[pvec.nr - 1] + 1; + } + out: +- put_dax(dax_dev); ++ if (bdev) ++ put_dax(dax_dev); + trace_dax_writeback_range_done(inode, start_index, end_index); + return (ret < 0 ? ret : 0); + } +@@ -1141,7 +1145,12 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, + static bool dax_range_is_aligned(struct block_device *bdev, + unsigned int offset, unsigned int length) + { +- unsigned short sector_size = bdev_logical_block_size(bdev); ++ unsigned short sector_size; ++ ++ if (!bdev) ++ return false; ++ ++ sector_size = bdev_logical_block_size(bdev); + + if (!IS_ALIGNED(offset, sector_size)) + return false; +diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c +index e4bb9386c045..c9b024dafe09 100644 +--- a/fs/ext2/inode.c ++++ b/fs/ext2/inode.c +@@ -956,7 +956,7 @@ static int + ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) + { + return dax_writeback_mapping_range(mapping, +- mapping->host->i_sb->s_bdev, wbc); ++ mapping->host->i_sb->s_bdev, NULL, wbc); + } + + const struct address_space_operations ext2_aops = { +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 2c43c5b92229..a94aff38cda4 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2949,7 +2949,7 @@ static int ext4_dax_writepages(struct address_space *mapping, + percpu_down_read(&sbi->s_journal_flag_rwsem); + trace_ext4_writepages(inode, wbc); + +- ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc); ++ ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, NULL, wbc); + trace_ext4_writepages_result(inode, wbc, ret, + nr_to_write - wbc->nr_to_write); + percpu_up_read(&sbi->s_journal_flag_rwsem); +diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig +index 76f09ce7e5b2..46e9a8ff9f7a 100644 +--- a/fs/fuse/Kconfig ++++ b/fs/fuse/Kconfig +@@ -26,3 +26,14 @@ config CUSE + + If you want to develop or use a userspace character device + based on CUSE, answer Y or M. ++ ++config VIRTIO_FS ++ tristate "Virtio Filesystem" ++ depends on FUSE_FS ++ select VIRTIO ++ help ++ The Virtio Filesystem allows guests to mount file systems from the ++ host. ++ ++ If you want to share files between guests or with the host, answer Y ++ or M. +diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile +index 60da84a86dab..d125ff826f2d 100644 +--- a/fs/fuse/Makefile ++++ b/fs/fuse/Makefile +@@ -4,5 +4,6 @@ + + obj-$(CONFIG_FUSE_FS) += fuse.o + obj-$(CONFIG_CUSE) += cuse.o ++obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o + + fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o +diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c +index 8f68181256c0..d49d64f42768 100644 +--- a/fs/fuse/cuse.c ++++ b/fs/fuse/cuse.c +@@ -503,9 +503,10 @@ static int cuse_channel_open(struct inode *inode, struct file *file) + * Limit the cuse channel to requests that can + * be represented in file->f_cred->user_ns. + */ +- fuse_conn_init(&cc->fc, file->f_cred->user_ns); ++ fuse_conn_init(&cc->fc, file->f_cred->user_ns, NULL, &fuse_dev_fiq_ops, ++ NULL); + +- fud = fuse_dev_alloc(&cc->fc); ++ fud = fuse_dev_alloc_install(&cc->fc); + if (!fud) { + kfree(cc); + return -ENOMEM; +diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c +index baaed4d05b22..24d4a9b93fb6 100644 +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -103,6 +103,7 @@ void fuse_request_free(struct fuse_req *req) + } + kmem_cache_free(fuse_req_cachep, req); + } ++EXPORT_SYMBOL_GPL(fuse_request_free); + + void __fuse_get_request(struct fuse_req *req) + { +@@ -310,7 +311,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) + } + EXPORT_SYMBOL_GPL(fuse_put_request); + +-static unsigned len_args(unsigned numargs, struct fuse_arg *args) ++unsigned fuse_len_args(unsigned numargs, struct fuse_arg *args) + { + unsigned nbytes = 0; + unsigned i; +@@ -320,19 +321,41 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) + + return nbytes; + } ++EXPORT_SYMBOL_GPL(fuse_len_args); + +-static u64 fuse_get_unique(struct fuse_iqueue *fiq) ++u64 fuse_get_unique(struct fuse_iqueue *fiq) + { + return ++fiq->reqctr; + } ++EXPORT_SYMBOL_GPL(fuse_get_unique); + +-static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) ++/** ++ * A new request is available, wake fiq->waitq ++ */ ++static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq) ++__releases(fiq->waitq.lock) + { +- req->in.h.len = sizeof(struct fuse_in_header) + +- len_args(req->in.numargs, (struct fuse_arg *) req->in.args); +- list_add_tail(&req->list, &fiq->pending); + wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); ++ spin_unlock(&fiq->waitq.lock); ++} ++ ++const struct fuse_iqueue_ops fuse_dev_fiq_ops = { ++ .wake_forget_and_unlock = fuse_dev_wake_and_unlock, ++ .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, ++ .wake_pending_and_unlock = fuse_dev_wake_and_unlock, ++}; ++EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); ++ ++static void queue_request_and_unlock(struct fuse_iqueue *fiq, ++ struct fuse_req *req) ++__releases(fiq->waitq.lock) ++{ ++ req->in.h.len = sizeof(struct fuse_in_header) + ++ fuse_len_args(req->in.numargs, ++ (struct fuse_arg *) req->in.args); ++ list_add_tail(&req->list, &fiq->pending); ++ fiq->ops->wake_pending_and_unlock(fiq); + } + + void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, +@@ -347,12 +370,11 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, + if (fiq->connected) { + fiq->forget_list_tail->next = forget; + fiq->forget_list_tail = forget; +- wake_up_locked(&fiq->waitq); +- kill_fasync(&fiq->fasync, SIGIO, POLL_IN); ++ fiq->ops->wake_forget_and_unlock(fiq); + } else { + kfree(forget); ++ spin_unlock(&fiq->waitq.lock); + } +- spin_unlock(&fiq->waitq.lock); + } + + static void flush_bg_queue(struct fuse_conn *fc) +@@ -367,8 +389,7 @@ static void flush_bg_queue(struct fuse_conn *fc) + fc->active_background++; + spin_lock(&fiq->waitq.lock); + req->in.h.unique = fuse_get_unique(fiq); +- queue_request(fiq, req); +- spin_unlock(&fiq->waitq.lock); ++ queue_request_and_unlock(fiq, req); + } + } + +@@ -380,7 +401,7 @@ static void flush_bg_queue(struct fuse_conn *fc) + * the 'end' callback is called if given, else the reference to the + * request is released + */ +-static void request_end(struct fuse_conn *fc, struct fuse_req *req) ++void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) + { + struct fuse_iqueue *fiq = &fc->iq; + +@@ -424,6 +445,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) + put_request: + fuse_put_request(fc, req); + } ++EXPORT_SYMBOL_GPL(fuse_request_end); + + static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) + { +@@ -434,10 +456,10 @@ static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) + } + if (list_empty(&req->intr_entry)) { + list_add_tail(&req->intr_entry, &fiq->interrupts); +- wake_up_locked(&fiq->waitq); ++ fiq->ops->wake_interrupt_and_unlock(fiq); ++ } else { ++ spin_unlock(&fiq->waitq.lock); + } +- spin_unlock(&fiq->waitq.lock); +- kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + } + + static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) +@@ -496,14 +518,13 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) + req->out.h.error = -ENOTCONN; + } else { + req->in.h.unique = fuse_get_unique(fiq); +- queue_request(fiq, req); + /* acquire extra reference, since request is still needed +- after request_end() */ ++ after fuse_request_end() */ + __fuse_get_request(req); +- spin_unlock(&fiq->waitq.lock); ++ queue_request_and_unlock(fiq, req); + + request_wait_answer(fc, req); +- /* Pairs with smp_wmb() in request_end() */ ++ /* Pairs with smp_wmb() in fuse_request_end() */ + smp_rmb(); + } + } +@@ -635,10 +656,11 @@ static int fuse_request_send_notify_reply(struct fuse_conn *fc, + req->in.h.unique = unique; + spin_lock(&fiq->waitq.lock); + if (fiq->connected) { +- queue_request(fiq, req); ++ queue_request_and_unlock(fiq, req); + err = 0; ++ } else { ++ spin_unlock(&fiq->waitq.lock); + } +- spin_unlock(&fiq->waitq.lock); + + return err; + } +@@ -1236,7 +1258,7 @@ __releases(fiq->waitq.lock) + * the pending list and copies request data to userspace buffer. If + * no reply is needed (FORGET) or request has been aborted or there + * was an error during the copying then it's finished by calling +- * request_end(). Otherwise add it to the processing list, and set ++ * fuse_request_end(). Otherwise add it to the processing list, and set + * the 'sent' flag. + */ + static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, +@@ -1295,7 +1317,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, + /* SETXATTR is special, since it may contain too large data */ + if (in->h.opcode == FUSE_SETXATTR) + req->out.h.error = -E2BIG; +- request_end(fc, req); ++ fuse_request_end(fc, req); + goto restart; + } + spin_lock(&fpq->lock); +@@ -1337,7 +1359,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, + if (!test_bit(FR_PRIVATE, &req->flags)) + list_del_init(&req->list); + spin_unlock(&fpq->lock); +- request_end(fc, req); ++ fuse_request_end(fc, req); + return err; + + err_unlock: +@@ -1824,7 +1846,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, + if (out->h.error) + return nbytes != reqsize ? -EINVAL : 0; + +- reqsize += len_args(out->numargs, out->args); ++ reqsize += fuse_len_args(out->numargs, out->args); + + if (reqsize < nbytes || (reqsize > nbytes && !out->argvar)) + return -EINVAL; +@@ -1844,7 +1866,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, + * the write buffer. The request is then searched on the processing + * list by the unique ID found in the header. If found, then remove + * it from the list and copy the rest of the buffer to the request. +- * The request is finished by calling request_end() ++ * The request is finished by calling fuse_request_end(). + */ + static ssize_t fuse_dev_do_write(struct fuse_dev *fud, + struct fuse_copy_state *cs, size_t nbytes) +@@ -1931,7 +1953,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, + list_del_init(&req->list); + spin_unlock(&fpq->lock); + +- request_end(fc, req); ++ fuse_request_end(fc, req); + + return err ? err : nbytes; + +@@ -2077,7 +2099,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) + req->out.h.error = -ECONNABORTED; + clear_bit(FR_SENT, &req->flags); + list_del_init(&req->list); +- request_end(fc, req); ++ fuse_request_end(fc, req); + } + } + +@@ -2223,7 +2245,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) + if (new->private_data) + return -EINVAL; + +- fud = fuse_dev_alloc(fc); ++ fud = fuse_dev_alloc_install(fc); + if (!fud) + return -ENOMEM; + +diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c +index 82a13221775e..3f923fe7841a 100644 +--- a/fs/fuse/dir.c ++++ b/fs/fuse/dir.c +@@ -44,12 +44,26 @@ union fuse_dentry { + struct rcu_head rcu; + }; + +-static inline void fuse_dentry_settime(struct dentry *entry, u64 time) ++static void fuse_dentry_settime(struct dentry *dentry, u64 time) + { +- ((union fuse_dentry *) entry->d_fsdata)->time = time; ++ /* ++ * Mess with DCACHE_OP_DELETE because dput() will be faster without it. ++ * Don't care about races, either way it's just an optimization ++ */ ++ if ((time && (dentry->d_flags & DCACHE_OP_DELETE)) || ++ (!time && !(dentry->d_flags & DCACHE_OP_DELETE))) { ++ spin_lock(&dentry->d_lock); ++ if (time) ++ dentry->d_flags &= ~DCACHE_OP_DELETE; ++ else ++ dentry->d_flags |= DCACHE_OP_DELETE; ++ spin_unlock(&dentry->d_lock); ++ } ++ ++ ((union fuse_dentry *) dentry->d_fsdata)->time = time; + } + +-static inline u64 fuse_dentry_time(struct dentry *entry) ++static inline u64 fuse_dentry_time(const struct dentry *entry) + { + return ((union fuse_dentry *) entry->d_fsdata)->time; + } +@@ -280,8 +294,14 @@ static void fuse_dentry_release(struct dentry *dentry) + kfree_rcu(fd, rcu); + } + ++static int fuse_dentry_delete(const struct dentry *dentry) ++{ ++ return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); ++} ++ + const struct dentry_operations fuse_dentry_operations = { + .d_revalidate = fuse_dentry_revalidate, ++ .d_delete = fuse_dentry_delete, + .d_init = fuse_dentry_init, + .d_release = fuse_dentry_release, + }; +@@ -1728,8 +1748,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, + */ + if ((is_truncate || !is_wb) && + S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { ++ down_write(&fi->i_mmap_sem); + truncate_pagecache(inode, outarg.attr.size); + invalidate_inode_pages2(inode->i_mapping); ++ up_write(&fi->i_mmap_sem); + } + + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); +diff --git a/fs/fuse/file.c b/fs/fuse/file.c +index bd500c3b7858..51faed351c7c 100644 +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -18,8 +18,18 @@ + #include + #include + #include ++#include ++#include ++#include + +-static const struct file_operations fuse_direct_io_file_operations; ++INTERVAL_TREE_DEFINE(struct fuse_dax_mapping, ++ rb, __u64, __subtree_last, ++ START, LAST, static inline, fuse_dax_interval_tree); ++ ++static long __fuse_file_fallocate(struct file *file, int mode, ++ loff_t offset, loff_t length); ++static struct fuse_dax_mapping *alloc_dax_mapping_reclaim(struct fuse_conn *fc, ++ struct inode *inode); + + static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, + int opcode, struct fuse_open_out *outargp) +@@ -170,13 +180,222 @@ static void fuse_link_write_file(struct file *file) + spin_unlock(&fc->lock); + } + ++static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn *fc) ++{ ++ unsigned long free_threshold; ++ struct fuse_dax_mapping *dmap = NULL; ++ ++ spin_lock(&fc->lock); ++ ++ /* TODO: Add logic to try to free up memory if wait is allowed */ ++ if (fc->nr_free_ranges <= 0) { ++ spin_unlock(&fc->lock); ++ goto out_kick; ++ } ++ ++ WARN_ON(list_empty(&fc->free_ranges)); ++ ++ /* Take a free range */ ++ dmap = list_first_entry(&fc->free_ranges, struct fuse_dax_mapping, ++ list); ++ list_del_init(&dmap->list); ++ fc->nr_free_ranges--; ++ spin_unlock(&fc->lock); ++ ++out_kick: ++ /* If number of free ranges are below threshold, start reclaim */ ++ free_threshold = max((fc->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD)/100, ++ (unsigned long)1); ++ if (fc->nr_free_ranges < free_threshold) { ++ pr_debug("fuse: Kicking dax memory reclaim worker. nr_free_ranges=0x%ld nr_total_ranges=%ld\n", fc->nr_free_ranges, fc->nr_ranges); ++ queue_delayed_work(system_long_wq, &fc->dax_free_work, 0); ++ } ++ return dmap; ++} ++ ++/* This assumes fc->lock is held */ ++static void __dmap_remove_busy_list(struct fuse_conn *fc, ++ struct fuse_dax_mapping *dmap) ++{ ++ list_del_init(&dmap->busy_list); ++ WARN_ON(fc->nr_busy_ranges == 0); ++ fc->nr_busy_ranges--; ++} ++ ++static void dmap_remove_busy_list(struct fuse_conn *fc, ++ struct fuse_dax_mapping *dmap) ++{ ++ spin_lock(&fc->lock); ++ __dmap_remove_busy_list(fc, dmap); ++ spin_unlock(&fc->lock); ++} ++ ++/* This assumes fc->lock is held */ ++static void __free_dax_mapping(struct fuse_conn *fc, ++ struct fuse_dax_mapping *dmap) ++{ ++ list_add_tail(&dmap->list, &fc->free_ranges); ++ fc->nr_free_ranges++; ++ /* TODO: Wake up only when needed */ ++ wake_up(&fc->dax_range_waitq); ++} ++ ++static void free_dax_mapping(struct fuse_conn *fc, ++ struct fuse_dax_mapping *dmap) ++{ ++ /* Return fuse_dax_mapping to free list */ ++ spin_lock(&fc->lock); ++ __free_dax_mapping(fc, dmap); ++ spin_unlock(&fc->lock); ++} ++ ++/* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */ ++static int fuse_setup_one_mapping(struct inode *inode, ++ struct file *file, loff_t offset, ++ struct fuse_dax_mapping *dmap) ++{ ++ struct fuse_conn *fc = get_fuse_conn(inode); ++ struct fuse_inode *fi = get_fuse_inode(inode); ++ struct fuse_file *ff = NULL; ++ struct fuse_setupmapping_in inarg; ++ FUSE_ARGS(args); ++ ssize_t err; ++ ++ if (file) ++ ff = file->private_data; ++ ++ WARN_ON(offset % FUSE_DAX_MEM_RANGE_SZ); ++ WARN_ON(fc->nr_free_ranges < 0); ++ ++ /* Ask fuse daemon to setup mapping */ ++ memset(&inarg, 0, sizeof(inarg)); ++ inarg.foffset = offset; ++ if (ff) ++ inarg.fh = ff->fh; ++ else ++ inarg.fh = -1; ++ inarg.moffset = dmap->window_offset; ++ inarg.len = FUSE_DAX_MEM_RANGE_SZ; ++ if (file) { ++ inarg.flags |= (file->f_mode & FMODE_WRITE) ? ++ FUSE_SETUPMAPPING_FLAG_WRITE : 0; ++ inarg.flags |= (file->f_mode & FMODE_READ) ? ++ FUSE_SETUPMAPPING_FLAG_READ : 0; ++ } else { ++ inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; ++ inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; ++ } ++ args.in.h.opcode = FUSE_SETUPMAPPING; ++ args.in.h.nodeid = fi->nodeid; ++ args.in.numargs = 1; ++ args.in.args[0].size = sizeof(inarg); ++ args.in.args[0].value = &inarg; ++ err = fuse_simple_request(fc, &args); ++ if (err < 0) { ++ printk(KERN_ERR "%s request failed at mem_offset=0x%llx %zd\n", ++ __func__, dmap->window_offset, err); ++ return err; ++ } ++ ++ pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx err=%zd\n", offset, err); ++ ++ /* ++ * We don't take a refernce on inode. inode is valid right now and ++ * when inode is going away, cleanup logic should first cleanup ++ * dmap entries. ++ * ++ * TODO: Do we need to ensure that we are holding inode lock ++ * as well. ++ */ ++ dmap->inode = inode; ++ dmap->start = offset; ++ dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1; ++ /* Protected by fi->i_dmap_sem */ ++ fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree); ++ fi->nr_dmaps++; ++ spin_lock(&fc->lock); ++ list_add_tail(&dmap->busy_list, &fc->busy_ranges); ++ fc->nr_busy_ranges++; ++ spin_unlock(&fc->lock); ++ return 0; ++} ++ ++static int fuse_removemapping_one(struct inode *inode, ++ struct fuse_dax_mapping *dmap) ++{ ++ struct fuse_inode *fi = get_fuse_inode(inode); ++ struct fuse_conn *fc = get_fuse_conn(inode); ++ struct fuse_removemapping_in inarg; ++ FUSE_ARGS(args); ++ ++ memset(&inarg, 0, sizeof(inarg)); ++ inarg.moffset = dmap->window_offset; ++ inarg.len = dmap->length; ++ args.in.h.opcode = FUSE_REMOVEMAPPING; ++ args.in.h.nodeid = fi->nodeid; ++ args.in.numargs = 1; ++ args.in.args[0].size = sizeof(inarg); ++ args.in.args[0].value = &inarg; ++ return fuse_simple_request(fc, &args); ++} ++ ++/* ++ * It is called from evict_inode() and by that time inode is going away. So ++ * this function does not take any locks like fi->i_dmap_sem for traversing ++ * that fuse inode interval tree. If that lock is taken then lock validator ++ * complains of deadlock situation w.r.t fs_reclaim lock. ++ */ ++void fuse_removemapping(struct inode *inode) ++{ ++ struct fuse_conn *fc = get_fuse_conn(inode); ++ struct fuse_inode *fi = get_fuse_inode(inode); ++ ssize_t err; ++ struct fuse_dax_mapping *dmap; ++ ++ /* Clear the mappings list */ ++ while (true) { ++ WARN_ON(fi->nr_dmaps < 0); ++ ++ dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, 0, ++ -1); ++ if (dmap) { ++ fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree); ++ fi->nr_dmaps--; ++ dmap_remove_busy_list(fc, dmap); ++ } ++ ++ if (!dmap) ++ break; ++ ++ /* ++ * During umount/shutdown, fuse connection is dropped first ++ * and later evict_inode() is called later. That means any ++ * removemapping messages are going to fail. Send messages ++ * only if connection is up. Otherwise fuse daemon is ++ * responsible for cleaning up any leftover references and ++ * mappings. ++ */ ++ if (fc->connected) { ++ err = fuse_removemapping_one(inode, dmap); ++ if (err) { ++ pr_warn("Failed to removemapping. offset=0x%llx" ++ " len=0x%llx\n", dmap->window_offset, ++ dmap->length); ++ } ++ } ++ ++ dmap->inode = NULL; ++ ++ /* Add it back to free ranges list */ ++ free_dax_mapping(fc, dmap); ++ } ++} ++ + void fuse_finish_open(struct inode *inode, struct file *file) + { + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + +- if (ff->open_flags & FOPEN_DIRECT_IO) +- file->f_op = &fuse_direct_io_file_operations; + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + if (ff->open_flags & FOPEN_NONSEEKABLE) +@@ -202,7 +421,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) + int err; + bool lock_inode = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && +- fc->writeback_cache; ++ (fc->writeback_cache || IS_DAX(inode)); + + err = generic_file_open(inode, file); + if (err) +@@ -250,6 +469,7 @@ void fuse_release_common(struct file *file, bool isdir) + struct fuse_file *ff = file->private_data; + struct fuse_req *req = ff->reserved_req; + int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; ++ bool sync = false; + + fuse_prepare_release(ff, file->f_flags, opcode); + +@@ -270,8 +490,20 @@ void fuse_release_common(struct file *file, bool isdir) + * Make the release synchronous if this is a fuseblk mount, + * synchronous RELEASE is allowed (and desirable) in this case + * because the server can be trusted not to screw up. ++ * ++ * For DAX, fuse server is trusted. So it should be fine to ++ * do a sync file put. Doing async file put is creating ++ * problems right now because when request finish, iput() ++ * can lead to freeing of inode. That means it tears down ++ * mappings backing DAX memory and sends REMOVEMAPPING message ++ * to server and blocks for completion. Currently, waiting ++ * in req->end context deadlocks the system as same worker thread ++ * can't process REMOVEMAPPING reply it is waiting for. + */ +- fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir); ++ if (IS_DAX(req->misc.release.inode) || ff->fc->destroy_req != NULL) ++ sync = true; ++ ++ fuse_file_put(ff, sync, isdir); + } + + static int fuse_open(struct inode *inode, struct file *file) +@@ -916,11 +1148,23 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, + return err; + } + ++ ++static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to); ++static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); ++ + static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) + { +- struct inode *inode = iocb->ki_filp->f_mapping->host; ++ struct file *file = iocb->ki_filp; ++ struct fuse_file *ff = file->private_data; ++ struct inode *inode = file->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + ++ if (ff->open_flags & FOPEN_DIRECT_IO) ++ return fuse_direct_read_iter(iocb, to); ++ ++ if (IS_DAX(inode)) ++ return fuse_dax_read_iter(iocb, to); ++ + /* + * In auto invalidate mode, always update attributes on read. + * Otherwise, only update if we attempt to read past EOF (to ensure +@@ -1168,9 +1412,14 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, + return res > 0 ? res : err; + } + ++static ssize_t fuse_direct_write_iter(struct kiocb *iocb, ++ struct iov_iter *from); ++static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); ++ + static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + { + struct file *file = iocb->ki_filp; ++ struct fuse_file *ff = file->private_data; + struct address_space *mapping = file->f_mapping; + ssize_t written = 0; + ssize_t written_buffered = 0; +@@ -1178,6 +1427,11 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + ssize_t err; + loff_t endbyte = 0; + ++ if (ff->open_flags & FOPEN_DIRECT_IO) ++ return fuse_direct_write_iter(iocb, from); ++ if (IS_DAX(inode)) ++ return fuse_dax_write_iter(iocb, from); ++ + if (get_fuse_conn(inode)->writeback_cache) { + /* Update size (EOF optimization) and mode (SUID clearing) */ + err = fuse_update_attributes(mapping->host, file); +@@ -1442,16 +1696,279 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) + /* Don't allow parallel writes to the same file */ + inode_lock(inode); + res = generic_write_checks(iocb, from); +- if (res > 0) +- res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); ++ if (res < 0) ++ goto out_invalidate; ++ ++ res = file_remove_privs(iocb->ki_filp); ++ if (res) ++ goto out_invalidate; ++ ++ res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); ++ if (res < 0) ++ goto out_invalidate; ++ + fuse_invalidate_attr(inode); +- if (res > 0) +- fuse_write_update_size(inode, iocb->ki_pos); ++ fuse_write_update_size(inode, iocb->ki_pos); + inode_unlock(inode); ++ return res; + ++out_invalidate: ++ fuse_invalidate_attr(inode); ++ inode_unlock(inode); + return res; + } + ++static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length) ++{ ++ iomap->addr = IOMAP_NULL_ADDR; ++ iomap->length = length; ++ iomap->type = IOMAP_HOLE; ++} ++ ++static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length, ++ struct iomap *iomap, struct fuse_dax_mapping *dmap, ++ unsigned flags) ++{ ++ loff_t offset, len; ++ loff_t i_size = i_size_read(inode); ++ ++ offset = pos - dmap->start; ++ len = min(length, dmap->length - offset); ++ ++ /* If length is beyond end of file, truncate further */ ++ if (pos + len > i_size) ++ len = i_size - pos; ++ ++ if (len > 0) { ++ iomap->addr = dmap->window_offset + offset; ++ iomap->length = len; ++ if (flags & IOMAP_FAULT) ++ iomap->length = ALIGN(len, PAGE_SIZE); ++ iomap->type = IOMAP_MAPPED; ++ pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx" ++ " length 0x%llx\n", __func__, iomap->addr, ++ iomap->offset, iomap->length); ++ } else { ++ /* Mapping beyond end of file is hole */ ++ fuse_fill_iomap_hole(iomap, length); ++ pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx" ++ "length 0x%llx\n", __func__, iomap->addr, ++ iomap->offset, iomap->length); ++ } ++} ++ ++/* This is just for DAX and the mapping is ephemeral, do not use it for other ++ * purposes since there is no block device with a permanent mapping. ++ */ ++static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, ++ unsigned flags, struct iomap *iomap) ++{ ++ struct fuse_inode *fi = get_fuse_inode(inode); ++ struct fuse_conn *fc = get_fuse_conn(inode); ++ struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; ++ int ret; ++ ++ /* We don't support FIEMAP */ ++ BUG_ON(flags & IOMAP_REPORT); ++ ++ pr_debug("fuse_iomap_begin() called. pos=0x%llx length=0x%llx\n", ++ pos, length); ++ ++ iomap->offset = pos; ++ iomap->flags = 0; ++ iomap->bdev = NULL; ++ iomap->dax_dev = fc->dax_dev; ++ ++ /* ++ * Both read/write and mmap path can race here. So we need something ++ * to make sure if we are setting up mapping, then other path waits ++ * ++ * For now, use a semaphore for this. It probably needs to be ++ * optimized later. ++ */ ++ down_read(&fi->i_dmap_sem); ++ dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos); ++ ++ if (dmap) { ++ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); ++ up_read(&fi->i_dmap_sem); ++ return 0; ++ } else { ++ up_read(&fi->i_dmap_sem); ++ pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", ++ __func__, pos, length); ++ if (pos >= i_size_read(inode)) ++ goto iomap_hole; ++ ++ /* Can't do reclaim in fault path yet due to lock ordering. ++ * Read path takes shared inode lock and that's not sufficient ++ * for inline range reclaim. Caller needs to drop lock, wait ++ * and retry. ++ */ ++ if (flags & IOMAP_FAULT || !(flags & IOMAP_WRITE)) { ++ alloc_dmap = alloc_dax_mapping(fc); ++ if (!alloc_dmap) ++ return -ENOSPC; ++ } else { ++ alloc_dmap = alloc_dax_mapping_reclaim(fc, inode); ++ if (IS_ERR(alloc_dmap)) ++ return PTR_ERR(alloc_dmap); ++ } ++ ++ /* If we are here, we should have memory allocated */ ++ if (WARN_ON(!alloc_dmap)) ++ return -EBUSY; ++ ++ /* ++ * Drop read lock and take write lock so that only one ++ * caller can try to setup mapping and other waits ++ */ ++ down_write(&fi->i_dmap_sem); ++ /* ++ * We dropped lock. Check again if somebody else setup ++ * mapping already. ++ */ ++ dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, ++ pos); ++ if (dmap) { ++ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); ++ free_dax_mapping(fc, alloc_dmap); ++ up_write(&fi->i_dmap_sem); ++ return 0; ++ } ++ ++ /* Setup one mapping */ ++ ret = fuse_setup_one_mapping(inode, NULL, ++ ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ), ++ alloc_dmap); ++ if (ret < 0) { ++ printk("fuse_setup_one_mapping() failed. err=%d" ++ " pos=0x%llx\n", ret, pos); ++ free_dax_mapping(fc, alloc_dmap); ++ up_write(&fi->i_dmap_sem); ++ return ret; ++ } ++ fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags); ++ up_write(&fi->i_dmap_sem); ++ return 0; ++ } ++ ++ /* ++ * If read beyond end of file happnes, fs code seems to return ++ * it as hole ++ */ ++iomap_hole: ++ fuse_fill_iomap_hole(iomap, length); ++ pr_debug("fuse_iomap_begin() returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", pos, length, iomap->length); ++ return 0; ++} ++ ++static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length, ++ ssize_t written, unsigned flags, ++ struct iomap *iomap) ++{ ++ /* DAX writes beyond end-of-file aren't handled using iomap, so the ++ * file size is unchanged and there is nothing to do here. ++ */ ++ return 0; ++} ++ ++static const struct iomap_ops fuse_iomap_ops = { ++ .iomap_begin = fuse_iomap_begin, ++ .iomap_end = fuse_iomap_end, ++}; ++ ++static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ struct fuse_conn *fc = get_fuse_conn(inode); ++ ssize_t ret; ++ bool retry = false; ++ ++retry: ++ if (retry && !(fc->nr_free_ranges > 0)) { ++ ret = -EINTR; ++ if (wait_event_killable_exclusive(fc->dax_range_waitq, ++ (fc->nr_free_ranges > 0))) { ++ goto out; ++ } ++ } ++ ++ if (iocb->ki_flags & IOCB_NOWAIT) { ++ if (!inode_trylock_shared(inode)) ++ return -EAGAIN; ++ } else { ++ inode_lock_shared(inode); ++ } ++ ++ ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops); ++ inode_unlock_shared(inode); ++ ++ /* If a dax range could not be allocated and it can't be reclaimed ++ * inline, then drop inode lock and retry. Range reclaim logic ++ * requires exclusive access to inode lock. ++ * ++ * TODO: What if -ENOSPC needs to be returned to user space. Fix it. ++ */ ++ if (ret == -ENOSPC) { ++ retry = true; ++ goto retry; ++ } ++ /* TODO file_accessed(iocb->f_filp) */ ++ ++out: ++ return ret; ++} ++ ++static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ ssize_t ret; ++ ++ if (iocb->ki_flags & IOCB_NOWAIT) { ++ if (!inode_trylock(inode)) ++ return -EAGAIN; ++ } else { ++ inode_lock(inode); ++ } ++ ++ ret = generic_write_checks(iocb, from); ++ if (ret <= 0) ++ goto out; ++ ++ ret = file_remove_privs(iocb->ki_filp); ++ if (ret) ++ goto out; ++ /* TODO file_update_time() but we don't want metadata I/O */ ++ ++ /* TODO handle growing the file */ ++ /* Grow file here if need be. iomap_begin() does not have access ++ * to file pointer ++ */ ++ if (iov_iter_rw(from) == WRITE && ++ ((iocb->ki_pos + iov_iter_count(from)) > i_size_read(inode))) { ++ ret = __fuse_file_fallocate(iocb->ki_filp, 0, iocb->ki_pos, ++ iov_iter_count(from)); ++ if (ret < 0) { ++ printk("fallocate(offset=0x%llx length=0x%zx)" ++ " failed. err=%zd\n", iocb->ki_pos, ++ iov_iter_count(from), ret); ++ goto out; ++ } ++ pr_debug("fallocate(offset=0x%llx length=0x%zx)" ++ " succeed. ret=%zd\n", iocb->ki_pos, iov_iter_count(from), ret); ++ } ++ ++ ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops); ++ ++out: ++ inode_unlock(inode); ++ ++ if (ret > 0) ++ ret = generic_write_sync(iocb, ret); ++ return ret; ++} ++ + static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) + { + int i; +@@ -1901,6 +2418,17 @@ static int fuse_writepages_fill(struct page *page, + return err; + } + ++static int fuse_dax_writepages(struct address_space *mapping, ++ struct writeback_control *wbc) ++{ ++ ++ struct inode *inode = mapping->host; ++ struct fuse_conn *fc = get_fuse_conn(inode); ++ ++ return dax_writeback_mapping_range(mapping, ++ NULL, fc->dax_dev, wbc); ++} ++ + static int fuse_writepages(struct address_space *mapping, + struct writeback_control *wbc) + { +@@ -2074,8 +2602,20 @@ static const struct vm_operations_struct fuse_file_vm_ops = { + .page_mkwrite = fuse_page_mkwrite, + }; + ++static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma); ++static int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); ++ + static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) + { ++ struct fuse_file *ff = file->private_data; ++ ++ /* DAX mmap is superior to direct_io mmap */ ++ if (IS_DAX(file_inode(file))) ++ return fuse_dax_mmap(file, vma); ++ ++ if (ff->open_flags & FOPEN_DIRECT_IO) ++ return fuse_direct_mmap(file, vma); ++ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + fuse_link_write_file(file); + +@@ -2095,6 +2635,103 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) + return generic_file_mmap(file, vma); + } + ++static ssize_t fuse_file_splice_read(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) ++{ ++ struct fuse_file *ff = in->private_data; ++ ++ if (ff->open_flags & FOPEN_DIRECT_IO) ++ return default_file_splice_read(in, ppos, pipe, len, flags); ++ else ++ return generic_file_splice_read(in, ppos, pipe, len, flags); ++ ++} ++static int __fuse_dax_fault(struct vm_fault *vmf, enum page_entry_size pe_size, ++ bool write) ++{ ++ int ret, error = 0; ++ struct inode *inode = file_inode(vmf->vma->vm_file); ++ struct super_block *sb = inode->i_sb; ++ pfn_t pfn; ++ struct fuse_conn *fc = get_fuse_conn(inode); ++ bool retry = false; ++ ++ if (write) ++ sb_start_pagefault(sb); ++ ++retry: ++ if (retry && !(fc->nr_free_ranges > 0)) { ++ ret = -EINTR; ++ if (wait_event_killable_exclusive(fc->dax_range_waitq, ++ (fc->nr_free_ranges > 0))) ++ goto out; ++ } ++ ++ /* ++ * We need to serialize against not only truncate but also against ++ * fuse dax memory range reclaim. While a range is being reclaimed, ++ * we do not want any read/write/mmap to make progress and try ++ * to populate page cache or access memory we are trying to free. ++ */ ++ down_read(&get_fuse_inode(inode)->i_mmap_sem); ++ ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); ++ if ((ret & VM_FAULT_ERROR) && error == -ENOSPC) { ++ error = 0; ++ retry = true; ++ up_read(&get_fuse_inode(inode)->i_mmap_sem); ++ goto retry; ++ } ++ ++ if (ret & VM_FAULT_NEEDDSYNC) ++ ret = dax_finish_sync_fault(vmf, pe_size, pfn); ++ ++ up_read(&get_fuse_inode(inode)->i_mmap_sem); ++ ++out: ++ if (write) ++ sb_end_pagefault(sb); ++ ++ return ret; ++} ++ ++static int fuse_dax_fault(struct vm_fault *vmf) ++{ ++ return __fuse_dax_fault(vmf, PE_SIZE_PTE, ++ vmf->flags & FAULT_FLAG_WRITE); ++} ++ ++static int fuse_dax_huge_fault(struct vm_fault *vmf, ++ enum page_entry_size pe_size) ++{ ++ return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); ++} ++ ++static int fuse_dax_page_mkwrite(struct vm_fault *vmf) ++{ ++ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); ++} ++ ++static int fuse_dax_pfn_mkwrite(struct vm_fault *vmf) ++{ ++ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); ++} ++ ++static const struct vm_operations_struct fuse_dax_vm_ops = { ++ .fault = fuse_dax_fault, ++ .huge_fault = fuse_dax_huge_fault, ++ .page_mkwrite = fuse_dax_page_mkwrite, ++ .pfn_mkwrite = fuse_dax_pfn_mkwrite, ++}; ++ ++static int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ file_accessed(file); ++ vma->vm_ops = &fuse_dax_vm_ops; ++ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; ++ return 0; ++} ++ + static int convert_fuse_file_lock(struct fuse_conn *fc, + const struct fuse_file_lock *ffl, + struct file_lock *fl) +@@ -2938,8 +3575,12 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) + return ret; + } + +-static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, +- loff_t length) ++/* ++ * This variant does not take any inode lock and if locking is required, ++ * caller is supposed to hold lock ++ */ ++static long __fuse_file_fallocate(struct file *file, int mode, ++ loff_t offset, loff_t length) + { + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); +@@ -2953,8 +3594,6 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, + .mode = mode + }; + int err; +- bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || +- (mode & FALLOC_FL_PUNCH_HOLE); + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; +@@ -2962,17 +3601,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, + if (fc->no_fallocate) + return -EOPNOTSUPP; + +- if (lock_inode) { +- inode_lock(inode); +- if (mode & FALLOC_FL_PUNCH_HOLE) { +- loff_t endbyte = offset + length - 1; +- err = filemap_write_and_wait_range(inode->i_mapping, +- offset, endbyte); +- if (err) +- goto out; +- +- fuse_sync_writes(inode); +- } ++ if (mode & FALLOC_FL_PUNCH_HOLE) { ++ loff_t endbyte = offset + length - 1; ++ err = filemap_write_and_wait_range(inode->i_mapping, offset, ++ endbyte); ++ if (err) ++ goto out; ++ fuse_sync_writes(inode); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE)) +@@ -2999,18 +3634,42 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, + file_update_time(file); + } + +- if (mode & FALLOC_FL_PUNCH_HOLE) ++ if (mode & FALLOC_FL_PUNCH_HOLE) { ++ down_write(&fi->i_mmap_sem); + truncate_pagecache_range(inode, offset, offset + length - 1); +- ++ up_write(&fi->i_mmap_sem); ++ } + fuse_invalidate_attr(inode); + + out: + if (!(mode & FALLOC_FL_KEEP_SIZE)) + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + ++ return err; ++} ++ ++static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, ++ loff_t length) ++{ ++ struct fuse_file *ff = file->private_data; ++ struct inode *inode = file_inode(file); ++ struct fuse_conn *fc = ff->fc; ++ int err; ++ bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || ++ (mode & FALLOC_FL_PUNCH_HOLE); ++ ++ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) ++ return -EOPNOTSUPP; ++ ++ if (fc->no_fallocate) ++ return -EOPNOTSUPP; ++ + if (lock_inode) +- inode_unlock(inode); ++ inode_lock(inode); + ++ err = __fuse_file_fallocate(file, mode, offset, length); ++ if (lock_inode) ++ inode_unlock(inode); + return err; + } + +@@ -3018,38 +3677,21 @@ static const struct file_operations fuse_file_operations = { + .llseek = fuse_file_llseek, + .read_iter = fuse_file_read_iter, + .write_iter = fuse_file_write_iter, +- .mmap = fuse_file_mmap, ++ .mmap = fuse_file_mmap, ++ .splice_read = fuse_file_splice_read, + .open = fuse_open, + .flush = fuse_flush, + .release = fuse_release, + .fsync = fuse_fsync, + .lock = fuse_file_lock, ++ .get_unmapped_area = thp_get_unmapped_area, + .flock = fuse_file_flock, +- .splice_read = generic_file_splice_read, + .unlocked_ioctl = fuse_file_ioctl, + .compat_ioctl = fuse_file_compat_ioctl, + .poll = fuse_file_poll, + .fallocate = fuse_file_fallocate, + }; + +-static const struct file_operations fuse_direct_io_file_operations = { +- .llseek = fuse_file_llseek, +- .read_iter = fuse_direct_read_iter, +- .write_iter = fuse_direct_write_iter, +- .mmap = fuse_direct_mmap, +- .open = fuse_open, +- .flush = fuse_flush, +- .release = fuse_release, +- .fsync = fuse_fsync, +- .lock = fuse_file_lock, +- .flock = fuse_file_flock, +- .unlocked_ioctl = fuse_file_ioctl, +- .compat_ioctl = fuse_file_compat_ioctl, +- .poll = fuse_file_poll, +- .fallocate = fuse_file_fallocate, +- /* no splice_read */ +-}; +- + static const struct address_space_operations fuse_file_aops = { + .readpage = fuse_readpage, + .writepage = fuse_writepage, +@@ -3063,8 +3705,271 @@ static const struct address_space_operations fuse_file_aops = { + .write_end = fuse_write_end, + }; + ++static const struct address_space_operations fuse_dax_file_aops = { ++ .writepages = fuse_dax_writepages, ++ .direct_IO = noop_direct_IO, ++ .set_page_dirty = noop_set_page_dirty, ++ .invalidatepage = noop_invalidatepage, ++}; ++ + void fuse_init_file_inode(struct inode *inode) + { ++ struct fuse_inode *fi = get_fuse_inode(inode); ++ struct fuse_conn *fc = get_fuse_conn(inode); ++ + inode->i_fop = &fuse_file_operations; + inode->i_data.a_ops = &fuse_file_aops; ++ fi->dmap_tree = RB_ROOT_CACHED; ++ ++ if (fc->dax_dev) { ++ inode->i_flags |= S_DAX; ++ inode->i_data.a_ops = &fuse_dax_file_aops; ++ } ++} ++ ++int fuse_dax_reclaim_dmap_locked(struct fuse_conn *fc, struct inode *inode, ++ struct fuse_dax_mapping *dmap) ++{ ++ int ret; ++ struct fuse_inode *fi = get_fuse_inode(inode); ++ ++ ret = filemap_fdatawrite_range(inode->i_mapping, dmap->start, ++ dmap->end); ++ if (ret) { ++ printk("filemap_fdatawrite_range() failed. err=%d start=0x%llx," ++ " end=0x%llx\n", ret, dmap->start, dmap->end); ++ return ret; ++ } ++ ++ ret = invalidate_inode_pages2_range(inode->i_mapping, ++ dmap->start >> PAGE_SHIFT, ++ dmap->end >> PAGE_SHIFT); ++ /* TODO: What to do if above fails? For now, ++ * leave the range in place. ++ */ ++ if (ret) { ++ printk("invalidate_inode_pages2_range() failed err=%d\n", ret); ++ return ret; ++ } ++ ++ /* Remove dax mapping from inode interval tree now */ ++ fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree); ++ fi->nr_dmaps--; ++ return 0; ++} ++ ++/* First first mapping in the tree and free it. */ ++struct fuse_dax_mapping *fuse_dax_reclaim_first_mapping_locked( ++ struct fuse_conn *fc, struct inode *inode) ++{ ++ struct fuse_inode *fi = get_fuse_inode(inode); ++ struct fuse_dax_mapping *dmap; ++ int ret; ++ ++ /* Find fuse dax mapping at file offset inode. */ ++ dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, 0, -1); ++ if (!dmap) ++ return NULL; ++ ++ ret = fuse_dax_reclaim_dmap_locked(fc, inode, dmap); ++ if (ret < 0) ++ return ERR_PTR(ret); ++ ++ /* Clean up dmap. Do not add back to free list */ ++ dmap_remove_busy_list(fc, dmap); ++ dmap->inode = NULL; ++ dmap->start = dmap->end = 0; ++ ++ pr_debug("fuse: reclaimed memory range window_offset=0x%llx," ++ " length=0x%llx\n", dmap->window_offset, ++ dmap->length); ++ return dmap; ++} ++ ++/* ++ * First first mapping in the tree and free it and return it. Do not add ++ * it back to free pool. ++ * ++ * This is called with inode lock held. ++ */ ++struct fuse_dax_mapping *fuse_dax_reclaim_first_mapping(struct fuse_conn *fc, ++ struct inode *inode) ++{ ++ struct fuse_inode *fi = get_fuse_inode(inode); ++ struct fuse_dax_mapping *dmap; ++ ++ down_write(&fi->i_mmap_sem); ++ down_write(&fi->i_dmap_sem); ++ dmap = fuse_dax_reclaim_first_mapping_locked(fc, inode); ++ up_write(&fi->i_dmap_sem); ++ up_write(&fi->i_mmap_sem); ++ return dmap; ++} ++ ++static struct fuse_dax_mapping *alloc_dax_mapping_reclaim(struct fuse_conn *fc, ++ struct inode *inode) ++{ ++ struct fuse_dax_mapping *dmap; ++ struct fuse_inode *fi = get_fuse_inode(inode); ++ ++ while(1) { ++ dmap = alloc_dax_mapping(fc); ++ if (dmap) ++ return dmap; ++ ++ if (fi->nr_dmaps) ++ return fuse_dax_reclaim_first_mapping(fc, inode); ++ /* ++ * There are no mappings which can be reclaimed. ++ * Wait for one. ++ */ ++ if (!(fc->nr_free_ranges > 0)) { ++ if (wait_event_killable_exclusive(fc->dax_range_waitq, ++ (fc->nr_free_ranges > 0))) ++ return ERR_PTR(-EINTR); ++ } ++ } ++} ++ ++int fuse_dax_free_one_mapping_locked(struct fuse_conn *fc, struct inode *inode, ++ u64 dmap_start) ++{ ++ int ret; ++ struct fuse_inode *fi = get_fuse_inode(inode); ++ struct fuse_dax_mapping *dmap; ++ ++ WARN_ON(!inode_is_locked(inode)); ++ ++ /* Find fuse dax mapping at file offset inode. */ ++ dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, dmap_start, ++ dmap_start); ++ ++ /* Range already got cleaned up by somebody else */ ++ if (!dmap) ++ return 0; ++ ++ ret = fuse_dax_reclaim_dmap_locked(fc, inode, dmap); ++ if (ret < 0) ++ return ret; ++ ++ /* Cleanup dmap entry and add back to free list */ ++ spin_lock(&fc->lock); ++ __dmap_remove_busy_list(fc, dmap); ++ dmap->inode = NULL; ++ dmap->start = dmap->end = 0; ++ __free_dax_mapping(fc, dmap); ++ spin_unlock(&fc->lock); ++ ++ pr_debug("fuse: freed memory range window_offset=0x%llx," ++ " length=0x%llx\n", dmap->window_offset, ++ dmap->length); ++ return ret; ++} ++ ++/* ++ * Free a range of memory. ++ * Locking. ++ * 1. Take inode->i_rwsem to prever further read/write. ++ * 2. Take fuse_inode->i_mmap_sem to block dax faults. ++ * 3. Take fuse_inode->i_dmap_sem to protect interval tree. It might not ++ * be strictly necessary as lock 1 and 2 seem sufficient. ++ */ ++int fuse_dax_free_one_mapping(struct fuse_conn *fc, struct inode *inode, ++ u64 dmap_start) ++{ ++ int ret; ++ struct fuse_inode *fi = get_fuse_inode(inode); ++ ++ /* ++ * If process is blocked waiting for memory while holding inode ++ * lock, we will deadlock. So continue to free next range. ++ */ ++ if (!inode_trylock(inode)) ++ return -EAGAIN; ++ down_write(&fi->i_mmap_sem); ++ down_write(&fi->i_dmap_sem); ++ ret = fuse_dax_free_one_mapping_locked(fc, inode, dmap_start); ++ up_write(&fi->i_dmap_sem); ++ up_write(&fi->i_mmap_sem); ++ inode_unlock(inode); ++ return ret; ++} ++ ++int fuse_dax_free_memory(struct fuse_conn *fc, unsigned long nr_to_free) ++{ ++ struct fuse_dax_mapping *dmap, *pos, *temp; ++ int ret, nr_freed = 0, nr_eagain = 0; ++ u64 dmap_start = 0, window_offset = 0; ++ struct inode *inode = NULL; ++ ++ /* Pick first busy range and free it for now*/ ++ while(1) { ++ if (nr_freed >= nr_to_free) ++ break; ++ ++ if (nr_eagain > 20) { ++ queue_delayed_work(system_long_wq, &fc->dax_free_work, ++ msecs_to_jiffies(10)); ++ return 0; ++ } ++ ++ dmap = NULL; ++ spin_lock(&fc->lock); ++ ++ list_for_each_entry_safe(pos, temp, &fc->busy_ranges, ++ busy_list) { ++ inode = igrab(pos->inode); ++ /* ++ * This inode is going away. That will free ++ * up all the ranges anyway, continue to ++ * next range. ++ */ ++ if (!inode) ++ continue; ++ /* ++ * Take this element off list and add it tail. If ++ * inode lock can't be obtained, this will help with ++ * selecting new element ++ */ ++ dmap = pos; ++ list_move_tail(&dmap->busy_list, &fc->busy_ranges); ++ dmap_start = dmap->start; ++ window_offset = dmap->window_offset; ++ break; ++ } ++ spin_unlock(&fc->lock); ++ if (!dmap) ++ return 0; ++ ++ ret = fuse_dax_free_one_mapping(fc, inode, dmap_start); ++ iput(inode); ++ if (ret && ret != -EAGAIN) { ++ printk("%s(window_offset=0x%llx) failed. err=%d\n", ++ __func__, window_offset, ret); ++ return ret; ++ } ++ ++ /* Could not get inode lock. Try next element */ ++ if (ret == -EAGAIN) { ++ nr_eagain++; ++ continue; ++ } ++ nr_freed++; ++ } ++ return 0; ++} ++ ++/* TODO: This probably should go in inode.c */ ++void fuse_dax_free_mem_worker(struct work_struct *work) ++{ ++ int ret; ++ struct fuse_conn *fc = container_of(work, struct fuse_conn, ++ dax_free_work.work); ++ pr_debug("fuse: Worker to free memory called.\n"); ++ pr_debug("fuse: Worker to free memory called. nr_free_ranges=%lu" ++ " nr_busy_ranges=%lu\n", fc->nr_free_ranges, ++ fc->nr_busy_ranges); ++ ret = fuse_dax_free_memory(fc, FUSE_DAX_RECLAIM_CHUNK); ++ if (ret) ++ pr_debug("fuse: fuse_dax_free_memory() failed with err=%d\n", ret); + } +diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h +index cec8b8e74969..1149281ab1e8 100644 +--- a/fs/fuse/fuse_i.h ++++ b/fs/fuse/fuse_i.h +@@ -43,6 +43,20 @@ + /** Number of page pointers embedded in fuse_req */ + #define FUSE_REQ_INLINE_PAGES 1 + ++/* Default memory range size, 2MB */ ++#define FUSE_DAX_MEM_RANGE_SZ (2*1024*1024) ++#define FUSE_DAX_MEM_RANGE_PAGES (FUSE_DAX_MEM_RANGE_SZ/PAGE_SIZE) ++ ++/* Number of ranges reclaimer will try to free in one invocation */ ++#define FUSE_DAX_RECLAIM_CHUNK (10) ++ ++/* ++ * Dax memory reclaim threshold in percetage of total ranges. When free ++ * number of free ranges drops below this threshold, reclaim can trigger ++ * Default is 20% ++ * */ ++#define FUSE_DAX_RECLAIM_THRESHOLD (20) ++ + /** List of active connections */ + extern struct list_head fuse_conn_list; + +@@ -53,12 +67,73 @@ extern struct mutex fuse_mutex; + extern unsigned max_user_bgreq; + extern unsigned max_user_congthresh; + ++/** Mount options */ ++struct fuse_mount_data { ++ int fd; ++ const char *tag; /* lifetime: .fill_super() data argument */ ++ unsigned rootmode; ++ kuid_t user_id; ++ kgid_t group_id; ++ unsigned fd_present:1; ++ unsigned tag_present:1; ++ unsigned rootmode_present:1; ++ unsigned user_id_present:1; ++ unsigned group_id_present:1; ++ unsigned default_permissions:1; ++ unsigned allow_other:1; ++ unsigned dax:1; ++ unsigned destroy:1; ++ unsigned max_read; ++ unsigned blksize; ++ ++ /* DAX device, may be NULL */ ++ struct dax_device *dax_dev; ++ ++ /* fuse input queue operations */ ++ const struct fuse_iqueue_ops *fiq_ops; ++ ++ /* device-specific state for fuse_iqueue */ ++ void *fiq_priv; ++ ++ /* fuse_dev pointer to fill in, should contain NULL on entry */ ++ void **fudptr; ++}; ++ + /* One forget request */ + struct fuse_forget_link { + struct fuse_forget_one forget_one; + struct fuse_forget_link *next; + }; + ++#define START(node) ((node)->start) ++#define LAST(node) ((node)->end) ++ ++/** Translation information for file offsets to DAX window offsets */ ++struct fuse_dax_mapping { ++ /* Pointer to inode where this memory range is mapped */ ++ struct inode *inode; ++ ++ /* Will connect in fc->free_ranges to keep track of free memory */ ++ struct list_head list; ++ ++ /* For interval tree in file/inode */ ++ struct rb_node rb; ++ /** Start Position in file */ ++ __u64 start; ++ /** End Position in file */ ++ __u64 end; ++ __u64 __subtree_last; ++ ++ /* Will connect in fc->busy_ranges to keep track busy memory */ ++ struct list_head busy_list; ++ ++ /** Position in DAX window */ ++ u64 window_offset; ++ ++ /** Length of mapping, in bytes */ ++ loff_t length; ++}; ++ + /** FUSE inode */ + struct fuse_inode { + /** Inode data */ +@@ -108,6 +183,22 @@ struct fuse_inode { + + /** Lock for serializing lookup and readdir for back compatibility*/ + struct mutex mutex; ++ ++ /* ++ * Semaphore to protect modifications to dmap_tree ++ */ ++ struct rw_semaphore i_dmap_sem; ++ ++ /** ++ * Can't take inode lock in fault path (leads to circular dependency). ++ * So take this in fuse dax fault path to make sure truncate and ++ * punch hole etc. can't make progress in parallel. ++ */ ++ struct rw_semaphore i_mmap_sem; ++ ++ /** Sorted rb tree of struct fuse_dax_mapping elements */ ++ struct rb_root_cached dmap_tree; ++ unsigned long nr_dmaps; + }; + + /** FUSE inode state bits */ +@@ -382,8 +473,44 @@ struct fuse_req { + + /** Request is stolen from fuse_file->reserved_req */ + struct file *stolen_file; ++ ++ /** virtio-fs's physically contiguous buffer for in and out args */ ++ void *argbuf; + }; + ++struct fuse_iqueue; ++ ++/** ++ * Input queue callbacks ++ * ++ * Input queue signalling is device-specific. For example, the /dev/fuse file ++ * uses fiq->waitq and fasync to wake processes that are waiting on queue ++ * readiness. These callbacks allow other device types to respond to input ++ * queue activity. ++ */ ++struct fuse_iqueue_ops { ++ /** ++ * Signal that a forget has been queued ++ */ ++ void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) ++ __releases(fiq->waitq.lock); ++ ++ /** ++ * Signal that an INTERRUPT request has been queued ++ */ ++ void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) ++ __releases(fiq->waitq.lock); ++ ++ /** ++ * Signal that a request has been queued ++ */ ++ void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) ++ __releases(fiq->waitq.lock); ++}; ++ ++/** /dev/fuse input queue operations */ ++extern const struct fuse_iqueue_ops fuse_dev_fiq_ops; ++ + struct fuse_iqueue { + /** Connection established */ + unsigned connected; +@@ -409,6 +536,12 @@ struct fuse_iqueue { + + /** O_ASYNC requests */ + struct fasync_struct *fasync; ++ ++ /** Device-specific callbacks */ ++ const struct fuse_iqueue_ops *ops; ++ ++ /** Device-specific state */ ++ void *priv; + }; + + struct fuse_pqueue { +@@ -675,6 +808,28 @@ struct fuse_conn { + + /** List of device instances belonging to this connection */ + struct list_head devices; ++ ++ /** DAX device, non-NULL if DAX is supported */ ++ struct dax_device *dax_dev; ++ ++ /* List of memory ranges which are busy */ ++ unsigned long nr_busy_ranges; ++ struct list_head busy_ranges; ++ ++ /* Worker to free up memory ranges */ ++ struct delayed_work dax_free_work; ++ ++ /* Wait queue for a dax range to become free */ ++ wait_queue_head_t dax_range_waitq; ++ ++ /* ++ * DAX Window Free Ranges. TODO: This might not be best place to store ++ * this free list ++ */ ++ unsigned long nr_free_ranges; ++ struct list_head free_ranges; ++ ++ unsigned long nr_ranges; + }; + + static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +@@ -860,6 +1015,11 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); + void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req); + ++/** ++ * End a finished request ++ */ ++void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req); ++ + /* Abort all requests */ + void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); + void fuse_wait_aborted(struct fuse_conn *fc); +@@ -881,16 +1041,42 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); + /** + * Initialize fuse_conn + */ +-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns); ++void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, ++ struct dax_device *dax_dev, ++ const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv); + + /** + * Release reference to fuse_conn + */ + void fuse_conn_put(struct fuse_conn *fc); + +-struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); ++struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); ++struct fuse_dev *fuse_dev_alloc(void); ++void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); + void fuse_dev_free(struct fuse_dev *fud); + ++/** ++ * Parse a mount options string ++ */ ++int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, ++ struct user_namespace *user_ns); ++ ++/** ++ * Fill in superblock and initialize fuse connection ++ * @sb: partially-initialized superblock to fill in ++ * @mount_data: mount parameters ++ */ ++int fuse_fill_super_common(struct super_block *sb, ++ struct fuse_mount_data *mount_data); ++void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req); ++ ++/** ++ * Disassociate fuse connection from superblock and kill the superblock ++ * ++ * Calls kill_anon_super(), use with do not use with bdev mounts. ++ */ ++void fuse_kill_sb_anon(struct super_block *sb); ++ + /** + * Add connection to control filesystem + */ +@@ -992,4 +1178,16 @@ struct posix_acl; + struct posix_acl *fuse_get_acl(struct inode *inode, int type); + int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); + ++/** ++ * Return the number of bytes in an arguments list ++ */ ++unsigned fuse_len_args(unsigned numargs, struct fuse_arg *args); ++ ++/** ++ * Get the next unique ID for a request ++ */ ++u64 fuse_get_unique(struct fuse_iqueue *fiq); ++void fuse_dax_free_mem_worker(struct work_struct *work); ++void fuse_removemapping(struct inode *inode); ++ + #endif /* _FS_FUSE_I_H */ +diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c +index db9e60b7eb69..dd16c7f6a561 100644 +--- a/fs/fuse/inode.c ++++ b/fs/fuse/inode.c +@@ -22,6 +22,8 @@ + #include + #include + #include ++#include ++#include + + MODULE_AUTHOR("Miklos Szeredi "); + MODULE_DESCRIPTION("Filesystem in Userspace"); +@@ -59,21 +61,6 @@ MODULE_PARM_DESC(max_user_congthresh, + /** Congestion starts at 75% of maximum */ + #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) + +-struct fuse_mount_data { +- int fd; +- unsigned rootmode; +- kuid_t user_id; +- kgid_t group_id; +- unsigned fd_present:1; +- unsigned rootmode_present:1; +- unsigned user_id_present:1; +- unsigned group_id_present:1; +- unsigned default_permissions:1; +- unsigned allow_other:1; +- unsigned max_read; +- unsigned blksize; +-}; +- + struct fuse_forget_link *fuse_alloc_forget(void) + { + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); +@@ -96,11 +83,14 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) + fi->writectr = 0; + fi->orig_ino = 0; + fi->state = 0; ++ fi->nr_dmaps = 0; + INIT_LIST_HEAD(&fi->write_files); + INIT_LIST_HEAD(&fi->queued_writes); + INIT_LIST_HEAD(&fi->writepages); + init_waitqueue_head(&fi->page_waitq); + mutex_init(&fi->mutex); ++ init_rwsem(&fi->i_mmap_sem); ++ init_rwsem(&fi->i_dmap_sem); + fi->forget = fuse_alloc_forget(); + if (!fi->forget) { + kmem_cache_free(fuse_inode_cachep, inode); +@@ -133,6 +123,10 @@ static void fuse_evict_inode(struct inode *inode) + if (inode->i_sb->s_flags & SB_ACTIVE) { + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); ++ if (IS_DAX(inode)) { ++ fuse_removemapping(inode); ++ WARN_ON(fi->nr_dmaps); ++ } + fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); + fi->forget = NULL; + } +@@ -447,6 +441,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) + + enum { + OPT_FD, ++ OPT_TAG, + OPT_ROOTMODE, + OPT_USER_ID, + OPT_GROUP_ID, +@@ -454,11 +449,13 @@ enum { + OPT_ALLOW_OTHER, + OPT_MAX_READ, + OPT_BLKSIZE, ++ OPT_DAX, + OPT_ERR + }; + + static const match_table_t tokens = { + {OPT_FD, "fd=%u"}, ++ {OPT_TAG, "tag=%s"}, + {OPT_ROOTMODE, "rootmode=%o"}, + {OPT_USER_ID, "user_id=%u"}, + {OPT_GROUP_ID, "group_id=%u"}, +@@ -466,6 +463,7 @@ static const match_table_t tokens = { + {OPT_ALLOW_OTHER, "allow_other"}, + {OPT_MAX_READ, "max_read=%u"}, + {OPT_BLKSIZE, "blksize=%u"}, ++ {OPT_DAX, "dax"}, + {OPT_ERR, NULL} + }; + +@@ -480,7 +478,7 @@ static int fuse_match_uint(substring_t *s, unsigned int *res) + return err; + } + +-static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, ++int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, + struct user_namespace *user_ns) + { + char *p; +@@ -505,6 +503,11 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, + d->fd_present = 1; + break; + ++ case OPT_TAG: ++ d->tag = args[0].from; ++ d->tag_present = 1; ++ break; ++ + case OPT_ROOTMODE: + if (match_octal(&args[0], &value)) + return 0; +@@ -552,17 +555,22 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, + d->blksize = value; + break; + ++ case OPT_DAX: ++ d->dax = 1; ++ break; ++ + default: + return 0; + } + } + +- if (!d->fd_present || !d->rootmode_present || +- !d->user_id_present || !d->group_id_present) ++ if (!d->rootmode_present || !d->user_id_present || ++ !d->group_id_present) + return 0; + + return 1; + } ++EXPORT_SYMBOL_GPL(parse_fuse_opt); + + static int fuse_show_options(struct seq_file *m, struct dentry *root) + { +@@ -579,10 +587,14 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) + seq_printf(m, ",max_read=%u", fc->max_read); + if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ",blksize=%lu", sb->s_blocksize); ++ if (fc->dax_dev) ++ seq_printf(m, ",dax"); + return 0; + } + +-static void fuse_iqueue_init(struct fuse_iqueue *fiq) ++static void fuse_iqueue_init(struct fuse_iqueue *fiq, ++ const struct fuse_iqueue_ops *ops, ++ void *priv) + { + memset(fiq, 0, sizeof(struct fuse_iqueue)); + init_waitqueue_head(&fiq->waitq); +@@ -590,6 +602,8 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq) + INIT_LIST_HEAD(&fiq->interrupts); + fiq->forget_list_tail = &fiq->forget_list_head; + fiq->connected = 1; ++ fiq->ops = ops; ++ fiq->priv = priv; + } + + static void fuse_pqueue_init(struct fuse_pqueue *fpq) +@@ -601,7 +615,84 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) + fpq->connected = 1; + } + +-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) ++static void fuse_free_dax_mem_ranges(struct list_head *mem_list) ++{ ++ struct fuse_dax_mapping *range, *temp; ++ ++ /* Free All allocated elements */ ++ list_for_each_entry_safe(range, temp, mem_list, list) { ++ list_del(&range->list); ++ if (!list_empty(&range->busy_list)) ++ list_del(&range->busy_list); ++ kfree(range); ++ } ++} ++ ++#ifdef CONFIG_FS_DAX ++static int fuse_dax_mem_range_init(struct fuse_conn *fc, ++ struct dax_device *dax_dev) ++{ ++ long nr_pages, nr_ranges; ++ void *kaddr; ++ pfn_t pfn; ++ struct fuse_dax_mapping *range; ++ LIST_HEAD(mem_ranges); ++ phys_addr_t phys_addr; ++ int ret = 0, id; ++ size_t dax_size = -1; ++ unsigned long allocated_ranges = 0, i; ++ ++ id = dax_read_lock(); ++ nr_pages = dax_direct_access(dax_dev, 0, PHYS_PFN(dax_size), &kaddr, ++ &pfn); ++ dax_read_unlock(id); ++ if (nr_pages < 0) { ++ pr_debug("dax_direct_access() returned %ld\n", nr_pages); ++ return nr_pages; ++ } ++ ++ phys_addr = pfn_t_to_phys(pfn); ++ nr_ranges = nr_pages/FUSE_DAX_MEM_RANGE_PAGES; ++ printk("fuse_dax_mem_range_init(): dax mapped %ld pages. nr_ranges=%ld\n", nr_pages, nr_ranges); ++ ++ for (i = 0; i < nr_ranges; i++) { ++ range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL); ++ if (!range) { ++ pr_debug("memory allocation for mem_range failed.\n"); ++ ret = -ENOMEM; ++ goto out_err; ++ } ++ /* TODO: This offset only works if virtio-fs driver is not ++ * having some memory hidden at the beginning. This needs ++ * better handling ++ */ ++ range->window_offset = i * FUSE_DAX_MEM_RANGE_SZ; ++ range->length = FUSE_DAX_MEM_RANGE_SZ; ++ list_add_tail(&range->list, &mem_ranges); ++ INIT_LIST_HEAD(&range->busy_list); ++ allocated_ranges++; ++ } ++ ++ list_replace_init(&mem_ranges, &fc->free_ranges); ++ fc->nr_free_ranges = allocated_ranges; ++ fc->nr_ranges = allocated_ranges; ++ return 0; ++out_err: ++ /* Free All allocated elements */ ++ fuse_free_dax_mem_ranges(&mem_ranges); ++ return ret; ++} ++#else /* !CONFIG_FS_DAX */ ++static inline int fuse_dax_mem_range_init(struct fuse_conn *fc, ++ struct dax_device *dax_dev) ++{ ++ return 0; ++} ++#endif /* CONFIG_FS_DAX */ ++ ++void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, ++ struct dax_device *dax_dev, ++ const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) + { + memset(fc, 0, sizeof(*fc)); + spin_lock_init(&fc->lock); +@@ -610,7 +701,8 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) + atomic_set(&fc->dev_count, 1); + init_waitqueue_head(&fc->blocked_waitq); + init_waitqueue_head(&fc->reserved_req_waitq); +- fuse_iqueue_init(&fc->iq); ++ init_waitqueue_head(&fc->dax_range_waitq); ++ fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); + INIT_LIST_HEAD(&fc->bg_queue); + INIT_LIST_HEAD(&fc->entry); + INIT_LIST_HEAD(&fc->devices); +@@ -625,7 +717,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) + fc->attr_version = 1; + get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); + fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); ++ fc->dax_dev = dax_dev; + fc->user_ns = get_user_ns(user_ns); ++ INIT_LIST_HEAD(&fc->free_ranges); ++ INIT_LIST_HEAD(&fc->busy_ranges); ++ INIT_DELAYED_WORK(&fc->dax_free_work, fuse_dax_free_mem_worker); + } + EXPORT_SYMBOL_GPL(fuse_conn_init); + +@@ -634,6 +730,9 @@ void fuse_conn_put(struct fuse_conn *fc) + if (refcount_dec_and_test(&fc->count)) { + if (fc->destroy_req) + fuse_request_free(fc->destroy_req); ++ flush_delayed_work(&fc->dax_free_work); ++ if (fc->dax_dev) ++ fuse_free_dax_mem_ranges(&fc->free_ranges); + put_pid_ns(fc->pid_ns); + put_user_ns(fc->user_ns); + fc->release(fc); +@@ -943,7 +1042,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) + wake_up_all(&fc->blocked_waitq); + } + +-static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) ++void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) + { + struct fuse_init_in *arg = &req->misc.init_in; + +@@ -972,6 +1071,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) + req->end = process_init_reply; + fuse_request_send_background(fc, req); + } ++EXPORT_SYMBOL_GPL(fuse_send_init); + + static void fuse_free_conn(struct fuse_conn *fc) + { +@@ -1019,24 +1119,38 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) + return 0; + } + +-struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) +-{ ++struct fuse_dev *fuse_dev_alloc(void) { + struct fuse_dev *fud; + + fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL); +- if (fud) { +- fud->fc = fuse_conn_get(fc); ++ if (fud) + fuse_pqueue_init(&fud->pq); + +- spin_lock(&fc->lock); +- list_add_tail(&fud->entry, &fc->devices); +- spin_unlock(&fc->lock); +- } +- + return fud; + } + EXPORT_SYMBOL_GPL(fuse_dev_alloc); + ++void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc) { ++ fud->fc = fuse_conn_get(fc); ++ spin_lock(&fc->lock); ++ list_add_tail(&fud->entry, &fc->devices); ++ spin_unlock(&fc->lock); ++} ++EXPORT_SYMBOL_GPL(fuse_dev_install); ++ ++struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc) ++{ ++ struct fuse_dev *fud; ++ ++ fud = fuse_dev_alloc(); ++ if (!fud) ++ return NULL; ++ ++ fuse_dev_install(fud, fc); ++ return fud; ++} ++EXPORT_SYMBOL_GPL(fuse_dev_alloc_install); ++ + void fuse_dev_free(struct fuse_dev *fud) + { + struct fuse_conn *fc = fud->fc; +@@ -1052,15 +1166,13 @@ void fuse_dev_free(struct fuse_dev *fud) + } + EXPORT_SYMBOL_GPL(fuse_dev_free); + +-static int fuse_fill_super(struct super_block *sb, void *data, int silent) ++int fuse_fill_super_common(struct super_block *sb, ++ struct fuse_mount_data *mount_data) + { + struct fuse_dev *fud; + struct fuse_conn *fc; + struct inode *root; +- struct fuse_mount_data d; +- struct file *file; + struct dentry *root_dentry; +- struct fuse_req *init_req; + int err; + int is_bdev = sb->s_bdev != NULL; + +@@ -1070,13 +1182,10 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) + + sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); + +- if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) +- goto err; +- + if (is_bdev) { + #ifdef CONFIG_BLOCK + err = -EINVAL; +- if (!sb_set_blocksize(sb, d.blksize)) ++ if (!sb_set_blocksize(sb, mount_data->blksize)) + goto err; + #endif + } else { +@@ -1093,19 +1202,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) + if (sb->s_user_ns != &init_user_ns) + sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; + +- file = fget(d.fd); +- err = -EINVAL; +- if (!file) +- goto err; +- +- /* +- * Require mount to happen from the same user namespace which +- * opened /dev/fuse to prevent potential attacks. +- */ +- if (file->f_op != &fuse_dev_operations || +- file->f_cred->user_ns != sb->s_user_ns) +- goto err_fput; +- + /* + * If we are not in the initial user namespace posix + * acls must be translated. +@@ -1116,12 +1212,21 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + err = -ENOMEM; + if (!fc) +- goto err_fput; ++ goto err; + +- fuse_conn_init(fc, sb->s_user_ns); ++ fuse_conn_init(fc, sb->s_user_ns, mount_data->dax_dev, ++ mount_data->fiq_ops, mount_data->fiq_priv); + fc->release = fuse_free_conn; + +- fud = fuse_dev_alloc(fc); ++ if (mount_data->dax_dev) { ++ err = fuse_dax_mem_range_init(fc, mount_data->dax_dev); ++ if (err) { ++ pr_debug("fuse_dax_mem_range_init() returned %d\n", err); ++ goto err_free_ranges; ++ } ++ } ++ ++ fud = fuse_dev_alloc_install(fc); + if (!fud) + goto err_put_conn; + +@@ -1136,17 +1241,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) + fc->dont_mask = 1; + sb->s_flags |= SB_POSIXACL; + +- fc->default_permissions = d.default_permissions; +- fc->allow_other = d.allow_other; +- fc->user_id = d.user_id; +- fc->group_id = d.group_id; +- fc->max_read = max_t(unsigned, 4096, d.max_read); ++ fc->default_permissions = mount_data->default_permissions; ++ fc->allow_other = mount_data->allow_other; ++ fc->user_id = mount_data->user_id; ++ fc->group_id = mount_data->group_id; ++ fc->max_read = max_t(unsigned, 4096, mount_data->max_read); + + /* Used by get_root_inode() */ + sb->s_fs_info = fc; + + err = -ENOMEM; +- root = fuse_get_root_inode(sb, d.rootmode); ++ root = fuse_get_root_inode(sb, mount_data->rootmode); + sb->s_d_op = &fuse_root_dentry_operations; + root_dentry = d_make_root(root); + if (!root_dentry) +@@ -1154,20 +1259,15 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) + /* Root dentry doesn't have .d_revalidate */ + sb->s_d_op = &fuse_dentry_operations; + +- init_req = fuse_request_alloc(0); +- if (!init_req) +- goto err_put_root; +- __set_bit(FR_BACKGROUND, &init_req->flags); +- +- if (is_bdev) { ++ if (mount_data->destroy) { + fc->destroy_req = fuse_request_alloc(0); + if (!fc->destroy_req) +- goto err_free_init_req; ++ goto err_put_root; + } + + mutex_lock(&fuse_mutex); + err = -EINVAL; +- if (file->private_data) ++ if (*mount_data->fudptr) + goto err_unlock; + + err = fuse_ctl_add_conn(fc); +@@ -1176,35 +1276,82 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) + + list_add_tail(&fc->entry, &fuse_conn_list); + sb->s_root = root_dentry; +- file->private_data = fud; ++ *mount_data->fudptr = fud; + mutex_unlock(&fuse_mutex); +- /* +- * atomic_dec_and_test() in fput() provides the necessary +- * memory barrier for file->private_data to be visible on all +- * CPUs after this +- */ +- fput(file); +- +- fuse_send_init(fc, init_req); +- + return 0; + + err_unlock: + mutex_unlock(&fuse_mutex); +- err_free_init_req: +- fuse_request_free(init_req); + err_put_root: + dput(root_dentry); + err_dev_free: + fuse_dev_free(fud); ++ err_free_ranges: ++ if (mount_data->dax_dev) ++ fuse_free_dax_mem_ranges(&fc->free_ranges); + err_put_conn: + fuse_conn_put(fc); + sb->s_fs_info = NULL; +- err_fput: +- fput(file); + err: + return err; + } ++EXPORT_SYMBOL_GPL(fuse_fill_super_common); ++ ++static int fuse_fill_super(struct super_block *sb, void *data, int silent) ++{ ++ struct fuse_mount_data d; ++ struct file *file; ++ int is_bdev = sb->s_bdev != NULL; ++ int err; ++ struct fuse_req *init_req; ++ ++ err = -EINVAL; ++ if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) ++ goto err; ++ if (!d.fd_present || d.tag_present) ++ goto err; ++ ++ file = fget(d.fd); ++ if (!file) ++ goto err; ++ ++ /* ++ * Require mount to happen from the same user namespace which ++ * opened /dev/fuse to prevent potential attacks. ++ */ ++ if ((file->f_op != &fuse_dev_operations) || ++ (file->f_cred->user_ns != sb->s_user_ns)) ++ goto err_fput; ++ ++ init_req = fuse_request_alloc(0); ++ if (!init_req) ++ goto err_fput; ++ __set_bit(FR_BACKGROUND, &init_req->flags); ++ ++ d.dax_dev = NULL; ++ d.fiq_ops = &fuse_dev_fiq_ops; ++ d.fiq_priv = NULL; ++ d.fudptr = &file->private_data; ++ d.destroy = is_bdev; ++ err = fuse_fill_super_common(sb, &d); ++ if (err < 0) ++ goto err_free_init_req; ++ /* ++ * atomic_dec_and_test() in fput() provides the necessary ++ * memory barrier for file->private_data to be visible on all ++ * CPUs after this ++ */ ++ fput(file); ++ fuse_send_init(get_fuse_conn_super(sb), init_req); ++ return 0; ++ ++err_free_init_req: ++ fuse_request_free(init_req); ++err_fput: ++ fput(file); ++err: ++ return err; ++} + + static struct dentry *fuse_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, +@@ -1229,11 +1376,12 @@ static void fuse_sb_destroy(struct super_block *sb) + } + } + +-static void fuse_kill_sb_anon(struct super_block *sb) ++void fuse_kill_sb_anon(struct super_block *sb) + { + fuse_sb_destroy(sb); + kill_anon_super(sb); + } ++EXPORT_SYMBOL_GPL(fuse_kill_sb_anon); + + static struct file_system_type fuse_fs_type = { + .owner = THIS_MODULE, +diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c +new file mode 100644 +index 000000000000..a0a2cd1cefc7 +--- /dev/null ++++ b/fs/fuse/virtio_fs.c +@@ -0,0 +1,1121 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * virtio-fs: Virtio Filesystem ++ * Copyright (C) 2018 Red Hat, Inc. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "fuse_i.h" ++ ++/* List of virtio-fs device instances and a lock for the list */ ++static DEFINE_MUTEX(virtio_fs_mutex); ++static LIST_HEAD(virtio_fs_instances); ++ ++enum { ++ VQ_HIPRIO, ++ VQ_REQUEST ++}; ++ ++/* Per-virtqueue state */ ++struct virtio_fs_vq { ++ struct virtqueue *vq; /* protected by fpq->lock */ ++ struct work_struct done_work; ++ struct list_head queued_reqs; ++ struct delayed_work dispatch_work; ++ struct fuse_dev *fud; ++ char name[24]; ++} ____cacheline_aligned_in_smp; ++ ++/* State needed for devm_memremap_pages(). This API is called on the ++ * underlying pci_dev instead of struct virtio_fs (layering violation). Since ++ * the memremap release function only gets called when the pci_dev is released, ++ * keep the associated state separate from struct virtio_fs (it has a different ++ * lifecycle from pci_dev). ++ */ ++struct virtio_fs_memremap_info { ++ struct dev_pagemap pgmap; ++ struct percpu_ref ref; ++ struct completion completion; ++}; ++ ++/* A virtio-fs device instance */ ++struct virtio_fs { ++ struct list_head list; /* on virtio_fs_instances */ ++ char *tag; ++ struct virtio_fs_vq *vqs; ++ unsigned nvqs; /* number of virtqueues */ ++ unsigned num_queues; /* number of request queues */ ++ struct dax_device *dax_dev; ++ ++ /* DAX memory window where file contents are mapped */ ++ void *window_kaddr; ++ phys_addr_t window_phys_addr; ++ size_t window_len; ++}; ++ ++struct virtio_fs_forget { ++ struct fuse_in_header ih; ++ struct fuse_forget_in arg; ++ /* This request can be temporarily queued on virt queue */ ++ struct list_head list; ++}; ++ ++static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) ++{ ++ struct virtio_fs *fs = vq->vdev->priv; ++ ++ return &fs->vqs[vq->index]; ++} ++ ++static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq) ++{ ++ return &vq_to_fsvq(vq)->fud->pq; ++} ++ ++/* Add a new instance to the list or return -EEXIST if tag name exists*/ ++static int virtio_fs_add_instance(struct virtio_fs *fs) ++{ ++ struct virtio_fs *fs2; ++ bool duplicate = false; ++ ++ mutex_lock(&virtio_fs_mutex); ++ ++ list_for_each_entry(fs2, &virtio_fs_instances, list) { ++ if (strcmp(fs->tag, fs2->tag) == 0) ++ duplicate = true; ++ } ++ ++ if (!duplicate) ++ list_add_tail(&fs->list, &virtio_fs_instances); ++ ++ mutex_unlock(&virtio_fs_mutex); ++ ++ if (duplicate) ++ return -EEXIST; ++ return 0; ++} ++ ++/* Return the virtio_fs with a given tag, or NULL */ ++static struct virtio_fs *virtio_fs_find_instance(const char *tag) ++{ ++ struct virtio_fs *fs; ++ ++ mutex_lock(&virtio_fs_mutex); ++ ++ list_for_each_entry(fs, &virtio_fs_instances, list) { ++ if (strcmp(fs->tag, tag) == 0) ++ goto found; ++ } ++ ++ fs = NULL; /* not found */ ++ ++found: ++ mutex_unlock(&virtio_fs_mutex); ++ ++ return fs; ++} ++ ++static void virtio_fs_free_devs(struct virtio_fs *fs) ++{ ++ unsigned int i; ++ ++ /* TODO lock */ ++ ++ for (i = 0; i < fs->nvqs; i++) { ++ struct virtio_fs_vq *fsvq = &fs->vqs[i]; ++ ++ if (!fsvq->fud) ++ continue; ++ ++ flush_work(&fsvq->done_work); ++ flush_delayed_work(&fsvq->dispatch_work); ++ ++ fuse_dev_free(fsvq->fud); /* TODO need to quiesce/end_requests/decrement dev_count */ ++ fsvq->fud = NULL; ++ } ++} ++ ++/* Read filesystem name from virtio config into fs->tag (must kfree()). */ ++static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs) ++{ ++ char tag_buf[sizeof_field(struct virtio_fs_config, tag)]; ++ char *end; ++ size_t len; ++ ++ virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag), ++ &tag_buf, sizeof(tag_buf)); ++ end = memchr(tag_buf, '\0', sizeof(tag_buf)); ++ if (end == tag_buf) ++ return -EINVAL; /* empty tag */ ++ if (!end) ++ end = &tag_buf[sizeof(tag_buf)]; ++ ++ len = end - tag_buf; ++ fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL); ++ if (!fs->tag) ++ return -ENOMEM; ++ memcpy(fs->tag, tag_buf, len); ++ fs->tag[len] = '\0'; ++ return 0; ++} ++ ++/* Work function for hiprio completion */ ++static void virtio_fs_hiprio_done_work(struct work_struct *work) ++{ ++ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, ++ done_work); ++ struct fuse_pqueue *fpq = &fsvq->fud->pq; ++ struct virtqueue *vq = fsvq->vq; ++ ++ /* Free completed FUSE_FORGET requests */ ++ spin_lock(&fpq->lock); ++ do { ++ unsigned len; ++ void *req; ++ ++ virtqueue_disable_cb(vq); ++ ++ while ((req = virtqueue_get_buf(vq, &len)) != NULL) ++ kfree(req); ++ } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); ++ spin_unlock(&fpq->lock); ++} ++ ++static void virtio_fs_dummy_dispatch_work(struct work_struct *work) ++{ ++ return; ++} ++ ++static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) ++{ ++ struct virtio_fs_forget *forget; ++ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, ++ dispatch_work.work); ++ struct fuse_pqueue *fpq = &fsvq->fud->pq; ++ struct virtqueue *vq = fsvq->vq; ++ struct scatterlist sg; ++ struct scatterlist *sgs[] = {&sg}; ++ bool notify; ++ int ret; ++ ++ pr_debug("worker virtio_fs_hiprio_dispatch_work() called.\n"); ++ while(1) { ++ spin_lock(&fpq->lock); ++ forget = list_first_entry_or_null(&fsvq->queued_reqs, ++ struct virtio_fs_forget, list); ++ if (!forget) { ++ spin_unlock(&fpq->lock); ++ return; ++ } ++ ++ list_del(&forget->list); ++ sg_init_one(&sg, forget, sizeof(*forget)); ++ ++ /* Enqueue the request */ ++ dev_dbg(&vq->vdev->dev, "%s\n", __func__); ++ ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); ++ if (ret < 0) { ++ if (ret == -ENOMEM || ret == -ENOSPC) { ++ pr_debug("virtio-fs: Could not queue FORGET:" ++ " err=%d. Will try later\n", ret); ++ list_add_tail(&forget->list, ++ &fsvq->queued_reqs); ++ schedule_delayed_work(&fsvq->dispatch_work, ++ msecs_to_jiffies(1)); ++ } else { ++ pr_debug("virtio-fs: Could not queue FORGET:" ++ " err=%d. Dropping it.\n", ret); ++ kfree(forget); ++ } ++ spin_unlock(&fpq->lock); ++ return; ++ } ++ ++ notify = virtqueue_kick_prepare(vq); ++ spin_unlock(&fpq->lock); ++ ++ if (notify) ++ virtqueue_notify(vq); ++ pr_debug("worker virtio_fs_hiprio_dispatch_work() dispatched one forget request.\n"); ++ } ++} ++ ++/* Allocate and copy args into req->argbuf */ ++static int copy_args_to_argbuf(struct fuse_req *req) ++{ ++ unsigned offset = 0; ++ unsigned num_in; ++ unsigned num_out; ++ unsigned len; ++ unsigned i; ++ ++ num_in = req->in.numargs - req->in.argpages; ++ num_out = req->out.numargs - req->out.argpages; ++ len = fuse_len_args(num_in, (struct fuse_arg *)req->in.args) + ++ fuse_len_args(num_out, req->out.args); ++ ++ req->argbuf = kmalloc(len, GFP_ATOMIC); ++ if (!req->argbuf) ++ return -ENOMEM; ++ ++ for (i = 0; i < num_in; i++) { ++ memcpy(req->argbuf + offset, ++ req->in.args[i].value, ++ req->in.args[i].size); ++ offset += req->in.args[i].size; ++ } ++ ++ return 0; ++} ++ ++/* Copy args out of and free req->argbuf */ ++static void copy_args_from_argbuf(struct fuse_req *req) ++{ ++ unsigned remaining; ++ unsigned offset; ++ unsigned num_in; ++ unsigned num_out; ++ unsigned i; ++ ++ remaining = req->out.h.len - sizeof(req->out.h); ++ num_in = req->in.numargs - req->in.argpages; ++ num_out = req->out.numargs - req->out.argpages; ++ offset = fuse_len_args(num_in, (struct fuse_arg *)req->in.args); ++ ++ for (i = 0; i < num_out; i++) { ++ unsigned argsize = req->out.args[i].size; ++ ++ if (req->out.argvar && ++ i == req->out.numargs - 1 && ++ argsize > remaining) { ++ argsize = remaining; ++ } ++ ++ memcpy(req->out.args[i].value, req->argbuf + offset, argsize); ++ offset += argsize; ++ ++ if (i != req->out.numargs - 1) ++ remaining -= argsize; ++ } ++ ++ /* Store the actual size of the variable-length arg */ ++ if (req->out.argvar) ++ req->out.args[req->out.numargs - 1].size = remaining; ++ ++ kfree(req->argbuf); ++ req->argbuf = NULL; ++} ++ ++/* Work function for request completion */ ++static void virtio_fs_requests_done_work(struct work_struct *work) ++{ ++ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, ++ done_work); ++ struct fuse_pqueue *fpq = &fsvq->fud->pq; ++ struct fuse_conn *fc = fsvq->fud->fc; ++ struct virtqueue *vq = fsvq->vq; ++ struct fuse_req *req; ++ struct fuse_req *next; ++ LIST_HEAD(reqs); ++ ++ /* Collect completed requests off the virtqueue */ ++ spin_lock(&fpq->lock); ++ do { ++ unsigned len; ++ ++ virtqueue_disable_cb(vq); ++ ++ while ((req = virtqueue_get_buf(vq, &len)) != NULL) ++ list_move_tail(&req->list, &reqs); ++ } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); ++ spin_unlock(&fpq->lock); ++ ++ /* End requests */ ++ list_for_each_entry_safe(req, next, &reqs, list) { ++ /* TODO check unique */ ++ /* TODO fuse_len_args(out) against oh.len */ ++ ++ copy_args_from_argbuf(req); ++ ++ /* TODO zeroing? */ ++ ++ spin_lock(&fpq->lock); ++ clear_bit(FR_SENT, &req->flags); ++ list_del_init(&req->list); ++ spin_unlock(&fpq->lock); ++ ++ fuse_request_end(fc, req); ++ } ++} ++ ++/* Virtqueue interrupt handler */ ++static void virtio_fs_vq_done(struct virtqueue *vq) ++{ ++ struct virtio_fs_vq *fsvq = vq_to_fsvq(vq); ++ ++ dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name); ++ ++ schedule_work(&fsvq->done_work); ++} ++ ++/* Initialize virtqueues */ ++static int virtio_fs_setup_vqs(struct virtio_device *vdev, ++ struct virtio_fs *fs) ++{ ++ struct virtqueue **vqs; ++ vq_callback_t **callbacks; ++ const char **names; ++ unsigned i; ++ int ret; ++ ++ virtio_cread(vdev, struct virtio_fs_config, num_queues, ++ &fs->num_queues); ++ if (fs->num_queues == 0) ++ return -EINVAL; ++ ++ fs->nvqs = 1 + fs->num_queues; ++ ++ fs->vqs = devm_kcalloc(&vdev->dev, fs->nvqs, ++ sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); ++ if (!fs->vqs) ++ return -ENOMEM; ++ ++ vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL); ++ callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]), ++ GFP_KERNEL); ++ names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL); ++ if (!vqs || !callbacks || !names) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ callbacks[VQ_HIPRIO] = virtio_fs_vq_done; ++ snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name), ++ "hiprio"); ++ names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; ++ INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work); ++ INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs); ++ INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work, ++ virtio_fs_hiprio_dispatch_work); ++ ++ /* Initialize the requests virtqueues */ ++ for (i = VQ_REQUEST; i < fs->nvqs; i++) { ++ INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work); ++ INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work, ++ virtio_fs_dummy_dispatch_work); ++ INIT_LIST_HEAD(&fs->vqs[i].queued_reqs); ++ snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name), ++ "requests.%u", i - VQ_REQUEST); ++ callbacks[i] = virtio_fs_vq_done; ++ names[i] = fs->vqs[i].name; ++ } ++ ++ ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL); ++ if (ret < 0) ++ goto out; ++ ++ for (i = 0; i < fs->nvqs; i++) ++ fs->vqs[i].vq = vqs[i]; ++ ++out: ++ kfree(names); ++ kfree(callbacks); ++ kfree(vqs); ++ return ret; ++} ++ ++/* Free virtqueues (device must already be reset) */ ++static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, ++ struct virtio_fs *fs) ++{ ++ vdev->config->del_vqs(vdev); ++} ++ ++/* Map a window offset to a page frame number. The window offset will have ++ * been produced by .iomap_begin(), which maps a file offset to a window ++ * offset. ++ */ ++static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, ++ long nr_pages, void **kaddr, pfn_t *pfn) ++{ ++ struct virtio_fs *fs = dax_get_private(dax_dev); ++ phys_addr_t offset = PFN_PHYS(pgoff); ++ size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff; ++ ++ pr_debug("virtio_fs_direct_access(): called. nr_pages=%ld max_nr_pages=%zu\n", nr_pages, max_nr_pages); ++ ++ if (kaddr) ++ *kaddr = fs->window_kaddr + offset; ++ if (pfn) ++ *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, ++ PFN_DEV | PFN_MAP); ++ return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; ++} ++ ++static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev, ++ pgoff_t pgoff, void *addr, ++ size_t bytes, struct iov_iter *i) ++{ ++ return copy_from_iter(addr, bytes, i); ++} ++ ++static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev, ++ pgoff_t pgoff, void *addr, ++ size_t bytes, struct iov_iter *i) ++{ ++ return copy_to_iter(addr, bytes, i); ++} ++ ++static const struct dax_operations virtio_fs_dax_ops = { ++ .direct_access = virtio_fs_direct_access, ++ .copy_from_iter = virtio_fs_copy_from_iter, ++ .copy_to_iter = virtio_fs_copy_to_iter, ++}; ++ ++static void virtio_fs_percpu_release(struct percpu_ref *ref) ++{ ++ struct virtio_fs_memremap_info *mi = ++ container_of(ref, struct virtio_fs_memremap_info, ref); ++ ++ complete(&mi->completion); ++} ++ ++static void virtio_fs_percpu_exit(void *data) ++{ ++ struct virtio_fs_memremap_info *mi = data; ++ ++ wait_for_completion(&mi->completion); ++ percpu_ref_exit(&mi->ref); ++} ++ ++static void virtio_fs_percpu_kill(struct percpu_ref *ref) ++{ ++ percpu_ref_kill(ref); ++} ++ ++static void virtio_fs_cleanup_dax(void *data) ++{ ++ struct virtio_fs *fs = data; ++ ++ kill_dax(fs->dax_dev); ++ put_dax(fs->dax_dev); ++} ++ ++static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) ++{ ++ struct virtio_shm_region cache_reg; ++ struct virtio_fs_memremap_info *mi; ++ struct dev_pagemap *pgmap; ++ bool have_cache; ++ int ret; ++ ++ if (!IS_ENABLED(CONFIG_DAX_DRIVER)) ++ return 0; ++ ++ /* Get cache region */ ++ have_cache = virtio_get_shm_region(vdev, ++ &cache_reg, ++ (u8)VIRTIO_FS_SHMCAP_ID_CACHE); ++ if (!have_cache) { ++ dev_err(&vdev->dev, "%s: No cache capability\n", __func__); ++ return -ENXIO; ++ } else { ++ dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", ++ cache_reg.len, cache_reg.addr); ++ } ++ ++ mi = devm_kzalloc(&vdev->dev, sizeof(*mi), GFP_KERNEL); ++ if (!mi) ++ return -ENOMEM; ++ ++ init_completion(&mi->completion); ++ ret = percpu_ref_init(&mi->ref, virtio_fs_percpu_release, 0, ++ GFP_KERNEL); ++ if (ret < 0) { ++ dev_err(&vdev->dev, "%s: percpu_ref_init failed (%d)\n", ++ __func__, ret); ++ return ret; ++ } ++ ++ ret = devm_add_action(&vdev->dev, virtio_fs_percpu_exit, mi); ++ if (ret < 0) { ++ percpu_ref_exit(&mi->ref); ++ return ret; ++ } ++ ++ pgmap = &mi->pgmap; ++ pgmap->altmap_valid = false; ++ pgmap->ref = &mi->ref; ++ pgmap->kill = virtio_fs_percpu_kill; ++ pgmap->type = MEMORY_DEVICE_FS_DAX; ++ ++ /* Ideally we would directly use the PCI BAR resource but ++ * devm_memremap_pages() wants its own copy in pgmap. So ++ * initialize a struct resource from scratch (only the start ++ * and end fields will be used). ++ */ ++ pgmap->res = (struct resource){ ++ .name = "virtio-fs dax window", ++ .start = (phys_addr_t) cache_reg.addr, ++ .end = (phys_addr_t) cache_reg.addr + cache_reg.len, ++ }; ++ ++ fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap); ++ if (IS_ERR(fs->window_kaddr)) ++ return PTR_ERR(fs->window_kaddr); ++ ++ fs->window_phys_addr = (phys_addr_t) cache_reg.addr; ++ fs->window_len = (phys_addr_t) cache_reg.len; ++ ++ dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx" ++ " len 0x%llx\n", __func__, fs->window_kaddr, cache_reg.addr, ++ cache_reg.len); ++ ++ fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops); ++ if (!fs->dax_dev) ++ return -ENOMEM; ++ ++ return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs); ++} ++ ++static int virtio_fs_probe(struct virtio_device *vdev) ++{ ++ struct virtio_fs *fs; ++ int ret; ++ ++ fs = devm_kzalloc(&vdev->dev, sizeof(*fs), GFP_KERNEL); ++ if (!fs) ++ return -ENOMEM; ++ vdev->priv = fs; ++ ++ ret = virtio_fs_read_tag(vdev, fs); ++ if (ret < 0) ++ goto out; ++ ++ ret = virtio_fs_setup_vqs(vdev, fs); ++ if (ret < 0) ++ goto out; ++ ++ /* TODO vq affinity */ ++ /* TODO populate notifications vq */ ++ ++ ret = virtio_fs_setup_dax(vdev, fs); ++ if (ret < 0) ++ goto out_vqs; ++ ++ /* Bring the device online in case the filesystem is mounted and ++ * requests need to be sent before we return. ++ */ ++ virtio_device_ready(vdev); ++ ++ ret = virtio_fs_add_instance(fs); ++ if (ret < 0) ++ goto out_vqs; ++ ++ return 0; ++ ++out_vqs: ++ vdev->config->reset(vdev); ++ virtio_fs_cleanup_vqs(vdev, fs); ++out: ++ vdev->priv = NULL; ++ return ret; ++} ++ ++static void virtio_fs_remove(struct virtio_device *vdev) ++{ ++ struct virtio_fs *fs = vdev->priv; ++ ++ virtio_fs_free_devs(fs); ++ ++ vdev->config->reset(vdev); ++ virtio_fs_cleanup_vqs(vdev, fs); ++ ++ mutex_lock(&virtio_fs_mutex); ++ list_del(&fs->list); ++ mutex_unlock(&virtio_fs_mutex); ++ ++ vdev->priv = NULL; ++} ++ ++#ifdef CONFIG_PM ++static int virtio_fs_freeze(struct virtio_device *vdev) ++{ ++ return 0; /* TODO */ ++} ++ ++static int virtio_fs_restore(struct virtio_device *vdev) ++{ ++ return 0; /* TODO */ ++} ++#endif /* CONFIG_PM */ ++ ++const static struct virtio_device_id id_table[] = { ++ { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID }, ++ {}, ++}; ++ ++const static unsigned int feature_table[] = {}; ++ ++static struct virtio_driver virtio_fs_driver = { ++ .driver.name = KBUILD_MODNAME, ++ .driver.owner = THIS_MODULE, ++ .id_table = id_table, ++ .feature_table = feature_table, ++ .feature_table_size = ARRAY_SIZE(feature_table), ++ /* TODO validate config_get != NULL */ ++ .probe = virtio_fs_probe, ++ .remove = virtio_fs_remove, ++#ifdef CONFIG_PM_SLEEP ++ .freeze = virtio_fs_freeze, ++ .restore = virtio_fs_restore, ++#endif ++}; ++ ++static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq) ++__releases(fiq->waitq.lock) ++{ ++ struct fuse_forget_link *link; ++ struct virtio_fs_forget *forget; ++ struct fuse_pqueue *fpq; ++ struct scatterlist sg; ++ struct scatterlist *sgs[] = {&sg}; ++ struct virtio_fs *fs; ++ struct virtqueue *vq; ++ struct virtio_fs_vq *fsvq; ++ bool notify; ++ u64 unique; ++ int ret; ++ ++ BUG_ON(!fiq->forget_list_head.next); ++ link = fiq->forget_list_head.next; ++ BUG_ON(link->next); ++ fiq->forget_list_head.next = NULL; ++ fiq->forget_list_tail = &fiq->forget_list_head; ++ ++ unique = fuse_get_unique(fiq); ++ ++ fs = fiq->priv; ++ fsvq = &fs->vqs[VQ_HIPRIO]; ++ spin_unlock(&fiq->waitq.lock); ++ ++ /* Allocate a buffer for the request */ ++ forget = kmalloc(sizeof(*forget), GFP_ATOMIC); ++ if (!forget) { ++ pr_err("virtio-fs: dropped FORGET: kmalloc failed\n"); ++ goto out; /* TODO avoid dropping it? */ ++ } ++ ++ forget->ih = (struct fuse_in_header){ ++ .opcode = FUSE_FORGET, ++ .nodeid = link->forget_one.nodeid, ++ .unique = unique, ++ .len = sizeof(*forget), ++ }; ++ forget->arg = (struct fuse_forget_in){ ++ .nlookup = link->forget_one.nlookup, ++ }; ++ ++ sg_init_one(&sg, forget, sizeof(*forget)); ++ ++ /* Enqueue the request */ ++ vq = fsvq->vq; ++ dev_dbg(&vq->vdev->dev, "%s\n", __func__); ++ fpq = vq_to_fpq(vq); ++ spin_lock(&fpq->lock); ++ ++ ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); ++ if (ret < 0) { ++ if (ret == -ENOMEM || ret == -ENOSPC) { ++ pr_debug("virtio-fs: Could not queue FORGET: err=%d." ++ " Will try later.\n", ret); ++ list_add_tail(&forget->list, &fsvq->queued_reqs); ++ schedule_delayed_work(&fsvq->dispatch_work, ++ msecs_to_jiffies(1)); ++ } else { ++ pr_debug("virtio-fs: Could not queue FORGET: err=%d." ++ " Dropping it.\n", ret); ++ kfree(forget); ++ } ++ spin_unlock(&fpq->lock); ++ goto out; ++ } ++ ++ notify = virtqueue_kick_prepare(vq); ++ ++ spin_unlock(&fpq->lock); ++ ++ if (notify) ++ virtqueue_notify(vq); ++out: ++ kfree(link); ++} ++ ++static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq) ++__releases(fiq->waitq.lock) ++{ ++ /* TODO */ ++ spin_unlock(&fiq->waitq.lock); ++} ++ ++/* Return the number of scatter-gather list elements required */ ++static unsigned sg_count_fuse_req(struct fuse_req *req) ++{ ++ unsigned total_sgs = 1 /* fuse_in_header */; ++ ++ if (req->in.numargs - req->in.argpages) ++ total_sgs += 1; ++ ++ if (req->in.argpages) ++ total_sgs += req->num_pages; ++ ++ if (!test_bit(FR_ISREPLY, &req->flags)) ++ return total_sgs; ++ ++ total_sgs += 1 /* fuse_out_header */; ++ ++ if (req->out.numargs - req->out.argpages) ++ total_sgs += 1; ++ ++ if (req->out.argpages) ++ total_sgs += req->num_pages; ++ ++ return total_sgs; ++} ++ ++/* Add pages to scatter-gather list and return number of elements used */ ++static unsigned sg_init_fuse_pages(struct scatterlist *sg, ++ struct page **pages, ++ struct fuse_page_desc *page_descs, ++ unsigned num_pages) ++{ ++ unsigned i; ++ ++ for (i = 0; i < num_pages; i++) { ++ sg_init_table(&sg[i], 1); ++ sg_set_page(&sg[i], pages[i], ++ page_descs[i].length, ++ page_descs[i].offset); ++ } ++ ++ return i; ++} ++ ++/* Add args to scatter-gather list and return number of elements used */ ++static unsigned sg_init_fuse_args(struct scatterlist *sg, ++ struct fuse_req *req, ++ struct fuse_arg *args, ++ unsigned numargs, ++ bool argpages, ++ void *argbuf, ++ unsigned *len_used) ++{ ++ unsigned total_sgs = 0; ++ unsigned len; ++ ++ len = fuse_len_args(numargs - argpages, args); ++ if (len) ++ sg_init_one(&sg[total_sgs++], argbuf, len); ++ ++ if (argpages) ++ total_sgs += sg_init_fuse_pages(&sg[total_sgs], ++ req->pages, ++ req->page_descs, ++ req->num_pages); ++ ++ if (len_used) ++ *len_used = len; ++ ++ return total_sgs; ++} ++ ++/* Add a request to a virtqueue and kick the device */ ++static int virtio_fs_enqueue_req(struct virtqueue *vq, struct fuse_req *req) ++{ ++ struct scatterlist *stack_sgs[6 /* requests need at least 4 elements */]; ++ struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)]; ++ struct scatterlist **sgs = stack_sgs; ++ struct scatterlist *sg = stack_sg; ++ struct fuse_pqueue *fpq; ++ unsigned argbuf_used = 0; ++ unsigned out_sgs = 0; ++ unsigned in_sgs = 0; ++ unsigned total_sgs; ++ unsigned i; ++ int ret; ++ bool notify; ++ ++ /* Does the sglist fit on the stack? */ ++ total_sgs = sg_count_fuse_req(req); ++ if (total_sgs > ARRAY_SIZE(stack_sgs)) { ++ sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC); ++ sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC); ++ if (!sgs || !sg) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ } ++ ++ /* Use a bounce buffer since stack args cannot be mapped */ ++ ret = copy_args_to_argbuf(req); ++ if (ret < 0) ++ goto out; ++ ++ /* Request elements */ ++ sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h)); ++ out_sgs += sg_init_fuse_args(&sg[out_sgs], req, ++ (struct fuse_arg *)req->in.args, ++ req->in.numargs, req->in.argpages, ++ req->argbuf, &argbuf_used); ++ ++ /* Reply elements */ ++ if (test_bit(FR_ISREPLY, &req->flags)) { ++ sg_init_one(&sg[out_sgs + in_sgs++], ++ &req->out.h, sizeof(req->out.h)); ++ in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req, ++ req->out.args, req->out.numargs, ++ req->out.argpages, ++ req->argbuf + argbuf_used, NULL); ++ } ++ ++ BUG_ON(out_sgs + in_sgs != total_sgs); ++ ++ for (i = 0; i < total_sgs; i++) ++ sgs[i] = &sg[i]; ++ ++ fpq = vq_to_fpq(vq); ++ spin_lock(&fpq->lock); ++ ++ ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); ++ if (ret < 0) { ++ /* TODO handle full virtqueue */ ++ spin_unlock(&fpq->lock); ++ goto out; ++ } ++ ++ notify = virtqueue_kick_prepare(vq); ++ ++ spin_unlock(&fpq->lock); ++ ++ if (notify) ++ virtqueue_notify(vq); ++ ++out: ++ if (ret < 0 && req->argbuf) { ++ kfree(req->argbuf); ++ req->argbuf = NULL; ++ } ++ if (sgs != stack_sgs) { ++ kfree(sgs); ++ kfree(sg); ++ } ++ ++ return ret; ++} ++ ++static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) ++__releases(fiq->waitq.lock) ++{ ++ unsigned queue_id = VQ_REQUEST; /* TODO multiqueue */ ++ struct virtio_fs *fs; ++ struct fuse_conn *fc; ++ struct fuse_req *req; ++ struct fuse_pqueue *fpq; ++ int ret; ++ ++ BUG_ON(list_empty(&fiq->pending)); ++ req = list_last_entry(&fiq->pending, struct fuse_req, list); ++ clear_bit(FR_PENDING, &req->flags); ++ list_del_init(&req->list); ++ BUG_ON(!list_empty(&fiq->pending)); ++ spin_unlock(&fiq->waitq.lock); ++ ++ fs = fiq->priv; ++ fc = fs->vqs[queue_id].fud->fc; ++ ++ dev_dbg(&fs->vqs[queue_id].vq->vdev->dev, ++ "%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n", ++ __func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid, ++ req->in.h.len, fuse_len_args(req->out.numargs, req->out.args)); ++ ++ /* TODO put request onto fpq->io list? */ ++ ++ fpq = &fs->vqs[queue_id].fud->pq; ++ spin_lock(&fpq->lock); ++ if (!fpq->connected) { ++ spin_unlock(&fpq->lock); ++ req->out.h.error = -ENODEV; ++ printk(KERN_ERR "%s: disconnected\n", __func__); ++ fuse_request_end(fc, req); ++ return; ++ } ++ list_add_tail(&req->list, &fpq->processing); ++ spin_unlock(&fpq->lock); ++ set_bit(FR_SENT, &req->flags); ++ /* matches barrier in request_wait_answer() */ ++ smp_mb__after_atomic(); ++ /* TODO check for FR_INTERRUPTED? */ ++ ++ ret = virtio_fs_enqueue_req(fs->vqs[queue_id].vq, req); ++ if (ret < 0) { ++ req->out.h.error = ret; ++ printk(KERN_ERR "%s: virtio_fs_enqueue_req failed %d\n", ++ __func__, ret); ++ fuse_request_end(fc, req); ++ return; ++ } ++} ++ ++const static struct fuse_iqueue_ops virtio_fs_fiq_ops = { ++ .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, ++ .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, ++ .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, ++}; ++ ++static int virtio_fs_fill_super(struct super_block *sb, void *data, ++ int silent) ++{ ++ struct fuse_mount_data d; ++ struct fuse_conn *fc; ++ struct virtio_fs *fs; ++ int is_bdev = sb->s_bdev != NULL; ++ unsigned int i; ++ int err; ++ struct fuse_req *init_req; ++ ++ err = -EINVAL; ++ if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) ++ goto err; ++ if (d.fd_present) { ++ printk(KERN_ERR "virtio-fs: fd option cannot be used\n"); ++ goto err; ++ } ++ if (!d.tag_present) { ++ printk(KERN_ERR "virtio-fs: missing tag option\n"); ++ goto err; ++ } ++ ++ fs = virtio_fs_find_instance(d.tag); ++ if (!fs) { ++ printk(KERN_ERR "virtio-fs: tag not found\n"); ++ err = -ENOENT; ++ goto err; ++ } ++ ++ /* TODO lock */ ++ if (fs->vqs[VQ_REQUEST].fud) { ++ printk(KERN_ERR "virtio-fs: device already in use\n"); ++ err = -EBUSY; ++ goto err; ++ } ++ ++ err = -ENOMEM; ++ /* Allocate fuse_dev for hiprio and notification queues */ ++ for (i = 0; i < VQ_REQUEST; i++) { ++ struct virtio_fs_vq *fsvq = &fs->vqs[i]; ++ ++ fsvq->fud = fuse_dev_alloc(); ++ if (!fsvq->fud) ++ goto err_free_fuse_devs; ++ } ++ ++ init_req = fuse_request_alloc(0); ++ if (!init_req) ++ goto err; ++ __set_bit(FR_BACKGROUND, &init_req->flags); ++ ++ d.dax_dev = d.dax ? fs->dax_dev : NULL; ++ d.fiq_ops = &virtio_fs_fiq_ops; ++ d.fiq_priv = fs; ++ d.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud; ++ d.destroy = true; /* Send destroy request on unmount */ ++ err = fuse_fill_super_common(sb, &d); ++ if (err < 0) ++ goto err_free_init_req; ++ ++ fc = fs->vqs[VQ_REQUEST].fud->fc; ++ ++ /* TODO take fuse_mutex around this loop? */ ++ for (i = 0; i < fs->nvqs; i++) { ++ struct virtio_fs_vq *fsvq = &fs->vqs[i]; ++ ++ if (i == VQ_REQUEST) ++ continue; /* already initialized */ ++ fuse_dev_install(fsvq->fud, fc); ++ atomic_inc(&fc->dev_count); ++ } ++ ++ fuse_send_init(fc, init_req); ++ return 0; ++ ++err_free_init_req: ++ fuse_request_free(init_req); ++err_free_fuse_devs: ++ for (i = 0; i < fs->nvqs; i++) { ++ struct virtio_fs_vq *fsvq = &fs->vqs[i]; ++ fuse_dev_free(fsvq->fud); ++ } ++err: ++ return err; ++} ++ ++static void virtio_kill_sb(struct super_block *sb) ++{ ++ struct fuse_conn *fc = get_fuse_conn_super(sb); ++ fuse_kill_sb_anon(sb); ++ if (fc) { ++ struct virtio_fs *vfs = fc->iq.priv; ++ virtio_fs_free_devs(vfs); ++ } ++} ++ ++static struct dentry *virtio_fs_mount(struct file_system_type *fs_type, ++ int flags, const char *dev_name, ++ void *raw_data) ++{ ++ return mount_nodev(fs_type, flags, raw_data, virtio_fs_fill_super); ++} ++ ++static struct file_system_type virtio_fs_type = { ++ .owner = THIS_MODULE, ++ .name = KBUILD_MODNAME, ++ .mount = virtio_fs_mount, ++ .kill_sb = virtio_kill_sb, ++}; ++ ++static int __init virtio_fs_init(void) ++{ ++ int ret; ++ ++ ret = register_virtio_driver(&virtio_fs_driver); ++ if (ret < 0) ++ return ret; ++ ++ ret = register_filesystem(&virtio_fs_type); ++ if (ret < 0) { ++ unregister_virtio_driver(&virtio_fs_driver); ++ return ret; ++ } ++ ++ return 0; ++} ++module_init(virtio_fs_init); ++ ++static void __exit virtio_fs_exit(void) ++{ ++ unregister_filesystem(&virtio_fs_type); ++ unregister_virtio_driver(&virtio_fs_driver); ++} ++module_exit(virtio_fs_exit); ++ ++MODULE_AUTHOR("Stefan Hajnoczi "); ++MODULE_DESCRIPTION("Virtio Filesystem"); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS_FS(KBUILD_MODNAME); ++MODULE_DEVICE_TABLE(virtio, id_table); +diff --git a/fs/splice.c b/fs/splice.c +index b3daa971f597..d0bfbc13a417 100644 +--- a/fs/splice.c ++++ b/fs/splice.c +@@ -365,7 +365,7 @@ static ssize_t kernel_readv(struct file *file, const struct kvec *vec, + return res; + } + +-static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, ++ssize_t default_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) + { +@@ -429,6 +429,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, + iov_iter_advance(&to, copied); /* truncates and discards */ + return res; + } ++EXPORT_SYMBOL(default_file_splice_read); + + /* + * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' +diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c +index b697866946d2..c97f8a0cb47b 100644 +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -953,7 +953,7 @@ xfs_dax_writepages( + { + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + return dax_writeback_mapping_range(mapping, +- xfs_find_bdev_for_inode(mapping->host), wbc); ++ xfs_find_bdev_for_inode(mapping->host), NULL, wbc); + } + + STATIC int +diff --git a/include/linux/dax.h b/include/linux/dax.h +index 450b28db9533..a8461841f148 100644 +--- a/include/linux/dax.h ++++ b/include/linux/dax.h +@@ -85,7 +85,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev) + + struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); + int dax_writeback_mapping_range(struct address_space *mapping, +- struct block_device *bdev, struct writeback_control *wbc); ++ struct block_device *bdev, struct dax_device *dax_dev, ++ struct writeback_control *wbc); + + struct page *dax_layout_busy_page(struct address_space *mapping); + bool dax_lock_mapping_entry(struct page *page); +@@ -117,7 +118,8 @@ static inline struct page *dax_layout_busy_page(struct address_space *mapping) + } + + static inline int dax_writeback_mapping_range(struct address_space *mapping, +- struct block_device *bdev, struct writeback_control *wbc) ++ struct block_device *bdev, struct dax_device *dax_dev, ++ struct writeback_control *wbc) + { + return -EOPNOTSUPP; + } +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 7b6084854bfe..1c5ef6bf46e5 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -2991,6 +2991,8 @@ extern void block_sync_page(struct page *page); + /* fs/splice.c */ + extern ssize_t generic_file_splice_read(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); ++extern ssize_t default_file_splice_read(struct file *, loff_t *, ++ struct pipe_inode_info *, size_t, unsigned int); + extern ssize_t iter_file_splice_write(struct pipe_inode_info *, + struct file *, loff_t *, size_t, unsigned int); + extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, +diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h +index 32baf8e26735..8f85d1d8a895 100644 +--- a/include/linux/virtio_config.h ++++ b/include/linux/virtio_config.h +@@ -10,6 +10,11 @@ + + struct irq_affinity; + ++struct virtio_shm_region { ++ u64 addr; ++ u64 len; ++}; ++ + /** + * virtio_config_ops - operations for configuring a virtio device + * @get: read the value of a configuration field +@@ -60,6 +65,7 @@ struct irq_affinity; + * the caller can then copy. + * @set_vq_affinity: set the affinity for a virtqueue. + * @get_vq_affinity: get the affinity for a virtqueue (optional). ++ * @get_shm_region: get a shared memory region based on the index. + */ + typedef void vq_callback_t(struct virtqueue *); + struct virtio_config_ops { +@@ -83,6 +89,8 @@ struct virtio_config_ops { + const struct cpumask *cpu_mask); + const struct cpumask *(*get_vq_affinity)(struct virtio_device *vdev, + int index); ++ bool (*get_shm_region)(struct virtio_device *vdev, ++ struct virtio_shm_region *region, u8 id); + }; + + /* If driver didn't advertise the feature, it will never appear. */ +@@ -245,6 +253,15 @@ int virtqueue_set_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) + return 0; + } + ++static inline ++bool virtio_get_shm_region(struct virtio_device *vdev, ++ struct virtio_shm_region *region, u8 id) ++{ ++ if (!vdev->config->get_shm_region) ++ return false; ++ return vdev->config->get_shm_region(vdev, region, id); ++} ++ + static inline bool virtio_is_little_endian(struct virtio_device *vdev) + { + return virtio_has_feature(vdev, VIRTIO_F_VERSION_1) || +diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h +index 92fa24c24c92..dbc5013ad747 100644 +--- a/include/uapi/linux/fuse.h ++++ b/include/uapi/linux/fuse.h +@@ -381,6 +381,8 @@ enum fuse_opcode { + FUSE_READDIRPLUS = 44, + FUSE_RENAME2 = 45, + FUSE_LSEEK = 46, ++ FUSE_SETUPMAPPING = 48, ++ FUSE_REMOVEMAPPING = 49, + + /* CUSE specific operations */ + CUSE_INIT = 4096, +@@ -792,4 +794,36 @@ struct fuse_lseek_out { + uint64_t offset; + }; + ++#define FUSE_SETUPMAPPING_ENTRIES 8 ++#define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) ++#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1) ++struct fuse_setupmapping_in { ++ /* An already open handle */ ++ uint64_t fh; ++ /* Offset into the file to start the mapping */ ++ uint64_t foffset; ++ /* Length of mapping required */ ++ uint64_t len; ++ /* Flags, FUSE_SETUPMAPPING_FLAG_* */ ++ uint64_t flags; ++ /* Offset in Memory Window */ ++ uint64_t moffset; ++}; ++ ++struct fuse_setupmapping_out { ++ /* Offsets into the cache of mappings */ ++ uint64_t coffset[FUSE_SETUPMAPPING_ENTRIES]; ++ /* Lengths of each mapping */ ++ uint64_t len[FUSE_SETUPMAPPING_ENTRIES]; ++}; ++ ++struct fuse_removemapping_in { ++ /* An already open handle */ ++ uint64_t fh; ++ /* Offset into the dax window start the unmapping */ ++ uint64_t moffset; ++ /* Length of mapping required */ ++ uint64_t len; ++}; ++ + #endif /* _LINUX_FUSE_H */ +diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h +new file mode 100644 +index 000000000000..d4bb549568eb +--- /dev/null ++++ b/include/uapi/linux/virtio_fs.h +@@ -0,0 +1,44 @@ ++#ifndef _UAPI_LINUX_VIRTIO_FS_H ++#define _UAPI_LINUX_VIRTIO_FS_H ++/* This header is BSD licensed so anyone can use the definitions to implement ++ * compatible drivers/servers. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of IBM nor the names of its contributors ++ * may be used to endorse or promote products derived from this software ++ * without specific prior written permission. ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE. */ ++#include ++#include ++#include ++#include ++ ++struct virtio_fs_config { ++ /* Filesystem name (UTF-8, not NUL-terminated, padded with NULs) */ ++ __u8 tag[36]; ++ ++ /* Number of request queues */ ++ __u32 num_queues; ++} __attribute__((packed)); ++ ++/* For the id field in virtio_pci_shm_cap */ ++#define VIRTIO_FS_SHMCAP_ID_CACHE 0 ++ ++#endif /* _UAPI_LINUX_VIRTIO_FS_H */ +diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h +index 6d5c3b2d4f4d..884b0e2734bb 100644 +--- a/include/uapi/linux/virtio_ids.h ++++ b/include/uapi/linux/virtio_ids.h +@@ -43,5 +43,6 @@ + #define VIRTIO_ID_INPUT 18 /* virtio input */ + #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ + #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ ++#define VIRTIO_ID_FS 26 /* virtio filesystem */ + + #endif /* _LINUX_VIRTIO_IDS_H */ +diff --git a/include/uapi/linux/virtio_mmio.h b/include/uapi/linux/virtio_mmio.h +index c4b09689ab64..0650f91bea6c 100644 +--- a/include/uapi/linux/virtio_mmio.h ++++ b/include/uapi/linux/virtio_mmio.h +@@ -122,6 +122,17 @@ + #define VIRTIO_MMIO_QUEUE_USED_LOW 0x0a0 + #define VIRTIO_MMIO_QUEUE_USED_HIGH 0x0a4 + ++/* Shared memory region id */ ++#define VIRTIO_MMIO_SHM_SEL 0x0ac ++ ++/* Shared memory region length, 64 bits in two halves */ ++#define VIRTIO_MMIO_SHM_LEN_LOW 0x0b0 ++#define VIRTIO_MMIO_SHM_LEN_HIGH 0x0b4 ++ ++/* Shared memory region base address, 64 bits in two halves */ ++#define VIRTIO_MMIO_SHM_BASE_LOW 0x0b8 ++#define VIRTIO_MMIO_SHM_BASE_HIGH 0x0bc ++ + /* Configuration atomicity value */ + #define VIRTIO_MMIO_CONFIG_GENERATION 0x0fc + +diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h +index 90007a1abcab..31841a60a4ad 100644 +--- a/include/uapi/linux/virtio_pci.h ++++ b/include/uapi/linux/virtio_pci.h +@@ -113,6 +113,8 @@ + #define VIRTIO_PCI_CAP_DEVICE_CFG 4 + /* PCI configuration access */ + #define VIRTIO_PCI_CAP_PCI_CFG 5 ++/* Additional shared memory capability */ ++#define VIRTIO_PCI_CAP_SHARED_MEMORY_CFG 8 + + /* This is the PCI capability header: */ + struct virtio_pci_cap { +@@ -163,6 +165,14 @@ struct virtio_pci_cfg_cap { + __u8 pci_cfg_data[4]; /* Data for BAR access. */ + }; + ++/* Fields in VIRTIO_PCI_CAP_SHARED_MEMORY_CFG */ ++struct virtio_pci_shm_cap { ++ struct virtio_pci_cap cap; ++ __le32 offset_hi; /* Most sig 32 bits of offset */ ++ __le32 length_hi; /* Most sig 32 bits of length */ ++ __u8 id; /* To distinguish shm chunks */ ++}; ++ + /* Macro versions of offsets for the Old Timers! */ + #define VIRTIO_PCI_CAP_VNDR 0 + #define VIRTIO_PCI_CAP_NEXT 1 +-- +2.20.1 + diff --git a/kernel/patches/4.19.x/0001-Enable-memory-hotplug-using-probe-for-arm64.patch b/kernel/patches/4.19.x/0002-Enable-memory-hotplug-using-probe-for-arm64.patch similarity index 95% rename from kernel/patches/4.19.x/0001-Enable-memory-hotplug-using-probe-for-arm64.patch rename to kernel/patches/4.19.x/0002-Enable-memory-hotplug-using-probe-for-arm64.patch index 8d595a544a..6a4c7783a3 100644 --- a/kernel/patches/4.19.x/0001-Enable-memory-hotplug-using-probe-for-arm64.patch +++ b/kernel/patches/4.19.x/0002-Enable-memory-hotplug-using-probe-for-arm64.patch @@ -1,7 +1,7 @@ -From 074a6a9d83a4e790f892ef0fc91cdabbfbf26202 Mon Sep 17 00:00:00 2001 +From 33ffc9a93a1d9e72594d5eb3e4fc583a1a2911d1 Mon Sep 17 00:00:00 2001 From: Jianyong Wu Date: Tue, 19 Feb 2019 01:15:32 -0500 -Subject: [PATCH] Enable memory-hotplug using probe for arm64 +Subject: [PATCH 2/5] Enable memory-hotplug using probe for arm64 --- arch/arm64/Kconfig | 7 +++++++ @@ -94,5 +94,5 @@ index 146c04ceaa51..d276bd4d38b5 100644 + return 0; +} -- -2.17.1 +2.20.1 diff --git a/kernel/patches/4.19.x/0001-NO-UPSTREAM-9P-always-use-cached-inode-to-fill-in-v9.patch b/kernel/patches/4.19.x/0003-NO-UPSTREAM-9P-always-use-cached-inode-to-fill-in-v9.patch similarity index 80% rename from kernel/patches/4.19.x/0001-NO-UPSTREAM-9P-always-use-cached-inode-to-fill-in-v9.patch rename to kernel/patches/4.19.x/0003-NO-UPSTREAM-9P-always-use-cached-inode-to-fill-in-v9.patch index f2ada7eb85..86e587503e 100644 --- a/kernel/patches/4.19.x/0001-NO-UPSTREAM-9P-always-use-cached-inode-to-fill-in-v9.patch +++ b/kernel/patches/4.19.x/0003-NO-UPSTREAM-9P-always-use-cached-inode-to-fill-in-v9.patch @@ -1,7 +1,7 @@ -From 0a235af3130a0c40fe2198f18198c7ac4e799a03 Mon Sep 17 00:00:00 2001 +From cab495651e8f71c39e87a08abbe051916110b3ca Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Mon, 18 Sep 2017 11:46:59 -0500 -Subject: [PATCH 2/3] NO-UPSTREAM: 9P: always use cached inode to fill in +Subject: [PATCH 3/5] NO-UPSTREAM: 9P: always use cached inode to fill in v9fs_vfs_getattr So that if in cache=none mode, we don't have to lookup server that @@ -17,10 +17,10 @@ Signed-off-by: Peng Tao 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c -index bdabb27..30395e0 100644 +index 85ff859d3af5..efdc2a8f37bb 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c -@@ -1068,7 +1068,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, +@@ -1080,7 +1080,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); @@ -30,10 +30,10 @@ index bdabb27..30395e0 100644 return 0; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c -index 7f6ae21..5d7e970 100644 +index 4823e1c46999..daa5e6a41864 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c -@@ -481,7 +481,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, +@@ -480,7 +480,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); @@ -43,5 +43,5 @@ index 7f6ae21..5d7e970 100644 return 0; } -- -2.9.5 +2.20.1 diff --git a/kernel/patches/4.19.x/0002-Compile-in-evged-always.patch b/kernel/patches/4.19.x/0004-Compile-in-evged-always.patch similarity index 87% rename from kernel/patches/4.19.x/0002-Compile-in-evged-always.patch rename to kernel/patches/4.19.x/0004-Compile-in-evged-always.patch index fbcabe67c7..c211adbe72 100644 --- a/kernel/patches/4.19.x/0002-Compile-in-evged-always.patch +++ b/kernel/patches/4.19.x/0004-Compile-in-evged-always.patch @@ -1,7 +1,7 @@ -From e35cb54fb8d07dd80fa8df44ff0de6eb5ff8d6cf Mon Sep 17 00:00:00 2001 +From d78297bf9d8e41711bddc6003f460e815340a214 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 10 Aug 2018 13:22:08 +0000 -Subject: [PATCH 108/108] Compile in evged always +Subject: [PATCH 4/5] Compile in evged always We need evged for NEMU (and in general for hw reduced) @@ -25,5 +25,5 @@ index 6d59aa109a91..97f2fbbd5014 100644 acpi-y += property.o acpi-$(CONFIG_X86) += acpi_cmos_rtc.o -- -2.18.0 +2.20.1 diff --git a/kernel/patches/4.19.x/0003-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch b/kernel/patches/4.19.x/0005-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch similarity index 99% rename from kernel/patches/4.19.x/0003-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch rename to kernel/patches/4.19.x/0005-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch index 17564efd2d..c51dd09425 100644 --- a/kernel/patches/4.19.x/0003-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch +++ b/kernel/patches/4.19.x/0005-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch @@ -1,8 +1,8 @@ -From 60a4fed76e63c36cd327c4b404ec163e93a4805e Mon Sep 17 00:00:00 2001 +From 6823b343a7c5f6fc3b93d4a00e919d14cb6a4adb Mon Sep 17 00:00:00 2001 From: Penny Zheng Date: Tue, 19 Feb 2019 16:05:44 +0800 -Subject: [PATCH] arm64: backport Arm64 KVM Dynamic IPA and 52bit IPA support - to 4.19.X +Subject: [PATCH 5/5] arm64: backport Arm64 KVM Dynamic IPA and 52bit IPA + support to 4.19.X This patch is based on Suzuki K Poulose's [v6,00/18] kvm: arm64: Dynamic IPA and 52bit IPA @@ -258,10 +258,10 @@ index 460d616bb2d6..f6a7ea805232 100644 #endif /* __ARM_S2_PGTABLE_H_ */ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig -index 1b1a0e95c751..f9162da575a9 100644 +index 881bea194d53..d77da7a56eb5 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig -@@ -1132,6 +1132,19 @@ config ARM64_RAS_EXTN +@@ -1139,6 +1139,19 @@ config ARM64_RAS_EXTN and access the new registers if the system supports the extension. Platform RAS features may additionally depend on firmware support. @@ -2273,5 +2273,5 @@ index a2a175b08b17..b3d1f0985117 100644 } -- -2.17.1 +2.20.1