Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'libnvdimm-for-5.17' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull dax and libnvdimm updates from Dan Williams:
"The bulk of this is a rework of the dax_operations API after
discovering the obstacles it posed to the work-in-progress DAX+reflink
support for XFS and other copy-on-write filesystem mechanics.

Primarily the need to plumb a block_device through the API to handle
partition offsets was a sticking point and Christoph untangled that
dependency in addition to other cleanups to make landing the
DAX+reflink support easier.

The DAX_PMEM_COMPAT option has been around for 4 years and not only
are distributions shipping userspace that understand the current
configuration API, but some are not even bothering to turn this option
on anymore, so it seems a good time to remove it per the deprecation
schedule. Recall that this was added after the device-dax subsystem
moved from /sys/class/dax to /sys/bus/dax for its sysfs organization.
All recent functionality depends on /sys/bus/dax.

Some other miscellaneous cleanups and reflink prep patches are
included as well.

Summary:

- Simplify the dax_operations API:

- Eliminate bdev_dax_pgoff() in favor of the filesystem
maintaining and applying a partition offset to all its DAX iomap
operations.

- Remove wrappers and device-mapper stacked callbacks for
->copy_from_iter() and ->copy_to_iter() in favor of moving
block_device relative offset responsibility to the
dax_direct_access() caller.

- Remove the need for an @bdev in filesystem-DAX infrastructure

- Remove unused uio helpers copy_from_iter_flushcache() and
copy_mc_to_iter() as only the non-check_copy_size() versions are
used for DAX.

- Prepare XFS for the pending (next merge window) DAX+reflink support

- Remove deprecated DEV_DAX_PMEM_COMPAT support

- Cleanup a straggling misuse of the GUID api"

* tag 'libnvdimm-for-5.17' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (38 commits)
iomap: Fix error handling in iomap_zero_iter()
ACPI: NFIT: Import GUID before use
dax: remove the copy_from_iter and copy_to_iter methods
dax: remove the DAXDEV_F_SYNC flag
dax: simplify dax_synchronous and set_dax_synchronous
uio: remove copy_from_iter_flushcache() and copy_mc_to_iter()
iomap: turn the byte variable in iomap_zero_iter into a ssize_t
memremap: remove support for external pgmap refcounts
fsdax: don't require CONFIG_BLOCK
iomap: build the block based code conditionally
dax: fix up some of the block device related ifdefs
fsdax: shift partition offset handling into the file systems
dax: return the partition offset from fs_dax_get_by_bdev
iomap: add a IOMAP_DAX flag
xfs: pass the mapping flags to xfs_bmbt_to_iomap
xfs: use xfs_direct_write_iomap_ops for DAX zeroing
xfs: move dax device handling into xfs_{alloc,free}_buftarg
ext4: cleanup the dax handling in ext4_fill_super
ext2: cleanup the dax handling in ext2_fill_super
fsdax: decouple zeroing from the iomap buffered I/O code
...

+567 -1186
-22
Documentation/ABI/obsolete/sysfs-class-dax
··· 1 - What: /sys/class/dax/ 2 - Date: May, 2016 3 - KernelVersion: v4.7 4 - Contact: nvdimm@lists.linux.dev 5 - Description: Device DAX is the device-centric analogue of Filesystem 6 - DAX (CONFIG_FS_DAX). It allows memory ranges to be 7 - allocated and mapped without need of an intervening file 8 - system. Device DAX is strict, precise and predictable. 9 - Specifically this interface: 10 - 11 - 1. Guarantees fault granularity with respect to a given 12 - page size (pte, pmd, or pud) set at configuration time. 13 - 14 - 2. Enforces deterministic behavior by being strict about 15 - what fault scenarios are supported. 16 - 17 - The /sys/class/dax/ interface enumerates all the 18 - device-dax instances in the system. The ABI is 19 - deprecated and will be removed after 2020. It is 20 - replaced with the DAX bus interface /sys/bus/dax/ where 21 - device-dax instances can be found under 22 - /sys/bus/dax/devices/
+3 -1
drivers/acpi/nfit/core.c
··· 678 678 679 679 int nfit_spa_type(struct acpi_nfit_system_address *spa) 680 680 { 681 + guid_t guid; 681 682 int i; 682 683 684 + import_guid(&guid, spa->range_guid); 683 685 for (i = 0; i < NFIT_UUID_MAX; i++) 684 - if (guid_equal(to_nfit_uuid(i), (guid_t *)&spa->range_guid)) 686 + if (guid_equal(to_nfit_uuid(i), &guid)) 685 687 return i; 686 688 return -1; 687 689 }
-13
drivers/dax/Kconfig
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 - config DAX_DRIVER 3 - select DAX 4 - bool 5 - 6 2 menuconfig DAX 7 3 tristate "DAX: direct access to differentiated memory" 8 4 select SRCU ··· 65 69 device_dax driver and bound to this kmem driver on each boot. 66 70 67 71 Say N if unsure. 68 - 69 - config DEV_DAX_PMEM_COMPAT 70 - tristate "PMEM DAX: support the deprecated /sys/class/dax interface" 71 - depends on m && DEV_DAX_PMEM=m 72 - default DEV_DAX_PMEM 73 - help 74 - Older versions of the libdaxctl library expect to find all 75 - device-dax instances under /sys/class/dax. If libdaxctl in 76 - your distribution is older than v58 say M, otherwise say N. 77 72 78 73 endif
+2 -1
drivers/dax/Makefile
··· 2 2 obj-$(CONFIG_DAX) += dax.o 3 3 obj-$(CONFIG_DEV_DAX) += device_dax.o 4 4 obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o 5 + obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o 5 6 6 7 dax-y := super.o 7 8 dax-y += bus.o 8 9 device_dax-y := device.o 10 + dax_pmem-y := pmem.o 9 11 10 - obj-y += pmem/ 11 12 obj-y += hmem/
+8 -22
drivers/dax/bus.c
··· 10 10 #include "dax-private.h" 11 11 #include "bus.h" 12 12 13 - static struct class *dax_class; 14 - 15 13 static DEFINE_MUTEX(dax_bus_lock); 16 14 17 15 #define DAX_NAME_LEN 30 ··· 1321 1323 } 1322 1324 1323 1325 /* 1324 - * No 'host' or dax_operations since there is no access to this 1325 - * device outside of mmap of the resulting character device. 1326 + * No dax_operations since there is no access to this device outside of 1327 + * mmap of the resulting character device. 1326 1328 */ 1327 - dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC); 1329 + dax_dev = alloc_dax(dev_dax, NULL); 1328 1330 if (IS_ERR(dax_dev)) { 1329 1331 rc = PTR_ERR(dax_dev); 1330 1332 goto err_alloc_dax; 1331 1333 } 1334 + set_dax_synchronous(dax_dev); 1335 + set_dax_nocache(dax_dev); 1336 + set_dax_nomc(dax_dev); 1332 1337 1333 1338 /* a device_dax instance is dead while the driver is not attached */ 1334 1339 kill_dax(dax_dev); ··· 1344 1343 1345 1344 inode = dax_inode(dax_dev); 1346 1345 dev->devt = inode->i_rdev; 1347 - if (data->subsys == DEV_DAX_BUS) 1348 - dev->bus = &dax_bus_type; 1349 - else 1350 - dev->class = dax_class; 1346 + dev->bus = &dax_bus_type; 1351 1347 dev->parent = parent; 1352 1348 dev->type = &dev_dax_type; 1353 1349 ··· 1443 1445 1444 1446 int __init dax_bus_init(void) 1445 1447 { 1446 - int rc; 1447 - 1448 - if (IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)) { 1449 - dax_class = class_create(THIS_MODULE, "dax"); 1450 - if (IS_ERR(dax_class)) 1451 - return PTR_ERR(dax_class); 1452 - } 1453 - 1454 - rc = bus_register(&dax_bus_type); 1455 - if (rc) 1456 - class_destroy(dax_class); 1457 - return rc; 1448 + return bus_register(&dax_bus_type); 1458 1449 } 1459 1450 1460 1451 void __exit dax_bus_exit(void) 1461 1452 { 1462 1453 bus_unregister(&dax_bus_type); 1463 - class_destroy(dax_class); 1464 1454 }
-13
drivers/dax/bus.h
··· 16 16 struct range *range, int target_node, unsigned int align, 17 17 unsigned long flags); 18 18 19 - enum dev_dax_subsys { 20 - DEV_DAX_BUS = 0, /* zeroed dev_dax_data picks this by default */ 21 - DEV_DAX_CLASS, 22 - }; 23 - 24 19 struct dev_dax_data { 25 20 struct dax_region *dax_region; 26 21 struct dev_pagemap *pgmap; 27 - enum dev_dax_subsys subsys; 28 22 resource_size_t size; 29 23 int id; 30 24 }; 31 25 32 26 struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data); 33 - 34 - /* to be deleted when DEV_DAX_CLASS is removed */ 35 - struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys); 36 27 37 28 struct dax_device_driver { 38 29 struct device_driver drv; ··· 39 48 __dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) 40 49 void dax_driver_unregister(struct dax_device_driver *dax_drv); 41 50 void kill_dev_dax(struct dev_dax *dev_dax); 42 - 43 - #if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) 44 - int dev_dax_probe(struct dev_dax *dev_dax); 45 - #endif 46 51 47 52 /* 48 53 * While run_dax() is potentially a generic operation that could be
+1 -5
drivers/dax/device.c
··· 433 433 inode = dax_inode(dax_dev); 434 434 cdev = inode->i_cdev; 435 435 cdev_init(cdev, &dax_fops); 436 - if (dev->class) { 437 - /* for the CONFIG_DEV_DAX_PMEM_COMPAT case */ 438 - cdev->owner = dev->parent->driver->owner; 439 - } else 440 - cdev->owner = dev->driver->owner; 436 + cdev->owner = dev->driver->owner; 441 437 cdev_set_parent(cdev, &dev->kobj); 442 438 rc = cdev_add(cdev, dev->devt, 1); 443 439 if (rc)
-1
drivers/dax/pmem/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o 3 3 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem_core.o 4 - obj-$(CONFIG_DEV_DAX_PMEM_COMPAT) += dax_pmem_compat.o 5 4 6 5 dax_pmem-y := pmem.o 7 6 dax_pmem_core-y := core.o
-72
drivers/dax/pmem/compat.c
··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */ 3 - #include <linux/percpu-refcount.h> 4 - #include <linux/memremap.h> 5 - #include <linux/module.h> 6 - #include <linux/pfn_t.h> 7 - #include <linux/nd.h> 8 - #include "../bus.h" 9 - 10 - /* we need the private definitions to implement compat suport */ 11 - #include "../dax-private.h" 12 - 13 - static int dax_pmem_compat_probe(struct device *dev) 14 - { 15 - struct dev_dax *dev_dax = __dax_pmem_probe(dev, DEV_DAX_CLASS); 16 - int rc; 17 - 18 - if (IS_ERR(dev_dax)) 19 - return PTR_ERR(dev_dax); 20 - 21 - if (!devres_open_group(&dev_dax->dev, dev_dax, GFP_KERNEL)) 22 - return -ENOMEM; 23 - 24 - device_lock(&dev_dax->dev); 25 - rc = dev_dax_probe(dev_dax); 26 - device_unlock(&dev_dax->dev); 27 - 28 - devres_close_group(&dev_dax->dev, dev_dax); 29 - if (rc) 30 - devres_release_group(&dev_dax->dev, dev_dax); 31 - 32 - return rc; 33 - } 34 - 35 - static int dax_pmem_compat_release(struct device *dev, void *data) 36 - { 37 - device_lock(dev); 38 - devres_release_group(dev, to_dev_dax(dev)); 39 - device_unlock(dev); 40 - 41 - return 0; 42 - } 43 - 44 - static void dax_pmem_compat_remove(struct device *dev) 45 - { 46 - device_for_each_child(dev, NULL, dax_pmem_compat_release); 47 - } 48 - 49 - static struct nd_device_driver dax_pmem_compat_driver = { 50 - .probe = dax_pmem_compat_probe, 51 - .remove = dax_pmem_compat_remove, 52 - .drv = { 53 - .name = "dax_pmem_compat", 54 - }, 55 - .type = ND_DRIVER_DAX_PMEM, 56 - }; 57 - 58 - static int __init dax_pmem_compat_init(void) 59 - { 60 - return nd_driver_register(&dax_pmem_compat_driver); 61 - } 62 - module_init(dax_pmem_compat_init); 63 - 64 - static void __exit dax_pmem_compat_exit(void) 65 - { 66 - driver_unregister(&dax_pmem_compat_driver.drv); 67 - } 68 - module_exit(dax_pmem_compat_exit); 69 - 70 - MODULE_LICENSE("GPL v2"); 71 - MODULE_AUTHOR("Intel Corporation"); 72 - MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM);
+30 -6
drivers/dax/pmem/core.c drivers/dax/pmem.c
··· 3 3 #include <linux/memremap.h> 4 4 #include <linux/module.h> 5 5 #include <linux/pfn_t.h> 6 - #include "../../nvdimm/pfn.h" 7 - #include "../../nvdimm/nd.h" 8 - #include "../bus.h" 6 + #include "../nvdimm/pfn.h" 7 + #include "../nvdimm/nd.h" 8 + #include "bus.h" 9 9 10 - struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) 10 + static struct dev_dax *__dax_pmem_probe(struct device *dev) 11 11 { 12 12 struct range range; 13 13 int rc, id, region_id; ··· 63 63 .dax_region = dax_region, 64 64 .id = id, 65 65 .pgmap = &pgmap, 66 - .subsys = subsys, 67 66 .size = range_len(&range), 68 67 }; 69 68 dev_dax = devm_create_dev_dax(&data); ··· 72 73 73 74 return dev_dax; 74 75 } 75 - EXPORT_SYMBOL_GPL(__dax_pmem_probe); 76 + 77 + static int dax_pmem_probe(struct device *dev) 78 + { 79 + return PTR_ERR_OR_ZERO(__dax_pmem_probe(dev)); 80 + } 81 + 82 + static struct nd_device_driver dax_pmem_driver = { 83 + .probe = dax_pmem_probe, 84 + .drv = { 85 + .name = "dax_pmem", 86 + }, 87 + .type = ND_DRIVER_DAX_PMEM, 88 + }; 89 + 90 + static int __init dax_pmem_init(void) 91 + { 92 + return nd_driver_register(&dax_pmem_driver); 93 + } 94 + module_init(dax_pmem_init); 95 + 96 + static void __exit dax_pmem_exit(void) 97 + { 98 + driver_unregister(&dax_pmem_driver.drv); 99 + } 100 + module_exit(dax_pmem_exit); 76 101 77 102 MODULE_LICENSE("GPL v2"); 78 103 MODULE_AUTHOR("Intel Corporation"); 104 + MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM);
-30
drivers/dax/pmem/pmem.c
··· 7 7 #include <linux/nd.h> 8 8 #include "../bus.h" 9 9 10 - static int dax_pmem_probe(struct device *dev) 11 - { 12 - return PTR_ERR_OR_ZERO(__dax_pmem_probe(dev, DEV_DAX_BUS)); 13 - } 14 10 15 - static struct nd_device_driver dax_pmem_driver = { 16 - .probe = dax_pmem_probe, 17 - .drv = { 18 - .name = "dax_pmem", 19 - }, 20 - .type = ND_DRIVER_DAX_PMEM, 21 - }; 22 - 23 - static int __init dax_pmem_init(void) 24 - { 25 - return nd_driver_register(&dax_pmem_driver); 26 - } 27 - module_init(dax_pmem_init); 28 - 29 - static void __exit dax_pmem_exit(void) 30 - { 31 - driver_unregister(&dax_pmem_driver.drv); 32 - } 33 - module_exit(dax_pmem_exit); 34 - 35 - MODULE_LICENSE("GPL v2"); 36 - MODULE_AUTHOR("Intel Corporation"); 37 - #if !IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) 38 - /* For compat builds, don't load this module by default */ 39 - MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM); 40 - #endif
+73 -211
drivers/dax/super.c
··· 7 7 #include <linux/mount.h> 8 8 #include <linux/pseudo_fs.h> 9 9 #include <linux/magic.h> 10 - #include <linux/genhd.h> 11 10 #include <linux/pfn_t.h> 12 11 #include <linux/cdev.h> 13 - #include <linux/hash.h> 14 12 #include <linux/slab.h> 15 13 #include <linux/uio.h> 16 14 #include <linux/dax.h> ··· 19 21 * struct dax_device - anchor object for dax services 20 22 * @inode: core vfs 21 23 * @cdev: optional character interface for "device dax" 22 - * @host: optional name for lookups where the device path is not available 23 24 * @private: dax driver private data 24 25 * @flags: state and boolean properties 25 26 */ 26 27 struct dax_device { 27 - struct hlist_node list; 28 28 struct inode inode; 29 29 struct cdev cdev; 30 - const char *host; 31 30 void *private; 32 31 unsigned long flags; 33 32 const struct dax_operations *ops; ··· 36 41 static DEFINE_IDA(dax_minor_ida); 37 42 static struct kmem_cache *dax_cache __read_mostly; 38 43 static struct super_block *dax_superblock __read_mostly; 39 - 40 - #define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) 41 - static struct hlist_head dax_host_list[DAX_HASH_SIZE]; 42 - static DEFINE_SPINLOCK(dax_host_lock); 43 44 44 45 int dax_read_lock(void) 45 46 { ··· 49 58 } 50 59 EXPORT_SYMBOL_GPL(dax_read_unlock); 51 60 52 - static int dax_host_hash(const char *host) 53 - { 54 - return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE; 55 - } 56 - 57 - #ifdef CONFIG_BLOCK 61 + #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) 58 62 #include <linux/blkdev.h> 59 63 60 - int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, 61 - pgoff_t *pgoff) 64 + static DEFINE_XARRAY(dax_hosts); 65 + 66 + int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) 62 67 { 63 - sector_t start_sect = bdev ? get_start_sect(bdev) : 0; 64 - phys_addr_t phys_off = (start_sect + sector) * 512; 65 - 66 - if (pgoff) 67 - *pgoff = PHYS_PFN(phys_off); 68 - if (phys_off % PAGE_SIZE || size % PAGE_SIZE) 69 - return -EINVAL; 70 - return 0; 68 + return xa_insert(&dax_hosts, (unsigned long)disk, dax_dev, GFP_KERNEL); 71 69 } 72 - EXPORT_SYMBOL(bdev_dax_pgoff); 70 + EXPORT_SYMBOL_GPL(dax_add_host); 73 71 74 - #if IS_ENABLED(CONFIG_FS_DAX) 72 + void dax_remove_host(struct gendisk *disk) 73 + { 74 + xa_erase(&dax_hosts, (unsigned long)disk); 75 + } 76 + EXPORT_SYMBOL_GPL(dax_remove_host); 77 + 75 78 /** 76 - * dax_get_by_host() - temporary lookup mechanism for filesystem-dax 77 - * @host: alternate name for the device registered by a dax driver 79 + * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax 80 + * @bdev: block device to find a dax_device for 81 + * @start_off: returns the byte offset into the dax_device that @bdev starts 78 82 */ 79 - static struct dax_device *dax_get_by_host(const char *host) 83 + struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off) 80 84 { 81 - struct dax_device *dax_dev, *found = NULL; 82 - int hash, id; 83 - 84 - if (!host) 85 - return NULL; 86 - 87 - hash = dax_host_hash(host); 88 - 89 - id = dax_read_lock(); 90 - spin_lock(&dax_host_lock); 91 - hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) { 92 - if (!dax_alive(dax_dev) 93 - || strcmp(host, dax_dev->host) != 0) 94 - continue; 95 - 96 - if (igrab(&dax_dev->inode)) 97 - found = dax_dev; 98 - break; 99 - } 100 - spin_unlock(&dax_host_lock); 101 - dax_read_unlock(id); 102 - 103 - return found; 104 - } 105 - 106 - struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) 107 - { 108 - if (!blk_queue_dax(bdev->bd_disk->queue)) 109 - return NULL; 110 - return dax_get_by_host(bdev->bd_disk->disk_name); 111 - } 112 - EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); 113 - 114 - bool generic_fsdax_supported(struct dax_device *dax_dev, 115 - struct block_device *bdev, int blocksize, sector_t start, 116 - sector_t sectors) 117 - { 118 - bool dax_enabled = false; 119 - pgoff_t pgoff, pgoff_end; 120 - void *kaddr, *end_kaddr; 121 - pfn_t pfn, end_pfn; 122 - sector_t last_page; 123 - long len, len2; 124 - int err, id; 125 - 126 - if (blocksize != PAGE_SIZE) { 127 - pr_info("%pg: error: unsupported blocksize for dax\n", bdev); 128 - return false; 129 - } 130 - 131 - if (!dax_dev) { 132 - pr_debug("%pg: error: dax unsupported by block device\n", bdev); 133 - return false; 134 - } 135 - 136 - err = bdev_dax_pgoff(bdev, start, PAGE_SIZE, &pgoff); 137 - if (err) { 138 - pr_info("%pg: error: unaligned partition for dax\n", bdev); 139 - return false; 140 - } 141 - 142 - last_page = PFN_DOWN((start + sectors - 1) * 512) * PAGE_SIZE / 512; 143 - err = bdev_dax_pgoff(bdev, last_page, PAGE_SIZE, &pgoff_end); 144 - if (err) { 145 - pr_info("%pg: error: unaligned partition for dax\n", bdev); 146 - return false; 147 - } 148 - 149 - id = dax_read_lock(); 150 - len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); 151 - len2 = dax_direct_access(dax_dev, pgoff_end, 1, &end_kaddr, &end_pfn); 152 - 153 - if (len < 1 || len2 < 1) { 154 - pr_info("%pg: error: dax access failed (%ld)\n", 155 - bdev, len < 1 ? len : len2); 156 - dax_read_unlock(id); 157 - return false; 158 - } 159 - 160 - if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) { 161 - /* 162 - * An arch that has enabled the pmem api should also 163 - * have its drivers support pfn_t_devmap() 164 - * 165 - * This is a developer warning and should not trigger in 166 - * production. dax_flush() will crash since it depends 167 - * on being able to do (page_address(pfn_to_page())). 168 - */ 169 - WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)); 170 - dax_enabled = true; 171 - } else if (pfn_t_devmap(pfn) && pfn_t_devmap(end_pfn)) { 172 - struct dev_pagemap *pgmap, *end_pgmap; 173 - 174 - pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL); 175 - end_pgmap = get_dev_pagemap(pfn_t_to_pfn(end_pfn), NULL); 176 - if (pgmap && pgmap == end_pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX 177 - && pfn_t_to_page(pfn)->pgmap == pgmap 178 - && pfn_t_to_page(end_pfn)->pgmap == pgmap 179 - && pfn_t_to_pfn(pfn) == PHYS_PFN(__pa(kaddr)) 180 - && pfn_t_to_pfn(end_pfn) == PHYS_PFN(__pa(end_kaddr))) 181 - dax_enabled = true; 182 - put_dev_pagemap(pgmap); 183 - put_dev_pagemap(end_pgmap); 184 - 185 - } 186 - dax_read_unlock(id); 187 - 188 - if (!dax_enabled) { 189 - pr_info("%pg: error: dax support not enabled\n", bdev); 190 - return false; 191 - } 192 - return true; 193 - } 194 - EXPORT_SYMBOL_GPL(generic_fsdax_supported); 195 - 196 - bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, 197 - int blocksize, sector_t start, sector_t len) 198 - { 199 - bool ret = false; 85 + struct dax_device *dax_dev; 86 + u64 part_size; 200 87 int id; 201 88 202 - if (!dax_dev) 203 - return false; 89 + if (!blk_queue_dax(bdev->bd_disk->queue)) 90 + return NULL; 91 + 92 + *start_off = get_start_sect(bdev) * SECTOR_SIZE; 93 + part_size = bdev_nr_sectors(bdev) * SECTOR_SIZE; 94 + if (*start_off % PAGE_SIZE || part_size % PAGE_SIZE) { 95 + pr_info("%pg: error: unaligned partition for dax\n", bdev); 96 + return NULL; 97 + } 204 98 205 99 id = dax_read_lock(); 206 - if (dax_alive(dax_dev) && dax_dev->ops->dax_supported) 207 - ret = dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, 208 - start, len); 100 + dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk); 101 + if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) 102 + dax_dev = NULL; 209 103 dax_read_unlock(id); 210 - return ret; 104 + 105 + return dax_dev; 211 106 } 212 - EXPORT_SYMBOL_GPL(dax_supported); 213 - #endif /* CONFIG_FS_DAX */ 214 - #endif /* CONFIG_BLOCK */ 107 + EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); 108 + #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ 215 109 216 110 enum dax_device_flags { 217 111 /* !alive + rcu grace period == no new operations / mappings */ ··· 105 229 DAXDEV_WRITE_CACHE, 106 230 /* flag to check if device supports synchronous flush */ 107 231 DAXDEV_SYNC, 232 + /* do not leave the caches dirty after writes */ 233 + DAXDEV_NOCACHE, 234 + /* handle CPU fetch exceptions during reads */ 235 + DAXDEV_NOMC, 108 236 }; 109 237 110 238 /** ··· 150 270 if (!dax_alive(dax_dev)) 151 271 return 0; 152 272 153 - return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i); 273 + /* 274 + * The userspace address for the memory copy has already been validated 275 + * via access_ok() in vfs_write, so use the 'no check' version to bypass 276 + * the HARDENED_USERCOPY overhead. 277 + */ 278 + if (test_bit(DAXDEV_NOCACHE, &dax_dev->flags)) 279 + return _copy_from_iter_flushcache(addr, bytes, i); 280 + return _copy_from_iter(addr, bytes, i); 154 281 } 155 - EXPORT_SYMBOL_GPL(dax_copy_from_iter); 156 282 157 283 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 158 284 size_t bytes, struct iov_iter *i) ··· 166 280 if (!dax_alive(dax_dev)) 167 281 return 0; 168 282 169 - return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i); 283 + /* 284 + * The userspace address for the memory copy has already been validated 285 + * via access_ok() in vfs_red, so use the 'no check' version to bypass 286 + * the HARDENED_USERCOPY overhead. 287 + */ 288 + if (test_bit(DAXDEV_NOMC, &dax_dev->flags)) 289 + return _copy_mc_to_iter(addr, bytes, i); 290 + return _copy_to_iter(addr, bytes, i); 170 291 } 171 - EXPORT_SYMBOL_GPL(dax_copy_to_iter); 172 292 173 293 int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, 174 294 size_t nr_pages) ··· 224 332 } 225 333 EXPORT_SYMBOL_GPL(dax_write_cache_enabled); 226 334 227 - bool __dax_synchronous(struct dax_device *dax_dev) 335 + bool dax_synchronous(struct dax_device *dax_dev) 228 336 { 229 337 return test_bit(DAXDEV_SYNC, &dax_dev->flags); 230 338 } 231 - EXPORT_SYMBOL_GPL(__dax_synchronous); 339 + EXPORT_SYMBOL_GPL(dax_synchronous); 232 340 233 - void __set_dax_synchronous(struct dax_device *dax_dev) 341 + void set_dax_synchronous(struct dax_device *dax_dev) 234 342 { 235 343 set_bit(DAXDEV_SYNC, &dax_dev->flags); 236 344 } 237 - EXPORT_SYMBOL_GPL(__set_dax_synchronous); 345 + EXPORT_SYMBOL_GPL(set_dax_synchronous); 346 + 347 + void set_dax_nocache(struct dax_device *dax_dev) 348 + { 349 + set_bit(DAXDEV_NOCACHE, &dax_dev->flags); 350 + } 351 + EXPORT_SYMBOL_GPL(set_dax_nocache); 352 + 353 + void set_dax_nomc(struct dax_device *dax_dev) 354 + { 355 + set_bit(DAXDEV_NOMC, &dax_dev->flags); 356 + } 357 + EXPORT_SYMBOL_GPL(set_dax_nomc); 238 358 239 359 bool dax_alive(struct dax_device *dax_dev) 240 360 { ··· 267 363 return; 268 364 269 365 clear_bit(DAXDEV_ALIVE, &dax_dev->flags); 270 - 271 366 synchronize_srcu(&dax_srcu); 272 - 273 - spin_lock(&dax_host_lock); 274 - hlist_del_init(&dax_dev->list); 275 - spin_unlock(&dax_host_lock); 276 367 } 277 368 EXPORT_SYMBOL_GPL(kill_dax); 278 369 ··· 299 400 static void dax_free_inode(struct inode *inode) 300 401 { 301 402 struct dax_device *dax_dev = to_dax_dev(inode); 302 - kfree(dax_dev->host); 303 - dax_dev->host = NULL; 304 403 if (inode->i_rdev) 305 404 ida_simple_remove(&dax_minor_ida, iminor(inode)); 306 405 kmem_cache_free(dax_cache, dax_dev); ··· 373 476 return dax_dev; 374 477 } 375 478 376 - static void dax_add_host(struct dax_device *dax_dev, const char *host) 377 - { 378 - int hash; 379 - 380 - /* 381 - * Unconditionally init dax_dev since it's coming from a 382 - * non-zeroed slab cache 383 - */ 384 - INIT_HLIST_NODE(&dax_dev->list); 385 - dax_dev->host = host; 386 - if (!host) 387 - return; 388 - 389 - hash = dax_host_hash(host); 390 - spin_lock(&dax_host_lock); 391 - hlist_add_head(&dax_dev->list, &dax_host_list[hash]); 392 - spin_unlock(&dax_host_lock); 393 - } 394 - 395 - struct dax_device *alloc_dax(void *private, const char *__host, 396 - const struct dax_operations *ops, unsigned long flags) 479 + struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) 397 480 { 398 481 struct dax_device *dax_dev; 399 - const char *host; 400 482 dev_t devt; 401 483 int minor; 402 484 403 - if (ops && !ops->zero_page_range) { 404 - pr_debug("%s: error: device does not provide dax" 405 - " operation zero_page_range()\n", 406 - __host ? __host : "Unknown"); 485 + if (WARN_ON_ONCE(ops && !ops->zero_page_range)) 407 486 return ERR_PTR(-EINVAL); 408 - } 409 - 410 - host = kstrdup(__host, GFP_KERNEL); 411 - if (__host && !host) 412 - return ERR_PTR(-ENOMEM); 413 487 414 488 minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL); 415 489 if (minor < 0) 416 - goto err_minor; 490 + return ERR_PTR(-ENOMEM); 417 491 418 492 devt = MKDEV(MAJOR(dax_devt), minor); 419 493 dax_dev = dax_dev_get(devt); 420 494 if (!dax_dev) 421 495 goto err_dev; 422 496 423 - dax_add_host(dax_dev, host); 424 497 dax_dev->ops = ops; 425 498 dax_dev->private = private; 426 - if (flags & DAXDEV_F_SYNC) 427 - set_dax_synchronous(dax_dev); 428 - 429 499 return dax_dev; 430 500 431 501 err_dev: 432 502 ida_simple_remove(&dax_minor_ida, minor); 433 - err_minor: 434 - kfree(host); 435 503 return ERR_PTR(-ENOMEM); 436 504 } 437 505 EXPORT_SYMBOL_GPL(alloc_dax);
+12 -51
drivers/md/dm-linear.c
··· 162 162 return fn(ti, lc->dev, lc->start, ti->len, data); 163 163 } 164 164 165 - #if IS_ENABLED(CONFIG_DAX_DRIVER) 165 + #if IS_ENABLED(CONFIG_FS_DAX) 166 + static struct dax_device *linear_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff) 167 + { 168 + struct linear_c *lc = ti->private; 169 + sector_t sector = linear_map_sector(ti, *pgoff << PAGE_SECTORS_SHIFT); 170 + 171 + *pgoff = (get_start_sect(lc->dev->bdev) + sector) >> PAGE_SECTORS_SHIFT; 172 + return lc->dev->dax_dev; 173 + } 174 + 166 175 static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 167 176 long nr_pages, void **kaddr, pfn_t *pfn) 168 177 { 169 - long ret; 170 - struct linear_c *lc = ti->private; 171 - struct block_device *bdev = lc->dev->bdev; 172 - struct dax_device *dax_dev = lc->dev->dax_dev; 173 - sector_t dev_sector, sector = pgoff * PAGE_SECTORS; 178 + struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); 174 179 175 - dev_sector = linear_map_sector(ti, sector); 176 - ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff); 177 - if (ret) 178 - return ret; 179 180 return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); 180 - } 181 - 182 - static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, 183 - void *addr, size_t bytes, struct iov_iter *i) 184 - { 185 - struct linear_c *lc = ti->private; 186 - struct block_device *bdev = lc->dev->bdev; 187 - struct dax_device *dax_dev = lc->dev->dax_dev; 188 - sector_t dev_sector, sector = pgoff * PAGE_SECTORS; 189 - 190 - dev_sector = linear_map_sector(ti, sector); 191 - if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) 192 - return 0; 193 - return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); 194 - } 195 - 196 - static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff, 197 - void *addr, size_t bytes, struct iov_iter *i) 198 - { 199 - struct linear_c *lc = ti->private; 200 - struct block_device *bdev = lc->dev->bdev; 201 - struct dax_device *dax_dev = lc->dev->dax_dev; 202 - sector_t dev_sector, sector = pgoff * PAGE_SECTORS; 203 - 204 - dev_sector = linear_map_sector(ti, sector); 205 - if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) 206 - return 0; 207 - return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i); 208 181 } 209 182 210 183 static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, 211 184 size_t nr_pages) 212 185 { 213 - int ret; 214 - struct linear_c *lc = ti->private; 215 - struct block_device *bdev = lc->dev->bdev; 216 - struct dax_device *dax_dev = lc->dev->dax_dev; 217 - sector_t dev_sector, sector = pgoff * PAGE_SECTORS; 186 + struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); 218 187 219 - dev_sector = linear_map_sector(ti, sector); 220 - ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages << PAGE_SHIFT, &pgoff); 221 - if (ret) 222 - return ret; 223 188 return dax_zero_page_range(dax_dev, pgoff, nr_pages); 224 189 } 225 190 226 191 #else 227 192 #define linear_dax_direct_access NULL 228 - #define linear_dax_copy_from_iter NULL 229 - #define linear_dax_copy_to_iter NULL 230 193 #define linear_dax_zero_page_range NULL 231 194 #endif 232 195 ··· 207 244 .prepare_ioctl = linear_prepare_ioctl, 208 245 .iterate_devices = linear_iterate_devices, 209 246 .direct_access = linear_dax_direct_access, 210 - .dax_copy_from_iter = linear_dax_copy_from_iter, 211 - .dax_copy_to_iter = linear_dax_copy_to_iter, 212 247 .dax_zero_page_range = linear_dax_zero_page_range, 213 248 }; 214 249
+10 -98
drivers/md/dm-log-writes.c
··· 901 901 limits->io_min = limits->physical_block_size; 902 902 } 903 903 904 - #if IS_ENABLED(CONFIG_DAX_DRIVER) 905 - static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes, 906 - struct iov_iter *i) 904 + #if IS_ENABLED(CONFIG_FS_DAX) 905 + static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti, 906 + pgoff_t *pgoff) 907 907 { 908 - struct pending_block *block; 908 + struct log_writes_c *lc = ti->private; 909 909 910 - if (!bytes) 911 - return 0; 912 - 913 - block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); 914 - if (!block) { 915 - DMERR("Error allocating dax pending block"); 916 - return -ENOMEM; 917 - } 918 - 919 - block->data = kzalloc(bytes, GFP_KERNEL); 920 - if (!block->data) { 921 - DMERR("Error allocating dax data space"); 922 - kfree(block); 923 - return -ENOMEM; 924 - } 925 - 926 - /* write data provided via the iterator */ 927 - if (!copy_from_iter(block->data, bytes, i)) { 928 - DMERR("Error copying dax data"); 929 - kfree(block->data); 930 - kfree(block); 931 - return -EIO; 932 - } 933 - 934 - /* rewind the iterator so that the block driver can use it */ 935 - iov_iter_revert(i, bytes); 936 - 937 - block->datalen = bytes; 938 - block->sector = bio_to_dev_sectors(lc, sector); 939 - block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift; 940 - 941 - atomic_inc(&lc->pending_blocks); 942 - spin_lock_irq(&lc->blocks_lock); 943 - list_add_tail(&block->list, &lc->unflushed_blocks); 944 - spin_unlock_irq(&lc->blocks_lock); 945 - wake_up_process(lc->log_kthread); 946 - 947 - return 0; 910 + *pgoff += (get_start_sect(lc->dev->bdev) >> PAGE_SECTORS_SHIFT); 911 + return lc->dev->dax_dev; 948 912 } 949 913 950 914 static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 951 915 long nr_pages, void **kaddr, pfn_t *pfn) 952 916 { 953 - struct log_writes_c *lc = ti->private; 954 - sector_t sector = pgoff * PAGE_SECTORS; 955 - int ret; 917 + struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); 956 918 957 - ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages * PAGE_SIZE, &pgoff); 958 - if (ret) 959 - return ret; 960 - return dax_direct_access(lc->dev->dax_dev, pgoff, nr_pages, kaddr, pfn); 961 - } 962 - 963 - static size_t log_writes_dax_copy_from_iter(struct dm_target *ti, 964 - pgoff_t pgoff, void *addr, size_t bytes, 965 - struct iov_iter *i) 966 - { 967 - struct log_writes_c *lc = ti->private; 968 - sector_t sector = pgoff * PAGE_SECTORS; 969 - int err; 970 - 971 - if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) 972 - return 0; 973 - 974 - /* Don't bother doing anything if logging has been disabled */ 975 - if (!lc->logging_enabled) 976 - goto dax_copy; 977 - 978 - err = log_dax(lc, sector, bytes, i); 979 - if (err) { 980 - DMWARN("Error %d logging DAX write", err); 981 - return 0; 982 - } 983 - dax_copy: 984 - return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); 985 - } 986 - 987 - static size_t log_writes_dax_copy_to_iter(struct dm_target *ti, 988 - pgoff_t pgoff, void *addr, size_t bytes, 989 - struct iov_iter *i) 990 - { 991 - struct log_writes_c *lc = ti->private; 992 - sector_t sector = pgoff * PAGE_SECTORS; 993 - 994 - if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) 995 - return 0; 996 - return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); 919 + return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); 997 920 } 998 921 999 922 static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, 1000 923 size_t nr_pages) 1001 924 { 1002 - int ret; 1003 - struct log_writes_c *lc = ti->private; 1004 - sector_t sector = pgoff * PAGE_SECTORS; 925 + struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); 1005 926 1006 - ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages << PAGE_SHIFT, 1007 - &pgoff); 1008 - if (ret) 1009 - return ret; 1010 - return dax_zero_page_range(lc->dev->dax_dev, pgoff, 1011 - nr_pages << PAGE_SHIFT); 927 + return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT); 1012 928 } 1013 929 1014 930 #else 1015 931 #define log_writes_dax_direct_access NULL 1016 - #define log_writes_dax_copy_from_iter NULL 1017 - #define log_writes_dax_copy_to_iter NULL 1018 932 #define log_writes_dax_zero_page_range NULL 1019 933 #endif 1020 934 ··· 946 1032 .iterate_devices = log_writes_iterate_devices, 947 1033 .io_hints = log_writes_io_hints, 948 1034 .direct_access = log_writes_dax_direct_access, 949 - .dax_copy_from_iter = log_writes_dax_copy_from_iter, 950 - .dax_copy_to_iter = log_writes_dax_copy_to_iter, 951 1035 .dax_zero_page_range = log_writes_dax_zero_page_range, 952 1036 }; 953 1037
+18 -71
drivers/md/dm-stripe.c
··· 300 300 return DM_MAPIO_REMAPPED; 301 301 } 302 302 303 - #if IS_ENABLED(CONFIG_DAX_DRIVER) 303 + #if IS_ENABLED(CONFIG_FS_DAX) 304 + static struct dax_device *stripe_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff) 305 + { 306 + struct stripe_c *sc = ti->private; 307 + struct block_device *bdev; 308 + sector_t dev_sector; 309 + uint32_t stripe; 310 + 311 + stripe_map_sector(sc, *pgoff * PAGE_SECTORS, &stripe, &dev_sector); 312 + dev_sector += sc->stripe[stripe].physical_start; 313 + bdev = sc->stripe[stripe].dev->bdev; 314 + 315 + *pgoff = (get_start_sect(bdev) + dev_sector) >> PAGE_SECTORS_SHIFT; 316 + return sc->stripe[stripe].dev->dax_dev; 317 + } 318 + 304 319 static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 305 320 long nr_pages, void **kaddr, pfn_t *pfn) 306 321 { 307 - sector_t dev_sector, sector = pgoff * PAGE_SECTORS; 308 - struct stripe_c *sc = ti->private; 309 - struct dax_device *dax_dev; 310 - struct block_device *bdev; 311 - uint32_t stripe; 312 - long ret; 322 + struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); 313 323 314 - stripe_map_sector(sc, sector, &stripe, &dev_sector); 315 - dev_sector += sc->stripe[stripe].physical_start; 316 - dax_dev = sc->stripe[stripe].dev->dax_dev; 317 - bdev = sc->stripe[stripe].dev->bdev; 318 - 319 - ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff); 320 - if (ret) 321 - return ret; 322 324 return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); 323 - } 324 - 325 - static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, 326 - void *addr, size_t bytes, struct iov_iter *i) 327 - { 328 - sector_t dev_sector, sector = pgoff * PAGE_SECTORS; 329 - struct stripe_c *sc = ti->private; 330 - struct dax_device *dax_dev; 331 - struct block_device *bdev; 332 - uint32_t stripe; 333 - 334 - stripe_map_sector(sc, sector, &stripe, &dev_sector); 335 - dev_sector += sc->stripe[stripe].physical_start; 336 - dax_dev = sc->stripe[stripe].dev->dax_dev; 337 - bdev = sc->stripe[stripe].dev->bdev; 338 - 339 - if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) 340 - return 0; 341 - return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); 342 - } 343 - 344 - static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff, 345 - void *addr, size_t bytes, struct iov_iter *i) 346 - { 347 - sector_t dev_sector, sector = pgoff * PAGE_SECTORS; 348 - struct stripe_c *sc = ti->private; 349 - struct dax_device *dax_dev; 350 - struct block_device *bdev; 351 - uint32_t stripe; 352 - 353 - stripe_map_sector(sc, sector, &stripe, &dev_sector); 354 - dev_sector += sc->stripe[stripe].physical_start; 355 - dax_dev = sc->stripe[stripe].dev->dax_dev; 356 - bdev = sc->stripe[stripe].dev->bdev; 357 - 358 - if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) 359 - return 0; 360 - return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i); 361 325 } 362 326 363 327 static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, 364 328 size_t nr_pages) 365 329 { 366 - int ret; 367 - sector_t dev_sector, sector = pgoff * PAGE_SECTORS; 368 - struct stripe_c *sc = ti->private; 369 - struct dax_device *dax_dev; 370 - struct block_device *bdev; 371 - uint32_t stripe; 330 + struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); 372 331 373 - stripe_map_sector(sc, sector, &stripe, &dev_sector); 374 - dev_sector += sc->stripe[stripe].physical_start; 375 - dax_dev = sc->stripe[stripe].dev->dax_dev; 376 - bdev = sc->stripe[stripe].dev->bdev; 377 - 378 - ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages << PAGE_SHIFT, &pgoff); 379 - if (ret) 380 - return ret; 381 332 return dax_zero_page_range(dax_dev, pgoff, nr_pages); 382 333 } 383 334 384 335 #else 385 336 #define stripe_dax_direct_access NULL 386 - #define stripe_dax_copy_from_iter NULL 387 - #define stripe_dax_copy_to_iter NULL 388 337 #define stripe_dax_zero_page_range NULL 389 338 #endif 390 339 ··· 470 521 .iterate_devices = stripe_iterate_devices, 471 522 .io_hints = stripe_io_hints, 472 523 .direct_access = stripe_dax_direct_access, 473 - .dax_copy_from_iter = stripe_dax_copy_from_iter, 474 - .dax_copy_to_iter = stripe_dax_copy_to_iter, 475 524 .dax_zero_page_range = stripe_dax_zero_page_range, 476 525 }; 477 526
+11 -11
drivers/md/dm-table.c
··· 806 806 EXPORT_SYMBOL_GPL(dm_table_set_type); 807 807 808 808 /* validate the dax capability of the target device span */ 809 - int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev, 809 + static int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev, 810 810 sector_t start, sector_t len, void *data) 811 811 { 812 - int blocksize = *(int *) data; 812 + if (dev->dax_dev) 813 + return false; 813 814 814 - return !dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len); 815 + DMDEBUG("%pg: error: dax unsupported by block device", dev->bdev); 816 + return true; 815 817 } 816 818 817 819 /* Check devices support synchronous DAX */ ··· 823 821 return !dev->dax_dev || !dax_synchronous(dev->dax_dev); 824 822 } 825 823 826 - bool dm_table_supports_dax(struct dm_table *t, 827 - iterate_devices_callout_fn iterate_fn, int *blocksize) 824 + static bool dm_table_supports_dax(struct dm_table *t, 825 + iterate_devices_callout_fn iterate_fn) 828 826 { 829 827 struct dm_target *ti; 830 828 unsigned i; ··· 837 835 return false; 838 836 839 837 if (!ti->type->iterate_devices || 840 - ti->type->iterate_devices(ti, iterate_fn, blocksize)) 838 + ti->type->iterate_devices(ti, iterate_fn, NULL)) 841 839 return false; 842 840 } 843 841 ··· 864 862 struct dm_target *tgt; 865 863 struct list_head *devices = dm_table_get_devices(t); 866 864 enum dm_queue_mode live_md_type = dm_get_md_type(t->md); 867 - int page_size = PAGE_SIZE; 868 865 869 866 if (t->type != DM_TYPE_NONE) { 870 867 /* target already set the table's type */ ··· 907 906 verify_bio_based: 908 907 /* We must use this table as bio-based */ 909 908 t->type = DM_TYPE_BIO_BASED; 910 - if (dm_table_supports_dax(t, device_not_dax_capable, &page_size) || 909 + if (dm_table_supports_dax(t, device_not_dax_capable) || 911 910 (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) { 912 911 t->type = DM_TYPE_DAX_BIO_BASED; 913 912 } ··· 1977 1976 struct queue_limits *limits) 1978 1977 { 1979 1978 bool wc = false, fua = false; 1980 - int page_size = PAGE_SIZE; 1981 1979 int r; 1982 1980 1983 1981 /* ··· 2010 2010 } 2011 2011 blk_queue_write_cache(q, wc, fua); 2012 2012 2013 - if (dm_table_supports_dax(t, device_not_dax_capable, &page_size)) { 2013 + if (dm_table_supports_dax(t, device_not_dax_capable)) { 2014 2014 blk_queue_flag_set(QUEUE_FLAG_DAX, q); 2015 - if (dm_table_supports_dax(t, device_not_dax_synchronous_capable, NULL)) 2015 + if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) 2016 2016 set_dax_synchronous(t->md->dax_dev); 2017 2017 } 2018 2018 else
+1 -1
drivers/md/dm-writecache.c
··· 38 38 #define BITMAP_GRANULARITY PAGE_SIZE 39 39 #endif 40 40 41 - #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER) 41 + #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX) 42 42 #define DM_WRITECACHE_HAS_PMEM 43 43 #endif 44 44
+12 -77
drivers/md/dm.c
··· 637 637 struct mapped_device *md) 638 638 { 639 639 struct block_device *bdev; 640 - 640 + u64 part_off; 641 641 int r; 642 642 643 643 BUG_ON(td->dm_dev.bdev); ··· 653 653 } 654 654 655 655 td->dm_dev.bdev = bdev; 656 - td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev); 656 + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off); 657 657 return 0; 658 658 } 659 659 ··· 1021 1021 nr_pages = min(len, nr_pages); 1022 1022 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); 1023 1023 1024 - out: 1025 - dm_put_live_table(md, srcu_idx); 1026 - 1027 - return ret; 1028 - } 1029 - 1030 - static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev, 1031 - int blocksize, sector_t start, sector_t len) 1032 - { 1033 - struct mapped_device *md = dax_get_private(dax_dev); 1034 - struct dm_table *map; 1035 - bool ret = false; 1036 - int srcu_idx; 1037 - 1038 - map = dm_get_live_table(md, &srcu_idx); 1039 - if (!map) 1040 - goto out; 1041 - 1042 - ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize); 1043 - 1044 - out: 1045 - dm_put_live_table(md, srcu_idx); 1046 - 1047 - return ret; 1048 - } 1049 - 1050 - static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, 1051 - void *addr, size_t bytes, struct iov_iter *i) 1052 - { 1053 - struct mapped_device *md = dax_get_private(dax_dev); 1054 - sector_t sector = pgoff * PAGE_SECTORS; 1055 - struct dm_target *ti; 1056 - long ret = 0; 1057 - int srcu_idx; 1058 - 1059 - ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1060 - 1061 - if (!ti) 1062 - goto out; 1063 - if (!ti->type->dax_copy_from_iter) { 1064 - ret = copy_from_iter(addr, bytes, i); 1065 - goto out; 1066 - } 1067 - ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i); 1068 - out: 1069 - dm_put_live_table(md, srcu_idx); 1070 - 1071 - return ret; 1072 - } 1073 - 1074 - static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, 1075 - void *addr, size_t bytes, struct iov_iter *i) 1076 - { 1077 - struct mapped_device *md = dax_get_private(dax_dev); 1078 - sector_t sector = pgoff * PAGE_SECTORS; 1079 - struct dm_target *ti; 1080 - long ret = 0; 1081 - int srcu_idx; 1082 - 1083 - ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1084 - 1085 - if (!ti) 1086 - goto out; 1087 - if (!ti->type->dax_copy_to_iter) { 1088 - ret = copy_to_iter(addr, bytes, i); 1089 - goto out; 1090 - } 1091 - ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i); 1092 1024 out: 1093 1025 dm_put_live_table(md, srcu_idx); 1094 1026 ··· 1615 1683 bioset_exit(&md->io_bs); 1616 1684 1617 1685 if (md->dax_dev) { 1686 + dax_remove_host(md->disk); 1618 1687 kill_dax(md->dax_dev); 1619 1688 put_dax(md->dax_dev); 1620 1689 md->dax_dev = NULL; ··· 1717 1784 md->disk->private_data = md; 1718 1785 sprintf(md->disk->disk_name, "dm-%d", minor); 1719 1786 1720 - if (IS_ENABLED(CONFIG_DAX_DRIVER)) { 1721 - md->dax_dev = alloc_dax(md, md->disk->disk_name, 1722 - &dm_dax_ops, 0); 1723 - if (IS_ERR(md->dax_dev)) 1787 + if (IS_ENABLED(CONFIG_FS_DAX)) { 1788 + md->dax_dev = alloc_dax(md, &dm_dax_ops); 1789 + if (IS_ERR(md->dax_dev)) { 1790 + md->dax_dev = NULL; 1791 + goto bad; 1792 + } 1793 + set_dax_nocache(md->dax_dev); 1794 + set_dax_nomc(md->dax_dev); 1795 + if (dax_add_host(md->dax_dev, md->disk)) 1724 1796 goto bad; 1725 1797 } 1726 1798 ··· 2979 3041 2980 3042 static const struct dax_operations dm_dax_ops = { 2981 3043 .direct_access = dm_dax_direct_access, 2982 - .dax_supported = dm_dax_supported, 2983 - .copy_from_iter = dm_dax_copy_from_iter, 2984 - .copy_to_iter = dm_dax_copy_to_iter, 2985 3044 .zero_page_range = dm_dax_zero_page_range, 2986 3045 }; 2987 3046
-4
drivers/md/dm.h
··· 73 73 bool dm_table_request_based(struct dm_table *t); 74 74 void dm_table_free_md_mempools(struct dm_table *t); 75 75 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); 76 - bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn, 77 - int *blocksize); 78 - int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev, 79 - sector_t start, sector_t len, void *data); 80 76 81 77 void dm_lock_md_type(struct mapped_device *md); 82 78 void dm_unlock_md_type(struct mapped_device *md);
+1 -1
drivers/nvdimm/Kconfig
··· 22 22 config BLK_DEV_PMEM 23 23 tristate "PMEM: Persistent memory block device support" 24 24 default LIBNVDIMM 25 - select DAX_DRIVER 25 + select DAX 26 26 select ND_BTT if BTT 27 27 select ND_PFN if NVDIMM_PFN 28 28 help
+12 -26
drivers/nvdimm/pmem.c
··· 301 301 return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn); 302 302 } 303 303 304 - /* 305 - * Use the 'no check' versions of copy_from_iter_flushcache() and 306 - * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds 307 - * checking, both file offset and device offset, is handled by 308 - * dax_iomap_actor() 309 - */ 310 - static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, 311 - void *addr, size_t bytes, struct iov_iter *i) 312 - { 313 - return _copy_from_iter_flushcache(addr, bytes, i); 314 - } 315 - 316 - static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, 317 - void *addr, size_t bytes, struct iov_iter *i) 318 - { 319 - return _copy_mc_to_iter(addr, bytes, i); 320 - } 321 - 322 304 static const struct dax_operations pmem_dax_ops = { 323 305 .direct_access = pmem_dax_direct_access, 324 - .dax_supported = generic_fsdax_supported, 325 - .copy_from_iter = pmem_copy_from_iter, 326 - .copy_to_iter = pmem_copy_to_iter, 327 306 .zero_page_range = pmem_dax_zero_page_range, 328 307 }; 329 308 ··· 358 379 { 359 380 struct pmem_device *pmem = __pmem; 360 381 382 + dax_remove_host(pmem->disk); 361 383 kill_dax(pmem->dax_dev); 362 384 put_dax(pmem->dax_dev); 363 385 del_gendisk(pmem->disk); ··· 382 402 struct gendisk *disk; 383 403 void *addr; 384 404 int rc; 385 - unsigned long flags = 0UL; 386 405 387 406 pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); 388 407 if (!pmem) ··· 474 495 nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_range); 475 496 disk->bb = &pmem->bb; 476 497 477 - if (is_nvdimm_sync(nd_region)) 478 - flags = DAXDEV_F_SYNC; 479 - dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags); 498 + dax_dev = alloc_dax(pmem, &pmem_dax_ops); 480 499 if (IS_ERR(dax_dev)) { 481 500 rc = PTR_ERR(dax_dev); 482 501 goto out; 483 502 } 503 + set_dax_nocache(dax_dev); 504 + set_dax_nomc(dax_dev); 505 + if (is_nvdimm_sync(nd_region)) 506 + set_dax_synchronous(dax_dev); 507 + rc = dax_add_host(dax_dev, disk); 508 + if (rc) 509 + goto out_cleanup_dax; 484 510 dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); 485 511 pmem->dax_dev = dax_dev; 486 512 487 513 rc = device_add_disk(dev, disk, pmem_attribute_groups); 488 514 if (rc) 489 - goto out_cleanup_dax; 515 + goto out_remove_host; 490 516 if (devm_add_action_or_reset(dev, pmem_release_disk, pmem)) 491 517 return -ENOMEM; 492 518 ··· 503 519 dev_warn(dev, "'badblocks' notification disabled\n"); 504 520 return 0; 505 521 522 + out_remove_host: 523 + dax_remove_host(pmem->disk); 506 524 out_cleanup_dax: 507 525 kill_dax(pmem->dax_dev); 508 526 put_dax(pmem->dax_dev);
+1 -1
drivers/pci/p2pdma.c
··· 219 219 error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr, 220 220 pci_bus_address(pdev, bar) + offset, 221 221 range_len(&pgmap->range), dev_to_node(&pdev->dev), 222 - pgmap->ref); 222 + &pgmap->ref); 223 223 if (error) 224 224 goto pages_free; 225 225
+1 -1
drivers/s390/block/Kconfig
··· 5 5 config DCSSBLK 6 6 def_tristate m 7 7 select FS_DAX_LIMITED 8 - select DAX_DRIVER 8 + select DAX 9 9 prompt "DCSSBLK support" 10 10 depends on S390 && BLOCK 11 11 help
+8 -18
drivers/s390/block/dcssblk.c
··· 44 44 .release = dcssblk_release, 45 45 }; 46 46 47 - static size_t dcssblk_dax_copy_from_iter(struct dax_device *dax_dev, 48 - pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) 49 - { 50 - return copy_from_iter(addr, bytes, i); 51 - } 52 - 53 - static size_t dcssblk_dax_copy_to_iter(struct dax_device *dax_dev, 54 - pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) 55 - { 56 - return copy_to_iter(addr, bytes, i); 57 - } 58 - 59 47 static int dcssblk_dax_zero_page_range(struct dax_device *dax_dev, 60 48 pgoff_t pgoff, size_t nr_pages) 61 49 { ··· 60 72 61 73 static const struct dax_operations dcssblk_dax_ops = { 62 74 .direct_access = dcssblk_dax_direct_access, 63 - .dax_supported = generic_fsdax_supported, 64 - .copy_from_iter = dcssblk_dax_copy_from_iter, 65 - .copy_to_iter = dcssblk_dax_copy_to_iter, 66 75 .zero_page_range = dcssblk_dax_zero_page_range, 67 76 }; 68 77 ··· 672 687 if (rc) 673 688 goto put_dev; 674 689 675 - dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name, 676 - &dcssblk_dax_ops, DAXDEV_F_SYNC); 690 + dev_info->dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops); 677 691 if (IS_ERR(dev_info->dax_dev)) { 678 692 rc = PTR_ERR(dev_info->dax_dev); 679 693 dev_info->dax_dev = NULL; 680 694 goto put_dev; 681 695 } 696 + set_dax_synchronous(dev_info->dax_dev); 697 + rc = dax_add_host(dev_info->dax_dev, dev_info->gd); 698 + if (rc) 699 + goto out_dax; 682 700 683 701 get_device(&dev_info->dev); 684 702 rc = device_add_disk(&dev_info->dev, dev_info->gd, NULL); 685 703 if (rc) 686 - goto out_dax; 704 + goto out_dax_host; 687 705 688 706 switch (dev_info->segment_type) { 689 707 case SEG_TYPE_SR: ··· 702 714 rc = count; 703 715 goto out; 704 716 717 + out_dax_host: 718 + dax_remove_host(dev_info->gd); 705 719 out_dax: 706 720 put_device(&dev_info->dev); 707 721 kill_dax(dev_info->dax_dev);
+4 -4
fs/Kconfig
··· 15 15 Enable this to perform validation of the parameter description for a 16 16 filesystem when it is registered. 17 17 18 - if BLOCK 19 - 20 18 config FS_IOMAP 21 19 bool 20 + 21 + if BLOCK 22 22 23 23 source "fs/ext2/Kconfig" 24 24 source "fs/ext4/Kconfig" ··· 41 41 source "fs/nilfs2/Kconfig" 42 42 source "fs/f2fs/Kconfig" 43 43 source "fs/zonefs/Kconfig" 44 + 45 + endif # BLOCK 44 46 45 47 config FS_DAX 46 48 bool "File system based Direct Access (DAX) support" ··· 90 88 # direct-I/O to a DAX mapping. 91 89 config FS_DAX_LIMITED 92 90 bool 93 - 94 - endif # BLOCK 95 91 96 92 # Posix ACL utility routines 97 93 #
+92 -69
fs/dax.c
··· 709 709 return __dax_invalidate_entry(mapping, index, false); 710 710 } 711 711 712 - static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev, 713 - sector_t sector, struct page *to, unsigned long vaddr) 712 + static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos) 714 713 { 714 + return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset); 715 + } 716 + 717 + static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter) 718 + { 719 + pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos); 715 720 void *vto, *kaddr; 716 - pgoff_t pgoff; 717 721 long rc; 718 722 int id; 719 723 720 - rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); 721 - if (rc) 722 - return rc; 723 - 724 724 id = dax_read_lock(); 725 - rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); 725 + rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL); 726 726 if (rc < 0) { 727 727 dax_read_unlock(id); 728 728 return rc; 729 729 } 730 - vto = kmap_atomic(to); 731 - copy_user_page(vto, (void __force *)kaddr, vaddr, to); 730 + vto = kmap_atomic(vmf->cow_page); 731 + copy_user_page(vto, kaddr, vmf->address, vmf->cow_page); 732 732 kunmap_atomic(vto); 733 733 dax_read_unlock(id); 734 734 return 0; ··· 1005 1005 } 1006 1006 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 1007 1007 1008 - static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos) 1009 - { 1010 - return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; 1011 - } 1012 - 1013 1008 static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, 1014 1009 pfn_t *pfnp) 1015 1010 { 1016 - const sector_t sector = dax_iomap_sector(iomap, pos); 1017 - pgoff_t pgoff; 1011 + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 1018 1012 int id, rc; 1019 1013 long length; 1020 1014 1021 - rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff); 1022 - if (rc) 1023 - return rc; 1024 1015 id = dax_read_lock(); 1025 1016 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), 1026 1017 NULL, pfnp); ··· 1117 1126 } 1118 1127 #endif /* CONFIG_FS_DAX_PMD */ 1119 1128 1120 - s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) 1129 + static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff, 1130 + unsigned int offset, size_t size) 1121 1131 { 1122 - sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); 1123 - pgoff_t pgoff; 1124 - long rc, id; 1125 1132 void *kaddr; 1126 - bool page_aligned = false; 1127 - unsigned offset = offset_in_page(pos); 1128 - unsigned size = min_t(u64, PAGE_SIZE - offset, length); 1133 + long ret; 1129 1134 1130 - if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && 1131 - (size == PAGE_SIZE)) 1132 - page_aligned = true; 1133 - 1134 - rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); 1135 - if (rc) 1136 - return rc; 1137 - 1138 - id = dax_read_lock(); 1139 - 1140 - if (page_aligned) 1141 - rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); 1142 - else 1143 - rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); 1144 - if (rc < 0) { 1145 - dax_read_unlock(id); 1146 - return rc; 1147 - } 1148 - 1149 - if (!page_aligned) { 1135 + ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); 1136 + if (ret > 0) { 1150 1137 memset(kaddr + offset, 0, size); 1151 - dax_flush(iomap->dax_dev, kaddr + offset, size); 1138 + dax_flush(dax_dev, kaddr + offset, size); 1152 1139 } 1153 - dax_read_unlock(id); 1154 - return size; 1140 + return ret; 1155 1141 } 1142 + 1143 + static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) 1144 + { 1145 + const struct iomap *iomap = &iter->iomap; 1146 + const struct iomap *srcmap = iomap_iter_srcmap(iter); 1147 + loff_t pos = iter->pos; 1148 + u64 length = iomap_length(iter); 1149 + s64 written = 0; 1150 + 1151 + /* already zeroed? we're done. */ 1152 + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) 1153 + return length; 1154 + 1155 + do { 1156 + unsigned offset = offset_in_page(pos); 1157 + unsigned size = min_t(u64, PAGE_SIZE - offset, length); 1158 + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 1159 + long rc; 1160 + int id; 1161 + 1162 + id = dax_read_lock(); 1163 + if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE) 1164 + rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); 1165 + else 1166 + rc = dax_memzero(iomap->dax_dev, pgoff, offset, size); 1167 + dax_read_unlock(id); 1168 + 1169 + if (rc < 0) 1170 + return rc; 1171 + pos += size; 1172 + length -= size; 1173 + written += size; 1174 + if (did_zero) 1175 + *did_zero = true; 1176 + } while (length > 0); 1177 + 1178 + return written; 1179 + } 1180 + 1181 + int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 1182 + const struct iomap_ops *ops) 1183 + { 1184 + struct iomap_iter iter = { 1185 + .inode = inode, 1186 + .pos = pos, 1187 + .len = len, 1188 + .flags = IOMAP_DAX | IOMAP_ZERO, 1189 + }; 1190 + int ret; 1191 + 1192 + while ((ret = iomap_iter(&iter, ops)) > 0) 1193 + iter.processed = dax_zero_iter(&iter, did_zero); 1194 + return ret; 1195 + } 1196 + EXPORT_SYMBOL_GPL(dax_zero_range); 1197 + 1198 + int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 1199 + const struct iomap_ops *ops) 1200 + { 1201 + unsigned int blocksize = i_blocksize(inode); 1202 + unsigned int off = pos & (blocksize - 1); 1203 + 1204 + /* Block boundary? Nothing to do */ 1205 + if (!off) 1206 + return 0; 1207 + return dax_zero_range(inode, pos, blocksize - off, did_zero, ops); 1208 + } 1209 + EXPORT_SYMBOL_GPL(dax_truncate_page); 1156 1210 1157 1211 static loff_t dax_iomap_iter(const struct iomap_iter *iomi, 1158 1212 struct iov_iter *iter) ··· 1205 1169 const struct iomap *iomap = &iomi->iomap; 1206 1170 loff_t length = iomap_length(iomi); 1207 1171 loff_t pos = iomi->pos; 1208 - struct block_device *bdev = iomap->bdev; 1209 1172 struct dax_device *dax_dev = iomap->dax_dev; 1210 1173 loff_t end = pos + length, done = 0; 1211 1174 ssize_t ret = 0; ··· 1238 1203 while (pos < end) { 1239 1204 unsigned offset = pos & (PAGE_SIZE - 1); 1240 1205 const size_t size = ALIGN(length + offset, PAGE_SIZE); 1241 - const sector_t sector = dax_iomap_sector(iomap, pos); 1206 + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 1242 1207 ssize_t map_len; 1243 - pgoff_t pgoff; 1244 1208 void *kaddr; 1245 1209 1246 1210 if (fatal_signal_pending(current)) { 1247 1211 ret = -EINTR; 1248 1212 break; 1249 1213 } 1250 - 1251 - ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); 1252 - if (ret) 1253 - break; 1254 1214 1255 1215 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), 1256 1216 &kaddr, NULL); ··· 1260 1230 if (map_len > end - pos) 1261 1231 map_len = end - pos; 1262 1232 1263 - /* 1264 - * The userspace address for the memory copy has already been 1265 - * validated via access_ok() in either vfs_read() or 1266 - * vfs_write(), depending on which operation we are doing. 1267 - */ 1268 1233 if (iov_iter_rw(iter) == WRITE) 1269 1234 xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, 1270 1235 map_len, iter); ··· 1299 1274 .inode = iocb->ki_filp->f_mapping->host, 1300 1275 .pos = iocb->ki_pos, 1301 1276 .len = iov_iter_count(iter), 1277 + .flags = IOMAP_DAX, 1302 1278 }; 1303 1279 loff_t done = 0; 1304 1280 int ret; ··· 1358 1332 static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, 1359 1333 const struct iomap_iter *iter) 1360 1334 { 1361 - sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos); 1362 - unsigned long vaddr = vmf->address; 1363 1335 vm_fault_t ret; 1364 1336 int error = 0; 1365 1337 1366 1338 switch (iter->iomap.type) { 1367 1339 case IOMAP_HOLE: 1368 1340 case IOMAP_UNWRITTEN: 1369 - clear_user_highpage(vmf->cow_page, vaddr); 1341 + clear_user_highpage(vmf->cow_page, vmf->address); 1370 1342 break; 1371 1343 case IOMAP_MAPPED: 1372 - error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev, 1373 - sector, vmf->cow_page, vaddr); 1344 + error = copy_cow_page_dax(vmf, iter); 1374 1345 break; 1375 1346 default: 1376 1347 WARN_ON_ONCE(1); ··· 1453 1430 .inode = mapping->host, 1454 1431 .pos = (loff_t)vmf->pgoff << PAGE_SHIFT, 1455 1432 .len = PAGE_SIZE, 1456 - .flags = IOMAP_FAULT, 1433 + .flags = IOMAP_DAX | IOMAP_FAULT, 1457 1434 }; 1458 1435 vm_fault_t ret = 0; 1459 1436 void *entry; ··· 1562 1539 struct iomap_iter iter = { 1563 1540 .inode = mapping->host, 1564 1541 .len = PMD_SIZE, 1565 - .flags = IOMAP_FAULT, 1542 + .flags = IOMAP_DAX | IOMAP_FAULT, 1566 1543 }; 1567 1544 vm_fault_t ret = VM_FAULT_FALLBACK; 1568 1545 pgoff_t max_pgoff;
+9 -2
fs/erofs/data.c
··· 192 192 /* primary device by default */ 193 193 map->m_bdev = sb->s_bdev; 194 194 map->m_daxdev = EROFS_SB(sb)->dax_dev; 195 + map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; 195 196 196 197 if (map->m_deviceid) { 197 198 down_read(&devs->rwsem); ··· 203 202 } 204 203 map->m_bdev = dif->bdev; 205 204 map->m_daxdev = dif->dax_dev; 205 + map->m_dax_part_off = dif->dax_part_off; 206 206 up_read(&devs->rwsem); 207 207 } else if (devs->extra_devices) { 208 208 down_read(&devs->rwsem); ··· 220 218 map->m_pa -= startoff; 221 219 map->m_bdev = dif->bdev; 222 220 map->m_daxdev = dif->dax_dev; 221 + map->m_dax_part_off = dif->dax_part_off; 223 222 break; 224 223 } 225 224 } ··· 251 248 if (ret) 252 249 return ret; 253 250 254 - iomap->bdev = mdev.m_bdev; 255 - iomap->dax_dev = mdev.m_daxdev; 256 251 iomap->offset = map.m_la; 252 + if (flags & IOMAP_DAX) { 253 + iomap->dax_dev = mdev.m_daxdev; 254 + iomap->offset += mdev.m_dax_part_off; 255 + } else { 256 + iomap->bdev = mdev.m_bdev; 257 + } 257 258 iomap->length = map.m_llen; 258 259 iomap->flags = 0; 259 260 iomap->private = NULL;
+3
fs/erofs/internal.h
··· 51 51 char *path; 52 52 struct block_device *bdev; 53 53 struct dax_device *dax_dev; 54 + u64 dax_part_off; 54 55 55 56 u32 blocks; 56 57 u32 mapped_blkaddr; ··· 116 115 #endif /* CONFIG_EROFS_FS_ZIP */ 117 116 struct erofs_dev_context *devs; 118 117 struct dax_device *dax_dev; 118 + u64 dax_part_off; 119 119 u64 total_blocks; 120 120 u32 primarydevice_blocks; 121 121 ··· 469 467 struct erofs_map_dev { 470 468 struct block_device *m_bdev; 471 469 struct dax_device *m_daxdev; 470 + u64 m_dax_part_off; 472 471 473 472 erofs_off_t m_pa; 474 473 unsigned int m_deviceid;
+9 -6
fs/erofs/super.c
··· 267 267 break; 268 268 } 269 269 dif->bdev = bdev; 270 - dif->dax_dev = fs_dax_get_by_bdev(bdev); 270 + dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off); 271 271 dif->blocks = le32_to_cpu(dis->blocks); 272 272 dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); 273 273 sbi->total_blocks += dif->blocks; ··· 597 597 598 598 sb->s_fs_info = sbi; 599 599 sbi->opt = ctx->opt; 600 - sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev); 600 + sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dax_part_off); 601 601 sbi->devs = ctx->devs; 602 602 ctx->devs = NULL; 603 603 ··· 605 605 if (err) 606 606 return err; 607 607 608 - if (test_opt(&sbi->opt, DAX_ALWAYS) && 609 - !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) { 610 - errorfc(fc, "DAX unsupported by block device. Turning off DAX."); 611 - clear_opt(&sbi->opt, DAX_ALWAYS); 608 + if (test_opt(&sbi->opt, DAX_ALWAYS)) { 609 + BUILD_BUG_ON(EROFS_BLKSIZ != PAGE_SIZE); 610 + 611 + if (!sbi->dax_dev) { 612 + errorfc(fc, "DAX unsupported by block device. Turning off DAX."); 613 + clear_opt(&sbi->opt, DAX_ALWAYS); 614 + } 612 615 } 613 616 sb->s_flags |= SB_RDONLY | SB_NOATIME; 614 617 sb->s_maxbytes = MAX_LFS_FILESIZE;
+1
fs/ext2/ext2.h
··· 118 118 spinlock_t s_lock; 119 119 struct mb_cache *s_ea_block_cache; 120 120 struct dax_device *s_daxdev; 121 + u64 s_dax_part_off; 121 122 }; 122 123 123 124 static inline spinlock_t *
+10 -5
fs/ext2/inode.c
··· 36 36 #include <linux/iomap.h> 37 37 #include <linux/namei.h> 38 38 #include <linux/uio.h> 39 + #include <linux/dax.h> 39 40 #include "ext2.h" 40 41 #include "acl.h" 41 42 #include "xattr.h" ··· 817 816 return ret; 818 817 819 818 iomap->flags = 0; 820 - iomap->bdev = inode->i_sb->s_bdev; 821 819 iomap->offset = (u64)first_block << blkbits; 822 - iomap->dax_dev = sbi->s_daxdev; 820 + if (flags & IOMAP_DAX) 821 + iomap->dax_dev = sbi->s_daxdev; 822 + else 823 + iomap->bdev = inode->i_sb->s_bdev; 823 824 824 825 if (ret == 0) { 825 826 iomap->type = IOMAP_HOLE; ··· 830 827 } else { 831 828 iomap->type = IOMAP_MAPPED; 832 829 iomap->addr = (u64)bno << blkbits; 830 + if (flags & IOMAP_DAX) 831 + iomap->addr += sbi->s_dax_part_off; 833 832 iomap->length = (u64)ret << blkbits; 834 833 iomap->flags |= IOMAP_F_MERGED; 835 834 } ··· 1302 1297 inode_dio_wait(inode); 1303 1298 1304 1299 if (IS_DAX(inode)) { 1305 - error = iomap_zero_range(inode, newsize, 1306 - PAGE_ALIGN(newsize) - newsize, NULL, 1307 - &ext2_iomap_ops); 1300 + error = dax_zero_range(inode, newsize, 1301 + PAGE_ALIGN(newsize) - newsize, NULL, 1302 + &ext2_iomap_ops); 1308 1303 } else if (test_opt(inode->i_sb, NOBH)) 1309 1304 error = nobh_truncate_page(inode->i_mapping, 1310 1305 newsize, ext2_get_block);
+8 -8
fs/ext2/super.c
··· 802 802 803 803 static int ext2_fill_super(struct super_block *sb, void *data, int silent) 804 804 { 805 - struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); 806 805 struct buffer_head * bh; 807 806 struct ext2_sb_info * sbi; 808 807 struct ext2_super_block * es; ··· 821 822 822 823 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 823 824 if (!sbi) 824 - goto failed; 825 + return -ENOMEM; 825 826 826 827 sbi->s_blockgroup_lock = 827 828 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); 828 829 if (!sbi->s_blockgroup_lock) { 829 830 kfree(sbi); 830 - goto failed; 831 + return -ENOMEM; 831 832 } 832 833 sb->s_fs_info = sbi; 833 834 sbi->s_sb_block = sb_block; 834 - sbi->s_daxdev = dax_dev; 835 + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); 835 836 836 837 spin_lock_init(&sbi->s_lock); 837 838 ret = -EINVAL; ··· 945 946 blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); 946 947 947 948 if (test_opt(sb, DAX)) { 948 - if (!dax_supported(dax_dev, sb->s_bdev, blocksize, 0, 949 - bdev_nr_sectors(sb->s_bdev))) { 949 + if (!sbi->s_daxdev) { 950 950 ext2_msg(sb, KERN_ERR, 951 951 "DAX unsupported by block device. Turning off DAX."); 952 + clear_opt(sbi->s_mount_opt, DAX); 953 + } else if (blocksize != PAGE_SIZE) { 954 + ext2_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); 952 955 clear_opt(sbi->s_mount_opt, DAX); 953 956 } 954 957 } ··· 1200 1199 failed_mount: 1201 1200 brelse(bh); 1202 1201 failed_sbi: 1202 + fs_put_dax(sbi->s_daxdev); 1203 1203 sb->s_fs_info = NULL; 1204 1204 kfree(sbi->s_blockgroup_lock); 1205 1205 kfree(sbi); 1206 - failed: 1207 - fs_put_dax(dax_dev); 1208 1206 return ret; 1209 1207 } 1210 1208
+1
fs/ext4/ext4.h
··· 1699 1699 */ 1700 1700 struct percpu_rw_semaphore s_writepages_rwsem; 1701 1701 struct dax_device *s_daxdev; 1702 + u64 s_dax_part_off; 1702 1703 #ifdef CONFIG_EXT4_DEBUG 1703 1704 unsigned long s_simulate_fail; 1704 1705 #endif
+16 -9
fs/ext4/inode.c
··· 41 41 #include <linux/bitops.h> 42 42 #include <linux/iomap.h> 43 43 #include <linux/iversion.h> 44 + #include <linux/dax.h> 44 45 45 46 #include "ext4_jbd2.h" 46 47 #include "xattr.h" ··· 3254 3253 3255 3254 static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, 3256 3255 struct ext4_map_blocks *map, loff_t offset, 3257 - loff_t length) 3256 + loff_t length, unsigned int flags) 3258 3257 { 3259 3258 u8 blkbits = inode->i_blkbits; 3260 3259 ··· 3271 3270 if (map->m_flags & EXT4_MAP_NEW) 3272 3271 iomap->flags |= IOMAP_F_NEW; 3273 3272 3274 - iomap->bdev = inode->i_sb->s_bdev; 3275 - iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; 3273 + if (flags & IOMAP_DAX) 3274 + iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; 3275 + else 3276 + iomap->bdev = inode->i_sb->s_bdev; 3276 3277 iomap->offset = (u64) map->m_lblk << blkbits; 3277 3278 iomap->length = (u64) map->m_len << blkbits; 3278 3279 ··· 3294 3291 if (map->m_flags & EXT4_MAP_UNWRITTEN) { 3295 3292 iomap->type = IOMAP_UNWRITTEN; 3296 3293 iomap->addr = (u64) map->m_pblk << blkbits; 3294 + if (flags & IOMAP_DAX) 3295 + iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; 3297 3296 } else if (map->m_flags & EXT4_MAP_MAPPED) { 3298 3297 iomap->type = IOMAP_MAPPED; 3299 3298 iomap->addr = (u64) map->m_pblk << blkbits; 3299 + if (flags & IOMAP_DAX) 3300 + iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; 3300 3301 } else { 3301 3302 iomap->type = IOMAP_HOLE; 3302 3303 iomap->addr = IOMAP_NULL_ADDR; ··· 3337 3330 * DAX and direct I/O are the only two operations that are currently 3338 3331 * supported with IOMAP_WRITE. 3339 3332 */ 3340 - WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT)); 3341 - if (IS_DAX(inode)) 3333 + WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); 3334 + if (flags & IOMAP_DAX) 3342 3335 m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; 3343 3336 /* 3344 3337 * We use i_size instead of i_disksize here because delalloc writeback ··· 3409 3402 if (ret < 0) 3410 3403 return ret; 3411 3404 out: 3412 - ext4_set_iomap(inode, iomap, &map, offset, length); 3405 + ext4_set_iomap(inode, iomap, &map, offset, length, flags); 3413 3406 3414 3407 return 0; 3415 3408 } ··· 3529 3522 delalloc = ext4_iomap_is_delalloc(inode, &map); 3530 3523 3531 3524 set_iomap: 3532 - ext4_set_iomap(inode, iomap, &map, offset, length); 3525 + ext4_set_iomap(inode, iomap, &map, offset, length, flags); 3533 3526 if (delalloc && iomap->type == IOMAP_HOLE) 3534 3527 iomap->type = IOMAP_DELALLOC; 3535 3528 ··· 3769 3762 length = max; 3770 3763 3771 3764 if (IS_DAX(inode)) { 3772 - return iomap_zero_range(inode, from, length, NULL, 3773 - &ext4_iomap_ops); 3765 + return dax_zero_range(inode, from, length, NULL, 3766 + &ext4_iomap_ops); 3774 3767 } 3775 3768 return __ext4_block_zero_page_range(handle, mapping, from, length); 3776 3769 }
+7 -4
fs/ext4/super.c
··· 4338 4338 if (!sbi) 4339 4339 return NULL; 4340 4340 4341 - sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev); 4341 + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); 4342 4342 4343 4343 sbi->s_blockgroup_lock = 4344 4344 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); ··· 4756 4756 goto failed_mount; 4757 4757 } 4758 4758 4759 - if (dax_supported(sbi->s_daxdev, sb->s_bdev, blocksize, 0, 4760 - bdev_nr_sectors(sb->s_bdev))) 4761 - set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); 4759 + if (sbi->s_daxdev) { 4760 + if (blocksize == PAGE_SIZE) 4761 + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); 4762 + else 4763 + ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); 4764 + } 4762 4765 4763 4766 if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { 4764 4767 if (ext4_has_feature_inline_data(sb)) {
+1 -1
fs/fuse/Kconfig
··· 45 45 select INTERVAL_TREE 46 46 depends on VIRTIO_FS 47 47 depends on FS_DAX 48 - depends on DAX_DRIVER 48 + depends on DAX 49 49 help 50 50 This allows bypassing guest page cache and allows mapping host page 51 51 cache directly in guest address space.
+1 -17
fs/fuse/virtio_fs.c
··· 765 765 return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; 766 766 } 767 767 768 - static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev, 769 - pgoff_t pgoff, void *addr, 770 - size_t bytes, struct iov_iter *i) 771 - { 772 - return copy_from_iter(addr, bytes, i); 773 - } 774 - 775 - static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev, 776 - pgoff_t pgoff, void *addr, 777 - size_t bytes, struct iov_iter *i) 778 - { 779 - return copy_to_iter(addr, bytes, i); 780 - } 781 - 782 768 static int virtio_fs_zero_page_range(struct dax_device *dax_dev, 783 769 pgoff_t pgoff, size_t nr_pages) 784 770 { ··· 781 795 782 796 static const struct dax_operations virtio_fs_dax_ops = { 783 797 .direct_access = virtio_fs_direct_access, 784 - .copy_from_iter = virtio_fs_copy_from_iter, 785 - .copy_to_iter = virtio_fs_copy_to_iter, 786 798 .zero_page_range = virtio_fs_zero_page_range, 787 799 }; 788 800 ··· 846 862 dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", 847 863 __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); 848 864 849 - fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0); 865 + fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); 850 866 if (IS_ERR(fs->dax_dev)) 851 867 return PTR_ERR(fs->dax_dev); 852 868
+2 -2
fs/iomap/Makefile
··· 9 9 obj-$(CONFIG_FS_IOMAP) += iomap.o 10 10 11 11 iomap-y += trace.o \ 12 - buffered-io.o \ 12 + iter.o 13 + iomap-$(CONFIG_BLOCK) += buffered-io.o \ 13 14 direct-io.o \ 14 15 fiemap.o \ 15 - iter.o \ 16 16 seek.o 17 17 iomap-$(CONFIG_SWAP) += swapfile.o
-10
fs/iomap/buffered-io.c
··· 897 897 898 898 static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) 899 899 { 900 - struct iomap *iomap = &iter->iomap; 901 900 const struct iomap *srcmap = iomap_iter_srcmap(iter); 902 901 loff_t pos = iter->pos; 903 902 loff_t length = iomap_length(iter); ··· 912 913 size_t offset; 913 914 size_t bytes = min_t(u64, SIZE_MAX, length); 914 915 915 - if (IS_DAX(iter->inode)) { 916 - s64 tmp = dax_iomap_zero(pos, bytes, iomap); 917 - if (tmp < 0) 918 - return tmp; 919 - bytes = tmp; 920 - goto good; 921 - } 922 - 923 916 status = iomap_write_begin(iter, pos, bytes, &folio); 924 917 if (status) 925 918 return status; ··· 924 933 folio_mark_accessed(folio); 925 934 926 935 bytes = iomap_write_end(iter, pos, bytes, bytes, folio); 927 - good: 928 936 if (WARN_ON_ONCE(bytes == 0)) 929 937 return -EIO; 930 938
+2 -2
fs/xfs/libxfs/xfs_bmap.c
··· 4551 4551 * the extent. Just return the real extent at this offset. 4552 4552 */ 4553 4553 if (!isnullstartblock(bma.got.br_startblock)) { 4554 - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); 4554 + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); 4555 4555 *seq = READ_ONCE(ifp->if_seq); 4556 4556 goto out_trans_cancel; 4557 4557 } ··· 4598 4598 XFS_STATS_INC(mp, xs_xstrat_quick); 4599 4599 4600 4600 ASSERT(!isnullstartblock(bma.got.br_startblock)); 4601 - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); 4601 + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); 4602 4602 *seq = READ_ONCE(ifp->if_seq); 4603 4603 4604 4604 if (whichfork == XFS_COW_FORK)
+1 -1
fs/xfs/xfs_aops.c
··· 359 359 isnullstartblock(imap.br_startblock)) 360 360 goto allocate_blocks; 361 361 362 - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0); 362 + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); 363 363 trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); 364 364 return 0; 365 365 allocate_blocks:
+3 -4
fs/xfs/xfs_bmap_util.c
··· 1001 1001 1002 1002 /* 1003 1003 * Now that we've unmap all full blocks we'll have to zero out any 1004 - * partial block at the beginning and/or end. iomap_zero_range is smart 1004 + * partial block at the beginning and/or end. xfs_zero_range is smart 1005 1005 * enough to skip any holes, including those we just created, but we 1006 1006 * must take care not to zero beyond EOF and enlarge i_size. 1007 1007 */ ··· 1009 1009 return 0; 1010 1010 if (offset + len > XFS_ISIZE(ip)) 1011 1011 len = XFS_ISIZE(ip) - offset; 1012 - error = iomap_zero_range(VFS_I(ip), offset, len, NULL, 1013 - &xfs_buffered_write_iomap_ops); 1012 + error = xfs_zero_range(ip, offset, len, NULL); 1014 1013 if (error) 1015 1014 return error; 1016 1015 1017 1016 /* 1018 1017 * If we zeroed right up to EOF and EOF straddles a page boundary we 1019 1018 * must make sure that the post-EOF area is also zeroed because the 1020 - * page could be mmap'd and iomap_zero_range doesn't do that for us. 1019 + * page could be mmap'd and xfs_zero_range doesn't do that for us. 1021 1020 * Writeback of the eof page will do this, albeit clumsily. 1022 1021 */ 1023 1022 if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
+4 -4
fs/xfs/xfs_buf.c
··· 1892 1892 list_lru_destroy(&btp->bt_lru); 1893 1893 1894 1894 blkdev_issue_flush(btp->bt_bdev); 1895 + fs_put_dax(btp->bt_daxdev); 1895 1896 1896 1897 kmem_free(btp); 1897 1898 } ··· 1933 1932 return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); 1934 1933 } 1935 1934 1936 - xfs_buftarg_t * 1935 + struct xfs_buftarg * 1937 1936 xfs_alloc_buftarg( 1938 1937 struct xfs_mount *mp, 1939 - struct block_device *bdev, 1940 - struct dax_device *dax_dev) 1938 + struct block_device *bdev) 1941 1939 { 1942 1940 xfs_buftarg_t *btp; 1943 1941 ··· 1945 1945 btp->bt_mount = mp; 1946 1946 btp->bt_dev = bdev->bd_dev; 1947 1947 btp->bt_bdev = bdev; 1948 - btp->bt_daxdev = dax_dev; 1948 + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off); 1949 1949 1950 1950 /* 1951 1951 * Buffer IO error rate limiting. Limit it to no more than 10 messages
+3 -2
fs/xfs/xfs_buf.h
··· 89 89 dev_t bt_dev; 90 90 struct block_device *bt_bdev; 91 91 struct dax_device *bt_daxdev; 92 + u64 bt_dax_part_off; 92 93 struct xfs_mount *bt_mount; 93 94 unsigned int bt_meta_sectorsize; 94 95 size_t bt_meta_sectormask; ··· 339 338 /* 340 339 * Handling of buftargs. 341 340 */ 342 - extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *, 343 - struct block_device *, struct dax_device *); 341 + struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, 342 + struct block_device *bdev); 344 343 extern void xfs_free_buftarg(struct xfs_buftarg *); 345 344 extern void xfs_buftarg_wait(struct xfs_buftarg *); 346 345 extern void xfs_buftarg_drain(struct xfs_buftarg *);
+1 -2
fs/xfs/xfs_file.c
··· 437 437 } 438 438 439 439 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); 440 - error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, 441 - NULL, &xfs_buffered_write_iomap_ops); 440 + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); 442 441 if (error) 443 442 return error; 444 443 } else
+63 -21
fs/xfs/xfs_iomap.c
··· 28 28 #include "xfs_dquot.h" 29 29 #include "xfs_reflink.h" 30 30 31 - 32 31 #define XFS_ALLOC_ALIGN(mp, off) \ 33 32 (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) 34 33 ··· 53 54 struct xfs_inode *ip, 54 55 struct iomap *iomap, 55 56 struct xfs_bmbt_irec *imap, 56 - u16 flags) 57 + unsigned int mapping_flags, 58 + u16 iomap_flags) 57 59 { 58 60 struct xfs_mount *mp = ip->i_mount; 59 61 struct xfs_buftarg *target = xfs_inode_buftarg(ip); ··· 71 71 iomap->type = IOMAP_DELALLOC; 72 72 } else { 73 73 iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock)); 74 + if (mapping_flags & IOMAP_DAX) 75 + iomap->addr += target->bt_dax_part_off; 76 + 74 77 if (imap->br_state == XFS_EXT_UNWRITTEN) 75 78 iomap->type = IOMAP_UNWRITTEN; 76 79 else 77 80 iomap->type = IOMAP_MAPPED; 81 + 78 82 } 79 83 iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); 80 84 iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); 81 - iomap->bdev = target->bt_bdev; 82 - iomap->dax_dev = target->bt_daxdev; 83 - iomap->flags = flags; 85 + if (mapping_flags & IOMAP_DAX) 86 + iomap->dax_dev = target->bt_daxdev; 87 + else 88 + iomap->bdev = target->bt_bdev; 89 + iomap->flags = iomap_flags; 84 90 85 91 if (xfs_ipincount(ip) && 86 92 (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) ··· 194 188 struct xfs_inode *ip, 195 189 xfs_fileoff_t offset_fsb, 196 190 xfs_fileoff_t count_fsb, 191 + unsigned int flags, 197 192 struct xfs_bmbt_irec *imap) 198 193 { 199 194 struct xfs_mount *mp = ip->i_mount; ··· 236 229 * the reserve block pool for bmbt block allocation if there is no space 237 230 * left but we need to do unwritten extent conversion. 238 231 */ 239 - if (IS_DAX(VFS_I(ip))) { 232 + if (flags & IOMAP_DAX) { 240 233 bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; 241 234 if (imap->br_state == XFS_EXT_UNWRITTEN) { 242 235 force = true; ··· 627 620 imap->br_startblock == DELAYSTARTBLOCK) 628 621 return true; 629 622 /* we convert unwritten extents before copying the data for DAX */ 630 - if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN) 623 + if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN) 631 624 return true; 632 625 return false; 633 626 } ··· 807 800 808 801 xfs_iunlock(ip, lockmode); 809 802 trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); 810 - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); 803 + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); 811 804 812 805 allocate_blocks: 813 806 error = -EAGAIN; ··· 833 826 xfs_iunlock(ip, lockmode); 834 827 835 828 error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, 836 - &imap); 829 + flags, &imap); 837 830 if (error) 838 831 return error; 839 832 840 833 trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); 841 - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW); 834 + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 835 + iomap_flags | IOMAP_F_NEW); 842 836 843 837 out_found_cow: 844 838 xfs_iunlock(ip, lockmode); 845 839 length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); 846 840 trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); 847 841 if (imap.br_startblock != HOLESTARTBLOCK) { 848 - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); 842 + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); 849 843 if (error) 850 844 return error; 851 845 } 852 - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); 846 + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); 853 847 854 848 out_unlock: 855 849 if (lockmode) ··· 1060 1052 */ 1061 1053 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1062 1054 trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); 1063 - return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW); 1055 + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); 1064 1056 1065 1057 found_imap: 1066 1058 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1067 - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); 1059 + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); 1068 1060 1069 1061 found_cow: 1070 1062 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1071 1063 if (imap.br_startoff <= offset_fsb) { 1072 - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); 1064 + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); 1073 1065 if (error) 1074 1066 return error; 1075 - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); 1067 + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 1068 + IOMAP_F_SHARED); 1076 1069 } 1077 1070 1078 1071 xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); 1079 - return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0); 1072 + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); 1080 1073 1081 1074 out_unlock: 1082 1075 xfs_iunlock(ip, XFS_ILOCK_EXCL); ··· 1186 1177 if (error) 1187 1178 return error; 1188 1179 trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); 1189 - return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0); 1180 + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 1181 + shared ? IOMAP_F_SHARED : 0); 1190 1182 } 1191 1183 1192 1184 const struct iomap_ops xfs_read_iomap_ops = { ··· 1246 1236 if (data_fsb < cow_fsb + cmap.br_blockcount) 1247 1237 end_fsb = min(end_fsb, data_fsb); 1248 1238 xfs_trim_extent(&cmap, offset_fsb, end_fsb); 1249 - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); 1239 + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 1240 + IOMAP_F_SHARED); 1250 1241 /* 1251 1242 * This is a COW extent, so we must probe the page cache 1252 1243 * because there could be dirty page cache being backed ··· 1269 1258 imap.br_state = XFS_EXT_NORM; 1270 1259 done: 1271 1260 xfs_trim_extent(&imap, offset_fsb, end_fsb); 1272 - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); 1261 + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); 1273 1262 out_unlock: 1274 1263 xfs_iunlock(ip, lockmode); 1275 1264 return error; ··· 1316 1305 if (error) 1317 1306 return error; 1318 1307 ASSERT(nimaps); 1319 - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); 1308 + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); 1320 1309 } 1321 1310 1322 1311 const struct iomap_ops xfs_xattr_iomap_ops = { 1323 1312 .iomap_begin = xfs_xattr_iomap_begin, 1324 1313 }; 1314 + 1315 + int 1316 + xfs_zero_range( 1317 + struct xfs_inode *ip, 1318 + loff_t pos, 1319 + loff_t len, 1320 + bool *did_zero) 1321 + { 1322 + struct inode *inode = VFS_I(ip); 1323 + 1324 + if (IS_DAX(inode)) 1325 + return dax_zero_range(inode, pos, len, did_zero, 1326 + &xfs_direct_write_iomap_ops); 1327 + return iomap_zero_range(inode, pos, len, did_zero, 1328 + &xfs_buffered_write_iomap_ops); 1329 + } 1330 + 1331 + int 1332 + xfs_truncate_page( 1333 + struct xfs_inode *ip, 1334 + loff_t pos, 1335 + bool *did_zero) 1336 + { 1337 + struct inode *inode = VFS_I(ip); 1338 + 1339 + if (IS_DAX(inode)) 1340 + return dax_truncate_page(inode, pos, did_zero, 1341 + &xfs_direct_write_iomap_ops); 1342 + return iomap_truncate_page(inode, pos, did_zero, 1343 + &xfs_buffered_write_iomap_ops); 1344 + }
+9 -3
fs/xfs/xfs_iomap.h
··· 12 12 struct xfs_bmbt_irec; 13 13 14 14 int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, 15 - xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap); 15 + xfs_fileoff_t count_fsb, unsigned int flags, 16 + struct xfs_bmbt_irec *imap); 16 17 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); 17 18 xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, 18 19 xfs_fileoff_t end_fsb); 19 20 20 - int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, 21 - struct xfs_bmbt_irec *, u16); 21 + int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, 22 + struct xfs_bmbt_irec *imap, unsigned int mapping_flags, 23 + u16 iomap_flags); 24 + 25 + int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, 26 + bool *did_zero); 27 + int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero); 22 28 23 29 static inline xfs_filblks_t 24 30 xfs_aligned_fsb_count(
+3 -4
fs/xfs/xfs_iops.c
··· 890 890 */ 891 891 if (newsize > oldsize) { 892 892 trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); 893 - error = iomap_zero_range(inode, oldsize, newsize - oldsize, 894 - &did_zeroing, &xfs_buffered_write_iomap_ops); 893 + error = xfs_zero_range(ip, oldsize, newsize - oldsize, 894 + &did_zeroing); 895 895 } else { 896 896 /* 897 897 * iomap won't detect a dirty page over an unwritten block (or a ··· 903 903 newsize); 904 904 if (error) 905 905 return error; 906 - error = iomap_truncate_page(inode, newsize, &did_zeroing, 907 - &xfs_buffered_write_iomap_ops); 906 + error = xfs_truncate_page(ip, newsize, &did_zeroing); 908 907 } 909 908 910 909 if (error)
+2 -2
fs/xfs/xfs_pnfs.c
··· 155 155 xfs_iunlock(ip, lock_flags); 156 156 157 157 error = xfs_iomap_write_direct(ip, offset_fsb, 158 - end_fsb - offset_fsb, &imap); 158 + end_fsb - offset_fsb, 0, &imap); 159 159 if (error) 160 160 goto out_unlock; 161 161 ··· 173 173 } 174 174 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 175 175 176 - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); 176 + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); 177 177 *device_generation = mp->m_generation; 178 178 return error; 179 179 out_unlock:
+1 -2
fs/xfs/xfs_reflink.c
··· 1272 1272 return 0; 1273 1273 1274 1274 trace_xfs_zero_eof(ip, isize, pos - isize); 1275 - return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, 1276 - &xfs_buffered_write_iomap_ops); 1275 + return xfs_zero_range(ip, isize, pos - isize, NULL); 1277 1276 } 1278 1277 1279 1278 /*
+34 -46
fs/xfs/xfs_super.c
··· 331 331 return xfs_is_inode32(mp) ? maxagi : agcount; 332 332 } 333 333 334 - static bool 335 - xfs_buftarg_is_dax( 336 - struct super_block *sb, 337 - struct xfs_buftarg *bt) 334 + static int 335 + xfs_setup_dax_always( 336 + struct xfs_mount *mp) 338 337 { 339 - return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0, 340 - bdev_nr_sectors(bt->bt_bdev)); 338 + if (!mp->m_ddev_targp->bt_daxdev && 339 + (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) { 340 + xfs_alert(mp, 341 + "DAX unsupported by block device. Turning off DAX."); 342 + goto disable_dax; 343 + } 344 + 345 + if (mp->m_super->s_blocksize != PAGE_SIZE) { 346 + xfs_alert(mp, 347 + "DAX not supported for blocksize. Turning off DAX."); 348 + goto disable_dax; 349 + } 350 + 351 + if (xfs_has_reflink(mp)) { 352 + xfs_alert(mp, "DAX and reflink cannot be used together!"); 353 + return -EINVAL; 354 + } 355 + 356 + xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); 357 + return 0; 358 + 359 + disable_dax: 360 + xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); 361 + return 0; 341 362 } 342 363 343 364 STATIC int ··· 391 370 xfs_close_devices( 392 371 struct xfs_mount *mp) 393 372 { 394 - struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev; 395 - 396 373 if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { 397 374 struct block_device *logdev = mp->m_logdev_targp->bt_bdev; 398 - struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev; 399 375 400 376 xfs_free_buftarg(mp->m_logdev_targp); 401 377 xfs_blkdev_put(logdev); 402 - fs_put_dax(dax_logdev); 403 378 } 404 379 if (mp->m_rtdev_targp) { 405 380 struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; 406 - struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev; 407 381 408 382 xfs_free_buftarg(mp->m_rtdev_targp); 409 383 xfs_blkdev_put(rtdev); 410 - fs_put_dax(dax_rtdev); 411 384 } 412 385 xfs_free_buftarg(mp->m_ddev_targp); 413 - fs_put_dax(dax_ddev); 414 386 } 415 387 416 388 /* ··· 421 407 struct xfs_mount *mp) 422 408 { 423 409 struct block_device *ddev = mp->m_super->s_bdev; 424 - struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev); 425 - struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL; 426 410 struct block_device *logdev = NULL, *rtdev = NULL; 427 411 int error; 428 412 ··· 430 418 if (mp->m_logname) { 431 419 error = xfs_blkdev_get(mp, mp->m_logname, &logdev); 432 420 if (error) 433 - goto out; 434 - dax_logdev = fs_dax_get_by_bdev(logdev); 421 + return error; 435 422 } 436 423 437 424 if (mp->m_rtname) { ··· 444 433 error = -EINVAL; 445 434 goto out_close_rtdev; 446 435 } 447 - dax_rtdev = fs_dax_get_by_bdev(rtdev); 448 436 } 449 437 450 438 /* 451 439 * Setup xfs_mount buffer target pointers 452 440 */ 453 441 error = -ENOMEM; 454 - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev); 442 + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); 455 443 if (!mp->m_ddev_targp) 456 444 goto out_close_rtdev; 457 445 458 446 if (rtdev) { 459 - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev); 447 + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); 460 448 if (!mp->m_rtdev_targp) 461 449 goto out_free_ddev_targ; 462 450 } 463 451 464 452 if (logdev && logdev != ddev) { 465 - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev); 453 + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); 466 454 if (!mp->m_logdev_targp) 467 455 goto out_free_rtdev_targ; 468 456 } else { ··· 477 467 xfs_free_buftarg(mp->m_ddev_targp); 478 468 out_close_rtdev: 479 469 xfs_blkdev_put(rtdev); 480 - fs_put_dax(dax_rtdev); 481 470 out_close_logdev: 482 - if (logdev && logdev != ddev) { 471 + if (logdev && logdev != ddev) 483 472 xfs_blkdev_put(logdev); 484 - fs_put_dax(dax_logdev); 485 - } 486 - out: 487 - fs_put_dax(dax_ddev); 488 473 return error; 489 474 } 490 475 ··· 1598 1593 sb->s_flags |= SB_I_VERSION; 1599 1594 1600 1595 if (xfs_has_dax_always(mp)) { 1601 - bool rtdev_is_dax = false, datadev_is_dax; 1602 - 1603 - xfs_warn(mp, 1604 - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); 1605 - 1606 - datadev_is_dax = xfs_buftarg_is_dax(sb, mp->m_ddev_targp); 1607 - if (mp->m_rtdev_targp) 1608 - rtdev_is_dax = xfs_buftarg_is_dax(sb, 1609 - mp->m_rtdev_targp); 1610 - if (!rtdev_is_dax && !datadev_is_dax) { 1611 - xfs_alert(mp, 1612 - "DAX unsupported by block device. Turning off DAX."); 1613 - xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); 1614 - } 1615 - if (xfs_has_reflink(mp)) { 1616 - xfs_alert(mp, 1617 - "DAX and reflink cannot be used together!"); 1618 - error = -EINVAL; 1596 + error = xfs_setup_dax_always(mp); 1597 + if (error) 1619 1598 goto out_filestream_unmount; 1620 - } 1621 1599 } 1622 1600 1623 1601 if (xfs_has_discard(mp)) {
+41 -54
include/linux/dax.h
··· 6 6 #include <linux/mm.h> 7 7 #include <linux/radix-tree.h> 8 8 9 - /* Flag for synchronous flush */ 10 - #define DAXDEV_F_SYNC (1UL << 0) 11 - 12 9 typedef unsigned long dax_entry_t; 13 10 14 - struct iomap_ops; 15 - struct iomap; 16 11 struct dax_device; 12 + struct gendisk; 13 + struct iomap_ops; 14 + struct iomap_iter; 15 + struct iomap; 16 + 17 17 struct dax_operations { 18 18 /* 19 19 * direct_access: translate a device-relative ··· 28 28 */ 29 29 bool (*dax_supported)(struct dax_device *, struct block_device *, int, 30 30 sector_t, sector_t); 31 - /* copy_from_iter: required operation for fs-dax direct-i/o */ 32 - size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t, 33 - struct iov_iter *); 34 - /* copy_to_iter: required operation for fs-dax direct-i/o */ 35 - size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t, 36 - struct iov_iter *); 37 31 /* zero_page_range: required operation. Zero page range */ 38 32 int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); 39 33 }; 40 34 41 35 #if IS_ENABLED(CONFIG_DAX) 42 - struct dax_device *alloc_dax(void *private, const char *host, 43 - const struct dax_operations *ops, unsigned long flags); 36 + struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); 44 37 void put_dax(struct dax_device *dax_dev); 45 38 void kill_dax(struct dax_device *dax_dev); 46 39 void dax_write_cache(struct dax_device *dax_dev, bool wc); 47 40 bool dax_write_cache_enabled(struct dax_device *dax_dev); 48 - bool __dax_synchronous(struct dax_device *dax_dev); 49 - static inline bool dax_synchronous(struct dax_device *dax_dev) 50 - { 51 - return __dax_synchronous(dax_dev); 52 - } 53 - void __set_dax_synchronous(struct dax_device *dax_dev); 54 - static inline void set_dax_synchronous(struct dax_device *dax_dev) 55 - { 56 - __set_dax_synchronous(dax_dev); 57 - } 41 + bool dax_synchronous(struct dax_device *dax_dev); 42 + void set_dax_synchronous(struct dax_device *dax_dev); 58 43 /* 59 44 * Check if given mapping is supported by the file / underlying device. 60 45 */ ··· 53 68 return dax_synchronous(dax_dev); 54 69 } 55 70 #else 56 - static inline struct dax_device *alloc_dax(void *private, const char *host, 57 - const struct dax_operations *ops, unsigned long flags) 71 + static inline struct dax_device *alloc_dax(void *private, 72 + const struct dax_operations *ops) 58 73 { 59 74 /* 60 75 * Callers should check IS_ENABLED(CONFIG_DAX) to know if this ··· 89 104 } 90 105 #endif 91 106 107 + void set_dax_nocache(struct dax_device *dax_dev); 108 + void set_dax_nomc(struct dax_device *dax_dev); 109 + 92 110 struct writeback_control; 93 - int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); 94 - #if IS_ENABLED(CONFIG_FS_DAX) 95 - bool generic_fsdax_supported(struct dax_device *dax_dev, 96 - struct block_device *bdev, int blocksize, sector_t start, 97 - sector_t sectors); 98 - 99 - bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, 100 - int blocksize, sector_t start, sector_t len); 101 - 111 + #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) 112 + int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); 113 + void dax_remove_host(struct gendisk *disk); 114 + struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, 115 + u64 *start_off); 102 116 static inline void fs_put_dax(struct dax_device *dax_dev) 103 117 { 104 118 put_dax(dax_dev); 105 119 } 120 + #else 121 + static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) 122 + { 123 + return 0; 124 + } 125 + static inline void dax_remove_host(struct gendisk *disk) 126 + { 127 + } 128 + static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, 129 + u64 *start_off) 130 + { 131 + return NULL; 132 + } 133 + static inline void fs_put_dax(struct dax_device *dax_dev) 134 + { 135 + } 136 + #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ 106 137 107 - struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); 138 + #if IS_ENABLED(CONFIG_FS_DAX) 108 139 int dax_writeback_mapping_range(struct address_space *mapping, 109 140 struct dax_device *dax_dev, struct writeback_control *wbc); 110 141 ··· 129 128 dax_entry_t dax_lock_page(struct page *page); 130 129 void dax_unlock_page(struct page *page, dax_entry_t cookie); 131 130 #else 132 - #define generic_fsdax_supported NULL 133 - 134 - static inline bool dax_supported(struct dax_device *dax_dev, 135 - struct block_device *bdev, int blocksize, sector_t start, 136 - sector_t len) 137 - { 138 - return false; 139 - } 140 - 141 - static inline void fs_put_dax(struct dax_device *dax_dev) 142 - { 143 - } 144 - 145 - static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) 146 - { 147 - return NULL; 148 - } 149 - 150 131 static inline struct page *dax_layout_busy_page(struct address_space *mapping) 151 132 { 152 133 return NULL; ··· 156 173 { 157 174 } 158 175 #endif 176 + 177 + int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 178 + const struct iomap_ops *ops); 179 + int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 180 + const struct iomap_ops *ops); 159 181 160 182 #if IS_ENABLED(CONFIG_DAX) 161 183 int dax_read_lock(void); ··· 196 208 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); 197 209 int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 198 210 pgoff_t index); 199 - s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap); 200 211 static inline bool dax_mapping(struct address_space *mapping) 201 212 { 202 213 return mapping->host && IS_DAX(mapping->host);
-4
include/linux/device-mapper.h
··· 147 147 */ 148 148 typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, 149 149 long nr_pages, void **kaddr, pfn_t *pfn); 150 - typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff, 151 - void *addr, size_t bytes, struct iov_iter *i); 152 150 typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, 153 151 size_t nr_pages); 154 152 ··· 198 200 dm_iterate_devices_fn iterate_devices; 199 201 dm_io_hints_fn io_hints; 200 202 dm_dax_direct_access_fn direct_access; 201 - dm_dax_copy_iter_fn dax_copy_from_iter; 202 - dm_dax_copy_iter_fn dax_copy_to_iter; 203 203 dm_dax_zero_page_range_fn dax_zero_page_range; 204 204 205 205 /* For internal device-mapper use. */
+5
include/linux/iomap.h
··· 141 141 #define IOMAP_NOWAIT (1 << 5) /* do not block */ 142 142 #define IOMAP_OVERWRITE_ONLY (1 << 6) /* only pure overwrites allowed */ 143 143 #define IOMAP_UNSHARE (1 << 7) /* unshare_file_range */ 144 + #ifdef CONFIG_FS_DAX 145 + #define IOMAP_DAX (1 << 8) /* DAX mapping */ 146 + #else 147 + #define IOMAP_DAX 0 148 + #endif /* CONFIG_FS_DAX */ 144 149 145 150 struct iomap_ops { 146 151 /*
+3 -15
include/linux/memremap.h
··· 73 73 void (*page_free)(struct page *page); 74 74 75 75 /* 76 - * Transition the refcount in struct dev_pagemap to the dead state. 77 - */ 78 - void (*kill)(struct dev_pagemap *pgmap); 79 - 80 - /* 81 - * Wait for refcount in struct dev_pagemap to be idle and reap it. 82 - */ 83 - void (*cleanup)(struct dev_pagemap *pgmap); 84 - 85 - /* 86 76 * Used for private (un-addressable) device memory only. Must migrate 87 77 * the page back to a CPU accessible page. 88 78 */ ··· 85 95 * struct dev_pagemap - metadata for ZONE_DEVICE mappings 86 96 * @altmap: pre-allocated/reserved memory for vmemmap allocations 87 97 * @ref: reference count that pins the devm_memremap_pages() mapping 88 - * @internal_ref: internal reference if @ref is not provided by the caller 89 - * @done: completion for @internal_ref 98 + * @done: completion for @ref 90 99 * @type: memory type: see MEMORY_* in memory_hotplug.h 91 100 * @flags: PGMAP_* flags to specify defailed behavior 92 101 * @ops: method table ··· 98 109 */ 99 110 struct dev_pagemap { 100 111 struct vmem_altmap altmap; 101 - struct percpu_ref *ref; 102 - struct percpu_ref internal_ref; 112 + struct percpu_ref ref; 103 113 struct completion done; 104 114 enum memory_type type; 105 115 unsigned int flags; ··· 179 191 static inline void put_dev_pagemap(struct dev_pagemap *pgmap) 180 192 { 181 193 if (pgmap) 182 - percpu_ref_put(pgmap->ref); 194 + percpu_ref_put(&pgmap->ref); 183 195 } 184 196 185 197 #endif /* _LINUX_MEMREMAP_H_ */
+1 -19
include/linux/uio.h
··· 203 203 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 204 204 /* 205 205 * Note, users like pmem that depend on the stricter semantics of 206 - * copy_from_iter_flushcache() than copy_from_iter_nocache() must check for 206 + * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for 207 207 * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the 208 208 * destination is flushed from the cache on return. 209 209 */ ··· 217 217 #else 218 218 #define _copy_mc_to_iter _copy_to_iter 219 219 #endif 220 - 221 - static __always_inline __must_check 222 - size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 223 - { 224 - if (unlikely(!check_copy_size(addr, bytes, false))) 225 - return 0; 226 - else 227 - return _copy_from_iter_flushcache(addr, bytes, i); 228 - } 229 - 230 - static __always_inline __must_check 231 - size_t copy_mc_to_iter(void *addr, size_t bytes, struct iov_iter *i) 232 - { 233 - if (unlikely(!check_copy_size(addr, bytes, true))) 234 - return 0; 235 - else 236 - return _copy_mc_to_iter(addr, bytes, i); 237 - } 238 220 239 221 size_t iov_iter_zero(size_t bytes, struct iov_iter *); 240 222 unsigned long iov_iter_alignment(const struct iov_iter *i);
+12 -47
mm/memremap.c
··· 112 112 #define for_each_device_pfn(pfn, map, i) \ 113 113 for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn)) 114 114 115 - static void dev_pagemap_kill(struct dev_pagemap *pgmap) 116 - { 117 - if (pgmap->ops && pgmap->ops->kill) 118 - pgmap->ops->kill(pgmap); 119 - else 120 - percpu_ref_kill(pgmap->ref); 121 - } 122 - 123 - static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) 124 - { 125 - if (pgmap->ops && pgmap->ops->cleanup) { 126 - pgmap->ops->cleanup(pgmap); 127 - } else { 128 - wait_for_completion(&pgmap->done); 129 - percpu_ref_exit(pgmap->ref); 130 - } 131 - /* 132 - * Undo the pgmap ref assignment for the internal case as the 133 - * caller may re-enable the same pgmap. 134 - */ 135 - if (pgmap->ref == &pgmap->internal_ref) 136 - pgmap->ref = NULL; 137 - } 138 - 139 115 static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) 140 116 { 141 117 struct range *range = &pgmap->ranges[range_id]; ··· 143 167 unsigned long pfn; 144 168 int i; 145 169 146 - dev_pagemap_kill(pgmap); 170 + percpu_ref_kill(&pgmap->ref); 147 171 for (i = 0; i < pgmap->nr_range; i++) 148 172 for_each_device_pfn(pfn, pgmap, i) 149 173 put_page(pfn_to_page(pfn)); 150 - dev_pagemap_cleanup(pgmap); 174 + wait_for_completion(&pgmap->done); 175 + percpu_ref_exit(&pgmap->ref); 151 176 152 177 for (i = 0; i < pgmap->nr_range; i++) 153 178 pageunmap_range(pgmap, i); ··· 165 188 166 189 static void dev_pagemap_percpu_release(struct percpu_ref *ref) 167 190 { 168 - struct dev_pagemap *pgmap = 169 - container_of(ref, struct dev_pagemap, internal_ref); 191 + struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref); 170 192 171 193 complete(&pgmap->done); 172 194 } ··· 271 295 memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], 272 296 PHYS_PFN(range->start), 273 297 PHYS_PFN(range_len(range)), pgmap); 274 - percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id) 275 - - pfn_first(pgmap, range_id)); 298 + percpu_ref_get_many(&pgmap->ref, 299 + pfn_end(pgmap, range_id) - pfn_first(pgmap, range_id)); 276 300 return 0; 277 301 278 302 err_add_memory: ··· 338 362 break; 339 363 } 340 364 341 - if (!pgmap->ref) { 342 - if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) 343 - return ERR_PTR(-EINVAL); 344 - 345 - init_completion(&pgmap->done); 346 - error = percpu_ref_init(&pgmap->internal_ref, 347 - dev_pagemap_percpu_release, 0, GFP_KERNEL); 348 - if (error) 349 - return ERR_PTR(error); 350 - pgmap->ref = &pgmap->internal_ref; 351 - } else { 352 - if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { 353 - WARN(1, "Missing reference count teardown definition\n"); 354 - return ERR_PTR(-EINVAL); 355 - } 356 - } 365 + init_completion(&pgmap->done); 366 + error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0, 367 + GFP_KERNEL); 368 + if (error) 369 + return ERR_PTR(error); 357 370 358 371 devmap_managed_enable_get(pgmap); 359 372 ··· 451 486 /* fall back to slow path lookup */ 452 487 rcu_read_lock(); 453 488 pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); 454 - if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) 489 + if (pgmap && !percpu_ref_tryget_live(&pgmap->ref)) 455 490 pgmap = NULL; 456 491 rcu_read_unlock(); 457 492
+1 -7
tools/testing/nvdimm/Kbuild
··· 35 35 endif 36 36 obj-$(CONFIG_DEV_DAX) += device_dax.o 37 37 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o 38 - obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem_core.o 39 - obj-$(CONFIG_DEV_DAX_PMEM_COMPAT) += dax_pmem_compat.o 40 38 41 39 nfit-y := $(ACPI_SRC)/core.o 42 40 nfit-y += $(ACPI_SRC)/intel.o ··· 65 67 device_dax-y += device_dax_test.o 66 68 device_dax-y += config_check.o 67 69 68 - dax_pmem-y := $(DAX_SRC)/pmem/pmem.o 70 + dax_pmem-y := $(DAX_SRC)/pmem.o 69 71 dax_pmem-y += dax_pmem_test.o 70 - dax_pmem_core-y := $(DAX_SRC)/pmem/core.o 71 - dax_pmem_core-y += dax_pmem_core_test.o 72 - dax_pmem_compat-y := $(DAX_SRC)/pmem/compat.o 73 - dax_pmem_compat-y += dax_pmem_compat_test.o 74 72 dax_pmem-y += config_check.o 75 73 76 74 libnvdimm-y := $(NVDIMM_SRC)/core.o
-8
tools/testing/nvdimm/dax_pmem_compat_test.c
··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - // Copyright(c) 2019 Intel Corporation. All rights reserved. 3 - 4 - #include <linux/module.h> 5 - #include <linux/printk.h> 6 - #include "watermark.h" 7 - 8 - nfit_test_watermark(dax_pmem_compat);
-8
tools/testing/nvdimm/dax_pmem_core_test.c
··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - // Copyright(c) 2019 Intel Corporation. All rights reserved. 3 - 4 - #include <linux/module.h> 5 - #include <linux/printk.h> 6 - #include "watermark.h" 7 - 8 - nfit_test_watermark(dax_pmem_core);
+10 -29
tools/testing/nvdimm/test/iomap.c
··· 100 100 { 101 101 struct dev_pagemap *pgmap = _pgmap; 102 102 103 - WARN_ON(!pgmap || !pgmap->ref); 103 + WARN_ON(!pgmap); 104 104 105 - if (pgmap->ops && pgmap->ops->kill) 106 - pgmap->ops->kill(pgmap); 107 - else 108 - percpu_ref_kill(pgmap->ref); 105 + percpu_ref_kill(&pgmap->ref); 109 106 110 - if (pgmap->ops && pgmap->ops->cleanup) { 111 - pgmap->ops->cleanup(pgmap); 112 - } else { 113 - wait_for_completion(&pgmap->done); 114 - percpu_ref_exit(pgmap->ref); 115 - } 107 + wait_for_completion(&pgmap->done); 108 + percpu_ref_exit(&pgmap->ref); 116 109 } 117 110 118 111 static void dev_pagemap_percpu_release(struct percpu_ref *ref) 119 112 { 120 - struct dev_pagemap *pgmap = 121 - container_of(ref, struct dev_pagemap, internal_ref); 113 + struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref); 122 114 123 115 complete(&pgmap->done); 124 116 } ··· 124 132 if (!nfit_res) 125 133 return devm_memremap_pages(dev, pgmap); 126 134 127 - if (!pgmap->ref) { 128 - if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) 129 - return ERR_PTR(-EINVAL); 130 - 131 - init_completion(&pgmap->done); 132 - error = percpu_ref_init(&pgmap->internal_ref, 133 - dev_pagemap_percpu_release, 0, GFP_KERNEL); 134 - if (error) 135 - return ERR_PTR(error); 136 - pgmap->ref = &pgmap->internal_ref; 137 - } else { 138 - if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { 139 - WARN(1, "Missing reference count teardown definition\n"); 140 - return ERR_PTR(-EINVAL); 141 - } 142 - } 135 + init_completion(&pgmap->done); 136 + error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0, 137 + GFP_KERNEL); 138 + if (error) 139 + return ERR_PTR(error); 143 140 144 141 error = devm_add_action_or_reset(dev, nfit_test_kill, pgmap); 145 142 if (error)
-4
tools/testing/nvdimm/test/ndtest.c
··· 1054 1054 libnvdimm_test(); 1055 1055 device_dax_test(); 1056 1056 dax_pmem_test(); 1057 - dax_pmem_core_test(); 1058 - #ifdef CONFIG_DEV_DAX_PMEM_COMPAT 1059 - dax_pmem_compat_test(); 1060 - #endif 1061 1057 1062 1058 nfit_test_setup(ndtest_resource_lookup, NULL); 1063 1059
-4
tools/testing/nvdimm/test/nfit.c
··· 3300 3300 acpi_nfit_test(); 3301 3301 device_dax_test(); 3302 3302 dax_pmem_test(); 3303 - dax_pmem_core_test(); 3304 - #ifdef CONFIG_DEV_DAX_PMEM_COMPAT 3305 - dax_pmem_compat_test(); 3306 - #endif 3307 3303 3308 3304 nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm); 3309 3305