Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'libnvdimm-for-4.12' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm updates from Dan Williams:
"The bulk of this has been in multiple -next releases. There were a few
late breaking fixes and small features that got added in the last
couple days, but the whole set has received a build success
notification from the kbuild robot.

Change summary:

- Region media error reporting: A libnvdimm region device is the
parent to one or more namespaces. To date, media errors have been
reported via the "badblocks" attribute attached to pmem block
devices for namespaces in "raw" or "memory" mode. Given that
namespaces can be in "device-dax" or "btt-sector" mode this new
interface reports media errors generically, i.e. independent of
namespace modes or state.

This subsequently allows userspace tooling to craft "ACPI 6.1
Section 9.20.7.6 Function Index 4 - Clear Uncorrectable Error"
requests and submit them via the ioctl path for NVDIMM root bus
devices.

- Introduce 'struct dax_device' and 'struct dax_operations': Prompted
by a request from Linus and feedback from Christoph this allows for
dax capable drivers to publish their own custom dax operations.
This fixes the broken assumption that all dax operations are
related to a persistent memory device, and makes it easier for
other architectures and platforms to add customized persistent
memory support.

- 'libnvdimm' core updates: A new "deep_flush" sysfs attribute is
available for storage appliance applications to manually trigger
memory controllers to drain write-pending buffers that would
otherwise be flushed automatically by the platform ADR
(asynchronous-DRAM-refresh) mechanism at a power loss event.
Support for "locked" DIMMs is included to prevent namespaces from
surfacing when the namespace label data area is locked. Finally,
fixes for various reported deadlocks and crashes, also tagged for
-stable.

- ACPI / nfit driver updates: General updates of the nfit driver to
add DSM command overrides, ACPI 6.1 health state flags support, DSM
payload debug available by default, and various fixes.

Acknowledgements that came after the branch was pushed:

- commmit 565851c972b5 "device-dax: fix sysfs attribute deadlock":
Tested-by: Yi Zhang <yizhan@redhat.com>

- commit 23f498448362 "libnvdimm: rework region badblocks clearing"
Tested-by: Toshi Kani <toshi.kani@hpe.com>"

* tag 'libnvdimm-for-4.12' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (52 commits)
libnvdimm, pfn: fix 'npfns' vs section alignment
libnvdimm: handle locked label storage areas
libnvdimm: convert NDD_ flags to use bitops, introduce NDD_LOCKED
brd: fix uninitialized use of brd->dax_dev
block, dax: use correct format string in bdev_dax_supported
device-dax: fix sysfs attribute deadlock
libnvdimm: restore "libnvdimm: band aid btt vs clear poison locking"
libnvdimm: fix nvdimm_bus_lock() vs device_lock() ordering
libnvdimm: rework region badblocks clearing
acpi, nfit: kill ACPI_NFIT_DEBUG
libnvdimm: fix clear length of nvdimm_forget_poison()
libnvdimm, pmem: fix a NULL pointer BUG in nd_pmem_notify
libnvdimm, region: sysfs trigger for nvdimm_flush()
libnvdimm: fix phys_addr for nvdimm_clear_poison
x86, dax, pmem: remove indirection around memcpy_from_pmem()
block: remove block_device_operations ->direct_access()
block, dax: convert bdev_dax_supported() to dax_direct_access()
filesystem-dax: convert to dax_direct_access()
Revert "block: use DAX for partition table reads"
ext2, ext4, xfs: retrieve dax_device for iomap operations
...

+1808 -945
+1
arch/powerpc/platforms/Kconfig
··· 284 284 config AXON_RAM 285 285 tristate "Axon DDR2 memory device driver" 286 286 depends on PPC_IBM_CELL_BLADE && BLOCK 287 + select DAX 287 288 default m 288 289 help 289 290 It registers one block device per Axon's DDR2 memory bank found
+33 -12
arch/powerpc/sysdev/axonram.c
··· 25 25 26 26 #include <linux/bio.h> 27 27 #include <linux/blkdev.h> 28 + #include <linux/dax.h> 28 29 #include <linux/device.h> 29 30 #include <linux/errno.h> 30 31 #include <linux/fs.h> ··· 63 62 struct axon_ram_bank { 64 63 struct platform_device *device; 65 64 struct gendisk *disk; 65 + struct dax_device *dax_dev; 66 66 unsigned int irq_id; 67 67 unsigned long ph_addr; 68 68 unsigned long io_addr; ··· 139 137 return BLK_QC_T_NONE; 140 138 } 141 139 142 - /** 143 - * axon_ram_direct_access - direct_access() method for block device 144 - * @device, @sector, @data: see block_device_operations method 145 - */ 140 + static const struct block_device_operations axon_ram_devops = { 141 + .owner = THIS_MODULE, 142 + }; 143 + 146 144 static long 147 - axon_ram_direct_access(struct block_device *device, sector_t sector, 148 - void **kaddr, pfn_t *pfn, long size) 145 + __axon_ram_direct_access(struct axon_ram_bank *bank, pgoff_t pgoff, long nr_pages, 146 + void **kaddr, pfn_t *pfn) 149 147 { 150 - struct axon_ram_bank *bank = device->bd_disk->private_data; 151 - loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; 148 + resource_size_t offset = pgoff * PAGE_SIZE; 152 149 153 150 *kaddr = (void *) bank->io_addr + offset; 154 151 *pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV); 155 - return bank->size - offset; 152 + return (bank->size - offset) / PAGE_SIZE; 156 153 } 157 154 158 - static const struct block_device_operations axon_ram_devops = { 159 - .owner = THIS_MODULE, 160 - .direct_access = axon_ram_direct_access 155 + static long 156 + axon_ram_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 157 + void **kaddr, pfn_t *pfn) 158 + { 159 + struct axon_ram_bank *bank = dax_get_private(dax_dev); 160 + 161 + return __axon_ram_direct_access(bank, pgoff, nr_pages, kaddr, pfn); 162 + } 163 + 164 + static const struct dax_operations axon_ram_dax_ops = { 165 + .direct_access = axon_ram_dax_direct_access, 161 166 }; 162 167 163 168 /** ··· 228 219 goto failed; 229 220 } 230 221 222 + 231 223 bank->disk->major = azfs_major; 232 224 bank->disk->first_minor = azfs_minor; 233 225 bank->disk->fops = &axon_ram_devops; ··· 236 226 237 227 sprintf(bank->disk->disk_name, "%s%d", 238 228 AXON_RAM_DEVICE_NAME, axon_ram_bank_id); 229 + 230 + bank->dax_dev = alloc_dax(bank, bank->disk->disk_name, 231 + &axon_ram_dax_ops); 232 + if (!bank->dax_dev) { 233 + rc = -ENOMEM; 234 + goto failed; 235 + } 239 236 240 237 bank->disk->queue = blk_alloc_queue(GFP_KERNEL); 241 238 if (bank->disk->queue == NULL) { ··· 295 278 del_gendisk(bank->disk); 296 279 put_disk(bank->disk); 297 280 } 281 + kill_dax(bank->dax_dev); 282 + put_dax(bank->dax_dev); 298 283 device->dev.platform_data = NULL; 299 284 if (bank->io_addr != 0) 300 285 iounmap((void __iomem *) bank->io_addr); ··· 319 300 320 301 device_remove_file(&device->dev, &dev_attr_ecc); 321 302 free_irq(bank->irq_id, device); 303 + kill_dax(bank->dax_dev); 304 + put_dax(bank->dax_dev); 322 305 del_gendisk(bank->disk); 323 306 put_disk(bank->disk); 324 307 iounmap((void __iomem *) bank->io_addr);
-5
arch/x86/include/asm/pmem.h
··· 44 44 BUG(); 45 45 } 46 46 47 - static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) 48 - { 49 - return memcpy_mcsafe(dst, src, n); 50 - } 51 - 52 47 /** 53 48 * arch_wb_cache_pmem - write back a cache range with CLWB 54 49 * @vaddr: virtual start address
+1
arch/x86/include/asm/string_64.h
··· 79 79 #define memset(s, c, n) __memset(s, c, n) 80 80 #endif 81 81 82 + #define __HAVE_ARCH_MEMCPY_MCSAFE 1 82 83 __must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt); 83 84 DECLARE_STATIC_KEY_FALSE(mcsafe_key); 84 85
+1
block/Kconfig
··· 6 6 default y 7 7 select SBITMAP 8 8 select SRCU 9 + select DAX 9 10 help 10 11 Provide block layer support for the kernel. 11 12
+2 -15
block/partition-generic.c
··· 16 16 #include <linux/kmod.h> 17 17 #include <linux/ctype.h> 18 18 #include <linux/genhd.h> 19 - #include <linux/dax.h> 20 19 #include <linux/blktrace_api.h> 21 20 22 21 #include "partitions/check.h" ··· 629 630 return 0; 630 631 } 631 632 632 - static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n) 633 - { 634 - struct address_space *mapping = bdev->bd_inode->i_mapping; 635 - 636 - return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)), 637 - NULL); 638 - } 639 - 640 633 unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) 641 634 { 635 + struct address_space *mapping = bdev->bd_inode->i_mapping; 642 636 struct page *page; 643 637 644 - /* don't populate page cache for dax capable devices */ 645 - if (IS_DAX(bdev->bd_inode)) 646 - page = read_dax_sector(bdev, n); 647 - else 648 - page = read_pagecache_sector(bdev, n); 649 - 638 + page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)), NULL); 650 639 if (!IS_ERR(page)) { 651 640 if (PageError(page)) 652 641 goto fail;
+1 -1
drivers/Makefile
··· 71 71 obj-$(CONFIG_NVM) += lightnvm/ 72 72 obj-y += base/ block/ misc/ mfd/ nfc/ 73 73 obj-$(CONFIG_LIBNVDIMM) += nvdimm/ 74 - obj-$(CONFIG_DEV_DAX) += dax/ 74 + obj-$(CONFIG_DAX) += dax/ 75 75 obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/ 76 76 obj-$(CONFIG_NUBUS) += nubus/ 77 77 obj-y += macintosh/
-12
drivers/acpi/nfit/Kconfig
··· 12 12 13 13 To compile this driver as a module, choose M here: 14 14 the module will be called nfit. 15 - 16 - config ACPI_NFIT_DEBUG 17 - bool "NFIT DSM debug" 18 - depends on ACPI_NFIT 19 - depends on DYNAMIC_DEBUG 20 - default n 21 - help 22 - Enabling this option causes the nfit driver to dump the 23 - input and output buffers of _DSM operations on the ACPI0012 24 - device and its children. This can be very verbose, so leave 25 - it disabled unless you are debugging a hardware / firmware 26 - issue.
+166 -69
drivers/acpi/nfit/core.c
··· 49 49 static bool disable_vendor_specific; 50 50 module_param(disable_vendor_specific, bool, S_IRUGO); 51 51 MODULE_PARM_DESC(disable_vendor_specific, 52 - "Limit commands to the publicly specified set\n"); 52 + "Limit commands to the publicly specified set"); 53 + 54 + static unsigned long override_dsm_mask; 55 + module_param(override_dsm_mask, ulong, S_IRUGO); 56 + MODULE_PARM_DESC(override_dsm_mask, "Bitmask of allowed NVDIMM DSM functions"); 57 + 58 + static int default_dsm_family = -1; 59 + module_param(default_dsm_family, int, S_IRUGO); 60 + MODULE_PARM_DESC(default_dsm_family, 61 + "Try this DSM type first when identifying NVDIMM family"); 53 62 54 63 LIST_HEAD(acpi_descs); 55 64 DEFINE_MUTEX(acpi_desc_lock); ··· 184 175 return 0; 185 176 } 186 177 178 + static int xlat_nvdimm_status(void *buf, unsigned int cmd, u32 status) 179 + { 180 + switch (cmd) { 181 + case ND_CMD_GET_CONFIG_SIZE: 182 + if (status >> 16 & ND_CONFIG_LOCKED) 183 + return -EACCES; 184 + break; 185 + default: 186 + break; 187 + } 188 + 189 + /* all other non-zero status results in an error */ 190 + if (status) 191 + return -EIO; 192 + return 0; 193 + } 194 + 187 195 static int xlat_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd, 188 196 u32 status) 189 197 { 190 198 if (!nvdimm) 191 199 return xlat_bus_status(buf, cmd, status); 192 - if (status) 193 - return -EIO; 194 - return 0; 200 + return xlat_nvdimm_status(buf, cmd, status); 195 201 } 196 202 197 203 int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, ··· 283 259 in_buf.buffer.length = call_pkg->nd_size_in; 284 260 } 285 261 286 - if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) { 287 - dev_dbg(dev, "%s:%s cmd: %d: func: %d input length: %d\n", 288 - __func__, dimm_name, cmd, func, 289 - in_buf.buffer.length); 290 - print_hex_dump_debug("nvdimm in ", DUMP_PREFIX_OFFSET, 4, 4, 262 + dev_dbg(dev, "%s:%s cmd: %d: func: %d input length: %d\n", 263 + __func__, dimm_name, cmd, func, in_buf.buffer.length); 264 + print_hex_dump_debug("nvdimm in ", DUMP_PREFIX_OFFSET, 4, 4, 291 265 in_buf.buffer.pointer, 292 266 min_t(u32, 256, in_buf.buffer.length), true); 293 - } 294 267 295 268 out_obj = acpi_evaluate_dsm(handle, uuid, 1, func, &in_obj); 296 269 if (!out_obj) { ··· 319 298 goto out; 320 299 } 321 300 322 - if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) { 323 - dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__, 324 - dimm_name, cmd_name, out_obj->buffer.length); 325 - print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, 326 - 4, out_obj->buffer.pointer, min_t(u32, 128, 327 - out_obj->buffer.length), true); 328 - } 301 + dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__, dimm_name, 302 + cmd_name, out_obj->buffer.length); 303 + print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, 4, 304 + out_obj->buffer.pointer, 305 + min_t(u32, 128, out_obj->buffer.length), true); 329 306 330 307 for (i = 0, offset = 0; i < desc->out_num; i++) { 331 308 u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf, ··· 467 448 INIT_LIST_HEAD(&nfit_memdev->list); 468 449 memcpy(nfit_memdev->memdev, memdev, sizeof(*memdev)); 469 450 list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs); 470 - dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d\n", 451 + dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d flags: %#x\n", 471 452 __func__, memdev->device_handle, memdev->range_index, 472 - memdev->region_index); 453 + memdev->region_index, memdev->flags); 473 454 return true; 474 455 } 475 456 ··· 748 729 } 749 730 } 750 731 751 - static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc, 732 + static int __nfit_mem_init(struct acpi_nfit_desc *acpi_desc, 752 733 struct acpi_nfit_system_address *spa) 753 734 { 754 735 struct nfit_mem *nfit_mem, *found; 755 736 struct nfit_memdev *nfit_memdev; 756 - int type = nfit_spa_type(spa); 737 + int type = spa ? nfit_spa_type(spa) : 0; 757 738 758 739 switch (type) { 759 740 case NFIT_SPA_DCR: 760 741 case NFIT_SPA_PM: 761 742 break; 762 743 default: 763 - return 0; 744 + if (spa) 745 + return 0; 764 746 } 765 747 748 + /* 749 + * This loop runs in two modes, when a dimm is mapped the loop 750 + * adds memdev associations to an existing dimm, or creates a 751 + * dimm. In the unmapped dimm case this loop sweeps for memdev 752 + * instances with an invalid / zero range_index and adds those 753 + * dimms without spa associations. 754 + */ 766 755 list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { 767 756 struct nfit_flush *nfit_flush; 768 757 struct nfit_dcr *nfit_dcr; 769 758 u32 device_handle; 770 759 u16 dcr; 771 760 772 - if (nfit_memdev->memdev->range_index != spa->range_index) 761 + if (spa && nfit_memdev->memdev->range_index != spa->range_index) 762 + continue; 763 + if (!spa && nfit_memdev->memdev->range_index) 773 764 continue; 774 765 found = NULL; 775 766 dcr = nfit_memdev->memdev->region_index; ··· 864 835 break; 865 836 } 866 837 nfit_mem_init_bdw(acpi_desc, nfit_mem, spa); 867 - } else { 838 + } else if (type == NFIT_SPA_PM) { 868 839 /* 869 840 * A single dimm may belong to multiple SPA-PM 870 841 * ranges, record at least one in addition to 871 842 * any SPA-DCR range. 872 843 */ 873 844 nfit_mem->memdev_pmem = nfit_memdev->memdev; 874 - } 845 + } else 846 + nfit_mem->memdev_dcr = nfit_memdev->memdev; 875 847 } 876 848 877 849 return 0; ··· 896 866 static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc) 897 867 { 898 868 struct nfit_spa *nfit_spa; 869 + int rc; 870 + 899 871 900 872 /* 901 873 * For each SPA-DCR or SPA-PMEM address range find its ··· 908 876 * BDWs are optional. 909 877 */ 910 878 list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { 911 - int rc; 912 - 913 - rc = nfit_mem_dcr_init(acpi_desc, nfit_spa->spa); 879 + rc = __nfit_mem_init(acpi_desc, nfit_spa->spa); 914 880 if (rc) 915 881 return rc; 916 882 } 883 + 884 + /* 885 + * If a DIMM has failed to be mapped into SPA there will be no 886 + * SPA entries above. Find and register all the unmapped DIMMs 887 + * for reporting and recovery purposes. 888 + */ 889 + rc = __nfit_mem_init(acpi_desc, NULL); 890 + if (rc) 891 + return rc; 917 892 918 893 list_sort(NULL, &acpi_desc->dimms, nfit_mem_cmp); 919 894 ··· 1276 1237 { 1277 1238 u16 flags = to_nfit_memdev(dev)->flags; 1278 1239 1279 - return sprintf(buf, "%s%s%s%s%s\n", 1240 + return sprintf(buf, "%s%s%s%s%s%s%s\n", 1280 1241 flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save_fail " : "", 1281 1242 flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore_fail " : "", 1282 1243 flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush_fail " : "", 1283 1244 flags & ACPI_NFIT_MEM_NOT_ARMED ? "not_armed " : "", 1284 - flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart_event " : ""); 1245 + flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart_event " : "", 1246 + flags & ACPI_NFIT_MEM_MAP_FAILED ? "map_fail " : "", 1247 + flags & ACPI_NFIT_MEM_HEALTH_ENABLED ? "smart_notify " : ""); 1285 1248 } 1286 1249 static DEVICE_ATTR_RO(flags); 1287 1250 ··· 1331 1290 struct device *dev = container_of(kobj, struct device, kobj); 1332 1291 struct nvdimm *nvdimm = to_nvdimm(dev); 1333 1292 1334 - if (!to_nfit_dcr(dev)) 1293 + if (!to_nfit_dcr(dev)) { 1294 + /* Without a dcr only the memdev attributes can be surfaced */ 1295 + if (a == &dev_attr_handle.attr || a == &dev_attr_phys_id.attr 1296 + || a == &dev_attr_flags.attr 1297 + || a == &dev_attr_family.attr 1298 + || a == &dev_attr_dsm_mask.attr) 1299 + return a->mode; 1335 1300 return 0; 1301 + } 1302 + 1336 1303 if (a == &dev_attr_format1.attr && num_nvdimm_formats(nvdimm) <= 1) 1337 1304 return 0; 1338 1305 return a->mode; ··· 1417 1368 unsigned long dsm_mask; 1418 1369 const u8 *uuid; 1419 1370 int i; 1371 + int family = -1; 1420 1372 1421 1373 /* nfit test assumes 1:1 relationship between commands and dsms */ 1422 1374 nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en; ··· 1448 1398 */ 1449 1399 for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_MSFT; i++) 1450 1400 if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) 1451 - break; 1401 + if (family < 0 || i == default_dsm_family) 1402 + family = i; 1452 1403 1453 1404 /* limit the supported commands to those that are publicly documented */ 1454 - nfit_mem->family = i; 1455 - if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { 1405 + nfit_mem->family = family; 1406 + if (override_dsm_mask && !disable_vendor_specific) 1407 + dsm_mask = override_dsm_mask; 1408 + else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { 1456 1409 dsm_mask = 0x3fe; 1457 1410 if (disable_vendor_specific) 1458 1411 dsm_mask &= ~(1 << ND_CMD_VENDOR); ··· 1515 1462 list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { 1516 1463 struct acpi_nfit_flush_address *flush; 1517 1464 unsigned long flags = 0, cmd_mask; 1465 + struct nfit_memdev *nfit_memdev; 1518 1466 u32 device_handle; 1519 1467 u16 mem_flags; 1520 1468 ··· 1527 1473 } 1528 1474 1529 1475 if (nfit_mem->bdw && nfit_mem->memdev_pmem) 1530 - flags |= NDD_ALIASING; 1476 + set_bit(NDD_ALIASING, &flags); 1477 + 1478 + /* collate flags across all memdevs for this dimm */ 1479 + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { 1480 + struct acpi_nfit_memory_map *dimm_memdev; 1481 + 1482 + dimm_memdev = __to_nfit_memdev(nfit_mem); 1483 + if (dimm_memdev->device_handle 1484 + != nfit_memdev->memdev->device_handle) 1485 + continue; 1486 + dimm_memdev->flags |= nfit_memdev->memdev->flags; 1487 + } 1531 1488 1532 1489 mem_flags = __to_nfit_memdev(nfit_mem)->flags; 1533 1490 if (mem_flags & ACPI_NFIT_MEM_NOT_ARMED) 1534 - flags |= NDD_UNARMED; 1491 + set_bit(NDD_UNARMED, &flags); 1535 1492 1536 1493 rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle); 1537 1494 if (rc) ··· 1572 1507 if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0) 1573 1508 continue; 1574 1509 1575 - dev_info(acpi_desc->dev, "%s flags:%s%s%s%s\n", 1510 + dev_info(acpi_desc->dev, "%s flags:%s%s%s%s%s\n", 1576 1511 nvdimm_name(nvdimm), 1577 1512 mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? " save_fail" : "", 1578 1513 mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? " restore_fail":"", 1579 1514 mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? " flush_fail" : "", 1580 - mem_flags & ACPI_NFIT_MEM_NOT_ARMED ? " not_armed" : ""); 1515 + mem_flags & ACPI_NFIT_MEM_NOT_ARMED ? " not_armed" : "", 1516 + mem_flags & ACPI_NFIT_MEM_MAP_FAILED ? " map_fail" : ""); 1581 1517 1582 1518 } 1583 1519 ··· 1849 1783 mmio_flush_range((void __force *) 1850 1784 mmio->addr.aperture + offset, c); 1851 1785 1852 - memcpy_from_pmem(iobuf + copied, 1853 - mmio->addr.aperture + offset, c); 1786 + memcpy(iobuf + copied, mmio->addr.aperture + offset, c); 1854 1787 } 1855 1788 1856 1789 copied += c; ··· 2590 2525 acpi_nfit_register_region(acpi_desc, nfit_spa); 2591 2526 } 2592 2527 } 2528 + acpi_desc->init_complete = 1; 2593 2529 2594 2530 list_for_each_entry(nfit_spa, &acpi_desc->spas, list) 2595 2531 acpi_nfit_async_scrub(acpi_desc, nfit_spa); ··· 2613 2547 return rc; 2614 2548 } 2615 2549 2616 - queue_work(nfit_wq, &acpi_desc->work); 2550 + if (!acpi_desc->cancel) 2551 + queue_work(nfit_wq, &acpi_desc->work); 2617 2552 return 0; 2618 2553 } 2619 2554 ··· 2660 2593 return 0; 2661 2594 } 2662 2595 2663 - static void acpi_nfit_destruct(void *data) 2596 + static void acpi_nfit_unregister(void *data) 2664 2597 { 2665 2598 struct acpi_nfit_desc *acpi_desc = data; 2666 - struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus); 2667 2599 2668 - /* 2669 - * Destruct under acpi_desc_lock so that nfit_handle_mce does not 2670 - * race teardown 2671 - */ 2672 - mutex_lock(&acpi_desc_lock); 2673 - acpi_desc->cancel = 1; 2674 - /* 2675 - * Bounce the nvdimm bus lock to make sure any in-flight 2676 - * acpi_nfit_ars_rescan() submissions have had a chance to 2677 - * either submit or see ->cancel set. 2678 - */ 2679 - device_lock(bus_dev); 2680 - device_unlock(bus_dev); 2681 - 2682 - flush_workqueue(nfit_wq); 2683 - if (acpi_desc->scrub_count_state) 2684 - sysfs_put(acpi_desc->scrub_count_state); 2685 2600 nvdimm_bus_unregister(acpi_desc->nvdimm_bus); 2686 - acpi_desc->nvdimm_bus = NULL; 2687 - list_del(&acpi_desc->list); 2688 - mutex_unlock(&acpi_desc_lock); 2689 2601 } 2690 2602 2691 2603 int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz) ··· 2682 2636 if (!acpi_desc->nvdimm_bus) 2683 2637 return -ENOMEM; 2684 2638 2685 - rc = devm_add_action_or_reset(dev, acpi_nfit_destruct, 2639 + rc = devm_add_action_or_reset(dev, acpi_nfit_unregister, 2686 2640 acpi_desc); 2687 2641 if (rc) 2688 2642 return rc; ··· 2774 2728 device_lock(dev); 2775 2729 device_unlock(dev); 2776 2730 2731 + /* bounce the init_mutex to make init_complete valid */ 2732 + mutex_lock(&acpi_desc->init_mutex); 2733 + if (acpi_desc->cancel || acpi_desc->init_complete) { 2734 + mutex_unlock(&acpi_desc->init_mutex); 2735 + return 0; 2736 + } 2737 + 2777 2738 /* 2778 2739 * Scrub work could take 10s of seconds, userspace may give up so we 2779 2740 * need to be interruptible while waiting. ··· 2788 2735 INIT_WORK_ONSTACK(&flush.work, flush_probe); 2789 2736 COMPLETION_INITIALIZER_ONSTACK(flush.cmp); 2790 2737 queue_work(nfit_wq, &flush.work); 2738 + mutex_unlock(&acpi_desc->init_mutex); 2791 2739 2792 2740 rc = wait_for_completion_interruptible(&flush.cmp); 2793 2741 cancel_work_sync(&flush.work); ··· 2825 2771 if (work_busy(&acpi_desc->work)) 2826 2772 return -EBUSY; 2827 2773 2828 - if (acpi_desc->cancel) 2829 - return 0; 2830 - 2831 2774 mutex_lock(&acpi_desc->init_mutex); 2775 + if (acpi_desc->cancel) { 2776 + mutex_unlock(&acpi_desc->init_mutex); 2777 + return 0; 2778 + } 2779 + 2832 2780 list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { 2833 2781 struct acpi_nfit_system_address *spa = nfit_spa->spa; 2834 2782 ··· 2874 2818 } 2875 2819 EXPORT_SYMBOL_GPL(acpi_nfit_desc_init); 2876 2820 2821 + static void acpi_nfit_put_table(void *table) 2822 + { 2823 + acpi_put_table(table); 2824 + } 2825 + 2826 + void acpi_nfit_shutdown(void *data) 2827 + { 2828 + struct acpi_nfit_desc *acpi_desc = data; 2829 + struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus); 2830 + 2831 + /* 2832 + * Destruct under acpi_desc_lock so that nfit_handle_mce does not 2833 + * race teardown 2834 + */ 2835 + mutex_lock(&acpi_desc_lock); 2836 + list_del(&acpi_desc->list); 2837 + mutex_unlock(&acpi_desc_lock); 2838 + 2839 + mutex_lock(&acpi_desc->init_mutex); 2840 + acpi_desc->cancel = 1; 2841 + mutex_unlock(&acpi_desc->init_mutex); 2842 + 2843 + /* 2844 + * Bounce the nvdimm bus lock to make sure any in-flight 2845 + * acpi_nfit_ars_rescan() submissions have had a chance to 2846 + * either submit or see ->cancel set. 2847 + */ 2848 + device_lock(bus_dev); 2849 + device_unlock(bus_dev); 2850 + 2851 + flush_workqueue(nfit_wq); 2852 + } 2853 + EXPORT_SYMBOL_GPL(acpi_nfit_shutdown); 2854 + 2877 2855 static int acpi_nfit_add(struct acpi_device *adev) 2878 2856 { 2879 2857 struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; ··· 2924 2834 dev_dbg(dev, "failed to find NFIT at startup\n"); 2925 2835 return 0; 2926 2836 } 2837 + 2838 + rc = devm_add_action_or_reset(dev, acpi_nfit_put_table, tbl); 2839 + if (rc) 2840 + return rc; 2927 2841 sz = tbl->length; 2928 2842 2929 2843 acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL); ··· 2955 2861 rc = acpi_nfit_init(acpi_desc, (void *) tbl 2956 2862 + sizeof(struct acpi_table_nfit), 2957 2863 sz - sizeof(struct acpi_table_nfit)); 2958 - return rc; 2864 + 2865 + if (rc) 2866 + return rc; 2867 + return devm_add_action_or_reset(dev, acpi_nfit_shutdown, acpi_desc); 2959 2868 } 2960 2869 2961 2870 static int acpi_nfit_remove(struct acpi_device *adev) 2962 2871 { 2963 - /* see acpi_nfit_destruct */ 2872 + /* see acpi_nfit_unregister */ 2964 2873 return 0; 2965 2874 } 2966 2875
+3 -1
drivers/acpi/nfit/nfit.h
··· 37 37 38 38 #define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \ 39 39 | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \ 40 - | ACPI_NFIT_MEM_NOT_ARMED) 40 + | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED) 41 41 42 42 enum nfit_uuids { 43 43 /* for simplicity alias the uuid index with the family id */ ··· 163 163 unsigned int scrub_count; 164 164 unsigned int scrub_mode; 165 165 unsigned int cancel:1; 166 + unsigned int init_complete:1; 166 167 unsigned long dimm_cmd_force_en; 167 168 unsigned long bus_cmd_force_en; 168 169 int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, ··· 239 238 240 239 const u8 *to_nfit_uuid(enum nfit_uuids id); 241 240 int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz); 241 + void acpi_nfit_shutdown(void *data); 242 242 void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event); 243 243 void __acpi_nvdimm_notify(struct device *dev, u32 event); 244 244 int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
+1
drivers/block/Kconfig
··· 323 323 324 324 config BLK_DEV_RAM 325 325 tristate "RAM block device support" 326 + select DAX if BLK_DEV_RAM_DAX 326 327 ---help--- 327 328 Saying Y here will allow you to use a portion of your RAM memory as 328 329 a block device, so that you can make file systems on it, read and
+37 -11
drivers/block/brd.c
··· 21 21 #include <linux/slab.h> 22 22 #ifdef CONFIG_BLK_DEV_RAM_DAX 23 23 #include <linux/pfn_t.h> 24 + #include <linux/dax.h> 24 25 #endif 25 26 26 27 #include <linux/uaccess.h> ··· 42 41 43 42 struct request_queue *brd_queue; 44 43 struct gendisk *brd_disk; 44 + #ifdef CONFIG_BLK_DEV_RAM_DAX 45 + struct dax_device *dax_dev; 46 + #endif 45 47 struct list_head brd_list; 46 48 47 49 /* ··· 330 326 } 331 327 332 328 #ifdef CONFIG_BLK_DEV_RAM_DAX 333 - static long brd_direct_access(struct block_device *bdev, sector_t sector, 334 - void **kaddr, pfn_t *pfn, long size) 329 + static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff, 330 + long nr_pages, void **kaddr, pfn_t *pfn) 335 331 { 336 - struct brd_device *brd = bdev->bd_disk->private_data; 337 332 struct page *page; 338 333 339 334 if (!brd) 340 335 return -ENODEV; 341 - page = brd_insert_page(brd, sector); 336 + page = brd_insert_page(brd, PFN_PHYS(pgoff) / 512); 342 337 if (!page) 343 338 return -ENOSPC; 344 339 *kaddr = page_address(page); 345 340 *pfn = page_to_pfn_t(page); 346 341 347 - return PAGE_SIZE; 342 + return 1; 348 343 } 349 - #else 350 - #define brd_direct_access NULL 344 + 345 + static long brd_dax_direct_access(struct dax_device *dax_dev, 346 + pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) 347 + { 348 + struct brd_device *brd = dax_get_private(dax_dev); 349 + 350 + return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn); 351 + } 352 + 353 + static const struct dax_operations brd_dax_ops = { 354 + .direct_access = brd_dax_direct_access, 355 + }; 351 356 #endif 352 357 353 358 static const struct block_device_operations brd_fops = { 354 359 .owner = THIS_MODULE, 355 360 .rw_page = brd_rw_page, 356 - .direct_access = brd_direct_access, 357 361 }; 358 362 359 363 /* ··· 427 415 * is harmless) 428 416 */ 429 417 blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE); 430 - #ifdef CONFIG_BLK_DEV_RAM_DAX 431 - queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); 432 - #endif 433 418 disk = brd->brd_disk = alloc_disk(max_part); 434 419 if (!disk) 435 420 goto out_free_queue; ··· 439 430 sprintf(disk->disk_name, "ram%d", i); 440 431 set_capacity(disk, rd_size * 2); 441 432 433 + #ifdef CONFIG_BLK_DEV_RAM_DAX 434 + queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); 435 + brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops); 436 + if (!brd->dax_dev) 437 + goto out_free_inode; 438 + #endif 439 + 440 + 442 441 return brd; 443 442 443 + #ifdef CONFIG_BLK_DEV_RAM_DAX 444 + out_free_inode: 445 + kill_dax(brd->dax_dev); 446 + put_dax(brd->dax_dev); 447 + #endif 444 448 out_free_queue: 445 449 blk_cleanup_queue(brd->brd_queue); 446 450 out_free_dev: ··· 493 471 static void brd_del_one(struct brd_device *brd) 494 472 { 495 473 list_del(&brd->brd_list); 474 + #ifdef CONFIG_BLK_DEV_RAM_DAX 475 + kill_dax(brd->dax_dev); 476 + put_dax(brd->dax_dev); 477 + #endif 496 478 del_gendisk(brd->brd_disk); 497 479 brd_free(brd); 498 480 }
+8 -4
drivers/dax/Kconfig
··· 1 - menuconfig DEV_DAX 1 + menuconfig DAX 2 2 tristate "DAX: direct access to differentiated memory" 3 - default m if NVDIMM_DAX 4 - depends on TRANSPARENT_HUGEPAGE 5 3 select SRCU 4 + default m if NVDIMM_DAX 5 + 6 + if DAX 7 + 8 + config DEV_DAX 9 + tristate "Device DAX: direct access mapping device" 10 + depends on TRANSPARENT_HUGEPAGE 6 11 help 7 12 Support raw access to differentiated (persistence, bandwidth, 8 13 latency...) memory via an mmap(2) capable character ··· 16 11 baseline memory pool. Mappings of a /dev/daxX.Y device impose 17 12 restrictions that make the mapping behavior deterministic. 18 13 19 - if DEV_DAX 20 14 21 15 config DEV_DAX_PMEM 22 16 tristate "PMEM DAX: direct access to persistent memory"
+4 -1
drivers/dax/Makefile
··· 1 - obj-$(CONFIG_DEV_DAX) += dax.o 1 + obj-$(CONFIG_DAX) += dax.o 2 + obj-$(CONFIG_DEV_DAX) += device_dax.o 2 3 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o 3 4 5 + dax-y := super.o 4 6 dax_pmem-y := pmem.o 7 + device_dax-y := device.o
+57
drivers/dax/dax-private.h
··· 1 + /* 2 + * Copyright(c) 2016 Intel Corporation. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of version 2 of the GNU General Public License as 6 + * published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, but 9 + * WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + */ 13 + #ifndef __DAX_PRIVATE_H__ 14 + #define __DAX_PRIVATE_H__ 15 + 16 + #include <linux/device.h> 17 + #include <linux/cdev.h> 18 + 19 + /** 20 + * struct dax_region - mapping infrastructure for dax devices 21 + * @id: kernel-wide unique region for a memory range 22 + * @base: linear address corresponding to @res 23 + * @kref: to pin while other agents have a need to do lookups 24 + * @dev: parent device backing this region 25 + * @align: allocation and mapping alignment for child dax devices 26 + * @res: physical address range of the region 27 + * @pfn_flags: identify whether the pfns are paged back or not 28 + */ 29 + struct dax_region { 30 + int id; 31 + struct ida ida; 32 + void *base; 33 + struct kref kref; 34 + struct device *dev; 35 + unsigned int align; 36 + struct resource res; 37 + unsigned long pfn_flags; 38 + }; 39 + 40 + /** 41 + * struct dev_dax - instance data for a subdivision of a dax region 42 + * @region - parent region 43 + * @dax_dev - core dax functionality 44 + * @dev - device core 45 + * @id - child id in the region 46 + * @num_resources - number of physical address extents in this device 47 + * @res - array of physical address ranges 48 + */ 49 + struct dev_dax { 50 + struct dax_region *region; 51 + struct dax_device *dax_dev; 52 + struct device dev; 53 + int id; 54 + int num_resources; 55 + struct resource res[0]; 56 + }; 57 + #endif
+150 -351
drivers/dax/dax.c drivers/dax/device.c
··· 1 1 /* 2 - * Copyright(c) 2016 Intel Corporation. All rights reserved. 2 + * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved. 3 3 * 4 4 * This program is free software; you can redistribute it and/or modify 5 5 * it under the terms of version 2 of the GNU General Public License as ··· 13 13 #include <linux/pagemap.h> 14 14 #include <linux/module.h> 15 15 #include <linux/device.h> 16 - #include <linux/magic.h> 17 - #include <linux/mount.h> 18 16 #include <linux/pfn_t.h> 19 - #include <linux/hash.h> 20 17 #include <linux/cdev.h> 21 18 #include <linux/slab.h> 22 19 #include <linux/dax.h> 23 20 #include <linux/fs.h> 24 21 #include <linux/mm.h> 22 + #include "dax-private.h" 25 23 #include "dax.h" 26 24 27 - static dev_t dax_devt; 28 - DEFINE_STATIC_SRCU(dax_srcu); 29 25 static struct class *dax_class; 30 - static DEFINE_IDA(dax_minor_ida); 31 - static int nr_dax = CONFIG_NR_DEV_DAX; 32 - module_param(nr_dax, int, S_IRUGO); 33 - static struct vfsmount *dax_mnt; 34 - static struct kmem_cache *dax_cache __read_mostly; 35 - static struct super_block *dax_superblock __read_mostly; 36 - MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); 37 26 38 - /** 39 - * struct dax_region - mapping infrastructure for dax devices 40 - * @id: kernel-wide unique region for a memory range 41 - * @base: linear address corresponding to @res 42 - * @kref: to pin while other agents have a need to do lookups 43 - * @dev: parent device backing this region 44 - * @align: allocation and mapping alignment for child dax devices 45 - * @res: physical address range of the region 46 - * @pfn_flags: identify whether the pfns are paged back or not 27 + /* 28 + * Rely on the fact that drvdata is set before the attributes are 29 + * registered, and that the attributes are unregistered before drvdata 30 + * is cleared to assume that drvdata is always valid. 47 31 */ 48 - struct dax_region { 49 - int id; 50 - struct ida ida; 51 - void *base; 52 - struct kref kref; 53 - struct device *dev; 54 - unsigned int align; 55 - struct resource res; 56 - unsigned long pfn_flags; 57 - }; 58 - 59 - /** 60 - * struct dax_dev - subdivision of a dax region 61 - * @region - parent region 62 - * @dev - device backing the character device 63 - * @cdev - core chardev data 64 - * @alive - !alive + srcu grace period == no new mappings can be established 65 - * @id - child id in the region 66 - * @num_resources - number of physical address extents in this device 67 - * @res - array of physical address ranges 68 - */ 69 - struct dax_dev { 70 - struct dax_region *region; 71 - struct inode *inode; 72 - struct device dev; 73 - struct cdev cdev; 74 - bool alive; 75 - int id; 76 - int num_resources; 77 - struct resource res[0]; 78 - }; 79 - 80 32 static ssize_t id_show(struct device *dev, 81 33 struct device_attribute *attr, char *buf) 82 34 { 83 - struct dax_region *dax_region; 84 - ssize_t rc = -ENXIO; 35 + struct dax_region *dax_region = dev_get_drvdata(dev); 85 36 86 - device_lock(dev); 87 - dax_region = dev_get_drvdata(dev); 88 - if (dax_region) 89 - rc = sprintf(buf, "%d\n", dax_region->id); 90 - device_unlock(dev); 91 - 92 - return rc; 37 + return sprintf(buf, "%d\n", dax_region->id); 93 38 } 94 39 static DEVICE_ATTR_RO(id); 95 40 96 41 static ssize_t region_size_show(struct device *dev, 97 42 struct device_attribute *attr, char *buf) 98 43 { 99 - struct dax_region *dax_region; 100 - ssize_t rc = -ENXIO; 44 + struct dax_region *dax_region = dev_get_drvdata(dev); 101 45 102 - device_lock(dev); 103 - dax_region = dev_get_drvdata(dev); 104 - if (dax_region) 105 - rc = sprintf(buf, "%llu\n", (unsigned long long) 106 - resource_size(&dax_region->res)); 107 - device_unlock(dev); 108 - 109 - return rc; 46 + return sprintf(buf, "%llu\n", (unsigned long long) 47 + resource_size(&dax_region->res)); 110 48 } 111 49 static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, 112 50 region_size_show, NULL); ··· 52 114 static ssize_t align_show(struct device *dev, 53 115 struct device_attribute *attr, char *buf) 54 116 { 55 - struct dax_region *dax_region; 56 - ssize_t rc = -ENXIO; 117 + struct dax_region *dax_region = dev_get_drvdata(dev); 57 118 58 - device_lock(dev); 59 - dax_region = dev_get_drvdata(dev); 60 - if (dax_region) 61 - rc = sprintf(buf, "%u\n", dax_region->align); 62 - device_unlock(dev); 63 - 64 - return rc; 119 + return sprintf(buf, "%u\n", dax_region->align); 65 120 } 66 121 static DEVICE_ATTR_RO(align); 67 122 ··· 74 143 &dax_region_attribute_group, 75 144 NULL, 76 145 }; 77 - 78 - static struct inode *dax_alloc_inode(struct super_block *sb) 79 - { 80 - return kmem_cache_alloc(dax_cache, GFP_KERNEL); 81 - } 82 - 83 - static void dax_i_callback(struct rcu_head *head) 84 - { 85 - struct inode *inode = container_of(head, struct inode, i_rcu); 86 - 87 - kmem_cache_free(dax_cache, inode); 88 - } 89 - 90 - static void dax_destroy_inode(struct inode *inode) 91 - { 92 - call_rcu(&inode->i_rcu, dax_i_callback); 93 - } 94 - 95 - static const struct super_operations dax_sops = { 96 - .statfs = simple_statfs, 97 - .alloc_inode = dax_alloc_inode, 98 - .destroy_inode = dax_destroy_inode, 99 - .drop_inode = generic_delete_inode, 100 - }; 101 - 102 - static struct dentry *dax_mount(struct file_system_type *fs_type, 103 - int flags, const char *dev_name, void *data) 104 - { 105 - return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); 106 - } 107 - 108 - static struct file_system_type dax_type = { 109 - .name = "dax", 110 - .mount = dax_mount, 111 - .kill_sb = kill_anon_super, 112 - }; 113 - 114 - static int dax_test(struct inode *inode, void *data) 115 - { 116 - return inode->i_cdev == data; 117 - } 118 - 119 - static int dax_set(struct inode *inode, void *data) 120 - { 121 - inode->i_cdev = data; 122 - return 0; 123 - } 124 - 125 - static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt) 126 - { 127 - struct inode *inode; 128 - 129 - inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), 130 - dax_test, dax_set, cdev); 131 - 132 - if (!inode) 133 - return NULL; 134 - 135 - if (inode->i_state & I_NEW) { 136 - inode->i_mode = S_IFCHR; 137 - inode->i_flags = S_DAX; 138 - inode->i_rdev = devt; 139 - mapping_set_gfp_mask(&inode->i_data, GFP_USER); 140 - unlock_new_inode(inode); 141 - } 142 - return inode; 143 - } 144 - 145 - static void init_once(void *inode) 146 - { 147 - inode_init_once(inode); 148 - } 149 - 150 - static int dax_inode_init(void) 151 - { 152 - int rc; 153 - 154 - dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0, 155 - (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 156 - SLAB_MEM_SPREAD|SLAB_ACCOUNT), 157 - init_once); 158 - if (!dax_cache) 159 - return -ENOMEM; 160 - 161 - rc = register_filesystem(&dax_type); 162 - if (rc) 163 - goto err_register_fs; 164 - 165 - dax_mnt = kern_mount(&dax_type); 166 - if (IS_ERR(dax_mnt)) { 167 - rc = PTR_ERR(dax_mnt); 168 - goto err_mount; 169 - } 170 - dax_superblock = dax_mnt->mnt_sb; 171 - 172 - return 0; 173 - 174 - err_mount: 175 - unregister_filesystem(&dax_type); 176 - err_register_fs: 177 - kmem_cache_destroy(dax_cache); 178 - 179 - return rc; 180 - } 181 - 182 - static void dax_inode_exit(void) 183 - { 184 - kern_unmount(dax_mnt); 185 - unregister_filesystem(&dax_type); 186 - kmem_cache_destroy(dax_cache); 187 - } 188 146 189 147 static void dax_region_free(struct kref *kref) 190 148 { ··· 143 323 } 144 324 EXPORT_SYMBOL_GPL(alloc_dax_region); 145 325 146 - static struct dax_dev *to_dax_dev(struct device *dev) 326 + static struct dev_dax *to_dev_dax(struct device *dev) 147 327 { 148 - return container_of(dev, struct dax_dev, dev); 328 + return container_of(dev, struct dev_dax, dev); 149 329 } 150 330 151 331 static ssize_t size_show(struct device *dev, 152 332 struct device_attribute *attr, char *buf) 153 333 { 154 - struct dax_dev *dax_dev = to_dax_dev(dev); 334 + struct dev_dax *dev_dax = to_dev_dax(dev); 155 335 unsigned long long size = 0; 156 336 int i; 157 337 158 - for (i = 0; i < dax_dev->num_resources; i++) 159 - size += resource_size(&dax_dev->res[i]); 338 + for (i = 0; i < dev_dax->num_resources; i++) 339 + size += resource_size(&dev_dax->res[i]); 160 340 161 341 return sprintf(buf, "%llu\n", size); 162 342 } 163 343 static DEVICE_ATTR_RO(size); 164 344 165 - static struct attribute *dax_device_attributes[] = { 345 + static struct attribute *dev_dax_attributes[] = { 166 346 &dev_attr_size.attr, 167 347 NULL, 168 348 }; 169 349 170 - static const struct attribute_group dax_device_attribute_group = { 171 - .attrs = dax_device_attributes, 350 + static const struct attribute_group dev_dax_attribute_group = { 351 + .attrs = dev_dax_attributes, 172 352 }; 173 353 174 354 static const struct attribute_group *dax_attribute_groups[] = { 175 - &dax_device_attribute_group, 355 + &dev_dax_attribute_group, 176 356 NULL, 177 357 }; 178 358 179 - static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, 359 + static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, 180 360 const char *func) 181 361 { 182 - struct dax_region *dax_region = dax_dev->region; 183 - struct device *dev = &dax_dev->dev; 362 + struct dax_region *dax_region = dev_dax->region; 363 + struct device *dev = &dev_dax->dev; 184 364 unsigned long mask; 185 365 186 - if (!dax_dev->alive) 366 + if (!dax_alive(dev_dax->dax_dev)) 187 367 return -ENXIO; 188 368 189 369 /* prevent private mappings from being established */ ··· 217 397 return 0; 218 398 } 219 399 220 - static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, 400 + /* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ 401 + __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, 221 402 unsigned long size) 222 403 { 223 404 struct resource *res; 224 405 phys_addr_t phys; 225 406 int i; 226 407 227 - for (i = 0; i < dax_dev->num_resources; i++) { 228 - res = &dax_dev->res[i]; 408 + for (i = 0; i < dev_dax->num_resources; i++) { 409 + res = &dev_dax->res[i]; 229 410 phys = pgoff * PAGE_SIZE + res->start; 230 411 if (phys >= res->start && phys <= res->end) 231 412 break; 232 413 pgoff -= PHYS_PFN(resource_size(res)); 233 414 } 234 415 235 - if (i < dax_dev->num_resources) { 236 - res = &dax_dev->res[i]; 416 + if (i < dev_dax->num_resources) { 417 + res = &dev_dax->res[i]; 237 418 if (phys + size - 1 <= res->end) 238 419 return phys; 239 420 } ··· 242 421 return -1; 243 422 } 244 423 245 - static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) 424 + static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 246 425 { 247 - struct device *dev = &dax_dev->dev; 426 + struct device *dev = &dev_dax->dev; 248 427 struct dax_region *dax_region; 249 428 int rc = VM_FAULT_SIGBUS; 250 429 phys_addr_t phys; 251 430 pfn_t pfn; 252 431 unsigned int fault_size = PAGE_SIZE; 253 432 254 - if (check_vma(dax_dev, vmf->vma, __func__)) 433 + if (check_vma(dev_dax, vmf->vma, __func__)) 255 434 return VM_FAULT_SIGBUS; 256 435 257 - dax_region = dax_dev->region; 436 + dax_region = dev_dax->region; 258 437 if (dax_region->align > PAGE_SIZE) { 259 - dev_dbg(dev, "%s: alignment > fault size\n", __func__); 438 + dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", 439 + __func__, dax_region->align, fault_size); 260 440 return VM_FAULT_SIGBUS; 261 441 } 262 442 263 443 if (fault_size != dax_region->align) 264 444 return VM_FAULT_SIGBUS; 265 445 266 - phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); 446 + phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); 267 447 if (phys == -1) { 268 448 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 269 449 vmf->pgoff); ··· 283 461 return VM_FAULT_NOPAGE; 284 462 } 285 463 286 - static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) 464 + static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 287 465 { 288 466 unsigned long pmd_addr = vmf->address & PMD_MASK; 289 - struct device *dev = &dax_dev->dev; 467 + struct device *dev = &dev_dax->dev; 290 468 struct dax_region *dax_region; 291 469 phys_addr_t phys; 292 470 pgoff_t pgoff; 293 471 pfn_t pfn; 294 472 unsigned int fault_size = PMD_SIZE; 295 473 296 - if (check_vma(dax_dev, vmf->vma, __func__)) 474 + if (check_vma(dev_dax, vmf->vma, __func__)) 297 475 return VM_FAULT_SIGBUS; 298 476 299 - dax_region = dax_dev->region; 477 + dax_region = dev_dax->region; 300 478 if (dax_region->align > PMD_SIZE) { 301 - dev_dbg(dev, "%s: alignment > fault size\n", __func__); 479 + dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", 480 + __func__, dax_region->align, fault_size); 302 481 return VM_FAULT_SIGBUS; 303 482 } 304 483 305 484 /* dax pmd mappings require pfn_t_devmap() */ 306 485 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { 307 - dev_dbg(dev, "%s: alignment > fault size\n", __func__); 486 + dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); 308 487 return VM_FAULT_SIGBUS; 309 488 } 310 489 ··· 320 497 return VM_FAULT_SIGBUS; 321 498 322 499 pgoff = linear_page_index(vmf->vma, pmd_addr); 323 - phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); 500 + phys = dax_pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); 324 501 if (phys == -1) { 325 502 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 326 503 pgoff); ··· 334 511 } 335 512 336 513 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 337 - static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) 514 + static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 338 515 { 339 516 unsigned long pud_addr = vmf->address & PUD_MASK; 340 - struct device *dev = &dax_dev->dev; 517 + struct device *dev = &dev_dax->dev; 341 518 struct dax_region *dax_region; 342 519 phys_addr_t phys; 343 520 pgoff_t pgoff; ··· 345 522 unsigned int fault_size = PUD_SIZE; 346 523 347 524 348 - if (check_vma(dax_dev, vmf->vma, __func__)) 525 + if (check_vma(dev_dax, vmf->vma, __func__)) 349 526 return VM_FAULT_SIGBUS; 350 527 351 - dax_region = dax_dev->region; 528 + dax_region = dev_dax->region; 352 529 if (dax_region->align > PUD_SIZE) { 353 - dev_dbg(dev, "%s: alignment > fault size\n", __func__); 530 + dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", 531 + __func__, dax_region->align, fault_size); 354 532 return VM_FAULT_SIGBUS; 355 533 } 356 534 357 535 /* dax pud mappings require pfn_t_devmap() */ 358 536 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { 359 - dev_dbg(dev, "%s: alignment > fault size\n", __func__); 537 + dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); 360 538 return VM_FAULT_SIGBUS; 361 539 } 362 540 ··· 372 548 return VM_FAULT_SIGBUS; 373 549 374 550 pgoff = linear_page_index(vmf->vma, pud_addr); 375 - phys = pgoff_to_phys(dax_dev, pgoff, PUD_SIZE); 551 + phys = dax_pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); 376 552 if (phys == -1) { 377 553 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 378 554 pgoff); ··· 385 561 vmf->flags & FAULT_FLAG_WRITE); 386 562 } 387 563 #else 388 - static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) 564 + static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 389 565 { 390 566 return VM_FAULT_FALLBACK; 391 567 } 392 568 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 393 569 394 - static int dax_dev_huge_fault(struct vm_fault *vmf, 570 + static int dev_dax_huge_fault(struct vm_fault *vmf, 395 571 enum page_entry_size pe_size) 396 572 { 397 573 int rc, id; 398 574 struct file *filp = vmf->vma->vm_file; 399 - struct dax_dev *dax_dev = filp->private_data; 575 + struct dev_dax *dev_dax = filp->private_data; 400 576 401 - dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, 577 + dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__, 402 578 current->comm, (vmf->flags & FAULT_FLAG_WRITE) 403 579 ? "write" : "read", 404 - vmf->vma->vm_start, vmf->vma->vm_end); 580 + vmf->vma->vm_start, vmf->vma->vm_end, pe_size); 405 581 406 - id = srcu_read_lock(&dax_srcu); 582 + id = dax_read_lock(); 407 583 switch (pe_size) { 408 584 case PE_SIZE_PTE: 409 - rc = __dax_dev_pte_fault(dax_dev, vmf); 585 + rc = __dev_dax_pte_fault(dev_dax, vmf); 410 586 break; 411 587 case PE_SIZE_PMD: 412 - rc = __dax_dev_pmd_fault(dax_dev, vmf); 588 + rc = __dev_dax_pmd_fault(dev_dax, vmf); 413 589 break; 414 590 case PE_SIZE_PUD: 415 - rc = __dax_dev_pud_fault(dax_dev, vmf); 591 + rc = __dev_dax_pud_fault(dev_dax, vmf); 416 592 break; 417 593 default: 418 - return VM_FAULT_FALLBACK; 594 + rc = VM_FAULT_SIGBUS; 419 595 } 420 - srcu_read_unlock(&dax_srcu, id); 596 + dax_read_unlock(id); 421 597 422 598 return rc; 423 599 } 424 600 425 - static int dax_dev_fault(struct vm_fault *vmf) 601 + static int dev_dax_fault(struct vm_fault *vmf) 426 602 { 427 - return dax_dev_huge_fault(vmf, PE_SIZE_PTE); 603 + return dev_dax_huge_fault(vmf, PE_SIZE_PTE); 428 604 } 429 605 430 - static const struct vm_operations_struct dax_dev_vm_ops = { 431 - .fault = dax_dev_fault, 432 - .huge_fault = dax_dev_huge_fault, 606 + static const struct vm_operations_struct dax_vm_ops = { 607 + .fault = dev_dax_fault, 608 + .huge_fault = dev_dax_huge_fault, 433 609 }; 434 610 435 611 static int dax_mmap(struct file *filp, struct vm_area_struct *vma) 436 612 { 437 - struct dax_dev *dax_dev = filp->private_data; 438 - int rc; 613 + struct dev_dax *dev_dax = filp->private_data; 614 + int rc, id; 439 615 440 - dev_dbg(&dax_dev->dev, "%s\n", __func__); 616 + dev_dbg(&dev_dax->dev, "%s\n", __func__); 441 617 442 - rc = check_vma(dax_dev, vma, __func__); 618 + /* 619 + * We lock to check dax_dev liveness and will re-check at 620 + * fault time. 621 + */ 622 + id = dax_read_lock(); 623 + rc = check_vma(dev_dax, vma, __func__); 624 + dax_read_unlock(id); 443 625 if (rc) 444 626 return rc; 445 627 446 - vma->vm_ops = &dax_dev_vm_ops; 628 + vma->vm_ops = &dax_vm_ops; 447 629 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; 448 630 return 0; 449 631 } ··· 460 630 unsigned long flags) 461 631 { 462 632 unsigned long off, off_end, off_align, len_align, addr_align, align; 463 - struct dax_dev *dax_dev = filp ? filp->private_data : NULL; 633 + struct dev_dax *dev_dax = filp ? filp->private_data : NULL; 464 634 struct dax_region *dax_region; 465 635 466 - if (!dax_dev || addr) 636 + if (!dev_dax || addr) 467 637 goto out; 468 638 469 - dax_region = dax_dev->region; 639 + dax_region = dev_dax->region; 470 640 align = dax_region->align; 471 641 off = pgoff << PAGE_SHIFT; 472 642 off_end = off + len; ··· 491 661 492 662 static int dax_open(struct inode *inode, struct file *filp) 493 663 { 494 - struct dax_dev *dax_dev; 664 + struct dax_device *dax_dev = inode_dax(inode); 665 + struct inode *__dax_inode = dax_inode(dax_dev); 666 + struct dev_dax *dev_dax = dax_get_private(dax_dev); 495 667 496 - dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev); 497 - dev_dbg(&dax_dev->dev, "%s\n", __func__); 498 - inode->i_mapping = dax_dev->inode->i_mapping; 499 - inode->i_mapping->host = dax_dev->inode; 668 + dev_dbg(&dev_dax->dev, "%s\n", __func__); 669 + inode->i_mapping = __dax_inode->i_mapping; 670 + inode->i_mapping->host = __dax_inode; 500 671 filp->f_mapping = inode->i_mapping; 501 - filp->private_data = dax_dev; 672 + filp->private_data = dev_dax; 502 673 inode->i_flags = S_DAX; 503 674 504 675 return 0; ··· 507 676 508 677 static int dax_release(struct inode *inode, struct file *filp) 509 678 { 510 - struct dax_dev *dax_dev = filp->private_data; 679 + struct dev_dax *dev_dax = filp->private_data; 511 680 512 - dev_dbg(&dax_dev->dev, "%s\n", __func__); 681 + dev_dbg(&dev_dax->dev, "%s\n", __func__); 513 682 return 0; 514 683 } 515 684 ··· 522 691 .mmap = dax_mmap, 523 692 }; 524 693 525 - static void dax_dev_release(struct device *dev) 694 + static void dev_dax_release(struct device *dev) 526 695 { 527 - struct dax_dev *dax_dev = to_dax_dev(dev); 528 - struct dax_region *dax_region = dax_dev->region; 696 + struct dev_dax *dev_dax = to_dev_dax(dev); 697 + struct dax_region *dax_region = dev_dax->region; 698 + struct dax_device *dax_dev = dev_dax->dax_dev; 529 699 530 - ida_simple_remove(&dax_region->ida, dax_dev->id); 531 - ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); 700 + ida_simple_remove(&dax_region->ida, dev_dax->id); 532 701 dax_region_put(dax_region); 533 - iput(dax_dev->inode); 534 - kfree(dax_dev); 702 + put_dax(dax_dev); 703 + kfree(dev_dax); 535 704 } 536 705 537 - static void kill_dax_dev(struct dax_dev *dax_dev) 706 + static void kill_dev_dax(struct dev_dax *dev_dax) 538 707 { 539 - /* 540 - * Note, rcu is not protecting the liveness of dax_dev, rcu is 541 - * ensuring that any fault handlers that might have seen 542 - * dax_dev->alive == true, have completed. Any fault handlers 543 - * that start after synchronize_srcu() has started will abort 544 - * upon seeing dax_dev->alive == false. 545 - */ 546 - dax_dev->alive = false; 547 - synchronize_srcu(&dax_srcu); 548 - unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); 708 + struct dax_device *dax_dev = dev_dax->dax_dev; 709 + struct inode *inode = dax_inode(dax_dev); 710 + 711 + kill_dax(dax_dev); 712 + unmap_mapping_range(inode->i_mapping, 0, 0, 1); 549 713 } 550 714 551 - static void unregister_dax_dev(void *dev) 715 + static void unregister_dev_dax(void *dev) 552 716 { 553 - struct dax_dev *dax_dev = to_dax_dev(dev); 717 + struct dev_dax *dev_dax = to_dev_dax(dev); 718 + struct dax_device *dax_dev = dev_dax->dax_dev; 719 + struct inode *inode = dax_inode(dax_dev); 720 + struct cdev *cdev = inode->i_cdev; 554 721 555 722 dev_dbg(dev, "%s\n", __func__); 556 723 557 - kill_dax_dev(dax_dev); 558 - cdev_device_del(&dax_dev->cdev, dev); 724 + kill_dev_dax(dev_dax); 725 + cdev_device_del(cdev, dev); 559 726 put_device(dev); 560 727 } 561 728 562 - struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, 729 + struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, 563 730 struct resource *res, int count) 564 731 { 565 732 struct device *parent = dax_region->dev; 566 - struct dax_dev *dax_dev; 567 - int rc = 0, minor, i; 733 + struct dax_device *dax_dev; 734 + struct dev_dax *dev_dax; 735 + struct inode *inode; 568 736 struct device *dev; 569 737 struct cdev *cdev; 570 - dev_t dev_t; 738 + int rc = 0, i; 571 739 572 - dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); 573 - if (!dax_dev) 740 + dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL); 741 + if (!dev_dax) 574 742 return ERR_PTR(-ENOMEM); 575 743 576 744 for (i = 0; i < count; i++) { ··· 579 749 rc = -EINVAL; 580 750 break; 581 751 } 582 - dax_dev->res[i].start = res[i].start; 583 - dax_dev->res[i].end = res[i].end; 752 + dev_dax->res[i].start = res[i].start; 753 + dev_dax->res[i].end = res[i].end; 584 754 } 585 755 586 756 if (i < count) 587 757 goto err_id; 588 758 589 - dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); 590 - if (dax_dev->id < 0) { 591 - rc = dax_dev->id; 759 + dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); 760 + if (dev_dax->id < 0) { 761 + rc = dev_dax->id; 592 762 goto err_id; 593 763 } 594 764 595 - minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); 596 - if (minor < 0) { 597 - rc = minor; 598 - goto err_minor; 599 - } 600 - 601 - dev_t = MKDEV(MAJOR(dax_devt), minor); 602 - dev = &dax_dev->dev; 603 - dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t); 604 - if (!dax_dev->inode) { 605 - rc = -ENOMEM; 606 - goto err_inode; 607 - } 765 + /* 766 + * No 'host' or dax_operations since there is no access to this 767 + * device outside of mmap of the resulting character device. 768 + */ 769 + dax_dev = alloc_dax(dev_dax, NULL, NULL); 770 + if (!dax_dev) 771 + goto err_dax; 608 772 609 773 /* from here on we're committed to teardown via dax_dev_release() */ 774 + dev = &dev_dax->dev; 610 775 device_initialize(dev); 611 776 612 - cdev = &dax_dev->cdev; 777 + inode = dax_inode(dax_dev); 778 + cdev = inode->i_cdev; 613 779 cdev_init(cdev, &dax_fops); 614 780 cdev->owner = parent->driver->owner; 615 781 616 - dax_dev->num_resources = count; 617 - dax_dev->alive = true; 618 - dax_dev->region = dax_region; 782 + dev_dax->num_resources = count; 783 + dev_dax->dax_dev = dax_dev; 784 + dev_dax->region = dax_region; 619 785 kref_get(&dax_region->kref); 620 786 621 - dev->devt = dev_t; 787 + dev->devt = inode->i_rdev; 622 788 dev->class = dax_class; 623 789 dev->parent = parent; 624 790 dev->groups = dax_attribute_groups; 625 - dev->release = dax_dev_release; 626 - dev_set_name(dev, "dax%d.%d", dax_region->id, dax_dev->id); 791 + dev->release = dev_dax_release; 792 + dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id); 627 793 628 794 rc = cdev_device_add(cdev, dev); 629 795 if (rc) { 630 - kill_dax_dev(dax_dev); 796 + kill_dev_dax(dev_dax); 631 797 put_device(dev); 632 798 return ERR_PTR(rc); 633 799 } 634 800 635 - rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); 801 + rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev); 636 802 if (rc) 637 803 return ERR_PTR(rc); 638 804 639 - return dax_dev; 805 + return dev_dax; 640 806 641 - err_inode: 642 - ida_simple_remove(&dax_minor_ida, minor); 643 - err_minor: 644 - ida_simple_remove(&dax_region->ida, dax_dev->id); 807 + err_dax: 808 + ida_simple_remove(&dax_region->ida, dev_dax->id); 645 809 err_id: 646 - kfree(dax_dev); 810 + kfree(dev_dax); 647 811 648 812 return ERR_PTR(rc); 649 813 } 650 - EXPORT_SYMBOL_GPL(devm_create_dax_dev); 814 + EXPORT_SYMBOL_GPL(devm_create_dev_dax); 651 815 652 816 static int __init dax_init(void) 653 817 { 654 - int rc; 655 - 656 - rc = dax_inode_init(); 657 - if (rc) 658 - return rc; 659 - 660 - nr_dax = max(nr_dax, 256); 661 - rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); 662 - if (rc) 663 - goto err_chrdev; 664 - 665 818 dax_class = class_create(THIS_MODULE, "dax"); 666 - if (IS_ERR(dax_class)) { 667 - rc = PTR_ERR(dax_class); 668 - goto err_class; 669 - } 670 - 671 - return 0; 672 - 673 - err_class: 674 - unregister_chrdev_region(dax_devt, nr_dax); 675 - err_chrdev: 676 - dax_inode_exit(); 677 - return rc; 819 + return PTR_ERR_OR_ZERO(dax_class); 678 820 } 679 821 680 822 static void __exit dax_exit(void) 681 823 { 682 824 class_destroy(dax_class); 683 - unregister_chrdev_region(dax_devt, nr_dax); 684 - ida_destroy(&dax_minor_ida); 685 - dax_inode_exit(); 686 825 } 687 826 688 827 MODULE_AUTHOR("Intel Corporation");
+4 -11
drivers/dax/dax.h
··· 1 1 /* 2 - * Copyright(c) 2016 Intel Corporation. All rights reserved. 2 + * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved. 3 3 * 4 4 * This program is free software; you can redistribute it and/or modify 5 5 * it under the terms of version 2 of the GNU General Public License as ··· 12 12 */ 13 13 #ifndef __DAX_H__ 14 14 #define __DAX_H__ 15 - struct device; 16 - struct dax_dev; 17 - struct resource; 18 - struct dax_region; 19 - void dax_region_put(struct dax_region *dax_region); 20 - struct dax_region *alloc_dax_region(struct device *parent, 21 - int region_id, struct resource *res, unsigned int align, 22 - void *addr, unsigned long flags); 23 - struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, 24 - struct resource *res, int count); 15 + struct dax_device; 16 + struct dax_device *inode_dax(struct inode *inode); 17 + struct inode *dax_inode(struct dax_device *dax_dev); 25 18 #endif /* __DAX_H__ */
+25
drivers/dax/device-dax.h
··· 1 + /* 2 + * Copyright(c) 2016 Intel Corporation. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of version 2 of the GNU General Public License as 6 + * published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, but 9 + * WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + */ 13 + #ifndef __DEVICE_DAX_H__ 14 + #define __DEVICE_DAX_H__ 15 + struct device; 16 + struct dev_dax; 17 + struct resource; 18 + struct dax_region; 19 + void dax_region_put(struct dax_region *dax_region); 20 + struct dax_region *alloc_dax_region(struct device *parent, 21 + int region_id, struct resource *res, unsigned int align, 22 + void *addr, unsigned long flags); 23 + struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, 24 + struct resource *res, int count); 25 + #endif /* __DEVICE_DAX_H__ */
+5 -5
drivers/dax/pmem.c
··· 16 16 #include <linux/pfn_t.h> 17 17 #include "../nvdimm/pfn.h" 18 18 #include "../nvdimm/nd.h" 19 - #include "dax.h" 19 + #include "device-dax.h" 20 20 21 21 struct dax_pmem { 22 22 struct device *dev; ··· 61 61 int rc; 62 62 void *addr; 63 63 struct resource res; 64 - struct dax_dev *dax_dev; 65 64 struct nd_pfn_sb *pfn_sb; 65 + struct dev_dax *dev_dax; 66 66 struct dax_pmem *dax_pmem; 67 67 struct nd_region *nd_region; 68 68 struct nd_namespace_io *nsio; ··· 130 130 return -ENOMEM; 131 131 132 132 /* TODO: support for subdividing a dax region... */ 133 - dax_dev = devm_create_dax_dev(dax_region, &res, 1); 133 + dev_dax = devm_create_dev_dax(dax_region, &res, 1); 134 134 135 - /* child dax_dev instances now own the lifetime of the dax_region */ 135 + /* child dev_dax instances now own the lifetime of the dax_region */ 136 136 dax_region_put(dax_region); 137 137 138 - return PTR_ERR_OR_ZERO(dax_dev); 138 + return PTR_ERR_OR_ZERO(dev_dax); 139 139 } 140 140 141 141 static struct nd_device_driver dax_pmem_driver = {
+425
drivers/dax/super.c
··· 1 + /* 2 + * Copyright(c) 2017 Intel Corporation. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of version 2 of the GNU General Public License as 6 + * published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, but 9 + * WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + */ 13 + #include <linux/pagemap.h> 14 + #include <linux/module.h> 15 + #include <linux/mount.h> 16 + #include <linux/magic.h> 17 + #include <linux/cdev.h> 18 + #include <linux/hash.h> 19 + #include <linux/slab.h> 20 + #include <linux/dax.h> 21 + #include <linux/fs.h> 22 + 23 + static int nr_dax = CONFIG_NR_DEV_DAX; 24 + module_param(nr_dax, int, S_IRUGO); 25 + MODULE_PARM_DESC(nr_dax, "max number of dax device instances"); 26 + 27 + static dev_t dax_devt; 28 + DEFINE_STATIC_SRCU(dax_srcu); 29 + static struct vfsmount *dax_mnt; 30 + static DEFINE_IDA(dax_minor_ida); 31 + static struct kmem_cache *dax_cache __read_mostly; 32 + static struct super_block *dax_superblock __read_mostly; 33 + 34 + #define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) 35 + static struct hlist_head dax_host_list[DAX_HASH_SIZE]; 36 + static DEFINE_SPINLOCK(dax_host_lock); 37 + 38 + int dax_read_lock(void) 39 + { 40 + return srcu_read_lock(&dax_srcu); 41 + } 42 + EXPORT_SYMBOL_GPL(dax_read_lock); 43 + 44 + void dax_read_unlock(int id) 45 + { 46 + srcu_read_unlock(&dax_srcu, id); 47 + } 48 + EXPORT_SYMBOL_GPL(dax_read_unlock); 49 + 50 + /** 51 + * struct dax_device - anchor object for dax services 52 + * @inode: core vfs 53 + * @cdev: optional character interface for "device dax" 54 + * @host: optional name for lookups where the device path is not available 55 + * @private: dax driver private data 56 + * @alive: !alive + rcu grace period == no new operations / mappings 57 + */ 58 + struct dax_device { 59 + struct hlist_node list; 60 + struct inode inode; 61 + struct cdev cdev; 62 + const char *host; 63 + void *private; 64 + bool alive; 65 + const struct dax_operations *ops; 66 + }; 67 + 68 + /** 69 + * dax_direct_access() - translate a device pgoff to an absolute pfn 70 + * @dax_dev: a dax_device instance representing the logical memory range 71 + * @pgoff: offset in pages from the start of the device to translate 72 + * @nr_pages: number of consecutive pages caller can handle relative to @pfn 73 + * @kaddr: output parameter that returns a virtual address mapping of pfn 74 + * @pfn: output parameter that returns an absolute pfn translation of @pgoff 75 + * 76 + * Return: negative errno if an error occurs, otherwise the number of 77 + * pages accessible at the device relative @pgoff. 78 + */ 79 + long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 80 + void **kaddr, pfn_t *pfn) 81 + { 82 + long avail; 83 + 84 + /* 85 + * The device driver is allowed to sleep, in order to make the 86 + * memory directly accessible. 87 + */ 88 + might_sleep(); 89 + 90 + if (!dax_dev) 91 + return -EOPNOTSUPP; 92 + 93 + if (!dax_alive(dax_dev)) 94 + return -ENXIO; 95 + 96 + if (nr_pages < 0) 97 + return nr_pages; 98 + 99 + avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, 100 + kaddr, pfn); 101 + if (!avail) 102 + return -ERANGE; 103 + return min(avail, nr_pages); 104 + } 105 + EXPORT_SYMBOL_GPL(dax_direct_access); 106 + 107 + bool dax_alive(struct dax_device *dax_dev) 108 + { 109 + lockdep_assert_held(&dax_srcu); 110 + return dax_dev->alive; 111 + } 112 + EXPORT_SYMBOL_GPL(dax_alive); 113 + 114 + static int dax_host_hash(const char *host) 115 + { 116 + return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE; 117 + } 118 + 119 + /* 120 + * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring 121 + * that any fault handlers or operations that might have seen 122 + * dax_alive(), have completed. Any operations that start after 123 + * synchronize_srcu() has run will abort upon seeing !dax_alive(). 124 + */ 125 + void kill_dax(struct dax_device *dax_dev) 126 + { 127 + if (!dax_dev) 128 + return; 129 + 130 + dax_dev->alive = false; 131 + 132 + synchronize_srcu(&dax_srcu); 133 + 134 + spin_lock(&dax_host_lock); 135 + hlist_del_init(&dax_dev->list); 136 + spin_unlock(&dax_host_lock); 137 + 138 + dax_dev->private = NULL; 139 + } 140 + EXPORT_SYMBOL_GPL(kill_dax); 141 + 142 + static struct inode *dax_alloc_inode(struct super_block *sb) 143 + { 144 + struct dax_device *dax_dev; 145 + 146 + dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); 147 + return &dax_dev->inode; 148 + } 149 + 150 + static struct dax_device *to_dax_dev(struct inode *inode) 151 + { 152 + return container_of(inode, struct dax_device, inode); 153 + } 154 + 155 + static void dax_i_callback(struct rcu_head *head) 156 + { 157 + struct inode *inode = container_of(head, struct inode, i_rcu); 158 + struct dax_device *dax_dev = to_dax_dev(inode); 159 + 160 + kfree(dax_dev->host); 161 + dax_dev->host = NULL; 162 + ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev)); 163 + kmem_cache_free(dax_cache, dax_dev); 164 + } 165 + 166 + static void dax_destroy_inode(struct inode *inode) 167 + { 168 + struct dax_device *dax_dev = to_dax_dev(inode); 169 + 170 + WARN_ONCE(dax_dev->alive, 171 + "kill_dax() must be called before final iput()\n"); 172 + call_rcu(&inode->i_rcu, dax_i_callback); 173 + } 174 + 175 + static const struct super_operations dax_sops = { 176 + .statfs = simple_statfs, 177 + .alloc_inode = dax_alloc_inode, 178 + .destroy_inode = dax_destroy_inode, 179 + .drop_inode = generic_delete_inode, 180 + }; 181 + 182 + static struct dentry *dax_mount(struct file_system_type *fs_type, 183 + int flags, const char *dev_name, void *data) 184 + { 185 + return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); 186 + } 187 + 188 + static struct file_system_type dax_fs_type = { 189 + .name = "dax", 190 + .mount = dax_mount, 191 + .kill_sb = kill_anon_super, 192 + }; 193 + 194 + static int dax_test(struct inode *inode, void *data) 195 + { 196 + dev_t devt = *(dev_t *) data; 197 + 198 + return inode->i_rdev == devt; 199 + } 200 + 201 + static int dax_set(struct inode *inode, void *data) 202 + { 203 + dev_t devt = *(dev_t *) data; 204 + 205 + inode->i_rdev = devt; 206 + return 0; 207 + } 208 + 209 + static struct dax_device *dax_dev_get(dev_t devt) 210 + { 211 + struct dax_device *dax_dev; 212 + struct inode *inode; 213 + 214 + inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), 215 + dax_test, dax_set, &devt); 216 + 217 + if (!inode) 218 + return NULL; 219 + 220 + dax_dev = to_dax_dev(inode); 221 + if (inode->i_state & I_NEW) { 222 + dax_dev->alive = true; 223 + inode->i_cdev = &dax_dev->cdev; 224 + inode->i_mode = S_IFCHR; 225 + inode->i_flags = S_DAX; 226 + mapping_set_gfp_mask(&inode->i_data, GFP_USER); 227 + unlock_new_inode(inode); 228 + } 229 + 230 + return dax_dev; 231 + } 232 + 233 + static void dax_add_host(struct dax_device *dax_dev, const char *host) 234 + { 235 + int hash; 236 + 237 + /* 238 + * Unconditionally init dax_dev since it's coming from a 239 + * non-zeroed slab cache 240 + */ 241 + INIT_HLIST_NODE(&dax_dev->list); 242 + dax_dev->host = host; 243 + if (!host) 244 + return; 245 + 246 + hash = dax_host_hash(host); 247 + spin_lock(&dax_host_lock); 248 + hlist_add_head(&dax_dev->list, &dax_host_list[hash]); 249 + spin_unlock(&dax_host_lock); 250 + } 251 + 252 + struct dax_device *alloc_dax(void *private, const char *__host, 253 + const struct dax_operations *ops) 254 + { 255 + struct dax_device *dax_dev; 256 + const char *host; 257 + dev_t devt; 258 + int minor; 259 + 260 + host = kstrdup(__host, GFP_KERNEL); 261 + if (__host && !host) 262 + return NULL; 263 + 264 + minor = ida_simple_get(&dax_minor_ida, 0, nr_dax, GFP_KERNEL); 265 + if (minor < 0) 266 + goto err_minor; 267 + 268 + devt = MKDEV(MAJOR(dax_devt), minor); 269 + dax_dev = dax_dev_get(devt); 270 + if (!dax_dev) 271 + goto err_dev; 272 + 273 + dax_add_host(dax_dev, host); 274 + dax_dev->ops = ops; 275 + dax_dev->private = private; 276 + return dax_dev; 277 + 278 + err_dev: 279 + ida_simple_remove(&dax_minor_ida, minor); 280 + err_minor: 281 + kfree(host); 282 + return NULL; 283 + } 284 + EXPORT_SYMBOL_GPL(alloc_dax); 285 + 286 + void put_dax(struct dax_device *dax_dev) 287 + { 288 + if (!dax_dev) 289 + return; 290 + iput(&dax_dev->inode); 291 + } 292 + EXPORT_SYMBOL_GPL(put_dax); 293 + 294 + /** 295 + * dax_get_by_host() - temporary lookup mechanism for filesystem-dax 296 + * @host: alternate name for the device registered by a dax driver 297 + */ 298 + struct dax_device *dax_get_by_host(const char *host) 299 + { 300 + struct dax_device *dax_dev, *found = NULL; 301 + int hash, id; 302 + 303 + if (!host) 304 + return NULL; 305 + 306 + hash = dax_host_hash(host); 307 + 308 + id = dax_read_lock(); 309 + spin_lock(&dax_host_lock); 310 + hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) { 311 + if (!dax_alive(dax_dev) 312 + || strcmp(host, dax_dev->host) != 0) 313 + continue; 314 + 315 + if (igrab(&dax_dev->inode)) 316 + found = dax_dev; 317 + break; 318 + } 319 + spin_unlock(&dax_host_lock); 320 + dax_read_unlock(id); 321 + 322 + return found; 323 + } 324 + EXPORT_SYMBOL_GPL(dax_get_by_host); 325 + 326 + /** 327 + * inode_dax: convert a public inode into its dax_dev 328 + * @inode: An inode with i_cdev pointing to a dax_dev 329 + * 330 + * Note this is not equivalent to to_dax_dev() which is for private 331 + * internal use where we know the inode filesystem type == dax_fs_type. 332 + */ 333 + struct dax_device *inode_dax(struct inode *inode) 334 + { 335 + struct cdev *cdev = inode->i_cdev; 336 + 337 + return container_of(cdev, struct dax_device, cdev); 338 + } 339 + EXPORT_SYMBOL_GPL(inode_dax); 340 + 341 + struct inode *dax_inode(struct dax_device *dax_dev) 342 + { 343 + return &dax_dev->inode; 344 + } 345 + EXPORT_SYMBOL_GPL(dax_inode); 346 + 347 + void *dax_get_private(struct dax_device *dax_dev) 348 + { 349 + return dax_dev->private; 350 + } 351 + EXPORT_SYMBOL_GPL(dax_get_private); 352 + 353 + static void init_once(void *_dax_dev) 354 + { 355 + struct dax_device *dax_dev = _dax_dev; 356 + struct inode *inode = &dax_dev->inode; 357 + 358 + inode_init_once(inode); 359 + } 360 + 361 + static int __dax_fs_init(void) 362 + { 363 + int rc; 364 + 365 + dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0, 366 + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 367 + SLAB_MEM_SPREAD|SLAB_ACCOUNT), 368 + init_once); 369 + if (!dax_cache) 370 + return -ENOMEM; 371 + 372 + rc = register_filesystem(&dax_fs_type); 373 + if (rc) 374 + goto err_register_fs; 375 + 376 + dax_mnt = kern_mount(&dax_fs_type); 377 + if (IS_ERR(dax_mnt)) { 378 + rc = PTR_ERR(dax_mnt); 379 + goto err_mount; 380 + } 381 + dax_superblock = dax_mnt->mnt_sb; 382 + 383 + return 0; 384 + 385 + err_mount: 386 + unregister_filesystem(&dax_fs_type); 387 + err_register_fs: 388 + kmem_cache_destroy(dax_cache); 389 + 390 + return rc; 391 + } 392 + 393 + static void __dax_fs_exit(void) 394 + { 395 + kern_unmount(dax_mnt); 396 + unregister_filesystem(&dax_fs_type); 397 + kmem_cache_destroy(dax_cache); 398 + } 399 + 400 + static int __init dax_fs_init(void) 401 + { 402 + int rc; 403 + 404 + rc = __dax_fs_init(); 405 + if (rc) 406 + return rc; 407 + 408 + nr_dax = max(nr_dax, 256); 409 + rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); 410 + if (rc) 411 + __dax_fs_exit(); 412 + return rc; 413 + } 414 + 415 + static void __exit dax_fs_exit(void) 416 + { 417 + unregister_chrdev_region(dax_devt, nr_dax); 418 + ida_destroy(&dax_minor_ida); 419 + __dax_fs_exit(); 420 + } 421 + 422 + MODULE_AUTHOR("Intel Corporation"); 423 + MODULE_LICENSE("GPL v2"); 424 + subsys_initcall(dax_fs_init); 425 + module_exit(dax_fs_exit);
+1
drivers/md/Kconfig
··· 200 200 config BLK_DEV_DM 201 201 tristate "Device mapper support" 202 202 select BLK_DEV_DM_BUILTIN 203 + select DAX 203 204 ---help--- 204 205 Device-mapper is a low level volume manager. It works by allowing 205 206 people to specify mappings for ranges of logical sectors. Various
+1
drivers/md/dm-core.h
··· 58 58 struct target_type *immutable_target_type; 59 59 60 60 struct gendisk *disk; 61 + struct dax_device *dax_dev; 61 62 char name[16]; 62 63 63 64 void *interface_ptr;
+12 -13
drivers/md/dm-linear.c
··· 9 9 #include <linux/init.h> 10 10 #include <linux/blkdev.h> 11 11 #include <linux/bio.h> 12 + #include <linux/dax.h> 12 13 #include <linux/slab.h> 13 14 #include <linux/device-mapper.h> 14 15 ··· 143 142 return fn(ti, lc->dev, lc->start, ti->len, data); 144 143 } 145 144 146 - static long linear_direct_access(struct dm_target *ti, sector_t sector, 147 - void **kaddr, pfn_t *pfn, long size) 145 + static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 146 + long nr_pages, void **kaddr, pfn_t *pfn) 148 147 { 148 + long ret; 149 149 struct linear_c *lc = ti->private; 150 150 struct block_device *bdev = lc->dev->bdev; 151 - struct blk_dax_ctl dax = { 152 - .sector = linear_map_sector(ti, sector), 153 - .size = size, 154 - }; 155 - long ret; 151 + struct dax_device *dax_dev = lc->dev->dax_dev; 152 + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; 156 153 157 - ret = bdev_direct_access(bdev, &dax); 158 - *kaddr = dax.addr; 159 - *pfn = dax.pfn; 160 - 161 - return ret; 154 + dev_sector = linear_map_sector(ti, sector); 155 + ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff); 156 + if (ret) 157 + return ret; 158 + return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); 162 159 } 163 160 164 161 static struct target_type linear_target = { ··· 170 171 .status = linear_status, 171 172 .prepare_ioctl = linear_prepare_ioctl, 172 173 .iterate_devices = linear_iterate_devices, 173 - .direct_access = linear_direct_access, 174 + .direct_access = linear_dax_direct_access, 174 175 }; 175 176 176 177 int __init dm_linear_init(void)
+3 -3
drivers/md/dm-snap.c
··· 2302 2302 return do_origin(o->dev, bio); 2303 2303 } 2304 2304 2305 - static long origin_direct_access(struct dm_target *ti, sector_t sector, 2306 - void **kaddr, pfn_t *pfn, long size) 2305 + static long origin_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 2306 + long nr_pages, void **kaddr, pfn_t *pfn) 2307 2307 { 2308 2308 DMWARN("device does not support dax."); 2309 2309 return -EIO; ··· 2368 2368 .postsuspend = origin_postsuspend, 2369 2369 .status = origin_status, 2370 2370 .iterate_devices = origin_iterate_devices, 2371 - .direct_access = origin_direct_access, 2371 + .direct_access = origin_dax_direct_access, 2372 2372 }; 2373 2373 2374 2374 static struct target_type snapshot_target = {
+14 -15
drivers/md/dm-stripe.c
··· 11 11 #include <linux/init.h> 12 12 #include <linux/blkdev.h> 13 13 #include <linux/bio.h> 14 + #include <linux/dax.h> 14 15 #include <linux/slab.h> 15 16 #include <linux/log2.h> 16 17 ··· 311 310 return DM_MAPIO_REMAPPED; 312 311 } 313 312 314 - static long stripe_direct_access(struct dm_target *ti, sector_t sector, 315 - void **kaddr, pfn_t *pfn, long size) 313 + static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 314 + long nr_pages, void **kaddr, pfn_t *pfn) 316 315 { 316 + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; 317 317 struct stripe_c *sc = ti->private; 318 - uint32_t stripe; 318 + struct dax_device *dax_dev; 319 319 struct block_device *bdev; 320 - struct blk_dax_ctl dax = { 321 - .size = size, 322 - }; 320 + uint32_t stripe; 323 321 long ret; 324 322 325 - stripe_map_sector(sc, sector, &stripe, &dax.sector); 326 - 327 - dax.sector += sc->stripe[stripe].physical_start; 323 + stripe_map_sector(sc, sector, &stripe, &dev_sector); 324 + dev_sector += sc->stripe[stripe].physical_start; 325 + dax_dev = sc->stripe[stripe].dev->dax_dev; 328 326 bdev = sc->stripe[stripe].dev->bdev; 329 327 330 - ret = bdev_direct_access(bdev, &dax); 331 - *kaddr = dax.addr; 332 - *pfn = dax.pfn; 333 - 334 - return ret; 328 + ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff); 329 + if (ret) 330 + return ret; 331 + return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); 335 332 } 336 333 337 334 /* ··· 450 451 .status = stripe_status, 451 452 .iterate_devices = stripe_iterate_devices, 452 453 .io_hints = stripe_io_hints, 453 - .direct_access = stripe_direct_access, 454 + .direct_access = stripe_dax_direct_access, 454 455 }; 455 456 456 457 int __init dm_stripe_init(void)
+3 -3
drivers/md/dm-target.c
··· 142 142 { 143 143 } 144 144 145 - static long io_err_direct_access(struct dm_target *ti, sector_t sector, 146 - void **kaddr, pfn_t *pfn, long size) 145 + static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 146 + long nr_pages, void **kaddr, pfn_t *pfn) 147 147 { 148 148 return -EIO; 149 149 } ··· 157 157 .map = io_err_map, 158 158 .clone_and_map_rq = io_err_clone_and_map_rq, 159 159 .release_clone_rq = io_err_release_clone_rq, 160 - .direct_access = io_err_direct_access, 160 + .direct_access = io_err_dax_direct_access, 161 161 }; 162 162 163 163 int __init dm_target_init(void)
+54 -15
drivers/md/dm.c
··· 16 16 #include <linux/blkpg.h> 17 17 #include <linux/bio.h> 18 18 #include <linux/mempool.h> 19 + #include <linux/dax.h> 19 20 #include <linux/slab.h> 20 21 #include <linux/idr.h> 21 22 #include <linux/hdreg.h> ··· 630 629 } 631 630 632 631 td->dm_dev.bdev = bdev; 632 + td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 633 633 return 0; 634 634 } 635 635 ··· 644 642 645 643 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 646 644 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 645 + put_dax(td->dm_dev.dax_dev); 647 646 td->dm_dev.bdev = NULL; 647 + td->dm_dev.dax_dev = NULL; 648 648 } 649 649 650 650 static struct table_device *find_table_device(struct list_head *l, dev_t dev, ··· 924 920 } 925 921 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 926 922 927 - static long dm_blk_direct_access(struct block_device *bdev, sector_t sector, 928 - void **kaddr, pfn_t *pfn, long size) 923 + static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, 924 + sector_t sector, int *srcu_idx) 929 925 { 930 - struct mapped_device *md = bdev->bd_disk->private_data; 931 926 struct dm_table *map; 932 927 struct dm_target *ti; 933 - int srcu_idx; 934 - long len, ret = -EIO; 935 928 936 - map = dm_get_live_table(md, &srcu_idx); 929 + map = dm_get_live_table(md, srcu_idx); 937 930 if (!map) 938 - goto out; 931 + return NULL; 939 932 940 933 ti = dm_table_find_target(map, sector); 941 934 if (!dm_target_is_valid(ti)) 935 + return NULL; 936 + 937 + return ti; 938 + } 939 + 940 + static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 941 + long nr_pages, void **kaddr, pfn_t *pfn) 942 + { 943 + struct mapped_device *md = dax_get_private(dax_dev); 944 + sector_t sector = pgoff * PAGE_SECTORS; 945 + struct dm_target *ti; 946 + long len, ret = -EIO; 947 + int srcu_idx; 948 + 949 + ti = dm_dax_get_live_target(md, sector, &srcu_idx); 950 + 951 + if (!ti) 942 952 goto out; 943 - 944 - len = max_io_len(sector, ti) << SECTOR_SHIFT; 945 - size = min(len, size); 946 - 953 + if (!ti->type->direct_access) 954 + goto out; 955 + len = max_io_len(sector, ti) / PAGE_SECTORS; 956 + if (len < 1) 957 + goto out; 958 + nr_pages = min(len, nr_pages); 947 959 if (ti->type->direct_access) 948 - ret = ti->type->direct_access(ti, sector, kaddr, pfn, size); 949 - out: 960 + ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); 961 + 962 + out: 950 963 dm_put_live_table(md, srcu_idx); 951 - return min(ret, size); 964 + 965 + return ret; 952 966 } 953 967 954 968 /* ··· 1493 1471 } 1494 1472 1495 1473 static const struct block_device_operations dm_blk_dops; 1474 + static const struct dax_operations dm_dax_ops; 1496 1475 1497 1476 static void dm_wq_work(struct work_struct *work); 1498 1477 ··· 1540 1517 if (md->bs) 1541 1518 bioset_free(md->bs); 1542 1519 1520 + if (md->dax_dev) { 1521 + kill_dax(md->dax_dev); 1522 + put_dax(md->dax_dev); 1523 + md->dax_dev = NULL; 1524 + } 1525 + 1543 1526 if (md->disk) { 1544 1527 spin_lock(&_minor_lock); 1545 1528 md->disk->private_data = NULL; ··· 1573 1544 static struct mapped_device *alloc_dev(int minor) 1574 1545 { 1575 1546 int r, numa_node_id = dm_get_numa_node(); 1547 + struct dax_device *dax_dev; 1576 1548 struct mapped_device *md; 1577 1549 void *old_md; 1578 1550 ··· 1638 1608 md->disk->queue = md->queue; 1639 1609 md->disk->private_data = md; 1640 1610 sprintf(md->disk->disk_name, "dm-%d", minor); 1611 + 1612 + dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); 1613 + if (!dax_dev) 1614 + goto bad; 1615 + md->dax_dev = dax_dev; 1616 + 1641 1617 add_disk(md->disk); 1642 1618 format_dev_t(md->name, MKDEV(_major, minor)); 1643 1619 ··· 2852 2816 .open = dm_blk_open, 2853 2817 .release = dm_blk_close, 2854 2818 .ioctl = dm_blk_ioctl, 2855 - .direct_access = dm_blk_direct_access, 2856 2819 .getgeo = dm_blk_getgeo, 2857 2820 .pr_ops = &dm_pr_ops, 2858 2821 .owner = THIS_MODULE 2822 + }; 2823 + 2824 + static const struct dax_operations dm_dax_ops = { 2825 + .direct_access = dm_dax_direct_access, 2859 2826 }; 2860 2827 2861 2828 /*
+1
drivers/nvdimm/Kconfig
··· 20 20 config BLK_DEV_PMEM 21 21 tristate "PMEM: Persistent memory block device support" 22 22 default LIBNVDIMM 23 + select DAX 23 24 select ND_BTT if BTT 24 25 select ND_PFN if NVDIMM_PFN 25 26 help
+1 -1
drivers/nvdimm/btt_devs.c
··· 314 314 if (rc < 0) { 315 315 struct nd_btt *nd_btt = to_nd_btt(btt_dev); 316 316 317 - __nd_detach_ndns(btt_dev, &nd_btt->ndns); 317 + nd_detach_ndns(btt_dev, &nd_btt->ndns); 318 318 put_device(btt_dev); 319 319 } 320 320
+111 -11
drivers/nvdimm/bus.c
··· 27 27 #include <linux/nd.h> 28 28 #include "nd-core.h" 29 29 #include "nd.h" 30 + #include "pfn.h" 30 31 31 32 int nvdimm_major; 32 33 static int nvdimm_bus_major; ··· 172 171 } 173 172 EXPORT_SYMBOL_GPL(nvdimm_region_notify); 174 173 174 + struct clear_badblocks_context { 175 + resource_size_t phys, cleared; 176 + }; 177 + 178 + static int nvdimm_clear_badblocks_region(struct device *dev, void *data) 179 + { 180 + struct clear_badblocks_context *ctx = data; 181 + struct nd_region *nd_region; 182 + resource_size_t ndr_end; 183 + sector_t sector; 184 + 185 + /* make sure device is a region */ 186 + if (!is_nd_pmem(dev)) 187 + return 0; 188 + 189 + nd_region = to_nd_region(dev); 190 + ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1; 191 + 192 + /* make sure we are in the region */ 193 + if (ctx->phys < nd_region->ndr_start 194 + || (ctx->phys + ctx->cleared) > ndr_end) 195 + return 0; 196 + 197 + sector = (ctx->phys - nd_region->ndr_start) / 512; 198 + badblocks_clear(&nd_region->bb, sector, ctx->cleared / 512); 199 + 200 + return 0; 201 + } 202 + 203 + static void nvdimm_clear_badblocks_regions(struct nvdimm_bus *nvdimm_bus, 204 + phys_addr_t phys, u64 cleared) 205 + { 206 + struct clear_badblocks_context ctx = { 207 + .phys = phys, 208 + .cleared = cleared, 209 + }; 210 + 211 + device_for_each_child(&nvdimm_bus->dev, &ctx, 212 + nvdimm_clear_badblocks_region); 213 + } 214 + 215 + static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus, 216 + phys_addr_t phys, u64 cleared) 217 + { 218 + if (cleared > 0) 219 + nvdimm_forget_poison(nvdimm_bus, phys, cleared); 220 + 221 + if (cleared > 0 && cleared / 512) 222 + nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared); 223 + } 224 + 175 225 long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, 176 226 unsigned int len) 177 227 { ··· 270 218 if (cmd_rc < 0) 271 219 return cmd_rc; 272 220 273 - nvdimm_clear_from_poison_list(nvdimm_bus, phys, len); 221 + nvdimm_account_cleared_poison(nvdimm_bus, phys, clear_err.cleared); 222 + 274 223 return clear_err.cleared; 275 224 } 276 225 EXPORT_SYMBOL_GPL(nvdimm_clear_poison); ··· 339 286 init_waitqueue_head(&nvdimm_bus->probe_wait); 340 287 nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); 341 288 mutex_init(&nvdimm_bus->reconfig_mutex); 289 + spin_lock_init(&nvdimm_bus->poison_lock); 342 290 if (nvdimm_bus->id < 0) { 343 291 kfree(nvdimm_bus); 344 292 return NULL; ··· 408 354 nd_synchronize(); 409 355 device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); 410 356 411 - nvdimm_bus_lock(&nvdimm_bus->dev); 357 + spin_lock(&nvdimm_bus->poison_lock); 412 358 free_poison_list(&nvdimm_bus->poison_list); 413 - nvdimm_bus_unlock(&nvdimm_bus->dev); 359 + spin_unlock(&nvdimm_bus->poison_lock); 414 360 415 361 nvdimm_bus_destroy_ndctl(nvdimm_bus); 416 362 ··· 823 769 } while (true); 824 770 } 825 771 826 - static int pmem_active(struct device *dev, void *data) 772 + static int nd_pmem_forget_poison_check(struct device *dev, void *data) 827 773 { 828 - if (is_nd_pmem(dev) && dev->driver) 774 + struct nd_cmd_clear_error *clear_err = 775 + (struct nd_cmd_clear_error *)data; 776 + struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL; 777 + struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL; 778 + struct nd_dax *nd_dax = is_nd_dax(dev) ? to_nd_dax(dev) : NULL; 779 + struct nd_namespace_common *ndns = NULL; 780 + struct nd_namespace_io *nsio; 781 + resource_size_t offset = 0, end_trunc = 0, start, end, pstart, pend; 782 + 783 + if (nd_dax || !dev->driver) 784 + return 0; 785 + 786 + start = clear_err->address; 787 + end = clear_err->address + clear_err->cleared - 1; 788 + 789 + if (nd_btt || nd_pfn || nd_dax) { 790 + if (nd_btt) 791 + ndns = nd_btt->ndns; 792 + else if (nd_pfn) 793 + ndns = nd_pfn->ndns; 794 + else if (nd_dax) 795 + ndns = nd_dax->nd_pfn.ndns; 796 + 797 + if (!ndns) 798 + return 0; 799 + } else 800 + ndns = to_ndns(dev); 801 + 802 + nsio = to_nd_namespace_io(&ndns->dev); 803 + pstart = nsio->res.start + offset; 804 + pend = nsio->res.end - end_trunc; 805 + 806 + if ((pstart >= start) && (pend <= end)) 829 807 return -EBUSY; 808 + 830 809 return 0; 810 + 811 + } 812 + 813 + static int nd_ns_forget_poison_check(struct device *dev, void *data) 814 + { 815 + return device_for_each_child(dev, data, nd_pmem_forget_poison_check); 831 816 } 832 817 833 818 /* set_config requires an idle interleave set */ 834 819 static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus, 835 - struct nvdimm *nvdimm, unsigned int cmd) 820 + struct nvdimm *nvdimm, unsigned int cmd, void *data) 836 821 { 837 822 struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; 838 823 ··· 885 792 886 793 /* require clear error to go through the pmem driver */ 887 794 if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR) 888 - return device_for_each_child(&nvdimm_bus->dev, NULL, 889 - pmem_active); 795 + return device_for_each_child(&nvdimm_bus->dev, data, 796 + nd_ns_forget_poison_check); 890 797 891 798 if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA) 892 799 return 0; ··· 913 820 const char *cmd_name, *dimm_name; 914 821 unsigned long cmd_mask; 915 822 void *buf; 916 - int rc, i; 823 + int rc, i, cmd_rc; 917 824 918 825 if (nvdimm) { 919 826 desc = nd_cmd_dimm_desc(cmd); ··· 1020 927 } 1021 928 1022 929 nvdimm_bus_lock(&nvdimm_bus->dev); 1023 - rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd); 930 + rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd, buf); 1024 931 if (rc) 1025 932 goto out_unlock; 1026 933 1027 - rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, NULL); 934 + rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, &cmd_rc); 1028 935 if (rc < 0) 1029 936 goto out_unlock; 937 + 938 + if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR && cmd_rc >= 0) { 939 + struct nd_cmd_clear_error *clear_err = buf; 940 + 941 + nvdimm_account_cleared_poison(nvdimm_bus, clear_err->address, 942 + clear_err->cleared); 943 + } 1030 944 nvdimm_bus_unlock(&nvdimm_bus->dev); 1031 945 1032 946 if (copy_to_user(p, buf, buf_len))
+23 -14
drivers/nvdimm/claim.c
··· 21 21 void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns) 22 22 { 23 23 struct nd_namespace_common *ndns = *_ndns; 24 + struct nvdimm_bus *nvdimm_bus; 24 25 25 - lockdep_assert_held(&ndns->dev.mutex); 26 + if (!ndns) 27 + return; 28 + 29 + nvdimm_bus = walk_to_nvdimm_bus(&ndns->dev); 30 + lockdep_assert_held(&nvdimm_bus->reconfig_mutex); 26 31 dev_WARN_ONCE(dev, ndns->claim != dev, "%s: invalid claim\n", __func__); 27 32 ndns->claim = NULL; 28 33 *_ndns = NULL; ··· 42 37 if (!ndns) 43 38 return; 44 39 get_device(&ndns->dev); 45 - device_lock(&ndns->dev); 40 + nvdimm_bus_lock(&ndns->dev); 46 41 __nd_detach_ndns(dev, _ndns); 47 - device_unlock(&ndns->dev); 42 + nvdimm_bus_unlock(&ndns->dev); 48 43 put_device(&ndns->dev); 49 44 } 50 45 51 46 bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, 52 47 struct nd_namespace_common **_ndns) 53 48 { 49 + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&attach->dev); 50 + 54 51 if (attach->claim) 55 52 return false; 56 - lockdep_assert_held(&attach->dev.mutex); 53 + lockdep_assert_held(&nvdimm_bus->reconfig_mutex); 57 54 dev_WARN_ONCE(dev, *_ndns, "%s: invalid claim\n", __func__); 58 55 attach->claim = dev; 59 56 *_ndns = attach; ··· 68 61 { 69 62 bool claimed; 70 63 71 - device_lock(&attach->dev); 64 + nvdimm_bus_lock(&attach->dev); 72 65 claimed = __nd_attach_ndns(dev, attach, _ndns); 73 - device_unlock(&attach->dev); 66 + nvdimm_bus_unlock(&attach->dev); 74 67 return claimed; 75 68 } 76 69 ··· 121 114 struct nd_namespace_common **_ndns) 122 115 { 123 116 /* detach the namespace and destroy / reset the device */ 124 - nd_detach_ndns(dev, _ndns); 117 + __nd_detach_ndns(dev, _ndns); 125 118 if (is_idle(dev, *_ndns)) { 126 119 nd_device_unregister(dev, ND_ASYNC); 127 120 } else if (is_nd_btt(dev)) { ··· 191 184 } 192 185 193 186 WARN_ON_ONCE(!is_nvdimm_bus_locked(dev)); 194 - if (!nd_attach_ndns(dev, ndns, _ndns)) { 187 + if (!__nd_attach_ndns(dev, ndns, _ndns)) { 195 188 dev_dbg(dev, "%s already claimed\n", 196 189 dev_name(&ndns->dev)); 197 190 len = -EBUSY; ··· 246 239 if (rw == READ) { 247 240 if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) 248 241 return -EIO; 249 - return memcpy_from_pmem(buf, nsio->addr + offset, size); 242 + return memcpy_mcsafe(buf, nsio->addr + offset, size); 250 243 } 251 244 252 245 if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) { 253 246 /* 254 247 * FIXME: nsio_rw_bytes() may be called from atomic 255 - * context in the btt case and nvdimm_clear_poison() 256 - * takes a sleeping lock. Until the locking can be 257 - * reworked this capability requires that the namespace 258 - * is not claimed by btt. 248 + * context in the btt case and the ACPI DSM path for 249 + * clearing the error takes sleeping locks and allocates 250 + * memory. An explicit error clearing path, and support 251 + * for tracking badblocks in BTT metadata is needed to 252 + * work around this collision. 259 253 */ 260 254 if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512) 261 255 && (!ndns->claim || !is_nd_btt(ndns->claim))) { 262 256 long cleared; 263 257 264 - cleared = nvdimm_clear_poison(&ndns->dev, offset, size); 258 + cleared = nvdimm_clear_poison(&ndns->dev, 259 + nsio->res.start + offset, size); 265 260 if (cleared < size) 266 261 rc = -EIO; 267 262 if (cleared > 0 && cleared / 512) {
+35 -16
drivers/nvdimm/core.c
··· 518 518 } 519 519 EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); 520 520 521 + static void append_poison_entry(struct nvdimm_bus *nvdimm_bus, 522 + struct nd_poison *pl, u64 addr, u64 length) 523 + { 524 + lockdep_assert_held(&nvdimm_bus->poison_lock); 525 + pl->start = addr; 526 + pl->length = length; 527 + list_add_tail(&pl->list, &nvdimm_bus->poison_list); 528 + } 529 + 521 530 static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length, 522 531 gfp_t flags) 523 532 { ··· 536 527 if (!pl) 537 528 return -ENOMEM; 538 529 539 - pl->start = addr; 540 - pl->length = length; 541 - list_add_tail(&pl->list, &nvdimm_bus->poison_list); 542 - 530 + append_poison_entry(nvdimm_bus, pl, addr, length); 543 531 return 0; 544 532 } 545 533 546 534 static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) 547 535 { 548 - struct nd_poison *pl; 536 + struct nd_poison *pl, *pl_new; 549 537 550 - if (list_empty(&nvdimm_bus->poison_list)) 551 - return add_poison(nvdimm_bus, addr, length, GFP_KERNEL); 538 + spin_unlock(&nvdimm_bus->poison_lock); 539 + pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL); 540 + spin_lock(&nvdimm_bus->poison_lock); 541 + 542 + if (list_empty(&nvdimm_bus->poison_list)) { 543 + if (!pl_new) 544 + return -ENOMEM; 545 + append_poison_entry(nvdimm_bus, pl_new, addr, length); 546 + return 0; 547 + } 552 548 553 549 /* 554 550 * There is a chance this is a duplicate, check for those first. ··· 565 551 /* If length has changed, update this list entry */ 566 552 if (pl->length != length) 567 553 pl->length = length; 554 + kfree(pl_new); 568 555 return 0; 569 556 } 570 557 ··· 574 559 * as any overlapping ranges will get resolved when the list is consumed 575 560 * and converted to badblocks 576 561 */ 577 - return add_poison(nvdimm_bus, addr, length, GFP_KERNEL); 562 + if (!pl_new) 563 + return -ENOMEM; 564 + append_poison_entry(nvdimm_bus, pl_new, addr, length); 565 + 566 + return 0; 578 567 } 579 568 580 569 int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) 581 570 { 582 571 int rc; 583 572 584 - nvdimm_bus_lock(&nvdimm_bus->dev); 573 + spin_lock(&nvdimm_bus->poison_lock); 585 574 rc = bus_add_poison(nvdimm_bus, addr, length); 586 - nvdimm_bus_unlock(&nvdimm_bus->dev); 575 + spin_unlock(&nvdimm_bus->poison_lock); 587 576 588 577 return rc; 589 578 } 590 579 EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison); 591 580 592 - void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus, 593 - phys_addr_t start, unsigned int len) 581 + void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start, 582 + unsigned int len) 594 583 { 595 584 struct list_head *poison_list = &nvdimm_bus->poison_list; 596 585 u64 clr_end = start + len - 1; 597 586 struct nd_poison *pl, *next; 598 587 599 - nvdimm_bus_lock(&nvdimm_bus->dev); 588 + spin_lock(&nvdimm_bus->poison_lock); 600 589 WARN_ON_ONCE(list_empty(poison_list)); 601 590 602 591 /* ··· 647 628 u64 new_len = pl_end - new_start + 1; 648 629 649 630 /* Add new entry covering the right half */ 650 - add_poison(nvdimm_bus, new_start, new_len, GFP_NOIO); 631 + add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT); 651 632 /* Adjust this entry to cover the left half */ 652 633 pl->length = start - pl->start; 653 634 continue; 654 635 } 655 636 } 656 - nvdimm_bus_unlock(&nvdimm_bus->dev); 637 + spin_unlock(&nvdimm_bus->poison_lock); 657 638 } 658 - EXPORT_SYMBOL_GPL(nvdimm_clear_from_poison_list); 639 + EXPORT_SYMBOL_GPL(nvdimm_forget_poison); 659 640 660 641 #ifdef CONFIG_BLK_DEV_INTEGRITY 661 642 int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
+1 -1
drivers/nvdimm/dax_devs.c
··· 124 124 dev_dbg(dev, "%s: dax: %s\n", __func__, 125 125 rc == 0 ? dev_name(dax_dev) : "<none>"); 126 126 if (rc < 0) { 127 - __nd_detach_ndns(dax_dev, &nd_pfn->ndns); 127 + nd_detach_ndns(dax_dev, &nd_pfn->ndns); 128 128 put_device(dax_dev); 129 129 } else 130 130 __nd_device_register(dax_dev);
+2
drivers/nvdimm/dimm.c
··· 49 49 kref_init(&ndd->kref); 50 50 51 51 rc = nvdimm_init_nsarea(ndd); 52 + if (rc == -EACCES) 53 + nvdimm_set_locked(dev); 52 54 if (rc) 53 55 goto err; 54 56
+15 -4
drivers/nvdimm/dimm_devs.c
··· 34 34 35 35 if (!nvdimm->cmd_mask || 36 36 !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) { 37 - if (nvdimm->flags & NDD_ALIASING) 37 + if (test_bit(NDD_ALIASING, &nvdimm->flags)) 38 38 return -ENXIO; 39 39 else 40 40 return -ENOTTY; ··· 67 67 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); 68 68 struct nvdimm_bus_descriptor *nd_desc; 69 69 int rc = validate_dimm(ndd); 70 + int cmd_rc = 0; 70 71 71 72 if (rc) 72 73 return rc; ··· 77 76 78 77 memset(cmd, 0, sizeof(*cmd)); 79 78 nd_desc = nvdimm_bus->nd_desc; 80 - return nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), 81 - ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd), NULL); 79 + rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), 80 + ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd), &cmd_rc); 81 + if (rc < 0) 82 + return rc; 83 + return cmd_rc; 82 84 } 83 85 84 86 int nvdimm_init_config_data(struct nvdimm_drvdata *ndd) ··· 192 188 { 193 189 struct nvdimm *nvdimm = to_nvdimm(dev); 194 190 195 - nvdimm->flags |= NDD_ALIASING; 191 + set_bit(NDD_ALIASING, &nvdimm->flags); 192 + } 193 + 194 + void nvdimm_set_locked(struct device *dev) 195 + { 196 + struct nvdimm *nvdimm = to_nvdimm(dev); 197 + 198 + set_bit(NDD_LOCKED, &nvdimm->flags); 196 199 } 197 200 198 201 static void nvdimm_release(struct device *dev)
+12 -5
drivers/nvdimm/namespace_devs.c
··· 2236 2236 int count, j; 2237 2237 2238 2238 /* 2239 - * If the dimm is disabled then prevent the region from 2240 - * being activated if it aliases DPA. 2239 + * If the dimm is disabled then we may need to prevent 2240 + * the region from being activated. 2241 2241 */ 2242 2242 if (!ndd) { 2243 - if ((nvdimm->flags & NDD_ALIASING) == 0) 2243 + if (test_bit(NDD_LOCKED, &nvdimm->flags)) 2244 + /* fail, label data may be unreadable */; 2245 + else if (test_bit(NDD_ALIASING, &nvdimm->flags)) 2246 + /* fail, labels needed to disambiguate dpa */; 2247 + else 2244 2248 return 0; 2245 - dev_dbg(&nd_region->dev, "%s: is disabled, failing probe\n", 2246 - dev_name(&nd_mapping->nvdimm->dev)); 2249 + 2250 + dev_err(&nd_region->dev, "%s: is %s, failing probe\n", 2251 + dev_name(&nd_mapping->nvdimm->dev), 2252 + test_bit(NDD_LOCKED, &nvdimm->flags) 2253 + ? "locked" : "disabled"); 2247 2254 return -ENXIO; 2248 2255 } 2249 2256 nd_mapping->ndd = ndd;
+1
drivers/nvdimm/nd-core.h
··· 32 32 struct list_head poison_list; 33 33 struct list_head mapping_list; 34 34 struct mutex reconfig_mutex; 35 + spinlock_t poison_lock; 35 36 }; 36 37 37 38 struct nvdimm {
+2
drivers/nvdimm/nd.h
··· 154 154 u64 ndr_start; 155 155 int id, num_lanes, ro, numa_node; 156 156 void *provider_data; 157 + struct badblocks bb; 157 158 struct nd_interleave_set *nd_set; 158 159 struct nd_percpu_lane __percpu *lane; 159 160 struct nd_mapping mapping[0]; ··· 240 239 long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, 241 240 unsigned int len); 242 241 void nvdimm_set_aliasing(struct device *dev); 242 + void nvdimm_set_locked(struct device *dev); 243 243 struct nd_btt *to_nd_btt(struct device *dev); 244 244 245 245 struct nd_gen_sb {
+5 -3
drivers/nvdimm/pfn_devs.c
··· 484 484 dev_dbg(dev, "%s: pfn: %s\n", __func__, 485 485 rc == 0 ? dev_name(pfn_dev) : "<none>"); 486 486 if (rc < 0) { 487 - __nd_detach_ndns(pfn_dev, &nd_pfn->ndns); 487 + nd_detach_ndns(pfn_dev, &nd_pfn->ndns); 488 488 put_device(pfn_dev); 489 489 } else 490 490 __nd_device_register(pfn_dev); ··· 538 538 nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); 539 539 altmap = NULL; 540 540 } else if (nd_pfn->mode == PFN_MODE_PMEM) { 541 - nd_pfn->npfns = (resource_size(res) - offset) / PAGE_SIZE; 541 + nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res) 542 + - offset) / PAGE_SIZE); 542 543 if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) 543 544 dev_info(&nd_pfn->dev, 544 545 "number of pfns truncated from %lld to %ld\n", ··· 626 625 */ 627 626 start += start_pad; 628 627 size = resource_size(&nsio->res); 629 - npfns = (size - start_pad - end_trunc - SZ_8K) / SZ_4K; 628 + npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K) 629 + / PAGE_SIZE); 630 630 if (nd_pfn->mode == PFN_MODE_PMEM) { 631 631 /* 632 632 * vmemmap_populate_hugepages() allocates the memmap array in
+64 -26
drivers/nvdimm/pmem.c
··· 29 29 #include <linux/pfn_t.h> 30 30 #include <linux/slab.h> 31 31 #include <linux/pmem.h> 32 + #include <linux/dax.h> 32 33 #include <linux/nd.h> 33 34 #include "pmem.h" 34 35 #include "pfn.h" ··· 90 89 int rc; 91 90 void *mem = kmap_atomic(page); 92 91 93 - rc = memcpy_from_pmem(mem + off, pmem_addr, len); 92 + rc = memcpy_mcsafe(mem + off, pmem_addr, len); 94 93 kunmap_atomic(mem); 95 94 if (rc) 96 95 return -EIO; ··· 201 200 } 202 201 203 202 /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ 204 - __weak long pmem_direct_access(struct block_device *bdev, sector_t sector, 205 - void **kaddr, pfn_t *pfn, long size) 203 + __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, 204 + long nr_pages, void **kaddr, pfn_t *pfn) 206 205 { 207 - struct pmem_device *pmem = bdev->bd_queue->queuedata; 208 - resource_size_t offset = sector * 512 + pmem->data_offset; 206 + resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; 209 207 210 - if (unlikely(is_bad_pmem(&pmem->bb, sector, size))) 208 + if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, 209 + PFN_PHYS(nr_pages)))) 211 210 return -EIO; 212 211 *kaddr = pmem->virt_addr + offset; 213 212 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); ··· 217 216 * requested range. 218 217 */ 219 218 if (unlikely(pmem->bb.count)) 220 - return size; 221 - return pmem->size - pmem->pfn_pad - offset; 219 + return nr_pages; 220 + return PHYS_PFN(pmem->size - pmem->pfn_pad - offset); 222 221 } 223 222 224 223 static const struct block_device_operations pmem_fops = { 225 224 .owner = THIS_MODULE, 226 225 .rw_page = pmem_rw_page, 227 - .direct_access = pmem_direct_access, 228 226 .revalidate_disk = nvdimm_revalidate_disk, 227 + }; 228 + 229 + static long pmem_dax_direct_access(struct dax_device *dax_dev, 230 + pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) 231 + { 232 + struct pmem_device *pmem = dax_get_private(dax_dev); 233 + 234 + return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn); 235 + } 236 + 237 + static const struct dax_operations pmem_dax_ops = { 238 + .direct_access = pmem_dax_direct_access, 229 239 }; 230 240 231 241 static void pmem_release_queue(void *q) ··· 249 237 blk_freeze_queue_start(q); 250 238 } 251 239 252 - static void pmem_release_disk(void *disk) 240 + static void pmem_release_disk(void *__pmem) 253 241 { 254 - del_gendisk(disk); 255 - put_disk(disk); 242 + struct pmem_device *pmem = __pmem; 243 + 244 + kill_dax(pmem->dax_dev); 245 + put_dax(pmem->dax_dev); 246 + del_gendisk(pmem->disk); 247 + put_disk(pmem->disk); 256 248 } 257 249 258 250 static int pmem_attach_disk(struct device *dev, ··· 267 251 struct vmem_altmap __altmap, *altmap = NULL; 268 252 struct resource *res = &nsio->res; 269 253 struct nd_pfn *nd_pfn = NULL; 254 + struct dax_device *dax_dev; 270 255 int nid = dev_to_node(dev); 271 256 struct nd_pfn_sb *pfn_sb; 272 257 struct pmem_device *pmem; ··· 351 334 disk = alloc_disk_node(0, nid); 352 335 if (!disk) 353 336 return -ENOMEM; 337 + pmem->disk = disk; 354 338 355 339 disk->fops = &pmem_fops; 356 340 disk->queue = q; ··· 363 345 return -ENOMEM; 364 346 nvdimm_badblocks_populate(nd_region, &pmem->bb, res); 365 347 disk->bb = &pmem->bb; 366 - device_add_disk(dev, disk); 367 348 368 - if (devm_add_action_or_reset(dev, pmem_release_disk, disk)) 349 + dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops); 350 + if (!dax_dev) { 351 + put_disk(disk); 352 + return -ENOMEM; 353 + } 354 + pmem->dax_dev = dax_dev; 355 + 356 + device_add_disk(dev, disk); 357 + if (devm_add_action_or_reset(dev, pmem_release_disk, pmem)) 369 358 return -ENOMEM; 370 359 371 360 revalidate_disk(disk); ··· 422 397 423 398 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) 424 399 { 425 - struct pmem_device *pmem = dev_get_drvdata(dev); 426 - struct nd_region *nd_region = to_region(pmem); 400 + struct nd_region *nd_region; 427 401 resource_size_t offset = 0, end_trunc = 0; 428 402 struct nd_namespace_common *ndns; 429 403 struct nd_namespace_io *nsio; 430 404 struct resource res; 405 + struct badblocks *bb; 431 406 432 407 if (event != NVDIMM_REVALIDATE_POISON) 433 408 return; ··· 436 411 struct nd_btt *nd_btt = to_nd_btt(dev); 437 412 438 413 ndns = nd_btt->ndns; 439 - } else if (is_nd_pfn(dev)) { 440 - struct nd_pfn *nd_pfn = to_nd_pfn(dev); 441 - struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; 414 + nd_region = to_nd_region(ndns->dev.parent); 415 + nsio = to_nd_namespace_io(&ndns->dev); 416 + bb = &nsio->bb; 417 + } else { 418 + struct pmem_device *pmem = dev_get_drvdata(dev); 442 419 443 - ndns = nd_pfn->ndns; 444 - offset = pmem->data_offset + __le32_to_cpu(pfn_sb->start_pad); 445 - end_trunc = __le32_to_cpu(pfn_sb->end_trunc); 446 - } else 447 - ndns = to_ndns(dev); 420 + nd_region = to_region(pmem); 421 + bb = &pmem->bb; 448 422 449 - nsio = to_nd_namespace_io(&ndns->dev); 423 + if (is_nd_pfn(dev)) { 424 + struct nd_pfn *nd_pfn = to_nd_pfn(dev); 425 + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; 426 + 427 + ndns = nd_pfn->ndns; 428 + offset = pmem->data_offset + 429 + __le32_to_cpu(pfn_sb->start_pad); 430 + end_trunc = __le32_to_cpu(pfn_sb->end_trunc); 431 + } else { 432 + ndns = to_ndns(dev); 433 + } 434 + 435 + nsio = to_nd_namespace_io(&ndns->dev); 436 + } 437 + 450 438 res.start = nsio->res.start + offset; 451 439 res.end = nsio->res.end - end_trunc; 452 - nvdimm_badblocks_populate(nd_region, &pmem->bb, &res); 440 + nvdimm_badblocks_populate(nd_region, bb, &res); 453 441 } 454 442 455 443 MODULE_ALIAS("pmem");
+5 -2
drivers/nvdimm/pmem.h
··· 5 5 #include <linux/pfn_t.h> 6 6 #include <linux/fs.h> 7 7 8 - long pmem_direct_access(struct block_device *bdev, sector_t sector, 9 - void **kaddr, pfn_t *pfn, long size); 10 8 /* this definition is in it's own header for tools/testing/nvdimm to consume */ 11 9 struct pmem_device { 12 10 /* One contiguous memory region per device */ ··· 18 20 /* trim size when namespace capacity has been section aligned */ 19 21 u32 pfn_pad; 20 22 struct badblocks bb; 23 + struct dax_device *dax_dev; 24 + struct gendisk *disk; 21 25 }; 26 + 27 + long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, 28 + long nr_pages, void **kaddr, pfn_t *pfn); 22 29 #endif /* __NVDIMM_PMEM_H__ */
+24
drivers/nvdimm/region.c
··· 14 14 #include <linux/module.h> 15 15 #include <linux/device.h> 16 16 #include <linux/nd.h> 17 + #include "nd-core.h" 17 18 #include "nd.h" 18 19 19 20 static int nd_region_probe(struct device *dev) ··· 52 51 53 52 if (rc && err && rc == err) 54 53 return -ENODEV; 54 + 55 + if (is_nd_pmem(&nd_region->dev)) { 56 + struct resource ndr_res; 57 + 58 + if (devm_init_badblocks(dev, &nd_region->bb)) 59 + return -ENODEV; 60 + ndr_res.start = nd_region->ndr_start; 61 + ndr_res.end = nd_region->ndr_start + nd_region->ndr_size - 1; 62 + nvdimm_badblocks_populate(nd_region, 63 + &nd_region->bb, &ndr_res); 64 + } 55 65 56 66 nd_region->btt_seed = nd_btt_create(nd_region); 57 67 nd_region->pfn_seed = nd_pfn_create(nd_region); ··· 116 104 117 105 static void nd_region_notify(struct device *dev, enum nvdimm_event event) 118 106 { 107 + if (event == NVDIMM_REVALIDATE_POISON) { 108 + struct nd_region *nd_region = to_nd_region(dev); 109 + struct resource res; 110 + 111 + if (is_nd_pmem(&nd_region->dev)) { 112 + res.start = nd_region->ndr_start; 113 + res.end = nd_region->ndr_start + 114 + nd_region->ndr_size - 1; 115 + nvdimm_badblocks_populate(nd_region, 116 + &nd_region->bb, &res); 117 + } 118 + } 119 119 device_for_each_child(dev, &event, child_notify); 120 120 } 121 121
+77 -6
drivers/nvdimm/region_devs.c
··· 222 222 struct nd_mapping *nd_mapping = &nd_region->mapping[i]; 223 223 struct nvdimm *nvdimm = nd_mapping->nvdimm; 224 224 225 - if (nvdimm->flags & NDD_ALIASING) 225 + if (test_bit(NDD_ALIASING, &nvdimm->flags)) 226 226 alias++; 227 227 } 228 228 if (alias) ··· 254 254 return sprintf(buf, "%llu\n", size); 255 255 } 256 256 static DEVICE_ATTR_RO(size); 257 + 258 + static ssize_t deep_flush_show(struct device *dev, 259 + struct device_attribute *attr, char *buf) 260 + { 261 + struct nd_region *nd_region = to_nd_region(dev); 262 + 263 + /* 264 + * NOTE: in the nvdimm_has_flush() error case this attribute is 265 + * not visible. 266 + */ 267 + return sprintf(buf, "%d\n", nvdimm_has_flush(nd_region)); 268 + } 269 + 270 + static ssize_t deep_flush_store(struct device *dev, struct device_attribute *attr, 271 + const char *buf, size_t len) 272 + { 273 + bool flush; 274 + int rc = strtobool(buf, &flush); 275 + struct nd_region *nd_region = to_nd_region(dev); 276 + 277 + if (rc) 278 + return rc; 279 + if (!flush) 280 + return -EINVAL; 281 + nvdimm_flush(nd_region); 282 + 283 + return len; 284 + } 285 + static DEVICE_ATTR_RW(deep_flush); 257 286 258 287 static ssize_t mappings_show(struct device *dev, 259 288 struct device_attribute *attr, char *buf) ··· 477 448 } 478 449 static DEVICE_ATTR_RW(read_only); 479 450 451 + static ssize_t region_badblocks_show(struct device *dev, 452 + struct device_attribute *attr, char *buf) 453 + { 454 + struct nd_region *nd_region = to_nd_region(dev); 455 + 456 + return badblocks_show(&nd_region->bb, buf, 0); 457 + } 458 + 459 + static DEVICE_ATTR(badblocks, 0444, region_badblocks_show, NULL); 460 + 461 + static ssize_t resource_show(struct device *dev, 462 + struct device_attribute *attr, char *buf) 463 + { 464 + struct nd_region *nd_region = to_nd_region(dev); 465 + 466 + return sprintf(buf, "%#llx\n", nd_region->ndr_start); 467 + } 468 + static DEVICE_ATTR_RO(resource); 469 + 480 470 static struct attribute *nd_region_attributes[] = { 481 471 &dev_attr_size.attr, 482 472 &dev_attr_nstype.attr, ··· 503 455 &dev_attr_btt_seed.attr, 504 456 &dev_attr_pfn_seed.attr, 505 457 &dev_attr_dax_seed.attr, 458 + &dev_attr_deep_flush.attr, 506 459 &dev_attr_read_only.attr, 507 460 &dev_attr_set_cookie.attr, 508 461 &dev_attr_available_size.attr, 509 462 &dev_attr_namespace_seed.attr, 510 463 &dev_attr_init_namespaces.attr, 464 + &dev_attr_badblocks.attr, 465 + &dev_attr_resource.attr, 511 466 NULL, 512 467 }; 513 468 ··· 526 475 527 476 if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr) 528 477 return 0; 478 + 479 + if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr) 480 + return 0; 481 + 482 + if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr) 483 + return 0; 484 + 485 + if (a == &dev_attr_deep_flush.attr) { 486 + int has_flush = nvdimm_has_flush(nd_region); 487 + 488 + if (has_flush == 1) 489 + return a->mode; 490 + else if (has_flush == 0) 491 + return 0444; 492 + else 493 + return 0; 494 + } 529 495 530 496 if (a != &dev_attr_set_cookie.attr 531 497 && a != &dev_attr_available_size.attr) ··· 881 813 return NULL; 882 814 } 883 815 884 - if (nvdimm->flags & NDD_UNARMED) 816 + if (test_bit(NDD_UNARMED, &nvdimm->flags)) 885 817 ro = 1; 886 818 } 887 819 ··· 1036 968 */ 1037 969 int nvdimm_has_flush(struct nd_region *nd_region) 1038 970 { 1039 - struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev); 1040 971 int i; 1041 972 1042 973 /* no nvdimm == flushing capability unknown */ 1043 974 if (nd_region->ndr_mappings == 0) 1044 975 return -ENXIO; 1045 976 1046 - for (i = 0; i < nd_region->ndr_mappings; i++) 1047 - /* flush hints present, flushing required */ 1048 - if (ndrd_get_flush_wpq(ndrd, i, 0)) 977 + for (i = 0; i < nd_region->ndr_mappings; i++) { 978 + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; 979 + struct nvdimm *nvdimm = nd_mapping->nvdimm; 980 + 981 + /* flush hints present / available */ 982 + if (nvdimm->num_flush) 1049 983 return 1; 984 + } 1050 985 1051 986 /* 1052 987 * The platform defines dimm devices without hints, assume
+1
drivers/s390/block/Kconfig
··· 14 14 15 15 config DCSSBLK 16 16 def_tristate m 17 + select DAX 17 18 prompt "DCSSBLK support" 18 19 depends on S390 && BLOCK 19 20 help
+33 -12
drivers/s390/block/dcssblk.c
··· 18 18 #include <linux/interrupt.h> 19 19 #include <linux/platform_device.h> 20 20 #include <linux/pfn_t.h> 21 + #include <linux/dax.h> 21 22 #include <asm/extmem.h> 22 23 #include <asm/io.h> 23 24 ··· 31 30 static void dcssblk_release(struct gendisk *disk, fmode_t mode); 32 31 static blk_qc_t dcssblk_make_request(struct request_queue *q, 33 32 struct bio *bio); 34 - static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, 35 - void **kaddr, pfn_t *pfn, long size); 33 + static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 34 + long nr_pages, void **kaddr, pfn_t *pfn); 36 35 37 36 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; 38 37 ··· 41 40 .owner = THIS_MODULE, 42 41 .open = dcssblk_open, 43 42 .release = dcssblk_release, 44 - .direct_access = dcssblk_direct_access, 43 + }; 44 + 45 + static const struct dax_operations dcssblk_dax_ops = { 46 + .direct_access = dcssblk_dax_direct_access, 45 47 }; 46 48 47 49 struct dcssblk_dev_info { ··· 61 57 struct request_queue *dcssblk_queue; 62 58 int num_of_segments; 63 59 struct list_head seg_list; 60 + struct dax_device *dax_dev; 64 61 }; 65 62 66 63 struct segment_info { ··· 394 389 } 395 390 list_del(&dev_info->lh); 396 391 392 + kill_dax(dev_info->dax_dev); 393 + put_dax(dev_info->dax_dev); 397 394 del_gendisk(dev_info->gd); 398 395 blk_cleanup_queue(dev_info->dcssblk_queue); 399 396 dev_info->gd->queue = NULL; ··· 661 654 if (rc) 662 655 goto put_dev; 663 656 657 + dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name, 658 + &dcssblk_dax_ops); 659 + if (!dev_info->dax_dev) { 660 + rc = -ENOMEM; 661 + goto put_dev; 662 + } 663 + 664 664 get_device(&dev_info->dev); 665 665 device_add_disk(&dev_info->dev, dev_info->gd); 666 666 ··· 766 752 } 767 753 768 754 list_del(&dev_info->lh); 755 + kill_dax(dev_info->dax_dev); 756 + put_dax(dev_info->dax_dev); 769 757 del_gendisk(dev_info->gd); 770 758 blk_cleanup_queue(dev_info->dcssblk_queue); 771 759 dev_info->gd->queue = NULL; ··· 899 883 } 900 884 901 885 static long 902 - dcssblk_direct_access (struct block_device *bdev, sector_t secnum, 903 - void **kaddr, pfn_t *pfn, long size) 886 + __dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff, 887 + long nr_pages, void **kaddr, pfn_t *pfn) 904 888 { 905 - struct dcssblk_dev_info *dev_info; 906 - unsigned long offset, dev_sz; 889 + resource_size_t offset = pgoff * PAGE_SIZE; 890 + unsigned long dev_sz; 907 891 908 - dev_info = bdev->bd_disk->private_data; 909 - if (!dev_info) 910 - return -ENODEV; 911 892 dev_sz = dev_info->end - dev_info->start + 1; 912 - offset = secnum * 512; 913 893 *kaddr = (void *) dev_info->start + offset; 914 894 *pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV); 915 895 916 - return dev_sz - offset; 896 + return (dev_sz - offset) / PAGE_SIZE; 897 + } 898 + 899 + static long 900 + dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 901 + long nr_pages, void **kaddr, pfn_t *pfn) 902 + { 903 + struct dcssblk_dev_info *dev_info = dax_get_private(dax_dev); 904 + 905 + return __dcssblk_direct_access(dev_info, pgoff, nr_pages, kaddr, pfn); 917 906 } 918 907 919 908 static void
+37 -84
fs/block_dev.c
··· 18 18 #include <linux/module.h> 19 19 #include <linux/blkpg.h> 20 20 #include <linux/magic.h> 21 + #include <linux/dax.h> 21 22 #include <linux/buffer_head.h> 22 23 #include <linux/swap.h> 23 24 #include <linux/pagevec.h> ··· 717 716 } 718 717 EXPORT_SYMBOL_GPL(bdev_write_page); 719 718 720 - /** 721 - * bdev_direct_access() - Get the address for directly-accessibly memory 722 - * @bdev: The device containing the memory 723 - * @dax: control and output parameters for ->direct_access 724 - * 725 - * If a block device is made up of directly addressable memory, this function 726 - * will tell the caller the PFN and the address of the memory. The address 727 - * may be directly dereferenced within the kernel without the need to call 728 - * ioremap(), kmap() or similar. The PFN is suitable for inserting into 729 - * page tables. 730 - * 731 - * Return: negative errno if an error occurs, otherwise the number of bytes 732 - * accessible at this address. 733 - */ 734 - long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) 719 + int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, 720 + pgoff_t *pgoff) 735 721 { 736 - sector_t sector = dax->sector; 737 - long avail, size = dax->size; 738 - const struct block_device_operations *ops = bdev->bd_disk->fops; 722 + phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512; 739 723 740 - /* 741 - * The device driver is allowed to sleep, in order to make the 742 - * memory directly accessible. 743 - */ 744 - might_sleep(); 745 - 746 - if (size < 0) 747 - return size; 748 - if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access) 749 - return -EOPNOTSUPP; 750 - if ((sector + DIV_ROUND_UP(size, 512)) > 751 - part_nr_sects_read(bdev->bd_part)) 752 - return -ERANGE; 753 - sector += get_start_sect(bdev); 754 - if (sector % (PAGE_SIZE / 512)) 724 + if (pgoff) 725 + *pgoff = PHYS_PFN(phys_off); 726 + if (phys_off % PAGE_SIZE || size % PAGE_SIZE) 755 727 return -EINVAL; 756 - avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size); 757 - if (!avail) 758 - return -ERANGE; 759 - if (avail > 0 && avail & ~PAGE_MASK) 760 - return -ENXIO; 761 - return min(avail, size); 728 + return 0; 762 729 } 763 - EXPORT_SYMBOL_GPL(bdev_direct_access); 730 + EXPORT_SYMBOL(bdev_dax_pgoff); 764 731 765 732 /** 766 733 * bdev_dax_supported() - Check if the device supports dax for filesystem ··· 742 773 */ 743 774 int bdev_dax_supported(struct super_block *sb, int blocksize) 744 775 { 745 - struct blk_dax_ctl dax = { 746 - .sector = 0, 747 - .size = PAGE_SIZE, 748 - }; 749 - int err; 776 + struct block_device *bdev = sb->s_bdev; 777 + struct dax_device *dax_dev; 778 + pgoff_t pgoff; 779 + int err, id; 780 + void *kaddr; 781 + pfn_t pfn; 782 + long len; 750 783 751 784 if (blocksize != PAGE_SIZE) { 752 785 vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); 753 786 return -EINVAL; 754 787 } 755 788 756 - err = bdev_direct_access(sb->s_bdev, &dax); 757 - if (err < 0) { 758 - switch (err) { 759 - case -EOPNOTSUPP: 760 - vfs_msg(sb, KERN_ERR, 761 - "error: device does not support dax"); 762 - break; 763 - case -EINVAL: 764 - vfs_msg(sb, KERN_ERR, 765 - "error: unaligned partition for dax"); 766 - break; 767 - default: 768 - vfs_msg(sb, KERN_ERR, 769 - "error: dax access failed (%d)", err); 770 - } 789 + err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); 790 + if (err) { 791 + vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax"); 771 792 return err; 793 + } 794 + 795 + dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 796 + if (!dax_dev) { 797 + vfs_msg(sb, KERN_ERR, "error: device does not support dax"); 798 + return -EOPNOTSUPP; 799 + } 800 + 801 + id = dax_read_lock(); 802 + len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); 803 + dax_read_unlock(id); 804 + 805 + put_dax(dax_dev); 806 + 807 + if (len < 1) { 808 + vfs_msg(sb, KERN_ERR, 809 + "error: dax access failed (%ld)", len); 810 + return len < 0 ? len : -EIO; 772 811 } 773 812 774 813 return 0; 775 814 } 776 815 EXPORT_SYMBOL_GPL(bdev_dax_supported); 777 - 778 - /** 779 - * bdev_dax_capable() - Return if the raw device is capable for dax 780 - * @bdev: The device for raw block device access 781 - */ 782 - bool bdev_dax_capable(struct block_device *bdev) 783 - { 784 - struct blk_dax_ctl dax = { 785 - .size = PAGE_SIZE, 786 - }; 787 - 788 - if (!IS_ENABLED(CONFIG_FS_DAX)) 789 - return false; 790 - 791 - dax.sector = 0; 792 - if (bdev_direct_access(bdev, &dax) < 0) 793 - return false; 794 - 795 - dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512); 796 - if (bdev_direct_access(bdev, &dax) < 0) 797 - return false; 798 - 799 - return true; 800 - } 801 816 802 817 /* 803 818 * pseudo-fs
+153 -138
fs/dax.c
··· 55 55 } 56 56 fs_initcall(init_dax_wait_table); 57 57 58 - static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 59 - { 60 - struct request_queue *q = bdev->bd_queue; 61 - long rc = -EIO; 62 - 63 - dax->addr = ERR_PTR(-EIO); 64 - if (blk_queue_enter(q, true) != 0) 65 - return rc; 66 - 67 - rc = bdev_direct_access(bdev, dax); 68 - if (rc < 0) { 69 - dax->addr = ERR_PTR(rc); 70 - blk_queue_exit(q); 71 - return rc; 72 - } 73 - return rc; 74 - } 75 - 76 - static void dax_unmap_atomic(struct block_device *bdev, 77 - const struct blk_dax_ctl *dax) 78 - { 79 - if (IS_ERR(dax->addr)) 80 - return; 81 - blk_queue_exit(bdev->bd_queue); 82 - } 83 - 84 58 static int dax_is_pmd_entry(void *entry) 85 59 { 86 60 return (unsigned long)entry & RADIX_DAX_PMD; ··· 73 99 static int dax_is_empty_entry(void *entry) 74 100 { 75 101 return (unsigned long)entry & RADIX_DAX_EMPTY; 76 - } 77 - 78 - struct page *read_dax_sector(struct block_device *bdev, sector_t n) 79 - { 80 - struct page *page = alloc_pages(GFP_KERNEL, 0); 81 - struct blk_dax_ctl dax = { 82 - .size = PAGE_SIZE, 83 - .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), 84 - }; 85 - long rc; 86 - 87 - if (!page) 88 - return ERR_PTR(-ENOMEM); 89 - 90 - rc = dax_map_atomic(bdev, &dax); 91 - if (rc < 0) 92 - return ERR_PTR(rc); 93 - memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); 94 - dax_unmap_atomic(bdev, &dax); 95 - return page; 96 102 } 97 103 98 104 /* ··· 536 582 return ret; 537 583 } 538 584 539 - static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, 540 - struct page *to, unsigned long vaddr) 585 + static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, 586 + sector_t sector, size_t size, struct page *to, 587 + unsigned long vaddr) 541 588 { 542 - struct blk_dax_ctl dax = { 543 - .sector = sector, 544 - .size = size, 545 - }; 546 - void *vto; 589 + void *vto, *kaddr; 590 + pgoff_t pgoff; 591 + pfn_t pfn; 592 + long rc; 593 + int id; 547 594 548 - if (dax_map_atomic(bdev, &dax) < 0) 549 - return PTR_ERR(dax.addr); 595 + rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); 596 + if (rc) 597 + return rc; 598 + 599 + id = dax_read_lock(); 600 + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); 601 + if (rc < 0) { 602 + dax_read_unlock(id); 603 + return rc; 604 + } 550 605 vto = kmap_atomic(to); 551 - copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 606 + copy_user_page(vto, (void __force *)kaddr, vaddr, to); 552 607 kunmap_atomic(vto); 553 - dax_unmap_atomic(bdev, &dax); 608 + dax_read_unlock(id); 554 609 return 0; 555 610 } 556 611 ··· 727 764 } 728 765 729 766 static int dax_writeback_one(struct block_device *bdev, 730 - struct address_space *mapping, pgoff_t index, void *entry) 767 + struct dax_device *dax_dev, struct address_space *mapping, 768 + pgoff_t index, void *entry) 731 769 { 732 770 struct radix_tree_root *page_tree = &mapping->page_tree; 733 - struct blk_dax_ctl dax; 734 - void *entry2, **slot; 735 - int ret = 0; 771 + void *entry2, **slot, *kaddr; 772 + long ret = 0, id; 773 + sector_t sector; 774 + pgoff_t pgoff; 775 + size_t size; 776 + pfn_t pfn; 736 777 737 778 /* 738 779 * A page got tagged dirty in DAX mapping? Something is seriously ··· 785 818 * 'entry'. This allows us to flush for PMD_SIZE and not have to 786 819 * worry about partial PMD writebacks. 787 820 */ 788 - dax.sector = dax_radix_sector(entry); 789 - dax.size = PAGE_SIZE << dax_radix_order(entry); 821 + sector = dax_radix_sector(entry); 822 + size = PAGE_SIZE << dax_radix_order(entry); 823 + 824 + id = dax_read_lock(); 825 + ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); 826 + if (ret) 827 + goto dax_unlock; 790 828 791 829 /* 792 - * We cannot hold tree_lock while calling dax_map_atomic() because it 793 - * eventually calls cond_resched(). 830 + * dax_direct_access() may sleep, so cannot hold tree_lock over 831 + * its invocation. 794 832 */ 795 - ret = dax_map_atomic(bdev, &dax); 796 - if (ret < 0) { 797 - put_locked_mapping_entry(mapping, index, entry); 798 - return ret; 799 - } 833 + ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn); 834 + if (ret < 0) 835 + goto dax_unlock; 800 836 801 - if (WARN_ON_ONCE(ret < dax.size)) { 837 + if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) { 802 838 ret = -EIO; 803 - goto unmap; 839 + goto dax_unlock; 804 840 } 805 841 806 - dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); 807 - wb_cache_pmem(dax.addr, dax.size); 842 + dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); 843 + wb_cache_pmem(kaddr, size); 808 844 /* 809 845 * After we have flushed the cache, we can clear the dirty tag. There 810 846 * cannot be new dirty data in the pfn after the flush has completed as ··· 817 847 spin_lock_irq(&mapping->tree_lock); 818 848 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); 819 849 spin_unlock_irq(&mapping->tree_lock); 820 - unmap: 821 - dax_unmap_atomic(bdev, &dax); 850 + dax_unlock: 851 + dax_read_unlock(id); 822 852 put_locked_mapping_entry(mapping, index, entry); 823 853 return ret; 824 854 ··· 839 869 struct inode *inode = mapping->host; 840 870 pgoff_t start_index, end_index; 841 871 pgoff_t indices[PAGEVEC_SIZE]; 872 + struct dax_device *dax_dev; 842 873 struct pagevec pvec; 843 874 bool done = false; 844 875 int i, ret = 0; ··· 849 878 850 879 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 851 880 return 0; 881 + 882 + dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 883 + if (!dax_dev) 884 + return -EIO; 852 885 853 886 start_index = wbc->range_start >> PAGE_SHIFT; 854 887 end_index = wbc->range_end >> PAGE_SHIFT; ··· 874 899 break; 875 900 } 876 901 877 - ret = dax_writeback_one(bdev, mapping, indices[i], 878 - pvec.pages[i]); 879 - if (ret < 0) 902 + ret = dax_writeback_one(bdev, dax_dev, mapping, 903 + indices[i], pvec.pages[i]); 904 + if (ret < 0) { 905 + put_dax(dax_dev); 880 906 return ret; 907 + } 881 908 } 882 909 } 910 + put_dax(dax_dev); 883 911 return 0; 884 912 } 885 913 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 886 914 887 915 static int dax_insert_mapping(struct address_space *mapping, 888 - struct block_device *bdev, sector_t sector, size_t size, 889 - void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) 916 + struct block_device *bdev, struct dax_device *dax_dev, 917 + sector_t sector, size_t size, void **entryp, 918 + struct vm_area_struct *vma, struct vm_fault *vmf) 890 919 { 891 920 unsigned long vaddr = vmf->address; 892 - struct blk_dax_ctl dax = { 893 - .sector = sector, 894 - .size = size, 895 - }; 896 - void *ret; 897 921 void *entry = *entryp; 922 + void *ret, *kaddr; 923 + pgoff_t pgoff; 924 + int id, rc; 925 + pfn_t pfn; 898 926 899 - if (dax_map_atomic(bdev, &dax) < 0) 900 - return PTR_ERR(dax.addr); 901 - dax_unmap_atomic(bdev, &dax); 927 + rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); 928 + if (rc) 929 + return rc; 902 930 903 - ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); 931 + id = dax_read_lock(); 932 + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); 933 + if (rc < 0) { 934 + dax_read_unlock(id); 935 + return rc; 936 + } 937 + dax_read_unlock(id); 938 + 939 + ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); 904 940 if (IS_ERR(ret)) 905 941 return PTR_ERR(ret); 906 942 *entryp = ret; 907 943 908 - return vm_insert_mixed(vma, vaddr, dax.pfn); 944 + return vm_insert_mixed(vma, vaddr, pfn); 909 945 } 910 946 911 947 /** ··· 965 979 return true; 966 980 } 967 981 968 - int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 969 - unsigned int offset, unsigned int length) 982 + int __dax_zero_page_range(struct block_device *bdev, 983 + struct dax_device *dax_dev, sector_t sector, 984 + unsigned int offset, unsigned int size) 970 985 { 971 - struct blk_dax_ctl dax = { 972 - .sector = sector, 973 - .size = PAGE_SIZE, 974 - }; 975 - 976 - if (dax_range_is_aligned(bdev, offset, length)) { 977 - sector_t start_sector = dax.sector + (offset >> 9); 986 + if (dax_range_is_aligned(bdev, offset, size)) { 987 + sector_t start_sector = sector + (offset >> 9); 978 988 979 989 return blkdev_issue_zeroout(bdev, start_sector, 980 - length >> 9, GFP_NOFS, 0); 990 + size >> 9, GFP_NOFS, 0); 981 991 } else { 982 - if (dax_map_atomic(bdev, &dax) < 0) 983 - return PTR_ERR(dax.addr); 984 - clear_pmem(dax.addr + offset, length); 985 - dax_unmap_atomic(bdev, &dax); 992 + pgoff_t pgoff; 993 + long rc, id; 994 + void *kaddr; 995 + pfn_t pfn; 996 + 997 + rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); 998 + if (rc) 999 + return rc; 1000 + 1001 + id = dax_read_lock(); 1002 + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, 1003 + &pfn); 1004 + if (rc < 0) { 1005 + dax_read_unlock(id); 1006 + return rc; 1007 + } 1008 + clear_pmem(kaddr + offset, size); 1009 + dax_read_unlock(id); 986 1010 } 987 1011 return 0; 988 1012 } ··· 1007 1011 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 1008 1012 struct iomap *iomap) 1009 1013 { 1014 + struct block_device *bdev = iomap->bdev; 1015 + struct dax_device *dax_dev = iomap->dax_dev; 1010 1016 struct iov_iter *iter = data; 1011 1017 loff_t end = pos + length, done = 0; 1012 1018 ssize_t ret = 0; 1019 + int id; 1013 1020 1014 1021 if (iov_iter_rw(iter) == READ) { 1015 1022 end = min(end, i_size_read(inode)); ··· 1037 1038 (end - 1) >> PAGE_SHIFT); 1038 1039 } 1039 1040 1041 + id = dax_read_lock(); 1040 1042 while (pos < end) { 1041 1043 unsigned offset = pos & (PAGE_SIZE - 1); 1042 - struct blk_dax_ctl dax = { 0 }; 1044 + const size_t size = ALIGN(length + offset, PAGE_SIZE); 1045 + const sector_t sector = dax_iomap_sector(iomap, pos); 1043 1046 ssize_t map_len; 1047 + pgoff_t pgoff; 1048 + void *kaddr; 1049 + pfn_t pfn; 1044 1050 1045 1051 if (fatal_signal_pending(current)) { 1046 1052 ret = -EINTR; 1047 1053 break; 1048 1054 } 1049 1055 1050 - dax.sector = dax_iomap_sector(iomap, pos); 1051 - dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; 1052 - map_len = dax_map_atomic(iomap->bdev, &dax); 1056 + ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); 1057 + if (ret) 1058 + break; 1059 + 1060 + map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), 1061 + &kaddr, &pfn); 1053 1062 if (map_len < 0) { 1054 1063 ret = map_len; 1055 1064 break; 1056 1065 } 1057 1066 1058 - dax.addr += offset; 1067 + map_len = PFN_PHYS(map_len); 1068 + kaddr += offset; 1059 1069 map_len -= offset; 1060 1070 if (map_len > end - pos) 1061 1071 map_len = end - pos; 1062 1072 1063 1073 if (iov_iter_rw(iter) == WRITE) 1064 - map_len = copy_from_iter_pmem(dax.addr, map_len, iter); 1074 + map_len = copy_from_iter_pmem(kaddr, map_len, iter); 1065 1075 else 1066 - map_len = copy_to_iter(dax.addr, map_len, iter); 1067 - dax_unmap_atomic(iomap->bdev, &dax); 1076 + map_len = copy_to_iter(kaddr, map_len, iter); 1068 1077 if (map_len <= 0) { 1069 1078 ret = map_len ? map_len : -EFAULT; 1070 1079 break; ··· 1082 1075 length -= map_len; 1083 1076 done += map_len; 1084 1077 } 1078 + dax_read_unlock(id); 1085 1079 1086 1080 return done ? done : ret; 1087 1081 } ··· 1189 1181 clear_user_highpage(vmf->cow_page, vaddr); 1190 1182 break; 1191 1183 case IOMAP_MAPPED: 1192 - error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, 1193 - vmf->cow_page, vaddr); 1184 + error = copy_user_dax(iomap.bdev, iomap.dax_dev, 1185 + sector, PAGE_SIZE, vmf->cow_page, vaddr); 1194 1186 break; 1195 1187 default: 1196 1188 WARN_ON_ONCE(1); ··· 1215 1207 mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); 1216 1208 major = VM_FAULT_MAJOR; 1217 1209 } 1218 - error = dax_insert_mapping(mapping, iomap.bdev, sector, 1219 - PAGE_SIZE, &entry, vmf->vma, vmf); 1210 + error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, 1211 + sector, PAGE_SIZE, &entry, vmf->vma, vmf); 1220 1212 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1221 1213 if (error == -EBUSY) 1222 1214 error = 0; ··· 1266 1258 loff_t pos, void **entryp) 1267 1259 { 1268 1260 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1261 + const sector_t sector = dax_iomap_sector(iomap, pos); 1262 + struct dax_device *dax_dev = iomap->dax_dev; 1269 1263 struct block_device *bdev = iomap->bdev; 1270 1264 struct inode *inode = mapping->host; 1271 - struct blk_dax_ctl dax = { 1272 - .sector = dax_iomap_sector(iomap, pos), 1273 - .size = PMD_SIZE, 1274 - }; 1275 - long length = dax_map_atomic(bdev, &dax); 1276 - void *ret = NULL; 1265 + const size_t size = PMD_SIZE; 1266 + void *ret = NULL, *kaddr; 1267 + long length = 0; 1268 + pgoff_t pgoff; 1269 + pfn_t pfn; 1270 + int id; 1277 1271 1278 - if (length < 0) /* dax_map_atomic() failed */ 1272 + if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) 1279 1273 goto fallback; 1280 - if (length < PMD_SIZE) 1281 - goto unmap_fallback; 1282 - if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) 1283 - goto unmap_fallback; 1284 - if (!pfn_t_devmap(dax.pfn)) 1285 - goto unmap_fallback; 1286 1274 1287 - dax_unmap_atomic(bdev, &dax); 1275 + id = dax_read_lock(); 1276 + length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); 1277 + if (length < 0) 1278 + goto unlock_fallback; 1279 + length = PFN_PHYS(length); 1288 1280 1289 - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, 1281 + if (length < size) 1282 + goto unlock_fallback; 1283 + if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) 1284 + goto unlock_fallback; 1285 + if (!pfn_t_devmap(pfn)) 1286 + goto unlock_fallback; 1287 + dax_read_unlock(id); 1288 + 1289 + ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, 1290 1290 RADIX_DAX_PMD); 1291 1291 if (IS_ERR(ret)) 1292 1292 goto fallback; 1293 1293 *entryp = ret; 1294 1294 1295 - trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret); 1295 + trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); 1296 1296 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, 1297 - dax.pfn, vmf->flags & FAULT_FLAG_WRITE); 1297 + pfn, vmf->flags & FAULT_FLAG_WRITE); 1298 1298 1299 - unmap_fallback: 1300 - dax_unmap_atomic(bdev, &dax); 1299 + unlock_fallback: 1300 + dax_read_unlock(id); 1301 1301 fallback: 1302 - trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, 1303 - dax.pfn, ret); 1302 + trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); 1304 1303 return VM_FAULT_FALLBACK; 1305 1304 } 1306 1305
+8 -1
fs/ext2/inode.c
··· 799 799 static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 800 800 unsigned flags, struct iomap *iomap) 801 801 { 802 + struct block_device *bdev; 802 803 unsigned int blkbits = inode->i_blkbits; 803 804 unsigned long first_block = offset >> blkbits; 804 805 unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; ··· 813 812 return ret; 814 813 815 814 iomap->flags = 0; 816 - iomap->bdev = inode->i_sb->s_bdev; 815 + bdev = inode->i_sb->s_bdev; 816 + iomap->bdev = bdev; 817 817 iomap->offset = (u64)first_block << blkbits; 818 + if (blk_queue_dax(bdev->bd_queue)) 819 + iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 820 + else 821 + iomap->dax_dev = NULL; 818 822 819 823 if (ret == 0) { 820 824 iomap->type = IOMAP_HOLE; ··· 841 835 ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, 842 836 ssize_t written, unsigned flags, struct iomap *iomap) 843 837 { 838 + put_dax(iomap->dax_dev); 844 839 if (iomap->type == IOMAP_MAPPED && 845 840 written < length && 846 841 (flags & IOMAP_WRITE))
+8 -1
fs/ext4/inode.c
··· 3305 3305 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 3306 3306 unsigned flags, struct iomap *iomap) 3307 3307 { 3308 + struct block_device *bdev; 3308 3309 unsigned int blkbits = inode->i_blkbits; 3309 3310 unsigned long first_block = offset >> blkbits; 3310 3311 unsigned long last_block = (offset + length - 1) >> blkbits; ··· 3374 3373 } 3375 3374 3376 3375 iomap->flags = 0; 3377 - iomap->bdev = inode->i_sb->s_bdev; 3376 + bdev = inode->i_sb->s_bdev; 3377 + iomap->bdev = bdev; 3378 + if (blk_queue_dax(bdev->bd_queue)) 3379 + iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 3380 + else 3381 + iomap->dax_dev = NULL; 3378 3382 iomap->offset = first_block << blkbits; 3379 3383 3380 3384 if (ret == 0) { ··· 3412 3406 int blkbits = inode->i_blkbits; 3413 3407 bool truncate = false; 3414 3408 3409 + put_dax(iomap->dax_dev); 3415 3410 if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) 3416 3411 return 0; 3417 3412
+2 -1
fs/iomap.c
··· 360 360 sector_t sector = iomap->blkno + 361 361 (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); 362 362 363 - return __dax_zero_page_range(iomap->bdev, sector, offset, bytes); 363 + return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector, 364 + offset, bytes); 364 365 } 365 366 366 367 static loff_t
+10
fs/xfs/xfs_iomap.c
··· 976 976 int nimaps = 1, error = 0; 977 977 bool shared = false, trimmed = false; 978 978 unsigned lockmode; 979 + struct block_device *bdev; 979 980 980 981 if (XFS_FORCED_SHUTDOWN(mp)) 981 982 return -EIO; ··· 1064 1063 } 1065 1064 1066 1065 xfs_bmbt_to_iomap(ip, iomap, &imap); 1066 + 1067 + /* optionally associate a dax device with the iomap bdev */ 1068 + bdev = iomap->bdev; 1069 + if (blk_queue_dax(bdev->bd_queue)) 1070 + iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 1071 + else 1072 + iomap->dax_dev = NULL; 1073 + 1067 1074 if (shared) 1068 1075 iomap->flags |= IOMAP_F_SHARED; 1069 1076 return 0; ··· 1149 1140 unsigned flags, 1150 1141 struct iomap *iomap) 1151 1142 { 1143 + put_dax(iomap->dax_dev); 1152 1144 if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) 1153 1145 return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, 1154 1146 length, written, iomap);
+1 -18
include/linux/blkdev.h
··· 1923 1923 1924 1924 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 1925 1925 1926 - /** 1927 - * struct blk_dax_ctl - control and output parameters for ->direct_access 1928 - * @sector: (input) offset relative to a block_device 1929 - * @addr: (output) kernel virtual address for @sector populated by driver 1930 - * @pfn: (output) page frame number for @addr populated by driver 1931 - * @size: (input) number of bytes requested 1932 - */ 1933 - struct blk_dax_ctl { 1934 - sector_t sector; 1935 - void *addr; 1936 - long size; 1937 - pfn_t pfn; 1938 - }; 1939 - 1940 1926 struct block_device_operations { 1941 1927 int (*open) (struct block_device *, fmode_t); 1942 1928 void (*release) (struct gendisk *, fmode_t); 1943 1929 int (*rw_page)(struct block_device *, sector_t, struct page *, bool); 1944 1930 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1945 1931 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1946 - long (*direct_access)(struct block_device *, sector_t, void **, pfn_t *, 1947 - long); 1948 1932 unsigned int (*check_events) (struct gendisk *disk, 1949 1933 unsigned int clearing); 1950 1934 /* ->media_changed() is DEPRECATED, use ->check_events() instead */ ··· 1947 1963 extern int bdev_read_page(struct block_device *, sector_t, struct page *); 1948 1964 extern int bdev_write_page(struct block_device *, sector_t, struct page *, 1949 1965 struct writeback_control *); 1950 - extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *); 1951 1966 extern int bdev_dax_supported(struct super_block *, int); 1952 - extern bool bdev_dax_capable(struct block_device *); 1967 + int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); 1953 1968 #else /* CONFIG_BLOCK */ 1954 1969 1955 1970 struct block_device;
+26 -8
include/linux/dax.h
··· 7 7 #include <asm/pgtable.h> 8 8 9 9 struct iomap_ops; 10 + struct dax_device; 11 + struct dax_operations { 12 + /* 13 + * direct_access: translate a device-relative 14 + * logical-page-offset into an absolute physical pfn. Return the 15 + * number of pages available for DAX at that pfn. 16 + */ 17 + long (*direct_access)(struct dax_device *, pgoff_t, long, 18 + void **, pfn_t *); 19 + }; 20 + 21 + int dax_read_lock(void); 22 + void dax_read_unlock(int id); 23 + struct dax_device *dax_get_by_host(const char *host); 24 + struct dax_device *alloc_dax(void *private, const char *host, 25 + const struct dax_operations *ops); 26 + void put_dax(struct dax_device *dax_dev); 27 + bool dax_alive(struct dax_device *dax_dev); 28 + void kill_dax(struct dax_device *dax_dev); 29 + void *dax_get_private(struct dax_device *dax_dev); 30 + long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 31 + void **kaddr, pfn_t *pfn); 10 32 11 33 /* 12 34 * We use lowest available bit in exceptional entry for locking, one bit for ··· 70 48 pgoff_t index, void *entry, bool wake_all); 71 49 72 50 #ifdef CONFIG_FS_DAX 73 - struct page *read_dax_sector(struct block_device *bdev, sector_t n); 74 - int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 51 + int __dax_zero_page_range(struct block_device *bdev, 52 + struct dax_device *dax_dev, sector_t sector, 75 53 unsigned int offset, unsigned int length); 76 54 #else 77 - static inline struct page *read_dax_sector(struct block_device *bdev, 78 - sector_t n) 79 - { 80 - return ERR_PTR(-ENXIO); 81 - } 82 55 static inline int __dax_zero_page_range(struct block_device *bdev, 83 - sector_t sector, unsigned int offset, unsigned int length) 56 + struct dax_device *dax_dev, sector_t sector, 57 + unsigned int offset, unsigned int length) 84 58 { 85 59 return -ENXIO; 86 60 }
+5 -3
include/linux/device-mapper.h
··· 130 130 * < 0 : error 131 131 * >= 0 : the number of bytes accessible at the address 132 132 */ 133 - typedef long (*dm_direct_access_fn) (struct dm_target *ti, sector_t sector, 134 - void **kaddr, pfn_t *pfn, long size); 133 + typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, 134 + long nr_pages, void **kaddr, pfn_t *pfn); 135 + #define PAGE_SECTORS (PAGE_SIZE / 512) 135 136 136 137 void dm_error(const char *message); 137 138 138 139 struct dm_dev { 139 140 struct block_device *bdev; 141 + struct dax_device *dax_dev; 140 142 fmode_t mode; 141 143 char name[16]; 142 144 }; ··· 180 178 dm_busy_fn busy; 181 179 dm_iterate_devices_fn iterate_devices; 182 180 dm_io_hints_fn io_hints; 183 - dm_direct_access_fn direct_access; 181 + dm_dax_direct_access_fn direct_access; 184 182 185 183 /* For internal device-mapper use. */ 186 184 struct list_head list;
+1
include/linux/iomap.h
··· 41 41 u16 type; /* type of mapping */ 42 42 u16 flags; /* flags for mapping */ 43 43 struct block_device *bdev; /* block device for I/O */ 44 + struct dax_device *dax_dev; /* dax_dev for dax operations */ 44 45 }; 45 46 46 47 /*
+5 -3
include/linux/libnvdimm.h
··· 20 20 21 21 enum { 22 22 /* when a dimm supports both PMEM and BLK access a label is required */ 23 - NDD_ALIASING = 1 << 0, 23 + NDD_ALIASING = 0, 24 24 /* unarmed memory devices may not persist writes */ 25 - NDD_UNARMED = 1 << 1, 25 + NDD_UNARMED = 1, 26 + /* locked memory devices should not be accessed */ 27 + NDD_LOCKED = 2, 26 28 27 29 /* need to set a limit somewhere, but yes, this is likely overkill */ 28 30 ND_IOCTL_MAX_BUFLEN = SZ_4M, ··· 122 120 } 123 121 124 122 int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); 125 - void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus, 123 + void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, 126 124 phys_addr_t start, unsigned int len); 127 125 struct nvdimm_bus *nvdimm_bus_register(struct device *parent, 128 126 struct nvdimm_bus_descriptor *nfit_desc);
-23
include/linux/pmem.h
··· 31 31 BUG(); 32 32 } 33 33 34 - static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) 35 - { 36 - BUG(); 37 - return -EFAULT; 38 - } 39 - 40 34 static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, 41 35 struct iov_iter *i) 42 36 { ··· 57 63 static inline bool arch_has_pmem_api(void) 58 64 { 59 65 return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API); 60 - } 61 - 62 - /* 63 - * memcpy_from_pmem - read from persistent memory with error handling 64 - * @dst: destination buffer 65 - * @src: source buffer 66 - * @size: transfer length 67 - * 68 - * Returns 0 on success negative error code on failure. 69 - */ 70 - static inline int memcpy_from_pmem(void *dst, void const *src, size_t size) 71 - { 72 - if (arch_has_pmem_api()) 73 - return arch_memcpy_from_pmem(dst, src, size); 74 - else 75 - memcpy(dst, src, size); 76 - return 0; 77 66 } 78 67 79 68 /**
+8
include/linux/string.h
··· 114 114 #ifndef __HAVE_ARCH_MEMCHR 115 115 extern void * memchr(const void *,int,__kernel_size_t); 116 116 #endif 117 + #ifndef __HAVE_ARCH_MEMCPY_MCSAFE 118 + static inline __must_check int memcpy_mcsafe(void *dst, const void *src, 119 + size_t cnt) 120 + { 121 + memcpy(dst, src, cnt); 122 + return 0; 123 + } 124 + #endif 117 125 void *memchr_inv(const void *s, int c, size_t n); 118 126 char *strreplace(char *s, char old, char new); 119 127
+1
include/uapi/linux/ndctl.h
··· 169 169 enum { 170 170 ND_ARS_VOLATILE = 1, 171 171 ND_ARS_PERSISTENT = 2, 172 + ND_CONFIG_LOCKED = 1, 172 173 }; 173 174 174 175 static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
+9 -2
tools/testing/nvdimm/Kbuild
··· 28 28 obj-$(CONFIG_ND_BLK) += nd_blk.o 29 29 obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o 30 30 obj-$(CONFIG_ACPI_NFIT) += nfit.o 31 - obj-$(CONFIG_DEV_DAX) += dax.o 31 + ifeq ($(CONFIG_DAX),m) 32 + obj-$(CONFIG_DAX) += dax.o 33 + endif 34 + obj-$(CONFIG_DEV_DAX) += device_dax.o 32 35 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o 33 36 34 37 nfit-y := $(ACPI_SRC)/core.o ··· 51 48 nd_e820-y := $(NVDIMM_SRC)/e820.o 52 49 nd_e820-y += config_check.o 53 50 54 - dax-y := $(DAX_SRC)/dax.o 51 + dax-y := $(DAX_SRC)/super.o 55 52 dax-y += config_check.o 53 + 54 + device_dax-y := $(DAX_SRC)/device.o 55 + device_dax-y += dax-dev.o 56 + device_dax-y += config_check.o 56 57 57 58 dax_pmem-y := $(DAX_SRC)/pmem.o 58 59 dax_pmem-y += config_check.o
+49
tools/testing/nvdimm/dax-dev.c
··· 1 + /* 2 + * Copyright (c) 2016, Intel Corporation. 3 + * 4 + * This program is free software; you can redistribute it and/or modify it 5 + * under the terms and conditions of the GNU General Public License, 6 + * version 2, as published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope it will be useful, but WITHOUT 9 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 + * more details. 12 + */ 13 + #include "test/nfit_test.h" 14 + #include <linux/mm.h> 15 + #include "../../../drivers/dax/dax-private.h" 16 + 17 + phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, 18 + unsigned long size) 19 + { 20 + struct resource *res; 21 + phys_addr_t addr; 22 + int i; 23 + 24 + for (i = 0; i < dev_dax->num_resources; i++) { 25 + res = &dev_dax->res[i]; 26 + addr = pgoff * PAGE_SIZE + res->start; 27 + if (addr >= res->start && addr <= res->end) 28 + break; 29 + pgoff -= PHYS_PFN(resource_size(res)); 30 + } 31 + 32 + if (i < dev_dax->num_resources) { 33 + res = &dev_dax->res[i]; 34 + if (addr + size - 1 <= res->end) { 35 + if (get_nfit_res(addr)) { 36 + struct page *page; 37 + 38 + if (dev_dax->region->align > PAGE_SIZE) 39 + return -1; 40 + 41 + page = vmalloc_to_page((void *)addr); 42 + return PFN_PHYS(page_to_pfn(page)); 43 + } else 44 + return addr; 45 + } 46 + } 47 + 48 + return -1; 49 + }
+10 -11
tools/testing/nvdimm/pmem-dax.c
··· 15 15 #include <pmem.h> 16 16 #include <nd.h> 17 17 18 - long pmem_direct_access(struct block_device *bdev, sector_t sector, 19 - void **kaddr, pfn_t *pfn, long size) 18 + long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, 19 + long nr_pages, void **kaddr, pfn_t *pfn) 20 20 { 21 - struct pmem_device *pmem = bdev->bd_queue->queuedata; 22 - resource_size_t offset = sector * 512 + pmem->data_offset; 21 + resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; 23 22 24 - if (unlikely(is_bad_pmem(&pmem->bb, sector, size))) 23 + if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, 24 + PFN_PHYS(nr_pages)))) 25 25 return -EIO; 26 26 27 27 /* ··· 34 34 *kaddr = pmem->virt_addr + offset; 35 35 page = vmalloc_to_page(pmem->virt_addr + offset); 36 36 *pfn = page_to_pfn_t(page); 37 - dev_dbg_ratelimited(disk_to_dev(bdev->bd_disk)->parent, 38 - "%s: sector: %#llx pfn: %#lx\n", __func__, 39 - (unsigned long long) sector, page_to_pfn(page)); 37 + pr_debug_ratelimited("%s: pmem: %p pgoff: %#lx pfn: %#lx\n", 38 + __func__, pmem, pgoff, page_to_pfn(page)); 40 39 41 - return PAGE_SIZE; 40 + return 1; 42 41 } 43 42 44 43 *kaddr = pmem->virt_addr + offset; ··· 48 49 * requested range. 49 50 */ 50 51 if (unlikely(pmem->bb.count)) 51 - return size; 52 - return pmem->size - pmem->pfn_pad - offset; 52 + return nr_pages; 53 + return PHYS_PFN(pmem->size - pmem->pfn_pad - offset); 53 54 }
+50 -4
tools/testing/nvdimm/test/nfit.c
··· 132 132 [3] = NFIT_DIMM_HANDLE(0, 0, 1, 0, 1), 133 133 [4] = NFIT_DIMM_HANDLE(0, 1, 0, 0, 0), 134 134 [5] = NFIT_DIMM_HANDLE(1, 0, 0, 0, 0), 135 + [6] = NFIT_DIMM_HANDLE(1, 0, 0, 0, 1), 135 136 }; 136 137 137 138 static unsigned long dimm_fail_cmd_flags[NUM_DCR]; ··· 729 728 static int nfit_test1_alloc(struct nfit_test *t) 730 729 { 731 730 size_t nfit_size = sizeof(struct acpi_nfit_system_address) * 2 732 - + sizeof(struct acpi_nfit_memory_map) 733 - + offsetof(struct acpi_nfit_control_region, window_size); 731 + + sizeof(struct acpi_nfit_memory_map) * 2 732 + + offsetof(struct acpi_nfit_control_region, window_size) * 2; 734 733 int i; 735 734 736 735 t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma); ··· 907 906 memdev->address = 0; 908 907 memdev->interleave_index = 0; 909 908 memdev->interleave_ways = 2; 909 + memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED; 910 910 911 911 /* mem-region2 (spa1, dimm0) */ 912 912 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 2; ··· 923 921 memdev->address = SPA0_SIZE/2; 924 922 memdev->interleave_index = 0; 925 923 memdev->interleave_ways = 4; 924 + memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED; 926 925 927 926 /* mem-region3 (spa1, dimm1) */ 928 927 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 3; ··· 954 951 memdev->address = SPA0_SIZE/2; 955 952 memdev->interleave_index = 0; 956 953 memdev->interleave_ways = 4; 954 + memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED; 957 955 958 956 /* mem-region5 (spa1, dimm3) */ 959 957 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 5; ··· 1090 1086 memdev->address = 0; 1091 1087 memdev->interleave_index = 0; 1092 1088 memdev->interleave_ways = 1; 1089 + memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED; 1093 1090 1094 1091 offset = offset + sizeof(struct acpi_nfit_memory_map) * 14; 1095 1092 /* dcr-descriptor0: blk */ ··· 1389 1384 memdev->address = 0; 1390 1385 memdev->interleave_index = 0; 1391 1386 memdev->interleave_ways = 1; 1387 + memdev->flags = ACPI_NFIT_MEM_HEALTH_ENABLED; 1392 1388 1393 1389 /* mem-region16 (spa/bdw4, dimm4) */ 1394 1390 memdev = nfit_buf + offset + ··· 1489 1483 dcr->region_index = 0+1; 1490 1484 dcr_common_init(dcr); 1491 1485 dcr->serial_number = ~handle[5]; 1486 + dcr->code = NFIT_FIC_BYTE; 1487 + dcr->windows = 0; 1488 + 1489 + offset += dcr->header.length; 1490 + memdev = nfit_buf + offset; 1491 + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; 1492 + memdev->header.length = sizeof(*memdev); 1493 + memdev->device_handle = handle[6]; 1494 + memdev->physical_id = 0; 1495 + memdev->region_id = 0; 1496 + memdev->range_index = 0; 1497 + memdev->region_index = 0+2; 1498 + memdev->region_size = SPA2_SIZE; 1499 + memdev->region_offset = 0; 1500 + memdev->address = 0; 1501 + memdev->interleave_index = 0; 1502 + memdev->interleave_ways = 1; 1503 + memdev->flags = ACPI_NFIT_MEM_MAP_FAILED; 1504 + 1505 + /* dcr-descriptor1 */ 1506 + offset += sizeof(*memdev); 1507 + dcr = nfit_buf + offset; 1508 + dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION; 1509 + dcr->header.length = offsetof(struct acpi_nfit_control_region, 1510 + window_size); 1511 + dcr->region_index = 0+2; 1512 + dcr_common_init(dcr); 1513 + dcr->serial_number = ~handle[6]; 1492 1514 dcr->code = NFIT_FIC_BYTE; 1493 1515 dcr->windows = 0; 1494 1516 ··· 1851 1817 if (rc) 1852 1818 return rc; 1853 1819 1820 + rc = devm_add_action_or_reset(&pdev->dev, acpi_nfit_shutdown, acpi_desc); 1821 + if (rc) 1822 + return rc; 1823 + 1854 1824 if (nfit_test->setup != nfit_test0_setup) 1855 1825 return 0; 1856 1826 ··· 1945 1907 case 1: 1946 1908 nfit_test->num_pm = 1; 1947 1909 nfit_test->dcr_idx = NUM_DCR; 1948 - nfit_test->num_dcr = 1; 1910 + nfit_test->num_dcr = 2; 1949 1911 nfit_test->alloc = nfit_test1_alloc; 1950 1912 nfit_test->setup = nfit_test1_setup; 1951 1913 break; ··· 1962 1924 put_device(&pdev->dev); 1963 1925 goto err_register; 1964 1926 } 1927 + get_device(&pdev->dev); 1965 1928 1966 1929 rc = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); 1967 1930 if (rc) ··· 1981 1942 if (instances[i]) 1982 1943 platform_device_unregister(&instances[i]->pdev); 1983 1944 nfit_test_teardown(); 1945 + for (i = 0; i < NUM_NFITS; i++) 1946 + if (instances[i]) 1947 + put_device(&instances[i]->pdev.dev); 1948 + 1984 1949 return rc; 1985 1950 } 1986 1951 ··· 1992 1949 { 1993 1950 int i; 1994 1951 1995 - platform_driver_unregister(&nfit_test_driver); 1996 1952 for (i = 0; i < NUM_NFITS; i++) 1997 1953 platform_device_unregister(&instances[i]->pdev); 1954 + platform_driver_unregister(&nfit_test_driver); 1998 1955 nfit_test_teardown(); 1956 + 1957 + for (i = 0; i < NUM_NFITS; i++) 1958 + put_device(&instances[i]->pdev.dev); 1999 1959 class_destroy(nfit_test_dimm); 2000 1960 } 2001 1961