Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
at v5.15-rc2 719 lines 17 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Copyright(c) 2017 Intel Corporation. All rights reserved. 4 */ 5#include <linux/pagemap.h> 6#include <linux/module.h> 7#include <linux/mount.h> 8#include <linux/pseudo_fs.h> 9#include <linux/magic.h> 10#include <linux/genhd.h> 11#include <linux/pfn_t.h> 12#include <linux/cdev.h> 13#include <linux/hash.h> 14#include <linux/slab.h> 15#include <linux/uio.h> 16#include <linux/dax.h> 17#include <linux/fs.h> 18#include "dax-private.h" 19 20/** 21 * struct dax_device - anchor object for dax services 22 * @inode: core vfs 23 * @cdev: optional character interface for "device dax" 24 * @host: optional name for lookups where the device path is not available 25 * @private: dax driver private data 26 * @flags: state and boolean properties 27 */ 28struct dax_device { 29 struct hlist_node list; 30 struct inode inode; 31 struct cdev cdev; 32 const char *host; 33 void *private; 34 unsigned long flags; 35 const struct dax_operations *ops; 36}; 37 38static dev_t dax_devt; 39DEFINE_STATIC_SRCU(dax_srcu); 40static struct vfsmount *dax_mnt; 41static DEFINE_IDA(dax_minor_ida); 42static struct kmem_cache *dax_cache __read_mostly; 43static struct super_block *dax_superblock __read_mostly; 44 45#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) 46static struct hlist_head dax_host_list[DAX_HASH_SIZE]; 47static DEFINE_SPINLOCK(dax_host_lock); 48 49int dax_read_lock(void) 50{ 51 return srcu_read_lock(&dax_srcu); 52} 53EXPORT_SYMBOL_GPL(dax_read_lock); 54 55void dax_read_unlock(int id) 56{ 57 srcu_read_unlock(&dax_srcu, id); 58} 59EXPORT_SYMBOL_GPL(dax_read_unlock); 60 61static int dax_host_hash(const char *host) 62{ 63 return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE; 64} 65 66/** 67 * dax_get_by_host() - temporary lookup mechanism for filesystem-dax 68 * @host: alternate name for the device registered by a dax driver 69 */ 70static struct dax_device *dax_get_by_host(const char *host) 71{ 72 struct dax_device *dax_dev, *found = NULL; 73 int hash, id; 74 75 if (!host) 76 return NULL; 77 78 hash = dax_host_hash(host); 79 80 id = dax_read_lock(); 81 spin_lock(&dax_host_lock); 82 hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) { 83 if (!dax_alive(dax_dev) 84 || strcmp(host, dax_dev->host) != 0) 85 continue; 86 87 if (igrab(&dax_dev->inode)) 88 found = dax_dev; 89 break; 90 } 91 spin_unlock(&dax_host_lock); 92 dax_read_unlock(id); 93 94 return found; 95} 96 97#ifdef CONFIG_BLOCK 98#include <linux/blkdev.h> 99 100int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, 101 pgoff_t *pgoff) 102{ 103 sector_t start_sect = bdev ? get_start_sect(bdev) : 0; 104 phys_addr_t phys_off = (start_sect + sector) * 512; 105 106 if (pgoff) 107 *pgoff = PHYS_PFN(phys_off); 108 if (phys_off % PAGE_SIZE || size % PAGE_SIZE) 109 return -EINVAL; 110 return 0; 111} 112EXPORT_SYMBOL(bdev_dax_pgoff); 113 114#if IS_ENABLED(CONFIG_FS_DAX) 115struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) 116{ 117 if (!blk_queue_dax(bdev->bd_disk->queue)) 118 return NULL; 119 return dax_get_by_host(bdev->bd_disk->disk_name); 120} 121EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); 122 123bool generic_fsdax_supported(struct dax_device *dax_dev, 124 struct block_device *bdev, int blocksize, sector_t start, 125 sector_t sectors) 126{ 127 bool dax_enabled = false; 128 pgoff_t pgoff, pgoff_end; 129 void *kaddr, *end_kaddr; 130 pfn_t pfn, end_pfn; 131 sector_t last_page; 132 long len, len2; 133 int err, id; 134 135 if (blocksize != PAGE_SIZE) { 136 pr_info("%pg: error: unsupported blocksize for dax\n", bdev); 137 return false; 138 } 139 140 if (!dax_dev) { 141 pr_debug("%pg: error: dax unsupported by block device\n", bdev); 142 return false; 143 } 144 145 err = bdev_dax_pgoff(bdev, start, PAGE_SIZE, &pgoff); 146 if (err) { 147 pr_info("%pg: error: unaligned partition for dax\n", bdev); 148 return false; 149 } 150 151 last_page = PFN_DOWN((start + sectors - 1) * 512) * PAGE_SIZE / 512; 152 err = bdev_dax_pgoff(bdev, last_page, PAGE_SIZE, &pgoff_end); 153 if (err) { 154 pr_info("%pg: error: unaligned partition for dax\n", bdev); 155 return false; 156 } 157 158 id = dax_read_lock(); 159 len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); 160 len2 = dax_direct_access(dax_dev, pgoff_end, 1, &end_kaddr, &end_pfn); 161 162 if (len < 1 || len2 < 1) { 163 pr_info("%pg: error: dax access failed (%ld)\n", 164 bdev, len < 1 ? len : len2); 165 dax_read_unlock(id); 166 return false; 167 } 168 169 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) { 170 /* 171 * An arch that has enabled the pmem api should also 172 * have its drivers support pfn_t_devmap() 173 * 174 * This is a developer warning and should not trigger in 175 * production. dax_flush() will crash since it depends 176 * on being able to do (page_address(pfn_to_page())). 177 */ 178 WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)); 179 dax_enabled = true; 180 } else if (pfn_t_devmap(pfn) && pfn_t_devmap(end_pfn)) { 181 struct dev_pagemap *pgmap, *end_pgmap; 182 183 pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL); 184 end_pgmap = get_dev_pagemap(pfn_t_to_pfn(end_pfn), NULL); 185 if (pgmap && pgmap == end_pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX 186 && pfn_t_to_page(pfn)->pgmap == pgmap 187 && pfn_t_to_page(end_pfn)->pgmap == pgmap 188 && pfn_t_to_pfn(pfn) == PHYS_PFN(__pa(kaddr)) 189 && pfn_t_to_pfn(end_pfn) == PHYS_PFN(__pa(end_kaddr))) 190 dax_enabled = true; 191 put_dev_pagemap(pgmap); 192 put_dev_pagemap(end_pgmap); 193 194 } 195 dax_read_unlock(id); 196 197 if (!dax_enabled) { 198 pr_info("%pg: error: dax support not enabled\n", bdev); 199 return false; 200 } 201 return true; 202} 203EXPORT_SYMBOL_GPL(generic_fsdax_supported); 204 205bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, 206 int blocksize, sector_t start, sector_t len) 207{ 208 bool ret = false; 209 int id; 210 211 if (!dax_dev) 212 return false; 213 214 id = dax_read_lock(); 215 if (dax_alive(dax_dev) && dax_dev->ops->dax_supported) 216 ret = dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, 217 start, len); 218 dax_read_unlock(id); 219 return ret; 220} 221EXPORT_SYMBOL_GPL(dax_supported); 222#endif /* CONFIG_FS_DAX */ 223#endif /* CONFIG_BLOCK */ 224 225enum dax_device_flags { 226 /* !alive + rcu grace period == no new operations / mappings */ 227 DAXDEV_ALIVE, 228 /* gate whether dax_flush() calls the low level flush routine */ 229 DAXDEV_WRITE_CACHE, 230 /* flag to check if device supports synchronous flush */ 231 DAXDEV_SYNC, 232}; 233 234static ssize_t write_cache_show(struct device *dev, 235 struct device_attribute *attr, char *buf) 236{ 237 struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); 238 ssize_t rc; 239 240 WARN_ON_ONCE(!dax_dev); 241 if (!dax_dev) 242 return -ENXIO; 243 244 rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev)); 245 put_dax(dax_dev); 246 return rc; 247} 248 249static ssize_t write_cache_store(struct device *dev, 250 struct device_attribute *attr, const char *buf, size_t len) 251{ 252 bool write_cache; 253 int rc = strtobool(buf, &write_cache); 254 struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); 255 256 WARN_ON_ONCE(!dax_dev); 257 if (!dax_dev) 258 return -ENXIO; 259 260 if (rc) 261 len = rc; 262 else 263 dax_write_cache(dax_dev, write_cache); 264 265 put_dax(dax_dev); 266 return len; 267} 268static DEVICE_ATTR_RW(write_cache); 269 270static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n) 271{ 272 struct device *dev = container_of(kobj, typeof(*dev), kobj); 273 struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); 274 275 WARN_ON_ONCE(!dax_dev); 276 if (!dax_dev) 277 return 0; 278 279#ifndef CONFIG_ARCH_HAS_PMEM_API 280 if (a == &dev_attr_write_cache.attr) 281 return 0; 282#endif 283 return a->mode; 284} 285 286static struct attribute *dax_attributes[] = { 287 &dev_attr_write_cache.attr, 288 NULL, 289}; 290 291struct attribute_group dax_attribute_group = { 292 .name = "dax", 293 .attrs = dax_attributes, 294 .is_visible = dax_visible, 295}; 296EXPORT_SYMBOL_GPL(dax_attribute_group); 297 298/** 299 * dax_direct_access() - translate a device pgoff to an absolute pfn 300 * @dax_dev: a dax_device instance representing the logical memory range 301 * @pgoff: offset in pages from the start of the device to translate 302 * @nr_pages: number of consecutive pages caller can handle relative to @pfn 303 * @kaddr: output parameter that returns a virtual address mapping of pfn 304 * @pfn: output parameter that returns an absolute pfn translation of @pgoff 305 * 306 * Return: negative errno if an error occurs, otherwise the number of 307 * pages accessible at the device relative @pgoff. 308 */ 309long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 310 void **kaddr, pfn_t *pfn) 311{ 312 long avail; 313 314 if (!dax_dev) 315 return -EOPNOTSUPP; 316 317 if (!dax_alive(dax_dev)) 318 return -ENXIO; 319 320 if (nr_pages < 0) 321 return -EINVAL; 322 323 avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, 324 kaddr, pfn); 325 if (!avail) 326 return -ERANGE; 327 return min(avail, nr_pages); 328} 329EXPORT_SYMBOL_GPL(dax_direct_access); 330 331size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 332 size_t bytes, struct iov_iter *i) 333{ 334 if (!dax_alive(dax_dev)) 335 return 0; 336 337 return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i); 338} 339EXPORT_SYMBOL_GPL(dax_copy_from_iter); 340 341size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 342 size_t bytes, struct iov_iter *i) 343{ 344 if (!dax_alive(dax_dev)) 345 return 0; 346 347 return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i); 348} 349EXPORT_SYMBOL_GPL(dax_copy_to_iter); 350 351int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, 352 size_t nr_pages) 353{ 354 if (!dax_alive(dax_dev)) 355 return -ENXIO; 356 /* 357 * There are no callers that want to zero more than one page as of now. 358 * Once users are there, this check can be removed after the 359 * device mapper code has been updated to split ranges across targets. 360 */ 361 if (nr_pages != 1) 362 return -EIO; 363 364 return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); 365} 366EXPORT_SYMBOL_GPL(dax_zero_page_range); 367 368#ifdef CONFIG_ARCH_HAS_PMEM_API 369void arch_wb_cache_pmem(void *addr, size_t size); 370void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 371{ 372 if (unlikely(!dax_write_cache_enabled(dax_dev))) 373 return; 374 375 arch_wb_cache_pmem(addr, size); 376} 377#else 378void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 379{ 380} 381#endif 382EXPORT_SYMBOL_GPL(dax_flush); 383 384void dax_write_cache(struct dax_device *dax_dev, bool wc) 385{ 386 if (wc) 387 set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 388 else 389 clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 390} 391EXPORT_SYMBOL_GPL(dax_write_cache); 392 393bool dax_write_cache_enabled(struct dax_device *dax_dev) 394{ 395 return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 396} 397EXPORT_SYMBOL_GPL(dax_write_cache_enabled); 398 399bool __dax_synchronous(struct dax_device *dax_dev) 400{ 401 return test_bit(DAXDEV_SYNC, &dax_dev->flags); 402} 403EXPORT_SYMBOL_GPL(__dax_synchronous); 404 405void __set_dax_synchronous(struct dax_device *dax_dev) 406{ 407 set_bit(DAXDEV_SYNC, &dax_dev->flags); 408} 409EXPORT_SYMBOL_GPL(__set_dax_synchronous); 410 411bool dax_alive(struct dax_device *dax_dev) 412{ 413 lockdep_assert_held(&dax_srcu); 414 return test_bit(DAXDEV_ALIVE, &dax_dev->flags); 415} 416EXPORT_SYMBOL_GPL(dax_alive); 417 418/* 419 * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring 420 * that any fault handlers or operations that might have seen 421 * dax_alive(), have completed. Any operations that start after 422 * synchronize_srcu() has run will abort upon seeing !dax_alive(). 423 */ 424void kill_dax(struct dax_device *dax_dev) 425{ 426 if (!dax_dev) 427 return; 428 429 clear_bit(DAXDEV_ALIVE, &dax_dev->flags); 430 431 synchronize_srcu(&dax_srcu); 432 433 spin_lock(&dax_host_lock); 434 hlist_del_init(&dax_dev->list); 435 spin_unlock(&dax_host_lock); 436} 437EXPORT_SYMBOL_GPL(kill_dax); 438 439void run_dax(struct dax_device *dax_dev) 440{ 441 set_bit(DAXDEV_ALIVE, &dax_dev->flags); 442} 443EXPORT_SYMBOL_GPL(run_dax); 444 445static struct inode *dax_alloc_inode(struct super_block *sb) 446{ 447 struct dax_device *dax_dev; 448 struct inode *inode; 449 450 dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); 451 if (!dax_dev) 452 return NULL; 453 454 inode = &dax_dev->inode; 455 inode->i_rdev = 0; 456 return inode; 457} 458 459static struct dax_device *to_dax_dev(struct inode *inode) 460{ 461 return container_of(inode, struct dax_device, inode); 462} 463 464static void dax_free_inode(struct inode *inode) 465{ 466 struct dax_device *dax_dev = to_dax_dev(inode); 467 kfree(dax_dev->host); 468 dax_dev->host = NULL; 469 if (inode->i_rdev) 470 ida_simple_remove(&dax_minor_ida, iminor(inode)); 471 kmem_cache_free(dax_cache, dax_dev); 472} 473 474static void dax_destroy_inode(struct inode *inode) 475{ 476 struct dax_device *dax_dev = to_dax_dev(inode); 477 WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags), 478 "kill_dax() must be called before final iput()\n"); 479} 480 481static const struct super_operations dax_sops = { 482 .statfs = simple_statfs, 483 .alloc_inode = dax_alloc_inode, 484 .destroy_inode = dax_destroy_inode, 485 .free_inode = dax_free_inode, 486 .drop_inode = generic_delete_inode, 487}; 488 489static int dax_init_fs_context(struct fs_context *fc) 490{ 491 struct pseudo_fs_context *ctx = init_pseudo(fc, DAXFS_MAGIC); 492 if (!ctx) 493 return -ENOMEM; 494 ctx->ops = &dax_sops; 495 return 0; 496} 497 498static struct file_system_type dax_fs_type = { 499 .name = "dax", 500 .init_fs_context = dax_init_fs_context, 501 .kill_sb = kill_anon_super, 502}; 503 504static int dax_test(struct inode *inode, void *data) 505{ 506 dev_t devt = *(dev_t *) data; 507 508 return inode->i_rdev == devt; 509} 510 511static int dax_set(struct inode *inode, void *data) 512{ 513 dev_t devt = *(dev_t *) data; 514 515 inode->i_rdev = devt; 516 return 0; 517} 518 519static struct dax_device *dax_dev_get(dev_t devt) 520{ 521 struct dax_device *dax_dev; 522 struct inode *inode; 523 524 inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), 525 dax_test, dax_set, &devt); 526 527 if (!inode) 528 return NULL; 529 530 dax_dev = to_dax_dev(inode); 531 if (inode->i_state & I_NEW) { 532 set_bit(DAXDEV_ALIVE, &dax_dev->flags); 533 inode->i_cdev = &dax_dev->cdev; 534 inode->i_mode = S_IFCHR; 535 inode->i_flags = S_DAX; 536 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 537 unlock_new_inode(inode); 538 } 539 540 return dax_dev; 541} 542 543static void dax_add_host(struct dax_device *dax_dev, const char *host) 544{ 545 int hash; 546 547 /* 548 * Unconditionally init dax_dev since it's coming from a 549 * non-zeroed slab cache 550 */ 551 INIT_HLIST_NODE(&dax_dev->list); 552 dax_dev->host = host; 553 if (!host) 554 return; 555 556 hash = dax_host_hash(host); 557 spin_lock(&dax_host_lock); 558 hlist_add_head(&dax_dev->list, &dax_host_list[hash]); 559 spin_unlock(&dax_host_lock); 560} 561 562struct dax_device *alloc_dax(void *private, const char *__host, 563 const struct dax_operations *ops, unsigned long flags) 564{ 565 struct dax_device *dax_dev; 566 const char *host; 567 dev_t devt; 568 int minor; 569 570 if (ops && !ops->zero_page_range) { 571 pr_debug("%s: error: device does not provide dax" 572 " operation zero_page_range()\n", 573 __host ? __host : "Unknown"); 574 return ERR_PTR(-EINVAL); 575 } 576 577 host = kstrdup(__host, GFP_KERNEL); 578 if (__host && !host) 579 return ERR_PTR(-ENOMEM); 580 581 minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL); 582 if (minor < 0) 583 goto err_minor; 584 585 devt = MKDEV(MAJOR(dax_devt), minor); 586 dax_dev = dax_dev_get(devt); 587 if (!dax_dev) 588 goto err_dev; 589 590 dax_add_host(dax_dev, host); 591 dax_dev->ops = ops; 592 dax_dev->private = private; 593 if (flags & DAXDEV_F_SYNC) 594 set_dax_synchronous(dax_dev); 595 596 return dax_dev; 597 598 err_dev: 599 ida_simple_remove(&dax_minor_ida, minor); 600 err_minor: 601 kfree(host); 602 return ERR_PTR(-ENOMEM); 603} 604EXPORT_SYMBOL_GPL(alloc_dax); 605 606void put_dax(struct dax_device *dax_dev) 607{ 608 if (!dax_dev) 609 return; 610 iput(&dax_dev->inode); 611} 612EXPORT_SYMBOL_GPL(put_dax); 613 614/** 615 * inode_dax: convert a public inode into its dax_dev 616 * @inode: An inode with i_cdev pointing to a dax_dev 617 * 618 * Note this is not equivalent to to_dax_dev() which is for private 619 * internal use where we know the inode filesystem type == dax_fs_type. 620 */ 621struct dax_device *inode_dax(struct inode *inode) 622{ 623 struct cdev *cdev = inode->i_cdev; 624 625 return container_of(cdev, struct dax_device, cdev); 626} 627EXPORT_SYMBOL_GPL(inode_dax); 628 629struct inode *dax_inode(struct dax_device *dax_dev) 630{ 631 return &dax_dev->inode; 632} 633EXPORT_SYMBOL_GPL(dax_inode); 634 635void *dax_get_private(struct dax_device *dax_dev) 636{ 637 if (!test_bit(DAXDEV_ALIVE, &dax_dev->flags)) 638 return NULL; 639 return dax_dev->private; 640} 641EXPORT_SYMBOL_GPL(dax_get_private); 642 643static void init_once(void *_dax_dev) 644{ 645 struct dax_device *dax_dev = _dax_dev; 646 struct inode *inode = &dax_dev->inode; 647 648 memset(dax_dev, 0, sizeof(*dax_dev)); 649 inode_init_once(inode); 650} 651 652static int dax_fs_init(void) 653{ 654 int rc; 655 656 dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0, 657 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 658 SLAB_MEM_SPREAD|SLAB_ACCOUNT), 659 init_once); 660 if (!dax_cache) 661 return -ENOMEM; 662 663 dax_mnt = kern_mount(&dax_fs_type); 664 if (IS_ERR(dax_mnt)) { 665 rc = PTR_ERR(dax_mnt); 666 goto err_mount; 667 } 668 dax_superblock = dax_mnt->mnt_sb; 669 670 return 0; 671 672 err_mount: 673 kmem_cache_destroy(dax_cache); 674 675 return rc; 676} 677 678static void dax_fs_exit(void) 679{ 680 kern_unmount(dax_mnt); 681 kmem_cache_destroy(dax_cache); 682} 683 684static int __init dax_core_init(void) 685{ 686 int rc; 687 688 rc = dax_fs_init(); 689 if (rc) 690 return rc; 691 692 rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax"); 693 if (rc) 694 goto err_chrdev; 695 696 rc = dax_bus_init(); 697 if (rc) 698 goto err_bus; 699 return 0; 700 701err_bus: 702 unregister_chrdev_region(dax_devt, MINORMASK+1); 703err_chrdev: 704 dax_fs_exit(); 705 return 0; 706} 707 708static void __exit dax_core_exit(void) 709{ 710 dax_bus_exit(); 711 unregister_chrdev_region(dax_devt, MINORMASK+1); 712 ida_destroy(&dax_minor_ida); 713 dax_fs_exit(); 714} 715 716MODULE_AUTHOR("Intel Corporation"); 717MODULE_LICENSE("GPL v2"); 718subsys_initcall(dax_core_init); 719module_exit(dax_core_exit);