at v5.3-rc2 727 lines 17 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Copyright(c) 2017 Intel Corporation. All rights reserved. 4 */ 5#include <linux/pagemap.h> 6#include <linux/module.h> 7#include <linux/mount.h> 8#include <linux/pseudo_fs.h> 9#include <linux/magic.h> 10#include <linux/genhd.h> 11#include <linux/pfn_t.h> 12#include <linux/cdev.h> 13#include <linux/hash.h> 14#include <linux/slab.h> 15#include <linux/uio.h> 16#include <linux/dax.h> 17#include <linux/fs.h> 18#include "dax-private.h" 19 20static dev_t dax_devt; 21DEFINE_STATIC_SRCU(dax_srcu); 22static struct vfsmount *dax_mnt; 23static DEFINE_IDA(dax_minor_ida); 24static struct kmem_cache *dax_cache __read_mostly; 25static struct super_block *dax_superblock __read_mostly; 26 27#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) 28static struct hlist_head dax_host_list[DAX_HASH_SIZE]; 29static DEFINE_SPINLOCK(dax_host_lock); 30 31int dax_read_lock(void) 32{ 33 return srcu_read_lock(&dax_srcu); 34} 35EXPORT_SYMBOL_GPL(dax_read_lock); 36 37void dax_read_unlock(int id) 38{ 39 srcu_read_unlock(&dax_srcu, id); 40} 41EXPORT_SYMBOL_GPL(dax_read_unlock); 42 43#ifdef CONFIG_BLOCK 44#include <linux/blkdev.h> 45 46int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, 47 pgoff_t *pgoff) 48{ 49 phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512; 50 51 if (pgoff) 52 *pgoff = PHYS_PFN(phys_off); 53 if (phys_off % PAGE_SIZE || size % PAGE_SIZE) 54 return -EINVAL; 55 return 0; 56} 57EXPORT_SYMBOL(bdev_dax_pgoff); 58 59#if IS_ENABLED(CONFIG_FS_DAX) 60struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) 61{ 62 if (!blk_queue_dax(bdev->bd_queue)) 63 return NULL; 64 return fs_dax_get_by_host(bdev->bd_disk->disk_name); 65} 66EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); 67#endif 68 69bool __generic_fsdax_supported(struct dax_device *dax_dev, 70 struct block_device *bdev, int blocksize, sector_t start, 71 sector_t sectors) 72{ 73 bool dax_enabled = false; 74 pgoff_t pgoff, pgoff_end; 75 char buf[BDEVNAME_SIZE]; 76 void *kaddr, *end_kaddr; 77 pfn_t pfn, end_pfn; 78 sector_t last_page; 79 long len, len2; 80 int err, id; 81 82 if (blocksize != PAGE_SIZE) { 83 pr_debug("%s: error: unsupported blocksize for dax\n", 84 bdevname(bdev, buf)); 85 return false; 86 } 87 88 err = bdev_dax_pgoff(bdev, start, PAGE_SIZE, &pgoff); 89 if (err) { 90 pr_debug("%s: error: unaligned partition for dax\n", 91 bdevname(bdev, buf)); 92 return false; 93 } 94 95 last_page = PFN_DOWN((start + sectors - 1) * 512) * PAGE_SIZE / 512; 96 err = bdev_dax_pgoff(bdev, last_page, PAGE_SIZE, &pgoff_end); 97 if (err) { 98 pr_debug("%s: error: unaligned partition for dax\n", 99 bdevname(bdev, buf)); 100 return false; 101 } 102 103 id = dax_read_lock(); 104 len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); 105 len2 = dax_direct_access(dax_dev, pgoff_end, 1, &end_kaddr, &end_pfn); 106 dax_read_unlock(id); 107 108 if (len < 1 || len2 < 1) { 109 pr_debug("%s: error: dax access failed (%ld)\n", 110 bdevname(bdev, buf), len < 1 ? len : len2); 111 return false; 112 } 113 114 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) { 115 /* 116 * An arch that has enabled the pmem api should also 117 * have its drivers support pfn_t_devmap() 118 * 119 * This is a developer warning and should not trigger in 120 * production. dax_flush() will crash since it depends 121 * on being able to do (page_address(pfn_to_page())). 122 */ 123 WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)); 124 dax_enabled = true; 125 } else if (pfn_t_devmap(pfn) && pfn_t_devmap(end_pfn)) { 126 struct dev_pagemap *pgmap, *end_pgmap; 127 128 pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL); 129 end_pgmap = get_dev_pagemap(pfn_t_to_pfn(end_pfn), NULL); 130 if (pgmap && pgmap == end_pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX 131 && pfn_t_to_page(pfn)->pgmap == pgmap 132 && pfn_t_to_page(end_pfn)->pgmap == pgmap 133 && pfn_t_to_pfn(pfn) == PHYS_PFN(__pa(kaddr)) 134 && pfn_t_to_pfn(end_pfn) == PHYS_PFN(__pa(end_kaddr))) 135 dax_enabled = true; 136 put_dev_pagemap(pgmap); 137 put_dev_pagemap(end_pgmap); 138 139 } 140 141 if (!dax_enabled) { 142 pr_debug("%s: error: dax support not enabled\n", 143 bdevname(bdev, buf)); 144 return false; 145 } 146 return true; 147} 148EXPORT_SYMBOL_GPL(__generic_fsdax_supported); 149 150/** 151 * __bdev_dax_supported() - Check if the device supports dax for filesystem 152 * @bdev: block device to check 153 * @blocksize: The block size of the device 154 * 155 * This is a library function for filesystems to check if the block device 156 * can be mounted with dax option. 157 * 158 * Return: true if supported, false if unsupported 159 */ 160bool __bdev_dax_supported(struct block_device *bdev, int blocksize) 161{ 162 struct dax_device *dax_dev; 163 struct request_queue *q; 164 char buf[BDEVNAME_SIZE]; 165 bool ret; 166 int id; 167 168 q = bdev_get_queue(bdev); 169 if (!q || !blk_queue_dax(q)) { 170 pr_debug("%s: error: request queue doesn't support dax\n", 171 bdevname(bdev, buf)); 172 return false; 173 } 174 175 dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 176 if (!dax_dev) { 177 pr_debug("%s: error: device does not support dax\n", 178 bdevname(bdev, buf)); 179 return false; 180 } 181 182 id = dax_read_lock(); 183 ret = dax_supported(dax_dev, bdev, blocksize, 0, 184 i_size_read(bdev->bd_inode) / 512); 185 dax_read_unlock(id); 186 187 put_dax(dax_dev); 188 189 return ret; 190} 191EXPORT_SYMBOL_GPL(__bdev_dax_supported); 192#endif 193 194enum dax_device_flags { 195 /* !alive + rcu grace period == no new operations / mappings */ 196 DAXDEV_ALIVE, 197 /* gate whether dax_flush() calls the low level flush routine */ 198 DAXDEV_WRITE_CACHE, 199 /* flag to check if device supports synchronous flush */ 200 DAXDEV_SYNC, 201}; 202 203/** 204 * struct dax_device - anchor object for dax services 205 * @inode: core vfs 206 * @cdev: optional character interface for "device dax" 207 * @host: optional name for lookups where the device path is not available 208 * @private: dax driver private data 209 * @flags: state and boolean properties 210 */ 211struct dax_device { 212 struct hlist_node list; 213 struct inode inode; 214 struct cdev cdev; 215 const char *host; 216 void *private; 217 unsigned long flags; 218 const struct dax_operations *ops; 219}; 220 221static ssize_t write_cache_show(struct device *dev, 222 struct device_attribute *attr, char *buf) 223{ 224 struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); 225 ssize_t rc; 226 227 WARN_ON_ONCE(!dax_dev); 228 if (!dax_dev) 229 return -ENXIO; 230 231 rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev)); 232 put_dax(dax_dev); 233 return rc; 234} 235 236static ssize_t write_cache_store(struct device *dev, 237 struct device_attribute *attr, const char *buf, size_t len) 238{ 239 bool write_cache; 240 int rc = strtobool(buf, &write_cache); 241 struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); 242 243 WARN_ON_ONCE(!dax_dev); 244 if (!dax_dev) 245 return -ENXIO; 246 247 if (rc) 248 len = rc; 249 else 250 dax_write_cache(dax_dev, write_cache); 251 252 put_dax(dax_dev); 253 return len; 254} 255static DEVICE_ATTR_RW(write_cache); 256 257static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n) 258{ 259 struct device *dev = container_of(kobj, typeof(*dev), kobj); 260 struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); 261 262 WARN_ON_ONCE(!dax_dev); 263 if (!dax_dev) 264 return 0; 265 266#ifndef CONFIG_ARCH_HAS_PMEM_API 267 if (a == &dev_attr_write_cache.attr) 268 return 0; 269#endif 270 return a->mode; 271} 272 273static struct attribute *dax_attributes[] = { 274 &dev_attr_write_cache.attr, 275 NULL, 276}; 277 278struct attribute_group dax_attribute_group = { 279 .name = "dax", 280 .attrs = dax_attributes, 281 .is_visible = dax_visible, 282}; 283EXPORT_SYMBOL_GPL(dax_attribute_group); 284 285/** 286 * dax_direct_access() - translate a device pgoff to an absolute pfn 287 * @dax_dev: a dax_device instance representing the logical memory range 288 * @pgoff: offset in pages from the start of the device to translate 289 * @nr_pages: number of consecutive pages caller can handle relative to @pfn 290 * @kaddr: output parameter that returns a virtual address mapping of pfn 291 * @pfn: output parameter that returns an absolute pfn translation of @pgoff 292 * 293 * Return: negative errno if an error occurs, otherwise the number of 294 * pages accessible at the device relative @pgoff. 295 */ 296long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 297 void **kaddr, pfn_t *pfn) 298{ 299 long avail; 300 301 if (!dax_dev) 302 return -EOPNOTSUPP; 303 304 if (!dax_alive(dax_dev)) 305 return -ENXIO; 306 307 if (nr_pages < 0) 308 return nr_pages; 309 310 avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, 311 kaddr, pfn); 312 if (!avail) 313 return -ERANGE; 314 return min(avail, nr_pages); 315} 316EXPORT_SYMBOL_GPL(dax_direct_access); 317 318bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, 319 int blocksize, sector_t start, sector_t len) 320{ 321 if (!dax_alive(dax_dev)) 322 return false; 323 324 return dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, start, len); 325} 326 327size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 328 size_t bytes, struct iov_iter *i) 329{ 330 if (!dax_alive(dax_dev)) 331 return 0; 332 333 return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i); 334} 335EXPORT_SYMBOL_GPL(dax_copy_from_iter); 336 337size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 338 size_t bytes, struct iov_iter *i) 339{ 340 if (!dax_alive(dax_dev)) 341 return 0; 342 343 return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i); 344} 345EXPORT_SYMBOL_GPL(dax_copy_to_iter); 346 347#ifdef CONFIG_ARCH_HAS_PMEM_API 348void arch_wb_cache_pmem(void *addr, size_t size); 349void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 350{ 351 if (unlikely(!dax_write_cache_enabled(dax_dev))) 352 return; 353 354 arch_wb_cache_pmem(addr, size); 355} 356#else 357void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 358{ 359} 360#endif 361EXPORT_SYMBOL_GPL(dax_flush); 362 363void dax_write_cache(struct dax_device *dax_dev, bool wc) 364{ 365 if (wc) 366 set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 367 else 368 clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 369} 370EXPORT_SYMBOL_GPL(dax_write_cache); 371 372bool dax_write_cache_enabled(struct dax_device *dax_dev) 373{ 374 return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 375} 376EXPORT_SYMBOL_GPL(dax_write_cache_enabled); 377 378bool __dax_synchronous(struct dax_device *dax_dev) 379{ 380 return test_bit(DAXDEV_SYNC, &dax_dev->flags); 381} 382EXPORT_SYMBOL_GPL(__dax_synchronous); 383 384void __set_dax_synchronous(struct dax_device *dax_dev) 385{ 386 set_bit(DAXDEV_SYNC, &dax_dev->flags); 387} 388EXPORT_SYMBOL_GPL(__set_dax_synchronous); 389 390bool dax_alive(struct dax_device *dax_dev) 391{ 392 lockdep_assert_held(&dax_srcu); 393 return test_bit(DAXDEV_ALIVE, &dax_dev->flags); 394} 395EXPORT_SYMBOL_GPL(dax_alive); 396 397static int dax_host_hash(const char *host) 398{ 399 return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE; 400} 401 402/* 403 * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring 404 * that any fault handlers or operations that might have seen 405 * dax_alive(), have completed. Any operations that start after 406 * synchronize_srcu() has run will abort upon seeing !dax_alive(). 407 */ 408void kill_dax(struct dax_device *dax_dev) 409{ 410 if (!dax_dev) 411 return; 412 413 clear_bit(DAXDEV_ALIVE, &dax_dev->flags); 414 415 synchronize_srcu(&dax_srcu); 416 417 spin_lock(&dax_host_lock); 418 hlist_del_init(&dax_dev->list); 419 spin_unlock(&dax_host_lock); 420} 421EXPORT_SYMBOL_GPL(kill_dax); 422 423void run_dax(struct dax_device *dax_dev) 424{ 425 set_bit(DAXDEV_ALIVE, &dax_dev->flags); 426} 427EXPORT_SYMBOL_GPL(run_dax); 428 429static struct inode *dax_alloc_inode(struct super_block *sb) 430{ 431 struct dax_device *dax_dev; 432 struct inode *inode; 433 434 dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); 435 if (!dax_dev) 436 return NULL; 437 438 inode = &dax_dev->inode; 439 inode->i_rdev = 0; 440 return inode; 441} 442 443static struct dax_device *to_dax_dev(struct inode *inode) 444{ 445 return container_of(inode, struct dax_device, inode); 446} 447 448static void dax_free_inode(struct inode *inode) 449{ 450 struct dax_device *dax_dev = to_dax_dev(inode); 451 kfree(dax_dev->host); 452 dax_dev->host = NULL; 453 if (inode->i_rdev) 454 ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev)); 455 kmem_cache_free(dax_cache, dax_dev); 456} 457 458static void dax_destroy_inode(struct inode *inode) 459{ 460 struct dax_device *dax_dev = to_dax_dev(inode); 461 WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags), 462 "kill_dax() must be called before final iput()\n"); 463} 464 465static const struct super_operations dax_sops = { 466 .statfs = simple_statfs, 467 .alloc_inode = dax_alloc_inode, 468 .destroy_inode = dax_destroy_inode, 469 .free_inode = dax_free_inode, 470 .drop_inode = generic_delete_inode, 471}; 472 473static int dax_init_fs_context(struct fs_context *fc) 474{ 475 struct pseudo_fs_context *ctx = init_pseudo(fc, DAXFS_MAGIC); 476 if (!ctx) 477 return -ENOMEM; 478 ctx->ops = &dax_sops; 479 return 0; 480} 481 482static struct file_system_type dax_fs_type = { 483 .name = "dax", 484 .init_fs_context = dax_init_fs_context, 485 .kill_sb = kill_anon_super, 486}; 487 488static int dax_test(struct inode *inode, void *data) 489{ 490 dev_t devt = *(dev_t *) data; 491 492 return inode->i_rdev == devt; 493} 494 495static int dax_set(struct inode *inode, void *data) 496{ 497 dev_t devt = *(dev_t *) data; 498 499 inode->i_rdev = devt; 500 return 0; 501} 502 503static struct dax_device *dax_dev_get(dev_t devt) 504{ 505 struct dax_device *dax_dev; 506 struct inode *inode; 507 508 inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), 509 dax_test, dax_set, &devt); 510 511 if (!inode) 512 return NULL; 513 514 dax_dev = to_dax_dev(inode); 515 if (inode->i_state & I_NEW) { 516 set_bit(DAXDEV_ALIVE, &dax_dev->flags); 517 inode->i_cdev = &dax_dev->cdev; 518 inode->i_mode = S_IFCHR; 519 inode->i_flags = S_DAX; 520 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 521 unlock_new_inode(inode); 522 } 523 524 return dax_dev; 525} 526 527static void dax_add_host(struct dax_device *dax_dev, const char *host) 528{ 529 int hash; 530 531 /* 532 * Unconditionally init dax_dev since it's coming from a 533 * non-zeroed slab cache 534 */ 535 INIT_HLIST_NODE(&dax_dev->list); 536 dax_dev->host = host; 537 if (!host) 538 return; 539 540 hash = dax_host_hash(host); 541 spin_lock(&dax_host_lock); 542 hlist_add_head(&dax_dev->list, &dax_host_list[hash]); 543 spin_unlock(&dax_host_lock); 544} 545 546struct dax_device *alloc_dax(void *private, const char *__host, 547 const struct dax_operations *ops, unsigned long flags) 548{ 549 struct dax_device *dax_dev; 550 const char *host; 551 dev_t devt; 552 int minor; 553 554 host = kstrdup(__host, GFP_KERNEL); 555 if (__host && !host) 556 return NULL; 557 558 minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL); 559 if (minor < 0) 560 goto err_minor; 561 562 devt = MKDEV(MAJOR(dax_devt), minor); 563 dax_dev = dax_dev_get(devt); 564 if (!dax_dev) 565 goto err_dev; 566 567 dax_add_host(dax_dev, host); 568 dax_dev->ops = ops; 569 dax_dev->private = private; 570 if (flags & DAXDEV_F_SYNC) 571 set_dax_synchronous(dax_dev); 572 573 return dax_dev; 574 575 err_dev: 576 ida_simple_remove(&dax_minor_ida, minor); 577 err_minor: 578 kfree(host); 579 return NULL; 580} 581EXPORT_SYMBOL_GPL(alloc_dax); 582 583void put_dax(struct dax_device *dax_dev) 584{ 585 if (!dax_dev) 586 return; 587 iput(&dax_dev->inode); 588} 589EXPORT_SYMBOL_GPL(put_dax); 590 591/** 592 * dax_get_by_host() - temporary lookup mechanism for filesystem-dax 593 * @host: alternate name for the device registered by a dax driver 594 */ 595struct dax_device *dax_get_by_host(const char *host) 596{ 597 struct dax_device *dax_dev, *found = NULL; 598 int hash, id; 599 600 if (!host) 601 return NULL; 602 603 hash = dax_host_hash(host); 604 605 id = dax_read_lock(); 606 spin_lock(&dax_host_lock); 607 hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) { 608 if (!dax_alive(dax_dev) 609 || strcmp(host, dax_dev->host) != 0) 610 continue; 611 612 if (igrab(&dax_dev->inode)) 613 found = dax_dev; 614 break; 615 } 616 spin_unlock(&dax_host_lock); 617 dax_read_unlock(id); 618 619 return found; 620} 621EXPORT_SYMBOL_GPL(dax_get_by_host); 622 623/** 624 * inode_dax: convert a public inode into its dax_dev 625 * @inode: An inode with i_cdev pointing to a dax_dev 626 * 627 * Note this is not equivalent to to_dax_dev() which is for private 628 * internal use where we know the inode filesystem type == dax_fs_type. 629 */ 630struct dax_device *inode_dax(struct inode *inode) 631{ 632 struct cdev *cdev = inode->i_cdev; 633 634 return container_of(cdev, struct dax_device, cdev); 635} 636EXPORT_SYMBOL_GPL(inode_dax); 637 638struct inode *dax_inode(struct dax_device *dax_dev) 639{ 640 return &dax_dev->inode; 641} 642EXPORT_SYMBOL_GPL(dax_inode); 643 644void *dax_get_private(struct dax_device *dax_dev) 645{ 646 if (!test_bit(DAXDEV_ALIVE, &dax_dev->flags)) 647 return NULL; 648 return dax_dev->private; 649} 650EXPORT_SYMBOL_GPL(dax_get_private); 651 652static void init_once(void *_dax_dev) 653{ 654 struct dax_device *dax_dev = _dax_dev; 655 struct inode *inode = &dax_dev->inode; 656 657 memset(dax_dev, 0, sizeof(*dax_dev)); 658 inode_init_once(inode); 659} 660 661static int dax_fs_init(void) 662{ 663 int rc; 664 665 dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0, 666 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 667 SLAB_MEM_SPREAD|SLAB_ACCOUNT), 668 init_once); 669 if (!dax_cache) 670 return -ENOMEM; 671 672 dax_mnt = kern_mount(&dax_fs_type); 673 if (IS_ERR(dax_mnt)) { 674 rc = PTR_ERR(dax_mnt); 675 goto err_mount; 676 } 677 dax_superblock = dax_mnt->mnt_sb; 678 679 return 0; 680 681 err_mount: 682 kmem_cache_destroy(dax_cache); 683 684 return rc; 685} 686 687static void dax_fs_exit(void) 688{ 689 kern_unmount(dax_mnt); 690 kmem_cache_destroy(dax_cache); 691} 692 693static int __init dax_core_init(void) 694{ 695 int rc; 696 697 rc = dax_fs_init(); 698 if (rc) 699 return rc; 700 701 rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax"); 702 if (rc) 703 goto err_chrdev; 704 705 rc = dax_bus_init(); 706 if (rc) 707 goto err_bus; 708 return 0; 709 710err_bus: 711 unregister_chrdev_region(dax_devt, MINORMASK+1); 712err_chrdev: 713 dax_fs_exit(); 714 return 0; 715} 716 717static void __exit dax_core_exit(void) 718{ 719 unregister_chrdev_region(dax_devt, MINORMASK+1); 720 ida_destroy(&dax_minor_ida); 721 dax_fs_exit(); 722} 723 724MODULE_AUTHOR("Intel Corporation"); 725MODULE_LICENSE("GPL v2"); 726subsys_initcall(dax_core_init); 727module_exit(dax_core_exit);