Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

virtiofs: implement dax read/write operations

This patch implements basic DAX support. mmap() is not implemented
yet and will come in later patches. This patch looks into implemeting
read/write.

We make use of interval tree to keep track of per inode dax mappings.

Do not use dax for file extending writes, instead just send WRITE message
to daemon (like we do for direct I/O path). This will keep write and
i_size change atomic w.r.t crash.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: Peng Tao <tao.peng@linux.alibaba.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>

authored by

Vivek Goyal and committed by
Miklos Szeredi
c2d0ad00 ceec02d4

+612 -6
+1
fs/fuse/Kconfig
··· 42 42 config FUSE_DAX 43 43 bool "Virtio Filesystem Direct Host Memory Access support" 44 44 default y 45 + select INTERVAL_TREE 45 46 depends on VIRTIO_FS 46 47 depends on FS_DAX 47 48 depends on DAX_DRIVER
+565
fs/fuse/dax.c
··· 7 7 #include "fuse_i.h" 8 8 9 9 #include <linux/dax.h> 10 + #include <linux/uio.h> 10 11 #include <linux/pfn_t.h> 12 + #include <linux/iomap.h> 13 + #include <linux/interval_tree.h> 11 14 12 15 /* 13 16 * Default memory range size. A power of 2 so it agrees with common FUSE_INIT ··· 25 22 /* Will connect in fcd->free_ranges to keep track of free memory */ 26 23 struct list_head list; 27 24 25 + /* For interval tree in file/inode */ 26 + struct interval_tree_node itn; 27 + 28 28 /** Position in DAX window */ 29 29 u64 window_offset; 30 30 31 31 /** Length of mapping, in bytes */ 32 32 loff_t length; 33 + 34 + /* Is this mapping read-only or read-write */ 35 + bool writable; 36 + }; 37 + 38 + /* Per-inode dax map */ 39 + struct fuse_inode_dax { 40 + /* Semaphore to protect modifications to the dmap tree */ 41 + struct rw_semaphore sem; 42 + 43 + /* Sorted rb tree of struct fuse_dax_mapping elements */ 44 + struct rb_root_cached tree; 45 + unsigned long nr; 33 46 }; 34 47 35 48 struct fuse_conn_dax { 36 49 /* DAX device */ 37 50 struct dax_device *dev; 38 51 52 + /* Lock protecting accessess to members of this structure */ 53 + spinlock_t lock; 54 + 39 55 /* DAX Window Free Ranges */ 40 56 long nr_free_ranges; 41 57 struct list_head free_ranges; 42 58 }; 59 + 60 + static inline struct fuse_dax_mapping * 61 + node_to_dmap(struct interval_tree_node *node) 62 + { 63 + if (!node) 64 + return NULL; 65 + 66 + return container_of(node, struct fuse_dax_mapping, itn); 67 + } 68 + 69 + static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) 70 + { 71 + struct fuse_dax_mapping *dmap; 72 + 73 + spin_lock(&fcd->lock); 74 + dmap = list_first_entry_or_null(&fcd->free_ranges, 75 + struct fuse_dax_mapping, list); 76 + if (dmap) { 77 + list_del_init(&dmap->list); 78 + WARN_ON(fcd->nr_free_ranges <= 0); 79 + fcd->nr_free_ranges--; 80 + } 81 + spin_unlock(&fcd->lock); 82 + return dmap; 83 + } 84 + 85 + /* This assumes fcd->lock is held */ 86 + static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd, 87 + struct fuse_dax_mapping *dmap) 88 + { 89 + list_add_tail(&dmap->list, &fcd->free_ranges); 90 + fcd->nr_free_ranges++; 91 + } 92 + 93 + static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd, 94 + struct fuse_dax_mapping *dmap) 95 + { 96 + /* Return fuse_dax_mapping to free list */ 97 + spin_lock(&fcd->lock); 98 + __dmap_add_to_free_pool(fcd, dmap); 99 + spin_unlock(&fcd->lock); 100 + } 101 + 102 + static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, 103 + struct fuse_dax_mapping *dmap, bool writable, 104 + bool upgrade) 105 + { 106 + struct fuse_conn *fc = get_fuse_conn(inode); 107 + struct fuse_conn_dax *fcd = fc->dax; 108 + struct fuse_inode *fi = get_fuse_inode(inode); 109 + struct fuse_setupmapping_in inarg; 110 + loff_t offset = start_idx << FUSE_DAX_SHIFT; 111 + FUSE_ARGS(args); 112 + ssize_t err; 113 + 114 + WARN_ON(fcd->nr_free_ranges < 0); 115 + 116 + /* Ask fuse daemon to setup mapping */ 117 + memset(&inarg, 0, sizeof(inarg)); 118 + inarg.foffset = offset; 119 + inarg.fh = -1; 120 + inarg.moffset = dmap->window_offset; 121 + inarg.len = FUSE_DAX_SZ; 122 + inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; 123 + if (writable) 124 + inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; 125 + args.opcode = FUSE_SETUPMAPPING; 126 + args.nodeid = fi->nodeid; 127 + args.in_numargs = 1; 128 + args.in_args[0].size = sizeof(inarg); 129 + args.in_args[0].value = &inarg; 130 + err = fuse_simple_request(fc, &args); 131 + if (err < 0) 132 + return err; 133 + dmap->writable = writable; 134 + if (!upgrade) { 135 + dmap->itn.start = dmap->itn.last = start_idx; 136 + /* Protected by fi->dax->sem */ 137 + interval_tree_insert(&dmap->itn, &fi->dax->tree); 138 + fi->dax->nr++; 139 + } 140 + return 0; 141 + } 142 + 143 + static int fuse_send_removemapping(struct inode *inode, 144 + struct fuse_removemapping_in *inargp, 145 + struct fuse_removemapping_one *remove_one) 146 + { 147 + struct fuse_inode *fi = get_fuse_inode(inode); 148 + struct fuse_conn *fc = get_fuse_conn(inode); 149 + FUSE_ARGS(args); 150 + 151 + args.opcode = FUSE_REMOVEMAPPING; 152 + args.nodeid = fi->nodeid; 153 + args.in_numargs = 2; 154 + args.in_args[0].size = sizeof(*inargp); 155 + args.in_args[0].value = inargp; 156 + args.in_args[1].size = inargp->count * sizeof(*remove_one); 157 + args.in_args[1].value = remove_one; 158 + return fuse_simple_request(fc, &args); 159 + } 160 + 161 + static int dmap_removemapping_list(struct inode *inode, unsigned int num, 162 + struct list_head *to_remove) 163 + { 164 + struct fuse_removemapping_one *remove_one, *ptr; 165 + struct fuse_removemapping_in inarg; 166 + struct fuse_dax_mapping *dmap; 167 + int ret, i = 0, nr_alloc; 168 + 169 + nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY); 170 + remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS); 171 + if (!remove_one) 172 + return -ENOMEM; 173 + 174 + ptr = remove_one; 175 + list_for_each_entry(dmap, to_remove, list) { 176 + ptr->moffset = dmap->window_offset; 177 + ptr->len = dmap->length; 178 + ptr++; 179 + i++; 180 + num--; 181 + if (i >= nr_alloc || num == 0) { 182 + memset(&inarg, 0, sizeof(inarg)); 183 + inarg.count = i; 184 + ret = fuse_send_removemapping(inode, &inarg, 185 + remove_one); 186 + if (ret) 187 + goto out; 188 + ptr = remove_one; 189 + i = 0; 190 + } 191 + } 192 + out: 193 + kfree(remove_one); 194 + return ret; 195 + } 196 + 197 + /* 198 + * Cleanup dmap entry and add back to free list. This should be called with 199 + * fcd->lock held. 200 + */ 201 + static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd, 202 + struct fuse_dax_mapping *dmap) 203 + { 204 + pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n", 205 + dmap->itn.start, dmap->itn.last, dmap->window_offset, 206 + dmap->length); 207 + dmap->itn.start = dmap->itn.last = 0; 208 + __dmap_add_to_free_pool(fcd, dmap); 209 + } 210 + 211 + /* 212 + * Free inode dmap entries whose range falls inside [start, end]. 213 + * Does not take any locks. At this point of time it should only be 214 + * called from evict_inode() path where we know all dmap entries can be 215 + * reclaimed. 216 + */ 217 + static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd, 218 + struct inode *inode, 219 + loff_t start, loff_t end) 220 + { 221 + struct fuse_inode *fi = get_fuse_inode(inode); 222 + struct fuse_dax_mapping *dmap, *n; 223 + int err, num = 0; 224 + LIST_HEAD(to_remove); 225 + unsigned long start_idx = start >> FUSE_DAX_SHIFT; 226 + unsigned long end_idx = end >> FUSE_DAX_SHIFT; 227 + struct interval_tree_node *node; 228 + 229 + while (1) { 230 + node = interval_tree_iter_first(&fi->dax->tree, start_idx, 231 + end_idx); 232 + if (!node) 233 + break; 234 + dmap = node_to_dmap(node); 235 + interval_tree_remove(&dmap->itn, &fi->dax->tree); 236 + num++; 237 + list_add(&dmap->list, &to_remove); 238 + } 239 + 240 + /* Nothing to remove */ 241 + if (list_empty(&to_remove)) 242 + return; 243 + 244 + WARN_ON(fi->dax->nr < num); 245 + fi->dax->nr -= num; 246 + err = dmap_removemapping_list(inode, num, &to_remove); 247 + if (err && err != -ENOTCONN) { 248 + pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n", 249 + start, end); 250 + } 251 + spin_lock(&fcd->lock); 252 + list_for_each_entry_safe(dmap, n, &to_remove, list) { 253 + list_del_init(&dmap->list); 254 + dmap_reinit_add_to_free_pool(fcd, dmap); 255 + } 256 + spin_unlock(&fcd->lock); 257 + } 258 + 259 + /* 260 + * It is called from evict_inode() and by that time inode is going away. So 261 + * this function does not take any locks like fi->dax->sem for traversing 262 + * that fuse inode interval tree. If that lock is taken then lock validator 263 + * complains of deadlock situation w.r.t fs_reclaim lock. 264 + */ 265 + void fuse_dax_inode_cleanup(struct inode *inode) 266 + { 267 + struct fuse_conn *fc = get_fuse_conn(inode); 268 + struct fuse_inode *fi = get_fuse_inode(inode); 269 + 270 + /* 271 + * fuse_evict_inode() has already called truncate_inode_pages_final() 272 + * before we arrive here. So we should not have to worry about any 273 + * pages/exception entries still associated with inode. 274 + */ 275 + inode_reclaim_dmap_range(fc->dax, inode, 0, -1); 276 + WARN_ON(fi->dax->nr); 277 + } 278 + 279 + static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length) 280 + { 281 + iomap->addr = IOMAP_NULL_ADDR; 282 + iomap->length = length; 283 + iomap->type = IOMAP_HOLE; 284 + } 285 + 286 + static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length, 287 + struct iomap *iomap, struct fuse_dax_mapping *dmap, 288 + unsigned int flags) 289 + { 290 + loff_t offset, len; 291 + loff_t i_size = i_size_read(inode); 292 + 293 + offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT); 294 + len = min(length, dmap->length - offset); 295 + 296 + /* If length is beyond end of file, truncate further */ 297 + if (pos + len > i_size) 298 + len = i_size - pos; 299 + 300 + if (len > 0) { 301 + iomap->addr = dmap->window_offset + offset; 302 + iomap->length = len; 303 + if (flags & IOMAP_FAULT) 304 + iomap->length = ALIGN(len, PAGE_SIZE); 305 + iomap->type = IOMAP_MAPPED; 306 + } else { 307 + /* Mapping beyond end of file is hole */ 308 + fuse_fill_iomap_hole(iomap, length); 309 + } 310 + } 311 + 312 + static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, 313 + loff_t length, unsigned int flags, 314 + struct iomap *iomap) 315 + { 316 + struct fuse_inode *fi = get_fuse_inode(inode); 317 + struct fuse_conn *fc = get_fuse_conn(inode); 318 + struct fuse_conn_dax *fcd = fc->dax; 319 + struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; 320 + int ret; 321 + bool writable = flags & IOMAP_WRITE; 322 + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; 323 + struct interval_tree_node *node; 324 + 325 + alloc_dmap = alloc_dax_mapping(fcd); 326 + if (!alloc_dmap) 327 + return -EIO; 328 + 329 + /* 330 + * Take write lock so that only one caller can try to setup mapping 331 + * and other waits. 332 + */ 333 + down_write(&fi->dax->sem); 334 + /* 335 + * We dropped lock. Check again if somebody else setup 336 + * mapping already. 337 + */ 338 + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); 339 + if (node) { 340 + dmap = node_to_dmap(node); 341 + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); 342 + dmap_add_to_free_pool(fcd, alloc_dmap); 343 + up_write(&fi->dax->sem); 344 + return 0; 345 + } 346 + 347 + /* Setup one mapping */ 348 + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap, 349 + writable, false); 350 + if (ret < 0) { 351 + dmap_add_to_free_pool(fcd, alloc_dmap); 352 + up_write(&fi->dax->sem); 353 + return ret; 354 + } 355 + fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags); 356 + up_write(&fi->dax->sem); 357 + return 0; 358 + } 359 + 360 + static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, 361 + loff_t length, unsigned int flags, 362 + struct iomap *iomap) 363 + { 364 + struct fuse_inode *fi = get_fuse_inode(inode); 365 + struct fuse_dax_mapping *dmap; 366 + int ret; 367 + unsigned long idx = pos >> FUSE_DAX_SHIFT; 368 + struct interval_tree_node *node; 369 + 370 + /* 371 + * Take exclusive lock so that only one caller can try to setup 372 + * mapping and others wait. 373 + */ 374 + down_write(&fi->dax->sem); 375 + node = interval_tree_iter_first(&fi->dax->tree, idx, idx); 376 + 377 + /* We are holding either inode lock or i_mmap_sem, and that should 378 + * ensure that dmap can't reclaimed or truncated and it should still 379 + * be there in tree despite the fact we dropped and re-acquired the 380 + * lock. 381 + */ 382 + ret = -EIO; 383 + if (WARN_ON(!node)) 384 + goto out_err; 385 + 386 + dmap = node_to_dmap(node); 387 + 388 + /* Maybe another thread already upgraded mapping while we were not 389 + * holding lock. 390 + */ 391 + if (dmap->writable) { 392 + ret = 0; 393 + goto out_fill_iomap; 394 + } 395 + 396 + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true, 397 + true); 398 + if (ret < 0) 399 + goto out_err; 400 + out_fill_iomap: 401 + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); 402 + out_err: 403 + up_write(&fi->dax->sem); 404 + return ret; 405 + } 406 + 407 + /* This is just for DAX and the mapping is ephemeral, do not use it for other 408 + * purposes since there is no block device with a permanent mapping. 409 + */ 410 + static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, 411 + unsigned int flags, struct iomap *iomap, 412 + struct iomap *srcmap) 413 + { 414 + struct fuse_inode *fi = get_fuse_inode(inode); 415 + struct fuse_conn *fc = get_fuse_conn(inode); 416 + struct fuse_dax_mapping *dmap; 417 + bool writable = flags & IOMAP_WRITE; 418 + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; 419 + struct interval_tree_node *node; 420 + 421 + /* We don't support FIEMAP */ 422 + if (WARN_ON(flags & IOMAP_REPORT)) 423 + return -EIO; 424 + 425 + iomap->offset = pos; 426 + iomap->flags = 0; 427 + iomap->bdev = NULL; 428 + iomap->dax_dev = fc->dax->dev; 429 + 430 + /* 431 + * Both read/write and mmap path can race here. So we need something 432 + * to make sure if we are setting up mapping, then other path waits 433 + * 434 + * For now, use a semaphore for this. It probably needs to be 435 + * optimized later. 436 + */ 437 + down_read(&fi->dax->sem); 438 + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); 439 + if (node) { 440 + dmap = node_to_dmap(node); 441 + if (writable && !dmap->writable) { 442 + /* Upgrade read-only mapping to read-write. This will 443 + * require exclusive fi->dax->sem lock as we don't want 444 + * two threads to be trying to this simultaneously 445 + * for same dmap. So drop shared lock and acquire 446 + * exclusive lock. 447 + */ 448 + up_read(&fi->dax->sem); 449 + pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n", 450 + __func__, pos, length); 451 + return fuse_upgrade_dax_mapping(inode, pos, length, 452 + flags, iomap); 453 + } else { 454 + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); 455 + up_read(&fi->dax->sem); 456 + return 0; 457 + } 458 + } else { 459 + up_read(&fi->dax->sem); 460 + pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", 461 + __func__, pos, length); 462 + if (pos >= i_size_read(inode)) 463 + goto iomap_hole; 464 + 465 + return fuse_setup_new_dax_mapping(inode, pos, length, flags, 466 + iomap); 467 + } 468 + 469 + /* 470 + * If read beyond end of file happnes, fs code seems to return 471 + * it as hole 472 + */ 473 + iomap_hole: 474 + fuse_fill_iomap_hole(iomap, length); 475 + pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", 476 + __func__, pos, length, iomap->length); 477 + return 0; 478 + } 479 + 480 + static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length, 481 + ssize_t written, unsigned int flags, 482 + struct iomap *iomap) 483 + { 484 + /* DAX writes beyond end-of-file aren't handled using iomap, so the 485 + * file size is unchanged and there is nothing to do here. 486 + */ 487 + return 0; 488 + } 489 + 490 + static const struct iomap_ops fuse_iomap_ops = { 491 + .iomap_begin = fuse_iomap_begin, 492 + .iomap_end = fuse_iomap_end, 493 + }; 494 + 495 + ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) 496 + { 497 + struct inode *inode = file_inode(iocb->ki_filp); 498 + ssize_t ret; 499 + 500 + if (iocb->ki_flags & IOCB_NOWAIT) { 501 + if (!inode_trylock_shared(inode)) 502 + return -EAGAIN; 503 + } else { 504 + inode_lock_shared(inode); 505 + } 506 + 507 + ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops); 508 + inode_unlock_shared(inode); 509 + 510 + /* TODO file_accessed(iocb->f_filp) */ 511 + return ret; 512 + } 513 + 514 + static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from) 515 + { 516 + struct inode *inode = file_inode(iocb->ki_filp); 517 + 518 + return (iov_iter_rw(from) == WRITE && 519 + ((iocb->ki_pos) >= i_size_read(inode) || 520 + (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode)))); 521 + } 522 + 523 + static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) 524 + { 525 + struct inode *inode = file_inode(iocb->ki_filp); 526 + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); 527 + ssize_t ret; 528 + 529 + ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); 530 + if (ret < 0) 531 + return ret; 532 + 533 + fuse_invalidate_attr(inode); 534 + fuse_write_update_size(inode, iocb->ki_pos); 535 + return ret; 536 + } 537 + 538 + ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) 539 + { 540 + struct inode *inode = file_inode(iocb->ki_filp); 541 + ssize_t ret; 542 + 543 + if (iocb->ki_flags & IOCB_NOWAIT) { 544 + if (!inode_trylock(inode)) 545 + return -EAGAIN; 546 + } else { 547 + inode_lock(inode); 548 + } 549 + 550 + ret = generic_write_checks(iocb, from); 551 + if (ret <= 0) 552 + goto out; 553 + 554 + ret = file_remove_privs(iocb->ki_filp); 555 + if (ret) 556 + goto out; 557 + /* TODO file_update_time() but we don't want metadata I/O */ 558 + 559 + /* Do not use dax for file extending writes as write and on 560 + * disk i_size increase are not atomic otherwise. 561 + */ 562 + if (file_extending_write(iocb, from)) 563 + ret = fuse_dax_direct_write(iocb, from); 564 + else 565 + ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops); 566 + 567 + out: 568 + inode_unlock(inode); 569 + 570 + if (ret > 0) 571 + ret = generic_write_sync(iocb, ret); 572 + return ret; 573 + } 43 574 44 575 static void fuse_free_dax_mem_ranges(struct list_head *mem_list) 45 576 { ··· 653 116 if (!fcd) 654 117 return -ENOMEM; 655 118 119 + spin_lock_init(&fcd->lock); 656 120 fcd->dev = dax_dev; 657 121 err = fuse_dax_mem_range_init(fcd); 658 122 if (err) { ··· 663 125 664 126 fc->dax = fcd; 665 127 return 0; 128 + } 129 + 130 + bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) 131 + { 132 + struct fuse_conn *fc = get_fuse_conn_super(sb); 133 + 134 + fi->dax = NULL; 135 + if (fc->dax) { 136 + fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT); 137 + if (!fi->dax) 138 + return false; 139 + 140 + init_rwsem(&fi->dax->sem); 141 + fi->dax->tree = RB_ROOT_CACHED; 142 + } 143 + 144 + return true; 145 + } 146 + 147 + void fuse_dax_inode_init(struct inode *inode) 148 + { 149 + struct fuse_conn *fc = get_fuse_conn(inode); 150 + 151 + if (!fc->dax) 152 + return; 153 + 154 + inode->i_flags |= S_DAX; 666 155 } 667 156 668 157 bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment)
+13 -2
fs/fuse/file.c
··· 1539 1539 { 1540 1540 struct file *file = iocb->ki_filp; 1541 1541 struct fuse_file *ff = file->private_data; 1542 + struct inode *inode = file_inode(file); 1542 1543 1543 - if (is_bad_inode(file_inode(file))) 1544 + if (is_bad_inode(inode)) 1544 1545 return -EIO; 1546 + 1547 + if (FUSE_IS_DAX(inode)) 1548 + return fuse_dax_read_iter(iocb, to); 1545 1549 1546 1550 if (!(ff->open_flags & FOPEN_DIRECT_IO)) 1547 1551 return fuse_cache_read_iter(iocb, to); ··· 1557 1553 { 1558 1554 struct file *file = iocb->ki_filp; 1559 1555 struct fuse_file *ff = file->private_data; 1556 + struct inode *inode = file_inode(file); 1560 1557 1561 - if (is_bad_inode(file_inode(file))) 1558 + if (is_bad_inode(inode)) 1562 1559 return -EIO; 1560 + 1561 + if (FUSE_IS_DAX(inode)) 1562 + return fuse_dax_write_iter(iocb, from); 1563 1563 1564 1564 if (!(ff->open_flags & FOPEN_DIRECT_IO)) 1565 1565 return fuse_cache_write_iter(iocb, from); ··· 3448 3440 fi->writectr = 0; 3449 3441 init_waitqueue_head(&fi->page_waitq); 3450 3442 fi->writepages = RB_ROOT; 3443 + 3444 + if (IS_ENABLED(CONFIG_FUSE_DAX)) 3445 + fuse_dax_inode_init(inode); 3451 3446 }
+15
fs/fuse/fuse_i.h
··· 148 148 149 149 /** Lock to protect write related fields */ 150 150 spinlock_t lock; 151 + 152 + #ifdef CONFIG_FUSE_DAX 153 + /* 154 + * Dax specific inode data 155 + */ 156 + struct fuse_inode_dax *dax; 157 + #endif 151 158 }; 152 159 153 160 /** FUSE inode state bits */ ··· 1111 1104 1112 1105 /* dax.c */ 1113 1106 1107 + #define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode)) 1108 + 1109 + ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); 1110 + ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); 1111 + int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); 1114 1112 int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev); 1115 1113 void fuse_dax_conn_free(struct fuse_conn *fc); 1114 + bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); 1115 + void fuse_dax_inode_init(struct inode *inode); 1116 + void fuse_dax_inode_cleanup(struct inode *inode); 1116 1117 bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); 1117 1118 1118 1119 #endif /* _FS_FUSE_I_H */
+17 -4
fs/fuse/inode.c
··· 87 87 mutex_init(&fi->mutex); 88 88 spin_lock_init(&fi->lock); 89 89 fi->forget = fuse_alloc_forget(); 90 - if (!fi->forget) { 91 - kmem_cache_free(fuse_inode_cachep, fi); 92 - return NULL; 93 - } 90 + if (!fi->forget) 91 + goto out_free; 92 + 93 + if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) 94 + goto out_free_forget; 94 95 95 96 return &fi->inode; 97 + 98 + out_free_forget: 99 + kfree(fi->forget); 100 + out_free: 101 + kmem_cache_free(fuse_inode_cachep, fi); 102 + return NULL; 96 103 } 97 104 98 105 static void fuse_free_inode(struct inode *inode) ··· 108 101 109 102 mutex_destroy(&fi->mutex); 110 103 kfree(fi->forget); 104 + #ifdef CONFIG_FUSE_DAX 105 + kfree(fi->dax); 106 + #endif 111 107 kmem_cache_free(fuse_inode_cachep, fi); 112 108 } 113 109 ··· 122 112 clear_inode(inode); 123 113 if (inode->i_sb->s_flags & SB_ACTIVE) { 124 114 struct fuse_conn *fc = get_fuse_conn(inode); 115 + 116 + if (FUSE_IS_DAX(inode)) 117 + fuse_dax_inode_cleanup(inode); 125 118 fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); 126 119 fi->forget = NULL; 127 120 }
+1
include/uapi/linux/fuse.h
··· 895 895 }; 896 896 897 897 #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) 898 + #define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1) 898 899 struct fuse_setupmapping_in { 899 900 /* An already open handle */ 900 901 uint64_t fh;