Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'xfs-dax-support' into for-next

+333 -154
+27 -7
fs/dax.c
··· 309 309 out: 310 310 i_mmap_unlock_read(mapping); 311 311 312 - if (bh->b_end_io) 313 - bh->b_end_io(bh, 1); 314 - 315 312 return error; 316 313 } 317 314 318 - static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 319 - get_block_t get_block) 315 + /** 316 + * __dax_fault - handle a page fault on a DAX file 317 + * @vma: The virtual memory area where the fault occurred 318 + * @vmf: The description of the fault 319 + * @get_block: The filesystem method used to translate file offsets to blocks 320 + * 321 + * When a page fault occurs, filesystems may call this helper in their 322 + * fault handler for DAX files. __dax_fault() assumes the caller has done all 323 + * the necessary locking for the page fault to proceed successfully. 324 + */ 325 + int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 326 + get_block_t get_block, dax_iodone_t complete_unwritten) 320 327 { 321 328 struct file *file = vma->vm_file; 322 329 struct address_space *mapping = file->f_mapping; ··· 424 417 page_cache_release(page); 425 418 } 426 419 420 + /* 421 + * If we successfully insert the new mapping over an unwritten extent, 422 + * we need to ensure we convert the unwritten extent. If there is an 423 + * error inserting the mapping, the filesystem needs to leave it as 424 + * unwritten to prevent exposure of the stale underlying data to 425 + * userspace, but we still need to call the completion function so 426 + * the private resources on the mapping buffer can be released. We 427 + * indicate what the callback should do via the uptodate variable, same 428 + * as for normal BH based IO completions. 429 + */ 427 430 error = dax_insert_mapping(inode, &bh, vma, vmf); 431 + if (buffer_unwritten(&bh)) 432 + complete_unwritten(&bh, !error); 428 433 429 434 out: 430 435 if (error == -ENOMEM) ··· 453 434 } 454 435 goto out; 455 436 } 437 + EXPORT_SYMBOL(__dax_fault); 456 438 457 439 /** 458 440 * dax_fault - handle a page fault on a DAX file ··· 465 445 * fault handler for DAX files. 466 446 */ 467 447 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 468 - get_block_t get_block) 448 + get_block_t get_block, dax_iodone_t complete_unwritten) 469 449 { 470 450 int result; 471 451 struct super_block *sb = file_inode(vma->vm_file)->i_sb; ··· 474 454 sb_start_pagefault(sb); 475 455 file_update_time(vma->vm_file); 476 456 } 477 - result = do_dax_fault(vma, vmf, get_block); 457 + result = __dax_fault(vma, vmf, get_block, complete_unwritten); 478 458 if (vmf->flags & FAULT_FLAG_WRITE) 479 459 sb_end_pagefault(sb); 480 460
+2 -2
fs/ext2/file.c
··· 28 28 #ifdef CONFIG_FS_DAX 29 29 static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 30 30 { 31 - return dax_fault(vma, vmf, ext2_get_block); 31 + return dax_fault(vma, vmf, ext2_get_block, NULL); 32 32 } 33 33 34 34 static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 35 35 { 36 - return dax_mkwrite(vma, vmf, ext2_get_block); 36 + return dax_mkwrite(vma, vmf, ext2_get_block, NULL); 37 37 } 38 38 39 39 static const struct vm_operations_struct ext2_dax_vm_ops = {
+14 -2
fs/ext4/file.c
··· 192 192 } 193 193 194 194 #ifdef CONFIG_FS_DAX 195 + static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) 196 + { 197 + struct inode *inode = bh->b_assoc_map->host; 198 + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ 199 + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; 200 + int err; 201 + if (!uptodate) 202 + return; 203 + WARN_ON(!buffer_unwritten(bh)); 204 + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); 205 + } 206 + 195 207 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 196 208 { 197 - return dax_fault(vma, vmf, ext4_get_block); 209 + return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten); 198 210 /* Is this the right get_block? */ 199 211 } 200 212 201 213 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 202 214 { 203 - return dax_mkwrite(vma, vmf, ext4_get_block); 215 + return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten); 204 216 } 205 217 206 218 static const struct vm_operations_struct ext4_dax_vm_ops = {
+7 -14
fs/ext4/inode.c
··· 656 656 return retval; 657 657 } 658 658 659 - static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) 660 - { 661 - struct inode *inode = bh->b_assoc_map->host; 662 - /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ 663 - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; 664 - int err; 665 - if (!uptodate) 666 - return; 667 - WARN_ON(!buffer_unwritten(bh)); 668 - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); 669 - } 670 - 671 659 /* Maximum number of blocks we map for direct IO at once. */ 672 660 #define DIO_MAX_BLOCKS 4096 673 661 ··· 693 705 694 706 map_bh(bh, inode->i_sb, map.m_pblk); 695 707 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 696 - if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { 708 + if (IS_DAX(inode) && buffer_unwritten(bh)) { 709 + /* 710 + * dgc: I suspect unwritten conversion on ext4+DAX is 711 + * fundamentally broken here when there are concurrent 712 + * read/write in progress on this inode. 713 + */ 714 + WARN_ON_ONCE(io_end); 697 715 bh->b_assoc_map = inode->i_mapping; 698 716 bh->b_private = (void *)(unsigned long)iblock; 699 - bh->b_end_io = ext4_end_io_unwritten; 700 717 } 701 718 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) 702 719 set_buffer_defer_completion(bh);
+110 -42
fs/xfs/xfs_aops.c
··· 1349 1349 sector_t iblock, 1350 1350 struct buffer_head *bh_result, 1351 1351 int create, 1352 - int direct) 1352 + bool direct) 1353 1353 { 1354 1354 struct xfs_inode *ip = XFS_I(inode); 1355 1355 struct xfs_mount *mp = ip->i_mount; ··· 1414 1414 if (error) 1415 1415 return error; 1416 1416 new = 1; 1417 + 1417 1418 } else { 1418 1419 /* 1419 1420 * Delalloc reservations do not require a transaction, ··· 1509 1508 struct buffer_head *bh_result, 1510 1509 int create) 1511 1510 { 1512 - return __xfs_get_blocks(inode, iblock, bh_result, create, 0); 1511 + return __xfs_get_blocks(inode, iblock, bh_result, create, false); 1513 1512 } 1514 1513 1515 - STATIC int 1514 + int 1516 1515 xfs_get_blocks_direct( 1517 1516 struct inode *inode, 1518 1517 sector_t iblock, 1519 1518 struct buffer_head *bh_result, 1520 1519 int create) 1521 1520 { 1522 - return __xfs_get_blocks(inode, iblock, bh_result, create, 1); 1521 + return __xfs_get_blocks(inode, iblock, bh_result, create, true); 1523 1522 } 1524 1523 1525 - /* 1526 - * Complete a direct I/O write request. 1527 - * 1528 - * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. 1529 - * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite 1530 - * wholly within the EOF and so there is nothing for us to do. Note that in this 1531 - * case the completion can be called in interrupt context, whereas if we have an 1532 - * ioend we will always be called in task context (i.e. from a workqueue). 1533 - */ 1534 - STATIC void 1535 - xfs_end_io_direct_write( 1536 - struct kiocb *iocb, 1524 + static void 1525 + __xfs_end_io_direct_write( 1526 + struct inode *inode, 1527 + struct xfs_ioend *ioend, 1537 1528 loff_t offset, 1538 - ssize_t size, 1539 - void *private) 1529 + ssize_t size) 1540 1530 { 1541 - struct inode *inode = file_inode(iocb->ki_filp); 1542 - struct xfs_inode *ip = XFS_I(inode); 1543 - struct xfs_mount *mp = ip->i_mount; 1544 - struct xfs_ioend *ioend = private; 1531 + struct xfs_mount *mp = XFS_I(inode)->i_mount; 1545 1532 1546 - trace_xfs_gbmap_direct_endio(ip, offset, size, 1547 - ioend ? ioend->io_type : 0, NULL); 1548 - 1549 - if (!ioend) { 1550 - ASSERT(offset + size <= i_size_read(inode)); 1551 - return; 1552 - } 1553 - 1554 - if (XFS_FORCED_SHUTDOWN(mp)) 1533 + if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) 1555 1534 goto out_end_io; 1556 1535 1557 1536 /* ··· 1568 1587 * here can result in EOF moving backwards and Bad Things Happen when 1569 1588 * that occurs. 1570 1589 */ 1571 - spin_lock(&ip->i_flags_lock); 1590 + spin_lock(&XFS_I(inode)->i_flags_lock); 1572 1591 if (offset + size > i_size_read(inode)) 1573 1592 i_size_write(inode, offset + size); 1574 - spin_unlock(&ip->i_flags_lock); 1593 + spin_unlock(&XFS_I(inode)->i_flags_lock); 1575 1594 1576 1595 /* 1577 1596 * If we are doing an append IO that needs to update the EOF on disk, ··· 1588 1607 return; 1589 1608 } 1590 1609 1610 + /* 1611 + * Complete a direct I/O write request. 1612 + * 1613 + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. 1614 + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite 1615 + * wholly within the EOF and so there is nothing for us to do. Note that in this 1616 + * case the completion can be called in interrupt context, whereas if we have an 1617 + * ioend we will always be called in task context (i.e. from a workqueue). 1618 + */ 1619 + STATIC void 1620 + xfs_end_io_direct_write( 1621 + struct kiocb *iocb, 1622 + loff_t offset, 1623 + ssize_t size, 1624 + void *private) 1625 + { 1626 + struct inode *inode = file_inode(iocb->ki_filp); 1627 + struct xfs_ioend *ioend = private; 1628 + 1629 + trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, 1630 + ioend ? ioend->io_type : 0, NULL); 1631 + 1632 + if (!ioend) { 1633 + ASSERT(offset + size <= i_size_read(inode)); 1634 + return; 1635 + } 1636 + 1637 + __xfs_end_io_direct_write(inode, ioend, offset, size); 1638 + } 1639 + 1640 + /* 1641 + * For DAX we need a mapping buffer callback for unwritten extent conversion 1642 + * when page faults allocate blocks and then zero them. Note that in this 1643 + * case the mapping indicated by the ioend may extend beyond EOF. We most 1644 + * definitely do not want to extend EOF here, so we trim back the ioend size to 1645 + * EOF. 1646 + */ 1647 + #ifdef CONFIG_FS_DAX 1648 + void 1649 + xfs_end_io_dax_write( 1650 + struct buffer_head *bh, 1651 + int uptodate) 1652 + { 1653 + struct xfs_ioend *ioend = bh->b_private; 1654 + struct inode *inode = ioend->io_inode; 1655 + ssize_t size = ioend->io_size; 1656 + 1657 + ASSERT(IS_DAX(ioend->io_inode)); 1658 + 1659 + /* if there was an error zeroing, then don't convert it */ 1660 + if (!uptodate) 1661 + ioend->io_error = -EIO; 1662 + 1663 + /* 1664 + * Trim update to EOF, so we don't extend EOF during unwritten extent 1665 + * conversion of partial EOF blocks. 1666 + */ 1667 + spin_lock(&XFS_I(inode)->i_flags_lock); 1668 + if (ioend->io_offset + size > i_size_read(inode)) 1669 + size = i_size_read(inode) - ioend->io_offset; 1670 + spin_unlock(&XFS_I(inode)->i_flags_lock); 1671 + 1672 + __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); 1673 + 1674 + } 1675 + #else 1676 + void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } 1677 + #endif 1678 + 1679 + static inline ssize_t 1680 + xfs_vm_do_dio( 1681 + struct inode *inode, 1682 + struct kiocb *iocb, 1683 + struct iov_iter *iter, 1684 + loff_t offset, 1685 + void (*endio)(struct kiocb *iocb, 1686 + loff_t offset, 1687 + ssize_t size, 1688 + void *private), 1689 + int flags) 1690 + { 1691 + struct block_device *bdev; 1692 + 1693 + if (IS_DAX(inode)) 1694 + return dax_do_io(iocb, inode, iter, offset, 1695 + xfs_get_blocks_direct, endio, 0); 1696 + 1697 + bdev = xfs_find_bdev_for_inode(inode); 1698 + return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1699 + xfs_get_blocks_direct, endio, NULL, flags); 1700 + } 1701 + 1591 1702 STATIC ssize_t 1592 1703 xfs_vm_direct_IO( 1593 1704 struct kiocb *iocb, ··· 1687 1614 loff_t offset) 1688 1615 { 1689 1616 struct inode *inode = iocb->ki_filp->f_mapping->host; 1690 - struct block_device *bdev = xfs_find_bdev_for_inode(inode); 1691 1617 1692 - if (iov_iter_rw(iter) == WRITE) { 1693 - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1694 - xfs_get_blocks_direct, 1695 - xfs_end_io_direct_write, NULL, 1696 - DIO_ASYNC_EXTEND); 1697 - } 1698 - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1699 - xfs_get_blocks_direct, NULL, NULL, 0); 1618 + if (iov_iter_rw(iter) == WRITE) 1619 + return xfs_vm_do_dio(inode, iocb, iter, offset, 1620 + xfs_end_io_direct_write, DIO_ASYNC_EXTEND); 1621 + return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); 1700 1622 } 1701 1623 1702 1624 /*
+6 -1
fs/xfs/xfs_aops.h
··· 53 53 } xfs_ioend_t; 54 54 55 55 extern const struct address_space_operations xfs_address_space_operations; 56 - extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); 56 + 57 + int xfs_get_blocks(struct inode *inode, sector_t offset, 58 + struct buffer_head *map_bh, int create); 59 + int xfs_get_blocks_direct(struct inode *inode, sector_t offset, 60 + struct buffer_head *map_bh, int create); 61 + void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); 57 62 58 63 extern void xfs_count_page_state(struct page *, int *, int *); 59 64
+19 -4
fs/xfs/xfs_bmap_util.c
··· 1133 1133 break; 1134 1134 ASSERT(imap.br_blockcount >= 1); 1135 1135 ASSERT(imap.br_startoff == offset_fsb); 1136 + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); 1137 + 1138 + if (imap.br_startblock == HOLESTARTBLOCK || 1139 + imap.br_state == XFS_EXT_UNWRITTEN) { 1140 + /* skip the entire extent */ 1141 + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1142 + imap.br_blockcount) - 1; 1143 + continue; 1144 + } 1145 + 1136 1146 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; 1137 1147 if (lastoffset > endoff) 1138 1148 lastoffset = endoff; 1139 - if (imap.br_startblock == HOLESTARTBLOCK) 1149 + 1150 + /* DAX can just zero the backing device directly */ 1151 + if (IS_DAX(VFS_I(ip))) { 1152 + error = dax_zero_page_range(VFS_I(ip), offset, 1153 + lastoffset - offset + 1, 1154 + xfs_get_blocks_direct); 1155 + if (error) 1156 + return error; 1140 1157 continue; 1141 - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); 1142 - if (imap.br_state == XFS_EXT_UNWRITTEN) 1143 - continue; 1158 + } 1144 1159 1145 1160 error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? 1146 1161 mp->m_rtdev_targp : mp->m_ddev_targp,
+99 -65
fs/xfs/xfs_file.c
··· 79 79 } 80 80 81 81 /* 82 - * xfs_iozero 82 + * xfs_iozero clears the specified range supplied via the page cache (except in 83 + * the DAX case). Writes through the page cache will allocate blocks over holes, 84 + * though the callers usually map the holes first and avoid them. If a block is 85 + * not completely zeroed, then it will be read from disk before being partially 86 + * zeroed. 83 87 * 84 - * xfs_iozero clears the specified range of buffer supplied, 85 - * and marks all the affected blocks as valid and modified. If 86 - * an affected block is not allocated, it will be allocated. If 87 - * an affected block is not completely overwritten, and is not 88 - * valid before the operation, it will be read from disk before 89 - * being partially zeroed. 88 + * In the DAX case, we can just directly write to the underlying pages. This 89 + * will not allocate blocks, but will avoid holes and unwritten extents and so 90 + * not do unnecessary work. 90 91 */ 91 92 int 92 93 xfs_iozero( ··· 97 96 { 98 97 struct page *page; 99 98 struct address_space *mapping; 100 - int status; 99 + int status = 0; 100 + 101 101 102 102 mapping = VFS_I(ip)->i_mapping; 103 103 do { ··· 110 108 if (bytes > count) 111 109 bytes = count; 112 110 113 - status = pagecache_write_begin(NULL, mapping, pos, bytes, 114 - AOP_FLAG_UNINTERRUPTIBLE, 115 - &page, &fsdata); 116 - if (status) 117 - break; 111 + if (IS_DAX(VFS_I(ip))) { 112 + status = dax_zero_page_range(VFS_I(ip), pos, bytes, 113 + xfs_get_blocks_direct); 114 + if (status) 115 + break; 116 + } else { 117 + status = pagecache_write_begin(NULL, mapping, pos, bytes, 118 + AOP_FLAG_UNINTERRUPTIBLE, 119 + &page, &fsdata); 120 + if (status) 121 + break; 118 122 119 - zero_user(page, offset, bytes); 123 + zero_user(page, offset, bytes); 120 124 121 - status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, 122 - page, fsdata); 123 - WARN_ON(status <= 0); /* can't return less than zero! */ 125 + status = pagecache_write_end(NULL, mapping, pos, bytes, 126 + bytes, page, fsdata); 127 + WARN_ON(status <= 0); /* can't return less than zero! */ 128 + status = 0; 129 + } 124 130 pos += bytes; 125 131 count -= bytes; 126 - status = 0; 127 132 } while (count); 128 133 129 134 return status; ··· 293 284 if (file->f_mode & FMODE_NOCMTIME) 294 285 ioflags |= XFS_IO_INVIS; 295 286 296 - if (unlikely(ioflags & XFS_IO_ISDIRECT)) { 287 + if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { 297 288 xfs_buftarg_t *target = 298 289 XFS_IS_REALTIME_INODE(ip) ? 299 290 mp->m_rtdev_targp : mp->m_ddev_targp; ··· 387 378 388 379 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 389 380 390 - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); 381 + /* for dax, we need to avoid the page cache */ 382 + if (IS_DAX(VFS_I(ip))) 383 + ret = default_file_splice_read(infilp, ppos, pipe, count, flags); 384 + else 385 + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); 391 386 if (ret > 0) 392 387 XFS_STATS_ADD(xs_read_bytes, ret); 393 388 ··· 685 672 mp->m_rtdev_targp : mp->m_ddev_targp; 686 673 687 674 /* DIO must be aligned to device logical sector size */ 688 - if ((pos | count) & target->bt_logical_sectormask) 675 + if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask)) 689 676 return -EINVAL; 690 677 691 678 /* "unaligned" here means not aligned to a filesystem block */ ··· 771 758 out: 772 759 xfs_rw_iunlock(ip, iolock); 773 760 774 - /* No fallback to buffered IO on errors for XFS. */ 775 - ASSERT(ret < 0 || ret == count); 761 + /* 762 + * No fallback to buffered IO on errors for XFS. DAX can result in 763 + * partial writes, but direct IO will either complete fully or fail. 764 + */ 765 + ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); 776 766 return ret; 777 767 } 778 768 ··· 858 842 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 859 843 return -EIO; 860 844 861 - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) 845 + if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) 862 846 ret = xfs_file_dio_aio_write(iocb, from); 863 847 else 864 848 ret = xfs_file_buffered_aio_write(iocb, from); ··· 1077 1061 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); 1078 1062 1079 1063 return xfs_readdir(ip, ctx, bufsize); 1080 - } 1081 - 1082 - STATIC int 1083 - xfs_file_mmap( 1084 - struct file *filp, 1085 - struct vm_area_struct *vma) 1086 - { 1087 - vma->vm_ops = &xfs_file_vm_ops; 1088 - 1089 - file_accessed(filp); 1090 - return 0; 1091 1064 } 1092 1065 1093 1066 /* ··· 1459 1454 * ordering of: 1460 1455 * 1461 1456 * mmap_sem (MM) 1462 - * i_mmap_lock (XFS - truncate serialisation) 1463 - * page_lock (MM) 1464 - * i_lock (XFS - extent map serialisation) 1457 + * sb_start_pagefault(vfs, freeze) 1458 + * i_mmap_lock (XFS - truncate serialisation) 1459 + * page_lock (MM) 1460 + * i_lock (XFS - extent map serialisation) 1465 1461 */ 1466 - STATIC int 1467 - xfs_filemap_fault( 1468 - struct vm_area_struct *vma, 1469 - struct vm_fault *vmf) 1470 - { 1471 - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); 1472 - int error; 1473 - 1474 - trace_xfs_filemap_fault(ip); 1475 - 1476 - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1477 - error = filemap_fault(vma, vmf); 1478 - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1479 - 1480 - return error; 1481 - } 1482 1462 1483 1463 /* 1484 1464 * mmap()d file has taken write protection fault and is being made writable. We ··· 1476 1486 struct vm_area_struct *vma, 1477 1487 struct vm_fault *vmf) 1478 1488 { 1479 - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); 1480 - int error; 1489 + struct inode *inode = file_inode(vma->vm_file); 1490 + int ret; 1481 1491 1482 - trace_xfs_filemap_page_mkwrite(ip); 1492 + trace_xfs_filemap_page_mkwrite(XFS_I(inode)); 1493 + 1494 + sb_start_pagefault(inode->i_sb); 1495 + file_update_time(vma->vm_file); 1496 + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1497 + 1498 + if (IS_DAX(inode)) { 1499 + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, 1500 + xfs_end_io_dax_write); 1501 + } else { 1502 + ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); 1503 + ret = block_page_mkwrite_return(ret); 1504 + } 1505 + 1506 + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1507 + sb_end_pagefault(inode->i_sb); 1508 + 1509 + return ret; 1510 + } 1511 + 1512 + STATIC int 1513 + xfs_filemap_fault( 1514 + struct vm_area_struct *vma, 1515 + struct vm_fault *vmf) 1516 + { 1517 + struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file)); 1518 + int ret; 1519 + 1520 + trace_xfs_filemap_fault(ip); 1521 + 1522 + /* DAX can shortcut the normal fault path on write faults! */ 1523 + if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip))) 1524 + return xfs_filemap_page_mkwrite(vma, vmf); 1483 1525 1484 1526 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1485 - error = block_page_mkwrite(vma, vmf, xfs_get_blocks); 1527 + ret = filemap_fault(vma, vmf); 1486 1528 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1487 1529 1488 - return error; 1530 + return ret; 1531 + } 1532 + 1533 + static const struct vm_operations_struct xfs_file_vm_ops = { 1534 + .fault = xfs_filemap_fault, 1535 + .map_pages = filemap_map_pages, 1536 + .page_mkwrite = xfs_filemap_page_mkwrite, 1537 + }; 1538 + 1539 + STATIC int 1540 + xfs_file_mmap( 1541 + struct file *filp, 1542 + struct vm_area_struct *vma) 1543 + { 1544 + file_accessed(filp); 1545 + vma->vm_ops = &xfs_file_vm_ops; 1546 + if (IS_DAX(file_inode(filp))) 1547 + vma->vm_flags |= VM_MIXEDMAP; 1548 + return 0; 1489 1549 } 1490 1550 1491 1551 const struct file_operations xfs_file_operations = { ··· 1565 1525 .compat_ioctl = xfs_file_compat_ioctl, 1566 1526 #endif 1567 1527 .fsync = xfs_dir_fsync, 1568 - }; 1569 - 1570 - static const struct vm_operations_struct xfs_file_vm_ops = { 1571 - .fault = xfs_filemap_fault, 1572 - .map_pages = filemap_map_pages, 1573 - .page_mkwrite = xfs_filemap_page_mkwrite, 1574 1528 };
+17 -13
fs/xfs/xfs_iops.c
··· 851 851 * to hope that the caller sees ENOMEM and retries the truncate 852 852 * operation. 853 853 */ 854 - error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); 854 + if (IS_DAX(inode)) 855 + error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); 856 + else 857 + error = block_truncate_page(inode->i_mapping, newsize, 858 + xfs_get_blocks); 855 859 if (error) 856 860 return error; 857 861 truncate_setsize(inode, newsize); ··· 1195 1191 struct inode *inode, 1196 1192 struct xfs_inode *ip) 1197 1193 { 1198 - if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) 1194 + uint16_t flags = ip->i_d.di_flags; 1195 + 1196 + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | 1197 + S_NOATIME | S_DAX); 1198 + 1199 + if (flags & XFS_DIFLAG_IMMUTABLE) 1199 1200 inode->i_flags |= S_IMMUTABLE; 1200 - else 1201 - inode->i_flags &= ~S_IMMUTABLE; 1202 - if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) 1201 + if (flags & XFS_DIFLAG_APPEND) 1203 1202 inode->i_flags |= S_APPEND; 1204 - else 1205 - inode->i_flags &= ~S_APPEND; 1206 - if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) 1203 + if (flags & XFS_DIFLAG_SYNC) 1207 1204 inode->i_flags |= S_SYNC; 1208 - else 1209 - inode->i_flags &= ~S_SYNC; 1210 - if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) 1205 + if (flags & XFS_DIFLAG_NOATIME) 1211 1206 inode->i_flags |= S_NOATIME; 1212 - else 1213 - inode->i_flags &= ~S_NOATIME; 1207 + /* XXX: Also needs an on-disk per inode flag! */ 1208 + if (ip->i_mount->m_flags & XFS_MOUNT_DAX) 1209 + inode->i_flags |= S_DAX; 1214 1210 } 1215 1211 1216 1212 /*
+2
fs/xfs/xfs_mount.h
··· 181 181 allocator */ 182 182 #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ 183 183 184 + #define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */ 185 + 184 186 185 187 /* 186 188 * Default minimum read and write sizes.
+23 -2
fs/xfs/xfs_super.c
··· 112 112 #define MNTOPT_DISCARD "discard" /* Discard unused blocks */ 113 113 #define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ 114 114 115 + #define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */ 116 + 115 117 /* 116 118 * Table driven mount option parser. 117 119 * ··· 365 363 mp->m_flags |= XFS_MOUNT_DISCARD; 366 364 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { 367 365 mp->m_flags &= ~XFS_MOUNT_DISCARD; 366 + #ifdef CONFIG_FS_DAX 367 + } else if (!strcmp(this_char, MNTOPT_DAX)) { 368 + mp->m_flags |= XFS_MOUNT_DAX; 369 + #endif 368 370 } else { 369 371 xfs_warn(mp, "unknown mount option [%s].", this_char); 370 372 return -EINVAL; ··· 458 452 } 459 453 460 454 struct proc_xfs_info { 461 - int flag; 462 - char *str; 455 + uint64_t flag; 456 + char *str; 463 457 }; 464 458 465 459 STATIC int ··· 480 474 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 481 475 { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, 482 476 { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, 477 + { XFS_MOUNT_DAX, "," MNTOPT_DAX }, 483 478 { 0, NULL } 484 479 }; 485 480 static struct proc_xfs_info xfs_info_unset[] = { ··· 1513 1506 /* version 5 superblocks support inode version counters. */ 1514 1507 if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) 1515 1508 sb->s_flags |= MS_I_VERSION; 1509 + 1510 + if (mp->m_flags & XFS_MOUNT_DAX) { 1511 + xfs_warn(mp, 1512 + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); 1513 + if (sb->s_blocksize != PAGE_SIZE) { 1514 + xfs_alert(mp, 1515 + "Filesystem block size invalid for DAX Turning DAX off."); 1516 + mp->m_flags &= ~XFS_MOUNT_DAX; 1517 + } else if (!sb->s_bdev->bd_disk->fops->direct_access) { 1518 + xfs_alert(mp, 1519 + "Block device does not support DAX Turning DAX off."); 1520 + mp->m_flags &= ~XFS_MOUNT_DAX; 1521 + } 1522 + } 1516 1523 1517 1524 error = xfs_mountfs(mp); 1518 1525 if (error)
+7 -2
include/linux/fs.h
··· 70 70 struct buffer_head *bh_result, int create); 71 71 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, 72 72 ssize_t bytes, void *private); 73 + typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate); 73 74 74 75 #define MAY_EXEC 0x00000001 75 76 #define MAY_WRITE 0x00000002 ··· 2628 2627 int dax_clear_blocks(struct inode *, sector_t block, long size); 2629 2628 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); 2630 2629 int dax_truncate_page(struct inode *, loff_t from, get_block_t); 2631 - int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); 2630 + int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, 2631 + dax_iodone_t); 2632 + int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, 2633 + dax_iodone_t); 2632 2634 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); 2633 - #define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) 2635 + #define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) 2636 + #define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod) 2634 2637 2635 2638 #ifdef CONFIG_BLOCK 2636 2639 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,