Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xfs: add DAX file operations support

Add the initial support for DAX file operations to XFS. This
includes the necessary block allocation and mmap page fault hooks
for DAX to function.

Note that there are changes to the splice interfaces to ensure that
for DAX splice avoids direct page cache manipulations and instead
takes the DAX IO paths for read/write operations.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

authored by

Dave Chinner and committed by
Dave Chinner
6b698ede ce5c5d55

+159 -84
+83 -33
fs/xfs/xfs_aops.c
··· 1349 1349 sector_t iblock, 1350 1350 struct buffer_head *bh_result, 1351 1351 int create, 1352 - int direct) 1352 + bool direct) 1353 1353 { 1354 1354 struct xfs_inode *ip = XFS_I(inode); 1355 1355 struct xfs_mount *mp = ip->i_mount; ··· 1414 1414 if (error) 1415 1415 return error; 1416 1416 new = 1; 1417 + 1417 1418 } else { 1418 1419 /* 1419 1420 * Delalloc reservations do not require a transaction, ··· 1509 1508 struct buffer_head *bh_result, 1510 1509 int create) 1511 1510 { 1512 - return __xfs_get_blocks(inode, iblock, bh_result, create, 0); 1511 + return __xfs_get_blocks(inode, iblock, bh_result, create, false); 1513 1512 } 1514 1513 1515 - STATIC int 1514 + int 1516 1515 xfs_get_blocks_direct( 1517 1516 struct inode *inode, 1518 1517 sector_t iblock, 1519 1518 struct buffer_head *bh_result, 1520 1519 int create) 1521 1520 { 1522 - return __xfs_get_blocks(inode, iblock, bh_result, create, 1); 1521 + return __xfs_get_blocks(inode, iblock, bh_result, create, true); 1523 1522 } 1524 1523 1525 - /* 1526 - * Complete a direct I/O write request. 1527 - * 1528 - * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. 1529 - * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite 1530 - * wholly within the EOF and so there is nothing for us to do. Note that in this 1531 - * case the completion can be called in interrupt context, whereas if we have an 1532 - * ioend we will always be called in task context (i.e. from a workqueue). 1533 - */ 1534 - STATIC void 1535 - xfs_end_io_direct_write( 1536 - struct kiocb *iocb, 1524 + static void 1525 + __xfs_end_io_direct_write( 1526 + struct inode *inode, 1527 + struct xfs_ioend *ioend, 1537 1528 loff_t offset, 1538 - ssize_t size, 1539 - void *private) 1529 + ssize_t size) 1540 1530 { 1541 - struct inode *inode = file_inode(iocb->ki_filp); 1542 - struct xfs_inode *ip = XFS_I(inode); 1543 - struct xfs_mount *mp = ip->i_mount; 1544 - struct xfs_ioend *ioend = private; 1531 + struct xfs_mount *mp = XFS_I(inode)->i_mount; 1545 1532 1546 - trace_xfs_gbmap_direct_endio(ip, offset, size, 1547 - ioend ? ioend->io_type : 0, NULL); 1548 - 1549 - if (!ioend) { 1550 - ASSERT(offset + size <= i_size_read(inode)); 1551 - return; 1552 - } 1553 - 1554 - if (XFS_FORCED_SHUTDOWN(mp)) 1533 + if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) 1555 1534 goto out_end_io; 1556 1535 1557 1536 /* ··· 1568 1587 * here can result in EOF moving backwards and Bad Things Happen when 1569 1588 * that occurs. 1570 1589 */ 1571 - spin_lock(&ip->i_flags_lock); 1590 + spin_lock(&XFS_I(inode)->i_flags_lock); 1572 1591 if (offset + size > i_size_read(inode)) 1573 1592 i_size_write(inode, offset + size); 1574 - spin_unlock(&ip->i_flags_lock); 1593 + spin_unlock(&XFS_I(inode)->i_flags_lock); 1575 1594 1576 1595 /* 1577 1596 * If we are doing an append IO that needs to update the EOF on disk, ··· 1587 1606 xfs_end_io(&ioend->io_work); 1588 1607 return; 1589 1608 } 1609 + 1610 + /* 1611 + * Complete a direct I/O write request. 1612 + * 1613 + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. 1614 + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite 1615 + * wholly within the EOF and so there is nothing for us to do. Note that in this 1616 + * case the completion can be called in interrupt context, whereas if we have an 1617 + * ioend we will always be called in task context (i.e. from a workqueue). 1618 + */ 1619 + STATIC void 1620 + xfs_end_io_direct_write( 1621 + struct kiocb *iocb, 1622 + loff_t offset, 1623 + ssize_t size, 1624 + void *private) 1625 + { 1626 + struct inode *inode = file_inode(iocb->ki_filp); 1627 + struct xfs_ioend *ioend = private; 1628 + 1629 + trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, 1630 + ioend ? ioend->io_type : 0, NULL); 1631 + 1632 + if (!ioend) { 1633 + ASSERT(offset + size <= i_size_read(inode)); 1634 + return; 1635 + } 1636 + 1637 + __xfs_end_io_direct_write(inode, ioend, offset, size); 1638 + } 1639 + 1640 + /* 1641 + * For DAX we need a mapping buffer callback for unwritten extent conversion 1642 + * when page faults allocate blocks and then zero them. Note that in this 1643 + * case the mapping indicated by the ioend may extend beyond EOF. We most 1644 + * definitely do not want to extend EOF here, so we trim back the ioend size to 1645 + * EOF. 1646 + */ 1647 + #ifdef CONFIG_FS_DAX 1648 + void 1649 + xfs_end_io_dax_write( 1650 + struct buffer_head *bh, 1651 + int uptodate) 1652 + { 1653 + struct xfs_ioend *ioend = bh->b_private; 1654 + struct inode *inode = ioend->io_inode; 1655 + ssize_t size = ioend->io_size; 1656 + 1657 + ASSERT(IS_DAX(ioend->io_inode)); 1658 + 1659 + /* if there was an error zeroing, then don't convert it */ 1660 + if (!uptodate) 1661 + ioend->io_error = -EIO; 1662 + 1663 + /* 1664 + * Trim update to EOF, so we don't extend EOF during unwritten extent 1665 + * conversion of partial EOF blocks. 1666 + */ 1667 + spin_lock(&XFS_I(inode)->i_flags_lock); 1668 + if (ioend->io_offset + size > i_size_read(inode)) 1669 + size = i_size_read(inode) - ioend->io_offset; 1670 + spin_unlock(&XFS_I(inode)->i_flags_lock); 1671 + 1672 + __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); 1673 + 1674 + } 1675 + #else 1676 + void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } 1677 + #endif 1590 1678 1591 1679 STATIC ssize_t 1592 1680 xfs_vm_direct_IO(
+6 -1
fs/xfs/xfs_aops.h
··· 53 53 } xfs_ioend_t; 54 54 55 55 extern const struct address_space_operations xfs_address_space_operations; 56 - extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); 56 + 57 + int xfs_get_blocks(struct inode *inode, sector_t offset, 58 + struct buffer_head *map_bh, int create); 59 + int xfs_get_blocks_direct(struct inode *inode, sector_t offset, 60 + struct buffer_head *map_bh, int create); 61 + void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); 57 62 58 63 extern void xfs_count_page_state(struct page *, int *, int *); 59 64
+70 -50
fs/xfs/xfs_file.c
··· 284 284 if (file->f_mode & FMODE_NOCMTIME) 285 285 ioflags |= XFS_IO_INVIS; 286 286 287 - if (unlikely(ioflags & XFS_IO_ISDIRECT)) { 287 + if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { 288 288 xfs_buftarg_t *target = 289 289 XFS_IS_REALTIME_INODE(ip) ? 290 290 mp->m_rtdev_targp : mp->m_ddev_targp; ··· 378 378 379 379 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 380 380 381 - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); 381 + /* for dax, we need to avoid the page cache */ 382 + if (IS_DAX(VFS_I(ip))) 383 + ret = default_file_splice_read(infilp, ppos, pipe, count, flags); 384 + else 385 + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); 382 386 if (ret > 0) 383 387 XFS_STATS_ADD(xs_read_bytes, ret); 384 388 ··· 676 672 mp->m_rtdev_targp : mp->m_ddev_targp; 677 673 678 674 /* DIO must be aligned to device logical sector size */ 679 - if ((pos | count) & target->bt_logical_sectormask) 675 + if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask)) 680 676 return -EINVAL; 681 677 682 678 /* "unaligned" here means not aligned to a filesystem block */ ··· 762 758 out: 763 759 xfs_rw_iunlock(ip, iolock); 764 760 765 - /* No fallback to buffered IO on errors for XFS. */ 766 - ASSERT(ret < 0 || ret == count); 761 + /* 762 + * No fallback to buffered IO on errors for XFS. DAX can result in 763 + * partial writes, but direct IO will either complete fully or fail. 764 + */ 765 + ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); 767 766 return ret; 768 767 } 769 768 ··· 849 842 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 850 843 return -EIO; 851 844 852 - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) 845 + if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) 853 846 ret = xfs_file_dio_aio_write(iocb, from); 854 847 else 855 848 ret = xfs_file_buffered_aio_write(iocb, from); ··· 1068 1061 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); 1069 1062 1070 1063 return xfs_readdir(ip, ctx, bufsize); 1071 - } 1072 - 1073 - STATIC int 1074 - xfs_file_mmap( 1075 - struct file *filp, 1076 - struct vm_area_struct *vma) 1077 - { 1078 - vma->vm_ops = &xfs_file_vm_ops; 1079 - 1080 - file_accessed(filp); 1081 - return 0; 1082 1064 } 1083 1065 1084 1066 /* ··· 1450 1454 * ordering of: 1451 1455 * 1452 1456 * mmap_sem (MM) 1453 - * i_mmap_lock (XFS - truncate serialisation) 1454 - * page_lock (MM) 1455 - * i_lock (XFS - extent map serialisation) 1457 + * sb_start_pagefault(vfs, freeze) 1458 + * i_mmap_lock (XFS - truncate serialisation) 1459 + * page_lock (MM) 1460 + * i_lock (XFS - extent map serialisation) 1456 1461 */ 1457 - STATIC int 1458 - xfs_filemap_fault( 1459 - struct vm_area_struct *vma, 1460 - struct vm_fault *vmf) 1461 - { 1462 - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); 1463 - int error; 1464 - 1465 - trace_xfs_filemap_fault(ip); 1466 - 1467 - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1468 - error = filemap_fault(vma, vmf); 1469 - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1470 - 1471 - return error; 1472 - } 1473 1462 1474 1463 /* 1475 1464 * mmap()d file has taken write protection fault and is being made writable. We ··· 1467 1486 struct vm_area_struct *vma, 1468 1487 struct vm_fault *vmf) 1469 1488 { 1470 - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); 1489 + struct inode *inode = file_inode(vma->vm_file); 1471 1490 int ret; 1472 1491 1473 - trace_xfs_filemap_page_mkwrite(ip); 1492 + trace_xfs_filemap_page_mkwrite(XFS_I(inode)); 1474 1493 1475 - sb_start_pagefault(VFS_I(ip)->i_sb); 1494 + sb_start_pagefault(inode->i_sb); 1476 1495 file_update_time(vma->vm_file); 1496 + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1497 + 1498 + if (IS_DAX(inode)) { 1499 + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, 1500 + xfs_end_io_dax_write); 1501 + } else { 1502 + ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); 1503 + ret = block_page_mkwrite_return(ret); 1504 + } 1505 + 1506 + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1507 + sb_end_pagefault(inode->i_sb); 1508 + 1509 + return ret; 1510 + } 1511 + 1512 + STATIC int 1513 + xfs_filemap_fault( 1514 + struct vm_area_struct *vma, 1515 + struct vm_fault *vmf) 1516 + { 1517 + struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file)); 1518 + int ret; 1519 + 1520 + trace_xfs_filemap_fault(ip); 1521 + 1522 + /* DAX can shortcut the normal fault path on write faults! */ 1523 + if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip))) 1524 + return xfs_filemap_page_mkwrite(vma, vmf); 1525 + 1477 1526 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1478 - 1479 - ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); 1480 - 1527 + ret = filemap_fault(vma, vmf); 1481 1528 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1482 - sb_end_pagefault(VFS_I(ip)->i_sb); 1483 1529 1484 - return block_page_mkwrite_return(ret); 1530 + return ret; 1531 + } 1532 + 1533 + static const struct vm_operations_struct xfs_file_vm_ops = { 1534 + .fault = xfs_filemap_fault, 1535 + .map_pages = filemap_map_pages, 1536 + .page_mkwrite = xfs_filemap_page_mkwrite, 1537 + }; 1538 + 1539 + STATIC int 1540 + xfs_file_mmap( 1541 + struct file *filp, 1542 + struct vm_area_struct *vma) 1543 + { 1544 + file_accessed(filp); 1545 + vma->vm_ops = &xfs_file_vm_ops; 1546 + if (IS_DAX(file_inode(filp))) 1547 + vma->vm_flags |= VM_MIXEDMAP; 1548 + return 0; 1485 1549 } 1486 1550 1487 1551 const struct file_operations xfs_file_operations = { ··· 1556 1530 .compat_ioctl = xfs_file_compat_ioctl, 1557 1531 #endif 1558 1532 .fsync = xfs_dir_fsync, 1559 - }; 1560 - 1561 - static const struct vm_operations_struct xfs_file_vm_ops = { 1562 - .fault = xfs_filemap_fault, 1563 - .map_pages = filemap_map_pages, 1564 - .page_mkwrite = xfs_filemap_page_mkwrite, 1565 1533 };