···309309 out:310310 i_mmap_unlock_read(mapping);311311312312- if (bh->b_end_io)313313- bh->b_end_io(bh, 1);314314-315312 return error;316313}317314318318-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,319319- get_block_t get_block)315315+/**316316+ * __dax_fault - handle a page fault on a DAX file317317+ * @vma: The virtual memory area where the fault occurred318318+ * @vmf: The description of the fault319319+ * @get_block: The filesystem method used to translate file offsets to blocks320320+ *321321+ * When a page fault occurs, filesystems may call this helper in their322322+ * fault handler for DAX files. __dax_fault() assumes the caller has done all323323+ * the necessary locking for the page fault to proceed successfully.324324+ */325325+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,326326+ get_block_t get_block, dax_iodone_t complete_unwritten)320327{321328 struct file *file = vma->vm_file;322329 struct address_space *mapping = file->f_mapping;···424417 page_cache_release(page);425418 }426419420420+ /*421421+ * If we successfully insert the new mapping over an unwritten extent,422422+ * we need to ensure we convert the unwritten extent. If there is an423423+ * error inserting the mapping, the filesystem needs to leave it as424424+ * unwritten to prevent exposure of the stale underlying data to425425+ * userspace, but we still need to call the completion function so426426+ * the private resources on the mapping buffer can be released. We427427+ * indicate what the callback should do via the uptodate variable, same428428+ * as for normal BH based IO completions.429429+ */427430 error = dax_insert_mapping(inode, &bh, vma, vmf);431431+ if (buffer_unwritten(&bh))432432+ complete_unwritten(&bh, !error);428433429434 out:430435 if (error == -ENOMEM)···453434 }454435 goto out;455436}437437+EXPORT_SYMBOL(__dax_fault);456438457439/**458440 * dax_fault - handle a page fault on a DAX file···465445 * fault handler for DAX files.466446 */467447int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,468468- get_block_t get_block)448448+ get_block_t get_block, dax_iodone_t complete_unwritten)469449{470450 int result;471451 struct super_block *sb = file_inode(vma->vm_file)->i_sb;···474454 sb_start_pagefault(sb);475455 file_update_time(vma->vm_file);476456 }477477- result = do_dax_fault(vma, vmf, get_block);457457+ result = __dax_fault(vma, vmf, get_block, complete_unwritten);478458 if (vmf->flags & FAULT_FLAG_WRITE)479459 sb_end_pagefault(sb);480460
···192192}193193194194#ifdef CONFIG_FS_DAX195195+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)196196+{197197+ struct inode *inode = bh->b_assoc_map->host;198198+ /* XXX: breaks on 32-bit > 16GB. Is that even supported? */199199+ loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;200200+ int err;201201+ if (!uptodate)202202+ return;203203+ WARN_ON(!buffer_unwritten(bh));204204+ err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);205205+}206206+195207static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)196208{197197- return dax_fault(vma, vmf, ext4_get_block);209209+ return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);198210 /* Is this the right get_block? */199211}200212201213static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)202214{203203- return dax_mkwrite(vma, vmf, ext4_get_block);215215+ return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);204216}205217206218static const struct vm_operations_struct ext4_dax_vm_ops = {
+7-14
fs/ext4/inode.c
···656656 return retval;657657}658658659659-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)660660-{661661- struct inode *inode = bh->b_assoc_map->host;662662- /* XXX: breaks on 32-bit > 16GB. Is that even supported? */663663- loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;664664- int err;665665- if (!uptodate)666666- return;667667- WARN_ON(!buffer_unwritten(bh));668668- err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);669669-}670670-671659/* Maximum number of blocks we map for direct IO at once. */672660#define DIO_MAX_BLOCKS 4096673661···693705694706 map_bh(bh, inode->i_sb, map.m_pblk);695707 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;696696- if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {708708+ if (IS_DAX(inode) && buffer_unwritten(bh)) {709709+ /*710710+ * dgc: I suspect unwritten conversion on ext4+DAX is711711+ * fundamentally broken here when there are concurrent712712+ * read/write in progress on this inode.713713+ */714714+ WARN_ON_ONCE(io_end);697715 bh->b_assoc_map = inode->i_mapping;698716 bh->b_private = (void *)(unsigned long)iblock;699699- bh->b_end_io = ext4_end_io_unwritten;700717 }701718 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)702719 set_buffer_defer_completion(bh);
+110-42
fs/xfs/xfs_aops.c
···13491349 sector_t iblock,13501350 struct buffer_head *bh_result,13511351 int create,13521352- int direct)13521352+ bool direct)13531353{13541354 struct xfs_inode *ip = XFS_I(inode);13551355 struct xfs_mount *mp = ip->i_mount;···14141414 if (error)14151415 return error;14161416 new = 1;14171417+14171418 } else {14181419 /*14191420 * Delalloc reservations do not require a transaction,···15091508 struct buffer_head *bh_result,15101509 int create)15111510{15121512- return __xfs_get_blocks(inode, iblock, bh_result, create, 0);15111511+ return __xfs_get_blocks(inode, iblock, bh_result, create, false);15131512}1514151315151515-STATIC int15141514+int15161515xfs_get_blocks_direct(15171516 struct inode *inode,15181517 sector_t iblock,15191518 struct buffer_head *bh_result,15201519 int create)15211520{15221522- return __xfs_get_blocks(inode, iblock, bh_result, create, 1);15211521+ return __xfs_get_blocks(inode, iblock, bh_result, create, true);15231522}1524152315251525-/*15261526- * Complete a direct I/O write request.15271527- *15281528- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.15291529- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite15301530- * wholly within the EOF and so there is nothing for us to do. Note that in this15311531- * case the completion can be called in interrupt context, whereas if we have an15321532- * ioend we will always be called in task context (i.e. from a workqueue).15331533- */15341534-STATIC void15351535-xfs_end_io_direct_write(15361536- struct kiocb *iocb,15241524+static void15251525+__xfs_end_io_direct_write(15261526+ struct inode *inode,15271527+ struct xfs_ioend *ioend,15371528 loff_t offset,15381538- ssize_t size,15391539- void *private)15291529+ ssize_t size)15401530{15411541- struct inode *inode = file_inode(iocb->ki_filp);15421542- struct xfs_inode *ip = XFS_I(inode);15431543- struct xfs_mount *mp = ip->i_mount;15441544- struct xfs_ioend *ioend = private;15311531+ struct xfs_mount *mp = XFS_I(inode)->i_mount;1545153215461546- trace_xfs_gbmap_direct_endio(ip, offset, size,15471547- ioend ? ioend->io_type : 0, NULL);15481548-15491549- if (!ioend) {15501550- ASSERT(offset + size <= i_size_read(inode));15511551- return;15521552- }15531553-15541554- if (XFS_FORCED_SHUTDOWN(mp))15331533+ if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)15551534 goto out_end_io;1556153515571536 /*···15681587 * here can result in EOF moving backwards and Bad Things Happen when15691588 * that occurs.15701589 */15711571- spin_lock(&ip->i_flags_lock);15901590+ spin_lock(&XFS_I(inode)->i_flags_lock);15721591 if (offset + size > i_size_read(inode))15731592 i_size_write(inode, offset + size);15741574- spin_unlock(&ip->i_flags_lock);15931593+ spin_unlock(&XFS_I(inode)->i_flags_lock);1575159415761595 /*15771596 * If we are doing an append IO that needs to update the EOF on disk,···15881607 return;15891608}1590160916101610+/*16111611+ * Complete a direct I/O write request.16121612+ *16131613+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.16141614+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite16151615+ * wholly within the EOF and so there is nothing for us to do. Note that in this16161616+ * case the completion can be called in interrupt context, whereas if we have an16171617+ * ioend we will always be called in task context (i.e. from a workqueue).16181618+ */16191619+STATIC void16201620+xfs_end_io_direct_write(16211621+ struct kiocb *iocb,16221622+ loff_t offset,16231623+ ssize_t size,16241624+ void *private)16251625+{16261626+ struct inode *inode = file_inode(iocb->ki_filp);16271627+ struct xfs_ioend *ioend = private;16281628+16291629+ trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,16301630+ ioend ? ioend->io_type : 0, NULL);16311631+16321632+ if (!ioend) {16331633+ ASSERT(offset + size <= i_size_read(inode));16341634+ return;16351635+ }16361636+16371637+ __xfs_end_io_direct_write(inode, ioend, offset, size);16381638+}16391639+16401640+/*16411641+ * For DAX we need a mapping buffer callback for unwritten extent conversion16421642+ * when page faults allocate blocks and then zero them. Note that in this16431643+ * case the mapping indicated by the ioend may extend beyond EOF. We most16441644+ * definitely do not want to extend EOF here, so we trim back the ioend size to16451645+ * EOF.16461646+ */16471647+#ifdef CONFIG_FS_DAX16481648+void16491649+xfs_end_io_dax_write(16501650+ struct buffer_head *bh,16511651+ int uptodate)16521652+{16531653+ struct xfs_ioend *ioend = bh->b_private;16541654+ struct inode *inode = ioend->io_inode;16551655+ ssize_t size = ioend->io_size;16561656+16571657+ ASSERT(IS_DAX(ioend->io_inode));16581658+16591659+ /* if there was an error zeroing, then don't convert it */16601660+ if (!uptodate)16611661+ ioend->io_error = -EIO;16621662+16631663+ /*16641664+ * Trim update to EOF, so we don't extend EOF during unwritten extent16651665+ * conversion of partial EOF blocks.16661666+ */16671667+ spin_lock(&XFS_I(inode)->i_flags_lock);16681668+ if (ioend->io_offset + size > i_size_read(inode))16691669+ size = i_size_read(inode) - ioend->io_offset;16701670+ spin_unlock(&XFS_I(inode)->i_flags_lock);16711671+16721672+ __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);16731673+16741674+}16751675+#else16761676+void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }16771677+#endif16781678+16791679+static inline ssize_t16801680+xfs_vm_do_dio(16811681+ struct inode *inode,16821682+ struct kiocb *iocb,16831683+ struct iov_iter *iter,16841684+ loff_t offset,16851685+ void (*endio)(struct kiocb *iocb,16861686+ loff_t offset,16871687+ ssize_t size,16881688+ void *private),16891689+ int flags)16901690+{16911691+ struct block_device *bdev;16921692+16931693+ if (IS_DAX(inode))16941694+ return dax_do_io(iocb, inode, iter, offset,16951695+ xfs_get_blocks_direct, endio, 0);16961696+16971697+ bdev = xfs_find_bdev_for_inode(inode);16981698+ return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,16991699+ xfs_get_blocks_direct, endio, NULL, flags);17001700+}17011701+15911702STATIC ssize_t15921703xfs_vm_direct_IO(15931704 struct kiocb *iocb,···16871614 loff_t offset)16881615{16891616 struct inode *inode = iocb->ki_filp->f_mapping->host;16901690- struct block_device *bdev = xfs_find_bdev_for_inode(inode);1691161716921692- if (iov_iter_rw(iter) == WRITE) {16931693- return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,16941694- xfs_get_blocks_direct,16951695- xfs_end_io_direct_write, NULL,16961696- DIO_ASYNC_EXTEND);16971697- }16981698- return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,16991699- xfs_get_blocks_direct, NULL, NULL, 0);16181618+ if (iov_iter_rw(iter) == WRITE)16191619+ return xfs_vm_do_dio(inode, iocb, iter, offset,16201620+ xfs_end_io_direct_write, DIO_ASYNC_EXTEND);16211621+ return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);17001622}1701162317021624/*
+6-1
fs/xfs/xfs_aops.h
···5353} xfs_ioend_t;54545555extern const struct address_space_operations xfs_address_space_operations;5656-extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);5656+5757+int xfs_get_blocks(struct inode *inode, sector_t offset,5858+ struct buffer_head *map_bh, int create);5959+int xfs_get_blocks_direct(struct inode *inode, sector_t offset,6060+ struct buffer_head *map_bh, int create);6161+void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);57625863extern void xfs_count_page_state(struct page *, int *, int *);5964
+19-4
fs/xfs/xfs_bmap_util.c
···11331133 break;11341134 ASSERT(imap.br_blockcount >= 1);11351135 ASSERT(imap.br_startoff == offset_fsb);11361136+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);11371137+11381138+ if (imap.br_startblock == HOLESTARTBLOCK ||11391139+ imap.br_state == XFS_EXT_UNWRITTEN) {11401140+ /* skip the entire extent */11411141+ lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +11421142+ imap.br_blockcount) - 1;11431143+ continue;11441144+ }11451145+11361146 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;11371147 if (lastoffset > endoff)11381148 lastoffset = endoff;11391139- if (imap.br_startblock == HOLESTARTBLOCK)11491149+11501150+ /* DAX can just zero the backing device directly */11511151+ if (IS_DAX(VFS_I(ip))) {11521152+ error = dax_zero_page_range(VFS_I(ip), offset,11531153+ lastoffset - offset + 1,11541154+ xfs_get_blocks_direct);11551155+ if (error)11561156+ return error;11401157 continue;11411141- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);11421142- if (imap.br_state == XFS_EXT_UNWRITTEN)11431143- continue;11581158+ }1144115911451160 error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?11461161 mp->m_rtdev_targp : mp->m_ddev_targp,
+99-65
fs/xfs/xfs_file.c
···7979}80808181/*8282- * xfs_iozero8282+ * xfs_iozero clears the specified range supplied via the page cache (except in8383+ * the DAX case). Writes through the page cache will allocate blocks over holes,8484+ * though the callers usually map the holes first and avoid them. If a block is8585+ * not completely zeroed, then it will be read from disk before being partially8686+ * zeroed.8387 *8484- * xfs_iozero clears the specified range of buffer supplied,8585- * and marks all the affected blocks as valid and modified. If8686- * an affected block is not allocated, it will be allocated. If8787- * an affected block is not completely overwritten, and is not8888- * valid before the operation, it will be read from disk before8989- * being partially zeroed.8888+ * In the DAX case, we can just directly write to the underlying pages. This8989+ * will not allocate blocks, but will avoid holes and unwritten extents and so9090+ * not do unnecessary work.9091 */9192int9293xfs_iozero(···9796{9897 struct page *page;9998 struct address_space *mapping;100100- int status;9999+ int status = 0;100100+101101102102 mapping = VFS_I(ip)->i_mapping;103103 do {···110108 if (bytes > count)111109 bytes = count;112110113113- status = pagecache_write_begin(NULL, mapping, pos, bytes,114114- AOP_FLAG_UNINTERRUPTIBLE,115115- &page, &fsdata);116116- if (status)117117- break;111111+ if (IS_DAX(VFS_I(ip))) {112112+ status = dax_zero_page_range(VFS_I(ip), pos, bytes,113113+ xfs_get_blocks_direct);114114+ if (status)115115+ break;116116+ } else {117117+ status = pagecache_write_begin(NULL, mapping, pos, bytes,118118+ AOP_FLAG_UNINTERRUPTIBLE,119119+ &page, &fsdata);120120+ if (status)121121+ break;118122119119- zero_user(page, offset, bytes);123123+ zero_user(page, offset, bytes);120124121121- status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,122122- page, fsdata);123123- WARN_ON(status <= 0); /* can't return less than zero! */125125+ status = pagecache_write_end(NULL, mapping, pos, bytes,126126+ bytes, page, fsdata);127127+ WARN_ON(status <= 0); /* can't return less than zero! */128128+ status = 0;129129+ }124130 pos += bytes;125131 count -= bytes;126126- status = 0;127132 } while (count);128133129134 return status;···293284 if (file->f_mode & FMODE_NOCMTIME)294285 ioflags |= XFS_IO_INVIS;295286296296- if (unlikely(ioflags & XFS_IO_ISDIRECT)) {287287+ if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {297288 xfs_buftarg_t *target =298289 XFS_IS_REALTIME_INODE(ip) ?299290 mp->m_rtdev_targp : mp->m_ddev_targp;···387378388379 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);389380390390- ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);381381+ /* for dax, we need to avoid the page cache */382382+ if (IS_DAX(VFS_I(ip)))383383+ ret = default_file_splice_read(infilp, ppos, pipe, count, flags);384384+ else385385+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);391386 if (ret > 0)392387 XFS_STATS_ADD(xs_read_bytes, ret);393388···685672 mp->m_rtdev_targp : mp->m_ddev_targp;686673687674 /* DIO must be aligned to device logical sector size */688688- if ((pos | count) & target->bt_logical_sectormask)675675+ if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))689676 return -EINVAL;690677691678 /* "unaligned" here means not aligned to a filesystem block */···771758out:772759 xfs_rw_iunlock(ip, iolock);773760774774- /* No fallback to buffered IO on errors for XFS. */775775- ASSERT(ret < 0 || ret == count);761761+ /*762762+ * No fallback to buffered IO on errors for XFS. DAX can result in763763+ * partial writes, but direct IO will either complete fully or fail.764764+ */765765+ ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));776766 return ret;777767}778768···858842 if (XFS_FORCED_SHUTDOWN(ip->i_mount))859843 return -EIO;860844861861- if (unlikely(iocb->ki_flags & IOCB_DIRECT))845845+ if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))862846 ret = xfs_file_dio_aio_write(iocb, from);863847 else864848 ret = xfs_file_buffered_aio_write(iocb, from);···10771061 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);1078106210791063 return xfs_readdir(ip, ctx, bufsize);10801080-}10811081-10821082-STATIC int10831083-xfs_file_mmap(10841084- struct file *filp,10851085- struct vm_area_struct *vma)10861086-{10871087- vma->vm_ops = &xfs_file_vm_ops;10881088-10891089- file_accessed(filp);10901090- return 0;10911064}1092106510931066/*···14591454 * ordering of:14601455 *14611456 * mmap_sem (MM)14621462- * i_mmap_lock (XFS - truncate serialisation)14631463- * page_lock (MM)14641464- * i_lock (XFS - extent map serialisation)14571457+ * sb_start_pagefault(vfs, freeze)14581458+ * i_mmap_lock (XFS - truncate serialisation)14591459+ * page_lock (MM)14601460+ * i_lock (XFS - extent map serialisation)14651461 */14661466-STATIC int14671467-xfs_filemap_fault(14681468- struct vm_area_struct *vma,14691469- struct vm_fault *vmf)14701470-{14711471- struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);14721472- int error;14731473-14741474- trace_xfs_filemap_fault(ip);14751475-14761476- xfs_ilock(ip, XFS_MMAPLOCK_SHARED);14771477- error = filemap_fault(vma, vmf);14781478- xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);14791479-14801480- return error;14811481-}1482146214831463/*14841464 * mmap()d file has taken write protection fault and is being made writable. We···14761486 struct vm_area_struct *vma,14771487 struct vm_fault *vmf)14781488{14791479- struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);14801480- int error;14891489+ struct inode *inode = file_inode(vma->vm_file);14901490+ int ret;1481149114821482- trace_xfs_filemap_page_mkwrite(ip);14921492+ trace_xfs_filemap_page_mkwrite(XFS_I(inode));14931493+14941494+ sb_start_pagefault(inode->i_sb);14951495+ file_update_time(vma->vm_file);14961496+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);14971497+14981498+ if (IS_DAX(inode)) {14991499+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,15001500+ xfs_end_io_dax_write);15011501+ } else {15021502+ ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);15031503+ ret = block_page_mkwrite_return(ret);15041504+ }15051505+15061506+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);15071507+ sb_end_pagefault(inode->i_sb);15081508+15091509+ return ret;15101510+}15111511+15121512+STATIC int15131513+xfs_filemap_fault(15141514+ struct vm_area_struct *vma,15151515+ struct vm_fault *vmf)15161516+{15171517+ struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file));15181518+ int ret;15191519+15201520+ trace_xfs_filemap_fault(ip);15211521+15221522+ /* DAX can shortcut the normal fault path on write faults! */15231523+ if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))15241524+ return xfs_filemap_page_mkwrite(vma, vmf);1483152514841526 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);14851485- error = block_page_mkwrite(vma, vmf, xfs_get_blocks);15271527+ ret = filemap_fault(vma, vmf);14861528 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);1487152914881488- return error;15301530+ return ret;15311531+}15321532+15331533+static const struct vm_operations_struct xfs_file_vm_ops = {15341534+ .fault = xfs_filemap_fault,15351535+ .map_pages = filemap_map_pages,15361536+ .page_mkwrite = xfs_filemap_page_mkwrite,15371537+};15381538+15391539+STATIC int15401540+xfs_file_mmap(15411541+ struct file *filp,15421542+ struct vm_area_struct *vma)15431543+{15441544+ file_accessed(filp);15451545+ vma->vm_ops = &xfs_file_vm_ops;15461546+ if (IS_DAX(file_inode(filp)))15471547+ vma->vm_flags |= VM_MIXEDMAP;15481548+ return 0;14891549}1490155014911551const struct file_operations xfs_file_operations = {···15651525 .compat_ioctl = xfs_file_compat_ioctl,15661526#endif15671527 .fsync = xfs_dir_fsync,15681568-};15691569-15701570-static const struct vm_operations_struct xfs_file_vm_ops = {15711571- .fault = xfs_filemap_fault,15721572- .map_pages = filemap_map_pages,15731573- .page_mkwrite = xfs_filemap_page_mkwrite,15741528};
+17-13
fs/xfs/xfs_iops.c
···851851 * to hope that the caller sees ENOMEM and retries the truncate852852 * operation.853853 */854854- error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);854854+ if (IS_DAX(inode))855855+ error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);856856+ else857857+ error = block_truncate_page(inode->i_mapping, newsize,858858+ xfs_get_blocks);855859 if (error)856860 return error;857861 truncate_setsize(inode, newsize);···11951191 struct inode *inode,11961192 struct xfs_inode *ip)11971193{11981198- if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)11941194+ uint16_t flags = ip->i_d.di_flags;11951195+11961196+ inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |11971197+ S_NOATIME | S_DAX);11981198+11991199+ if (flags & XFS_DIFLAG_IMMUTABLE)11991200 inode->i_flags |= S_IMMUTABLE;12001200- else12011201- inode->i_flags &= ~S_IMMUTABLE;12021202- if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)12011201+ if (flags & XFS_DIFLAG_APPEND)12031202 inode->i_flags |= S_APPEND;12041204- else12051205- inode->i_flags &= ~S_APPEND;12061206- if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)12031203+ if (flags & XFS_DIFLAG_SYNC)12071204 inode->i_flags |= S_SYNC;12081208- else12091209- inode->i_flags &= ~S_SYNC;12101210- if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)12051205+ if (flags & XFS_DIFLAG_NOATIME)12111206 inode->i_flags |= S_NOATIME;12121212- else12131213- inode->i_flags &= ~S_NOATIME;12071207+ /* XXX: Also needs an on-disk per inode flag! */12081208+ if (ip->i_mount->m_flags & XFS_MOUNT_DAX)12091209+ inode->i_flags |= S_DAX;12141210}1215121112161212/*
+2
fs/xfs/xfs_mount.h
···181181 allocator */182182#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */183183184184+#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */185185+184186185187/*186188 * Default minimum read and write sizes.
+23-2
fs/xfs/xfs_super.c
···112112#define MNTOPT_DISCARD "discard" /* Discard unused blocks */113113#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */114114115115+#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */116116+115117/*116118 * Table driven mount option parser.117119 *···365363 mp->m_flags |= XFS_MOUNT_DISCARD;366364 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {367365 mp->m_flags &= ~XFS_MOUNT_DISCARD;366366+#ifdef CONFIG_FS_DAX367367+ } else if (!strcmp(this_char, MNTOPT_DAX)) {368368+ mp->m_flags |= XFS_MOUNT_DAX;369369+#endif368370 } else {369371 xfs_warn(mp, "unknown mount option [%s].", this_char);370372 return -EINVAL;···458452}459453460454struct proc_xfs_info {461461- int flag;462462- char *str;455455+ uint64_t flag;456456+ char *str;463457};464458465459STATIC int···480474 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },481475 { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },482476 { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE },477477+ { XFS_MOUNT_DAX, "," MNTOPT_DAX },483478 { 0, NULL }484479 };485480 static struct proc_xfs_info xfs_info_unset[] = {···15131506 /* version 5 superblocks support inode version counters. */15141507 if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)15151508 sb->s_flags |= MS_I_VERSION;15091509+15101510+ if (mp->m_flags & XFS_MOUNT_DAX) {15111511+ xfs_warn(mp,15121512+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");15131513+ if (sb->s_blocksize != PAGE_SIZE) {15141514+ xfs_alert(mp,15151515+ "Filesystem block size invalid for DAX Turning DAX off.");15161516+ mp->m_flags &= ~XFS_MOUNT_DAX;15171517+ } else if (!sb->s_bdev->bd_disk->fops->direct_access) {15181518+ xfs_alert(mp,15191519+ "Block device does not support DAX Turning DAX off.");15201520+ mp->m_flags &= ~XFS_MOUNT_DAX;15211521+ }15221522+ }1516152315171524 error = xfs_mountfs(mp);15181525 if (error)