Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm, pmem, xfs: Introduce MF_MEM_PRE_REMOVE for unbind

Now, if we suddenly remove a PMEM device(by calling unbind) which
contains FSDAX while programs are still accessing data in this device,
e.g.:
```
$FSSTRESS_PROG -d $SCRATCH_MNT -n 99999 -p 4 &
# $FSX_PROG -N 1000000 -o 8192 -l 500000 $SCRATCH_MNT/t001 &
echo "pfn1.1" > /sys/bus/nd/drivers/nd_pmem/unbind
```
it could come into an unacceptable state:
1. device has gone but mount point still exists, and umount will fail
with "target is busy"
2. programs will hang and cannot be killed
3. may crash with NULL pointer dereference

To fix this, we introduce a MF_MEM_PRE_REMOVE flag to let it know that we
are going to remove the whole device, and make sure all related processes
could be notified so that they could end up gracefully.

This patch is inspired by Dan's "mm, dax, pmem: Introduce
dev_pagemap_failure()"[1]. With the help of dax_holder and
->notify_failure() mechanism, the pmem driver is able to ask filesystem
on it to unmap all files in use, and notify processes who are using
those files.

Call trace:
trigger unbind
-> unbind_store()
-> ... (skip)
-> devres_release_all()
-> kill_dax()
-> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
-> xfs_dax_notify_failure()
`-> freeze_super() // freeze (kernel call)
`-> do xfs rmap
` -> mf_dax_kill_procs()
` -> collect_procs_fsdax() // all associated processes
` -> unmap_and_kill()
` -> invalidate_inode_pages2_range() // drop file's cache
`-> thaw_super() // thaw (both kernel & user call)

Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
event. Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
new dax mapping from being created. Do not shutdown filesystem directly
if configuration is not supported, or if failure range includes metadata
area. Make sure all files and processes(not only the current progress)
are handled correctly. Also drop the cache of associated files before
pmem is removed.

[1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
[2]: https://lore.kernel.org/linux-xfs/169116275623.3187159.16862410128731457358.stg-ugh@frogsfrogsfrogs/

Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>

authored by

Shiyang Ruan and committed by
Chandan Babu R
fa422b35 49391d13

+122 -11
+2 -1
drivers/dax/super.c
··· 326 326 return; 327 327 328 328 if (dax_dev->holder_data != NULL) 329 - dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0); 329 + dax_holder_notify_failure(dax_dev, 0, U64_MAX, 330 + MF_MEM_PRE_REMOVE); 330 331 331 332 clear_bit(DAXDEV_ALIVE, &dax_dev->flags); 332 333 synchronize_srcu(&dax_srcu);
+102 -6
fs/xfs/xfs_notify_failure.c
··· 22 22 23 23 #include <linux/mm.h> 24 24 #include <linux/dax.h> 25 + #include <linux/fs.h> 25 26 26 27 struct xfs_failure_info { 27 28 xfs_agblock_t startblock; ··· 74 73 struct xfs_mount *mp = cur->bc_mp; 75 74 struct xfs_inode *ip; 76 75 struct xfs_failure_info *notify = data; 76 + struct address_space *mapping; 77 + pgoff_t pgoff; 78 + unsigned long pgcnt; 77 79 int error = 0; 78 80 79 81 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || 80 82 (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { 83 + /* Continue the query because this isn't a failure. */ 84 + if (notify->mf_flags & MF_MEM_PRE_REMOVE) 85 + return 0; 81 86 notify->want_shutdown = true; 82 87 return 0; 83 88 } ··· 99 92 return 0; 100 93 } 101 94 102 - error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, 103 - xfs_failure_pgoff(mp, rec, notify), 104 - xfs_failure_pgcnt(mp, rec, notify), 105 - notify->mf_flags); 95 + mapping = VFS_I(ip)->i_mapping; 96 + pgoff = xfs_failure_pgoff(mp, rec, notify); 97 + pgcnt = xfs_failure_pgcnt(mp, rec, notify); 98 + 99 + /* Continue the rmap query if the inode isn't a dax file. */ 100 + if (dax_mapping(mapping)) 101 + error = mf_dax_kill_procs(mapping, pgoff, pgcnt, 102 + notify->mf_flags); 103 + 104 + /* Invalidate the cache in dax pages. */ 105 + if (notify->mf_flags & MF_MEM_PRE_REMOVE) 106 + invalidate_inode_pages2_range(mapping, pgoff, 107 + pgoff + pgcnt - 1); 108 + 106 109 xfs_irele(ip); 107 110 return error; 111 + } 112 + 113 + static int 114 + xfs_dax_notify_failure_freeze( 115 + struct xfs_mount *mp) 116 + { 117 + struct super_block *sb = mp->m_super; 118 + int error; 119 + 120 + error = freeze_super(sb, FREEZE_HOLDER_KERNEL); 121 + if (error) 122 + xfs_emerg(mp, "already frozen by kernel, err=%d", error); 123 + 124 + return error; 125 + } 126 + 127 + static void 128 + xfs_dax_notify_failure_thaw( 129 + struct xfs_mount *mp, 130 + bool kernel_frozen) 131 + { 132 + struct super_block *sb = mp->m_super; 133 + int error; 134 + 135 + if (kernel_frozen) { 136 + error = thaw_super(sb, FREEZE_HOLDER_KERNEL); 137 + if (error) 138 + xfs_emerg(mp, "still frozen after notify failure, err=%d", 139 + error); 140 + } 141 + 142 + /* 143 + * Also thaw userspace call anyway because the device is about to be 144 + * removed immediately. 145 + */ 146 + thaw_super(sb, FREEZE_HOLDER_USERSPACE); 108 147 } 109 148 110 149 static int ··· 165 112 struct xfs_btree_cur *cur = NULL; 166 113 struct xfs_buf *agf_bp = NULL; 167 114 int error = 0; 115 + bool kernel_frozen = false; 168 116 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); 169 117 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); 170 118 xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, 171 119 daddr + bblen - 1); 172 120 xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); 173 121 122 + if (mf_flags & MF_MEM_PRE_REMOVE) { 123 + xfs_info(mp, "Device is about to be removed!"); 124 + /* 125 + * Freeze fs to prevent new mappings from being created. 126 + * - Keep going on if others already hold the kernel forzen. 127 + * - Keep going on if other errors too because this device is 128 + * starting to fail. 129 + * - If kernel frozen state is hold successfully here, thaw it 130 + * here as well at the end. 131 + */ 132 + kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; 133 + } 134 + 174 135 error = xfs_trans_alloc_empty(mp, &tp); 175 136 if (error) 176 - return error; 137 + goto out; 177 138 178 139 for (; agno <= end_agno; agno++) { 179 140 struct xfs_rmap_irec ri_low = { }; ··· 232 165 } 233 166 234 167 xfs_trans_cancel(tp); 235 - if (error || notify.want_shutdown) { 168 + 169 + /* 170 + * Shutdown fs from a force umount in pre-remove case which won't fail, 171 + * so errors can be ignored. Otherwise, shutdown the filesystem with 172 + * CORRUPT flag if error occured or notify.want_shutdown was set during 173 + * RMAP querying. 174 + */ 175 + if (mf_flags & MF_MEM_PRE_REMOVE) 176 + xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); 177 + else if (error || notify.want_shutdown) { 236 178 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); 237 179 if (!error) 238 180 error = -EFSCORRUPTED; 239 181 } 182 + 183 + out: 184 + /* Thaw the fs if it has been frozen before. */ 185 + if (mf_flags & MF_MEM_PRE_REMOVE) 186 + xfs_dax_notify_failure_thaw(mp, kernel_frozen); 187 + 240 188 return error; 241 189 } 242 190 ··· 279 197 280 198 if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && 281 199 mp->m_logdev_targp != mp->m_ddev_targp) { 200 + /* 201 + * In the pre-remove case the failure notification is attempting 202 + * to trigger a force unmount. The expectation is that the 203 + * device is still present, but its removal is in progress and 204 + * can not be cancelled, proceed with accessing the log device. 205 + */ 206 + if (mf_flags & MF_MEM_PRE_REMOVE) 207 + return 0; 282 208 xfs_err(mp, "ondisk log corrupt, shutting down fs!"); 283 209 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); 284 210 return -EFSCORRUPTED; ··· 299 209 300 210 ddev_start = mp->m_ddev_targp->bt_dax_part_off; 301 211 ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; 212 + 213 + /* Notify failure on the whole device. */ 214 + if (offset == 0 && len == U64_MAX) { 215 + offset = ddev_start; 216 + len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev); 217 + } 302 218 303 219 /* Ignore the range out of filesystem area */ 304 220 if (offset + len - 1 < ddev_start)
+1
include/linux/mm.h
··· 3904 3904 MF_UNPOISON = 1 << 4, 3905 3905 MF_SW_SIMULATED = 1 << 5, 3906 3906 MF_NO_RETRY = 1 << 6, 3907 + MF_MEM_PRE_REMOVE = 1 << 7, 3907 3908 }; 3908 3909 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, 3909 3910 unsigned long count, int mf_flags);
+17 -4
mm/memory-failure.c
··· 679 679 */ 680 680 static void collect_procs_fsdax(struct page *page, 681 681 struct address_space *mapping, pgoff_t pgoff, 682 - struct list_head *to_kill) 682 + struct list_head *to_kill, bool pre_remove) 683 683 { 684 684 struct vm_area_struct *vma; 685 685 struct task_struct *tsk; ··· 687 687 i_mmap_lock_read(mapping); 688 688 rcu_read_lock(); 689 689 for_each_process(tsk) { 690 - struct task_struct *t = task_early_kill(tsk, true); 690 + struct task_struct *t = tsk; 691 691 692 + /* 693 + * Search for all tasks while MF_MEM_PRE_REMOVE is set, because 694 + * the current may not be the one accessing the fsdax page. 695 + * Otherwise, search for the current task. 696 + */ 697 + if (!pre_remove) 698 + t = task_early_kill(tsk, true); 692 699 if (!t) 693 700 continue; 694 701 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { ··· 1802 1795 dax_entry_t cookie; 1803 1796 struct page *page; 1804 1797 size_t end = index + count; 1798 + bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE; 1805 1799 1806 1800 mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; 1807 1801 ··· 1814 1806 if (!page) 1815 1807 goto unlock; 1816 1808 1817 - SetPageHWPoison(page); 1809 + if (!pre_remove) 1810 + SetPageHWPoison(page); 1818 1811 1819 - collect_procs_fsdax(page, mapping, index, &to_kill); 1812 + /* 1813 + * The pre_remove case is revoking access, the memory is still 1814 + * good and could theoretically be put back into service. 1815 + */ 1816 + collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove); 1820 1817 unmap_and_kill(&to_kill, page_to_pfn(page), mapping, 1821 1818 index, mf_flags); 1822 1819 unlock: