mm/memory-failure.c at v6.11-rc4

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / memory-failure.c
at v6.11-rc4 2843 lines 77 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 2008, 2009 Intel Corporation
   4 * Authors: Andi Kleen, Fengguang Wu
   5 *
   6 * High level machine check handler. Handles pages reported by the
   7 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
   8 * failure.
   9 *
  10 * In addition there is a "soft offline" entry point that allows stop using
  11 * not-yet-corrupted-by-suspicious pages without killing anything.
  12 *
  13 * Handles page cache pages in various states.	The tricky part
  14 * here is that we can access any page asynchronously in respect to
  15 * other VM users, because memory failures could happen anytime and
  16 * anywhere. This could violate some of their assumptions. This is why
  17 * this code has to be extremely careful. Generally it tries to use
  18 * normal locking rules, as in get the standard locks, even if that means
  19 * the error handling takes potentially a long time.
  20 *
  21 * It can be very tempting to add handling for obscure cases here.
  22 * In general any code for handling new cases should only be added iff:
  23 * - You know how to test it.
  24 * - You have a test that can be added to mce-test
  25 *   https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
  26 * - The case actually shows up as a frequent (top 10) page state in
  27 *   tools/mm/page-types when running a real workload.
  28 *
  29 * There are several operations here with exponential complexity because
  30 * of unsuitable VM data structures. For example the operation to map back
  31 * from RMAP chains to processes has to walk the complete process list and
  32 * has non linear complexity with the number. But since memory corruptions
  33 * are rare we hope to get away with this. This avoids impacting the core
  34 * VM.
  35 */
  36
  37#define pr_fmt(fmt) "Memory failure: " fmt
  38
  39#include <linux/kernel.h>
  40#include <linux/mm.h>
  41#include <linux/page-flags.h>
  42#include <linux/sched/signal.h>
  43#include <linux/sched/task.h>
  44#include <linux/dax.h>
  45#include <linux/ksm.h>
  46#include <linux/rmap.h>
  47#include <linux/export.h>
  48#include <linux/pagemap.h>
  49#include <linux/swap.h>
  50#include <linux/backing-dev.h>
  51#include <linux/migrate.h>
  52#include <linux/slab.h>
  53#include <linux/swapops.h>
  54#include <linux/hugetlb.h>
  55#include <linux/memory_hotplug.h>
  56#include <linux/mm_inline.h>
  57#include <linux/memremap.h>
  58#include <linux/kfifo.h>
  59#include <linux/ratelimit.h>
  60#include <linux/pagewalk.h>
  61#include <linux/shmem_fs.h>
  62#include <linux/sysctl.h>
  63#include "swap.h"
  64#include "internal.h"
  65#include "ras/ras_event.h"
  66
  67static int sysctl_memory_failure_early_kill __read_mostly;
  68
  69static int sysctl_memory_failure_recovery __read_mostly = 1;
  70
  71static int sysctl_enable_soft_offline __read_mostly = 1;
  72
  73atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
  74
  75static bool hw_memory_failure __read_mostly = false;
  76
  77static DEFINE_MUTEX(mf_mutex);
  78
  79void num_poisoned_pages_inc(unsigned long pfn)
  80{
  81	atomic_long_inc(&num_poisoned_pages);
  82	memblk_nr_poison_inc(pfn);
  83}
  84
  85void num_poisoned_pages_sub(unsigned long pfn, long i)
  86{
  87	atomic_long_sub(i, &num_poisoned_pages);
  88	if (pfn != -1UL)
  89		memblk_nr_poison_sub(pfn, i);
  90}
  91
  92/**
  93 * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
  94 * @_name: name of the file in the per NUMA sysfs directory.
  95 */
  96#define MF_ATTR_RO(_name)					\
  97static ssize_t _name##_show(struct device *dev,			\
  98			    struct device_attribute *attr,	\
  99			    char *buf)				\
 100{								\
 101	struct memory_failure_stats *mf_stats =			\
 102		&NODE_DATA(dev->id)->mf_stats;			\
 103	return sprintf(buf, "%lu\n", mf_stats->_name);		\
 104}								\
 105static DEVICE_ATTR_RO(_name)
 106
 107MF_ATTR_RO(total);
 108MF_ATTR_RO(ignored);
 109MF_ATTR_RO(failed);
 110MF_ATTR_RO(delayed);
 111MF_ATTR_RO(recovered);
 112
 113static struct attribute *memory_failure_attr[] = {
 114	&dev_attr_total.attr,
 115	&dev_attr_ignored.attr,
 116	&dev_attr_failed.attr,
 117	&dev_attr_delayed.attr,
 118	&dev_attr_recovered.attr,
 119	NULL,
 120};
 121
 122const struct attribute_group memory_failure_attr_group = {
 123	.name = "memory_failure",
 124	.attrs = memory_failure_attr,
 125};
 126
 127static struct ctl_table memory_failure_table[] = {
 128	{
 129		.procname	= "memory_failure_early_kill",
 130		.data		= &sysctl_memory_failure_early_kill,
 131		.maxlen		= sizeof(sysctl_memory_failure_early_kill),
 132		.mode		= 0644,
 133		.proc_handler	= proc_dointvec_minmax,
 134		.extra1		= SYSCTL_ZERO,
 135		.extra2		= SYSCTL_ONE,
 136	},
 137	{
 138		.procname	= "memory_failure_recovery",
 139		.data		= &sysctl_memory_failure_recovery,
 140		.maxlen		= sizeof(sysctl_memory_failure_recovery),
 141		.mode		= 0644,
 142		.proc_handler	= proc_dointvec_minmax,
 143		.extra1		= SYSCTL_ZERO,
 144		.extra2		= SYSCTL_ONE,
 145	},
 146	{
 147		.procname	= "enable_soft_offline",
 148		.data		= &sysctl_enable_soft_offline,
 149		.maxlen		= sizeof(sysctl_enable_soft_offline),
 150		.mode		= 0644,
 151		.proc_handler	= proc_dointvec_minmax,
 152		.extra1		= SYSCTL_ZERO,
 153		.extra2		= SYSCTL_ONE,
 154	}
 155};
 156
 157/*
 158 * Return values:
 159 *   1:   the page is dissolved (if needed) and taken off from buddy,
 160 *   0:   the page is dissolved (if needed) and not taken off from buddy,
 161 *   < 0: failed to dissolve.
 162 */
 163static int __page_handle_poison(struct page *page)
 164{
 165	int ret;
 166
 167	/*
 168	 * zone_pcp_disable() can't be used here. It will
 169	 * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold
 170	 * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
 171	 * optimization is enabled. This will break current lock dependency
 172	 * chain and leads to deadlock.
 173	 * Disabling pcp before dissolving the page was a deterministic
 174	 * approach because we made sure that those pages cannot end up in any
 175	 * PCP list. Draining PCP lists expels those pages to the buddy system,
 176	 * but nothing guarantees that those pages do not get back to a PCP
 177	 * queue if we need to refill those.
 178	 */
 179	ret = dissolve_free_hugetlb_folio(page_folio(page));
 180	if (!ret) {
 181		drain_all_pages(page_zone(page));
 182		ret = take_page_off_buddy(page);
 183	}
 184
 185	return ret;
 186}
 187
 188static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
 189{
 190	if (hugepage_or_freepage) {
 191		/*
 192		 * Doing this check for free pages is also fine since
 193		 * dissolve_free_hugetlb_folio() returns 0 for non-hugetlb folios as well.
 194		 */
 195		if (__page_handle_poison(page) <= 0)
 196			/*
 197			 * We could fail to take off the target page from buddy
 198			 * for example due to racy page allocation, but that's
 199			 * acceptable because soft-offlined page is not broken
 200			 * and if someone really want to use it, they should
 201			 * take it.
 202			 */
 203			return false;
 204	}
 205
 206	SetPageHWPoison(page);
 207	if (release)
 208		put_page(page);
 209	page_ref_inc(page);
 210	num_poisoned_pages_inc(page_to_pfn(page));
 211
 212	return true;
 213}
 214
 215#if IS_ENABLED(CONFIG_HWPOISON_INJECT)
 216
 217u32 hwpoison_filter_enable = 0;
 218u32 hwpoison_filter_dev_major = ~0U;
 219u32 hwpoison_filter_dev_minor = ~0U;
 220u64 hwpoison_filter_flags_mask;
 221u64 hwpoison_filter_flags_value;
 222EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
 223EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
 224EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
 225EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
 226EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
 227
 228static int hwpoison_filter_dev(struct page *p)
 229{
 230	struct folio *folio = page_folio(p);
 231	struct address_space *mapping;
 232	dev_t dev;
 233
 234	if (hwpoison_filter_dev_major == ~0U &&
 235	    hwpoison_filter_dev_minor == ~0U)
 236		return 0;
 237
 238	mapping = folio_mapping(folio);
 239	if (mapping == NULL || mapping->host == NULL)
 240		return -EINVAL;
 241
 242	dev = mapping->host->i_sb->s_dev;
 243	if (hwpoison_filter_dev_major != ~0U &&
 244	    hwpoison_filter_dev_major != MAJOR(dev))
 245		return -EINVAL;
 246	if (hwpoison_filter_dev_minor != ~0U &&
 247	    hwpoison_filter_dev_minor != MINOR(dev))
 248		return -EINVAL;
 249
 250	return 0;
 251}
 252
 253static int hwpoison_filter_flags(struct page *p)
 254{
 255	if (!hwpoison_filter_flags_mask)
 256		return 0;
 257
 258	if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
 259				    hwpoison_filter_flags_value)
 260		return 0;
 261	else
 262		return -EINVAL;
 263}
 264
 265/*
 266 * This allows stress tests to limit test scope to a collection of tasks
 267 * by putting them under some memcg. This prevents killing unrelated/important
 268 * processes such as /sbin/init. Note that the target task may share clean
 269 * pages with init (eg. libc text), which is harmless. If the target task
 270 * share _dirty_ pages with another task B, the test scheme must make sure B
 271 * is also included in the memcg. At last, due to race conditions this filter
 272 * can only guarantee that the page either belongs to the memcg tasks, or is
 273 * a freed page.
 274 */
 275#ifdef CONFIG_MEMCG
 276u64 hwpoison_filter_memcg;
 277EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 278static int hwpoison_filter_task(struct page *p)
 279{
 280	if (!hwpoison_filter_memcg)
 281		return 0;
 282
 283	if (page_cgroup_ino(p) != hwpoison_filter_memcg)
 284		return -EINVAL;
 285
 286	return 0;
 287}
 288#else
 289static int hwpoison_filter_task(struct page *p) { return 0; }
 290#endif
 291
 292int hwpoison_filter(struct page *p)
 293{
 294	if (!hwpoison_filter_enable)
 295		return 0;
 296
 297	if (hwpoison_filter_dev(p))
 298		return -EINVAL;
 299
 300	if (hwpoison_filter_flags(p))
 301		return -EINVAL;
 302
 303	if (hwpoison_filter_task(p))
 304		return -EINVAL;
 305
 306	return 0;
 307}
 308EXPORT_SYMBOL_GPL(hwpoison_filter);
 309#else
 310int hwpoison_filter(struct page *p)
 311{
 312	return 0;
 313}
 314#endif
 315
 316/*
 317 * Kill all processes that have a poisoned page mapped and then isolate
 318 * the page.
 319 *
 320 * General strategy:
 321 * Find all processes having the page mapped and kill them.
 322 * But we keep a page reference around so that the page is not
 323 * actually freed yet.
 324 * Then stash the page away
 325 *
 326 * There's no convenient way to get back to mapped processes
 327 * from the VMAs. So do a brute-force search over all
 328 * running processes.
 329 *
 330 * Remember that machine checks are not common (or rather
 331 * if they are common you have other problems), so this shouldn't
 332 * be a performance issue.
 333 *
 334 * Also there are some races possible while we get from the
 335 * error detection to actually handle it.
 336 */
 337
 338struct to_kill {
 339	struct list_head nd;
 340	struct task_struct *tsk;
 341	unsigned long addr;
 342	short size_shift;
 343};
 344
 345/*
 346 * Send all the processes who have the page mapped a signal.
 347 * ``action optional'' if they are not immediately affected by the error
 348 * ``action required'' if error happened in current execution context
 349 */
 350static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
 351{
 352	struct task_struct *t = tk->tsk;
 353	short addr_lsb = tk->size_shift;
 354	int ret = 0;
 355
 356	pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
 357			pfn, t->comm, task_pid_nr(t));
 358
 359	if ((flags & MF_ACTION_REQUIRED) && (t == current))
 360		ret = force_sig_mceerr(BUS_MCEERR_AR,
 361				 (void __user *)tk->addr, addr_lsb);
 362	else
 363		/*
 364		 * Signal other processes sharing the page if they have
 365		 * PF_MCE_EARLY set.
 366		 * Don't use force here, it's convenient if the signal
 367		 * can be temporarily blocked.
 368		 */
 369		ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
 370				      addr_lsb, t);
 371	if (ret < 0)
 372		pr_info("Error sending signal to %s:%d: %d\n",
 373			t->comm, task_pid_nr(t), ret);
 374	return ret;
 375}
 376
 377/*
 378 * Unknown page type encountered. Try to check whether it can turn PageLRU by
 379 * lru_add_drain_all.
 380 */
 381void shake_folio(struct folio *folio)
 382{
 383	if (folio_test_hugetlb(folio))
 384		return;
 385	/*
 386	 * TODO: Could shrink slab caches here if a lightweight range-based
 387	 * shrinker will be available.
 388	 */
 389	if (folio_test_slab(folio))
 390		return;
 391
 392	lru_add_drain_all();
 393}
 394EXPORT_SYMBOL_GPL(shake_folio);
 395
 396static void shake_page(struct page *page)
 397{
 398	shake_folio(page_folio(page));
 399}
 400
 401static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
 402		unsigned long address)
 403{
 404	unsigned long ret = 0;
 405	pgd_t *pgd;
 406	p4d_t *p4d;
 407	pud_t *pud;
 408	pmd_t *pmd;
 409	pte_t *pte;
 410	pte_t ptent;
 411
 412	VM_BUG_ON_VMA(address == -EFAULT, vma);
 413	pgd = pgd_offset(vma->vm_mm, address);
 414	if (!pgd_present(*pgd))
 415		return 0;
 416	p4d = p4d_offset(pgd, address);
 417	if (!p4d_present(*p4d))
 418		return 0;
 419	pud = pud_offset(p4d, address);
 420	if (!pud_present(*pud))
 421		return 0;
 422	if (pud_devmap(*pud))
 423		return PUD_SHIFT;
 424	pmd = pmd_offset(pud, address);
 425	if (!pmd_present(*pmd))
 426		return 0;
 427	if (pmd_devmap(*pmd))
 428		return PMD_SHIFT;
 429	pte = pte_offset_map(pmd, address);
 430	if (!pte)
 431		return 0;
 432	ptent = ptep_get(pte);
 433	if (pte_present(ptent) && pte_devmap(ptent))
 434		ret = PAGE_SHIFT;
 435	pte_unmap(pte);
 436	return ret;
 437}
 438
 439/*
 440 * Failure handling: if we can't find or can't kill a process there's
 441 * not much we can do.	We just print a message and ignore otherwise.
 442 */
 443
 444/*
 445 * Schedule a process for later kill.
 446 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
 447 */
 448static void __add_to_kill(struct task_struct *tsk, struct page *p,
 449			  struct vm_area_struct *vma, struct list_head *to_kill,
 450			  unsigned long addr)
 451{
 452	struct to_kill *tk;
 453
 454	tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
 455	if (!tk) {
 456		pr_err("Out of memory while machine check handling\n");
 457		return;
 458	}
 459
 460	tk->addr = addr;
 461	if (is_zone_device_page(p))
 462		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
 463	else
 464		tk->size_shift = page_shift(compound_head(p));
 465
 466	/*
 467	 * Send SIGKILL if "tk->addr == -EFAULT". Also, as
 468	 * "tk->size_shift" is always non-zero for !is_zone_device_page(),
 469	 * so "tk->size_shift == 0" effectively checks no mapping on
 470	 * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
 471	 * to a process' address space, it's possible not all N VMAs
 472	 * contain mappings for the page, but at least one VMA does.
 473	 * Only deliver SIGBUS with payload derived from the VMA that
 474	 * has a mapping for the page.
 475	 */
 476	if (tk->addr == -EFAULT) {
 477		pr_info("Unable to find user space address %lx in %s\n",
 478			page_to_pfn(p), tsk->comm);
 479	} else if (tk->size_shift == 0) {
 480		kfree(tk);
 481		return;
 482	}
 483
 484	get_task_struct(tsk);
 485	tk->tsk = tsk;
 486	list_add_tail(&tk->nd, to_kill);
 487}
 488
 489static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
 490		struct vm_area_struct *vma, struct list_head *to_kill,
 491		unsigned long addr)
 492{
 493	if (addr == -EFAULT)
 494		return;
 495	__add_to_kill(tsk, p, vma, to_kill, addr);
 496}
 497
 498#ifdef CONFIG_KSM
 499static bool task_in_to_kill_list(struct list_head *to_kill,
 500				 struct task_struct *tsk)
 501{
 502	struct to_kill *tk, *next;
 503
 504	list_for_each_entry_safe(tk, next, to_kill, nd) {
 505		if (tk->tsk == tsk)
 506			return true;
 507	}
 508
 509	return false;
 510}
 511
 512void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
 513		     struct vm_area_struct *vma, struct list_head *to_kill,
 514		     unsigned long addr)
 515{
 516	if (!task_in_to_kill_list(to_kill, tsk))
 517		__add_to_kill(tsk, p, vma, to_kill, addr);
 518}
 519#endif
 520/*
 521 * Kill the processes that have been collected earlier.
 522 *
 523 * Only do anything when FORCEKILL is set, otherwise just free the
 524 * list (this is used for clean pages which do not need killing)
 525 */
 526static void kill_procs(struct list_head *to_kill, int forcekill,
 527		unsigned long pfn, int flags)
 528{
 529	struct to_kill *tk, *next;
 530
 531	list_for_each_entry_safe(tk, next, to_kill, nd) {
 532		if (forcekill) {
 533			if (tk->addr == -EFAULT) {
 534				pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
 535				       pfn, tk->tsk->comm, task_pid_nr(tk->tsk));
 536				do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
 537						 tk->tsk, PIDTYPE_PID);
 538			}
 539
 540			/*
 541			 * In theory the process could have mapped
 542			 * something else on the address in-between. We could
 543			 * check for that, but we need to tell the
 544			 * process anyways.
 545			 */
 546			else if (kill_proc(tk, pfn, flags) < 0)
 547				pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
 548				       pfn, tk->tsk->comm, task_pid_nr(tk->tsk));
 549		}
 550		list_del(&tk->nd);
 551		put_task_struct(tk->tsk);
 552		kfree(tk);
 553	}
 554}
 555
 556/*
 557 * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
 558 * on behalf of the thread group. Return task_struct of the (first found)
 559 * dedicated thread if found, and return NULL otherwise.
 560 *
 561 * We already hold rcu lock in the caller, so we don't have to call
 562 * rcu_read_lock/unlock() in this function.
 563 */
 564static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
 565{
 566	struct task_struct *t;
 567
 568	for_each_thread(tsk, t) {
 569		if (t->flags & PF_MCE_PROCESS) {
 570			if (t->flags & PF_MCE_EARLY)
 571				return t;
 572		} else {
 573			if (sysctl_memory_failure_early_kill)
 574				return t;
 575		}
 576	}
 577	return NULL;
 578}
 579
 580/*
 581 * Determine whether a given process is "early kill" process which expects
 582 * to be signaled when some page under the process is hwpoisoned.
 583 * Return task_struct of the dedicated thread (main thread unless explicitly
 584 * specified) if the process is "early kill" and otherwise returns NULL.
 585 *
 586 * Note that the above is true for Action Optional case. For Action Required
 587 * case, it's only meaningful to the current thread which need to be signaled
 588 * with SIGBUS, this error is Action Optional for other non current
 589 * processes sharing the same error page,if the process is "early kill", the
 590 * task_struct of the dedicated thread will also be returned.
 591 */
 592struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
 593{
 594	if (!tsk->mm)
 595		return NULL;
 596	/*
 597	 * Comparing ->mm here because current task might represent
 598	 * a subthread, while tsk always points to the main thread.
 599	 */
 600	if (force_early && tsk->mm == current->mm)
 601		return current;
 602
 603	return find_early_kill_thread(tsk);
 604}
 605
 606/*
 607 * Collect processes when the error hit an anonymous page.
 608 */
 609static void collect_procs_anon(struct folio *folio, struct page *page,
 610		struct list_head *to_kill, int force_early)
 611{
 612	struct task_struct *tsk;
 613	struct anon_vma *av;
 614	pgoff_t pgoff;
 615
 616	av = folio_lock_anon_vma_read(folio, NULL);
 617	if (av == NULL)	/* Not actually mapped anymore */
 618		return;
 619
 620	pgoff = page_to_pgoff(page);
 621	rcu_read_lock();
 622	for_each_process(tsk) {
 623		struct vm_area_struct *vma;
 624		struct anon_vma_chain *vmac;
 625		struct task_struct *t = task_early_kill(tsk, force_early);
 626		unsigned long addr;
 627
 628		if (!t)
 629			continue;
 630		anon_vma_interval_tree_foreach(vmac, &av->rb_root,
 631					       pgoff, pgoff) {
 632			vma = vmac->vma;
 633			if (vma->vm_mm != t->mm)
 634				continue;
 635			addr = page_mapped_in_vma(page, vma);
 636			add_to_kill_anon_file(t, page, vma, to_kill, addr);
 637		}
 638	}
 639	rcu_read_unlock();
 640	anon_vma_unlock_read(av);
 641}
 642
 643/*
 644 * Collect processes when the error hit a file mapped page.
 645 */
 646static void collect_procs_file(struct folio *folio, struct page *page,
 647		struct list_head *to_kill, int force_early)
 648{
 649	struct vm_area_struct *vma;
 650	struct task_struct *tsk;
 651	struct address_space *mapping = folio->mapping;
 652	pgoff_t pgoff;
 653
 654	i_mmap_lock_read(mapping);
 655	rcu_read_lock();
 656	pgoff = page_to_pgoff(page);
 657	for_each_process(tsk) {
 658		struct task_struct *t = task_early_kill(tsk, force_early);
 659		unsigned long addr;
 660
 661		if (!t)
 662			continue;
 663		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
 664				      pgoff) {
 665			/*
 666			 * Send early kill signal to tasks where a vma covers
 667			 * the page but the corrupted page is not necessarily
 668			 * mapped in its pte.
 669			 * Assume applications who requested early kill want
 670			 * to be informed of all such data corruptions.
 671			 */
 672			if (vma->vm_mm != t->mm)
 673				continue;
 674			addr = page_address_in_vma(page, vma);
 675			add_to_kill_anon_file(t, page, vma, to_kill, addr);
 676		}
 677	}
 678	rcu_read_unlock();
 679	i_mmap_unlock_read(mapping);
 680}
 681
 682#ifdef CONFIG_FS_DAX
 683static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
 684			      struct vm_area_struct *vma,
 685			      struct list_head *to_kill, pgoff_t pgoff)
 686{
 687	unsigned long addr = vma_address(vma, pgoff, 1);
 688	__add_to_kill(tsk, p, vma, to_kill, addr);
 689}
 690
 691/*
 692 * Collect processes when the error hit a fsdax page.
 693 */
 694static void collect_procs_fsdax(struct page *page,
 695		struct address_space *mapping, pgoff_t pgoff,
 696		struct list_head *to_kill, bool pre_remove)
 697{
 698	struct vm_area_struct *vma;
 699	struct task_struct *tsk;
 700
 701	i_mmap_lock_read(mapping);
 702	rcu_read_lock();
 703	for_each_process(tsk) {
 704		struct task_struct *t = tsk;
 705
 706		/*
 707		 * Search for all tasks while MF_MEM_PRE_REMOVE is set, because
 708		 * the current may not be the one accessing the fsdax page.
 709		 * Otherwise, search for the current task.
 710		 */
 711		if (!pre_remove)
 712			t = task_early_kill(tsk, true);
 713		if (!t)
 714			continue;
 715		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 716			if (vma->vm_mm == t->mm)
 717				add_to_kill_fsdax(t, page, vma, to_kill, pgoff);
 718		}
 719	}
 720	rcu_read_unlock();
 721	i_mmap_unlock_read(mapping);
 722}
 723#endif /* CONFIG_FS_DAX */
 724
 725/*
 726 * Collect the processes who have the corrupted page mapped to kill.
 727 */
 728static void collect_procs(struct folio *folio, struct page *page,
 729		struct list_head *tokill, int force_early)
 730{
 731	if (!folio->mapping)
 732		return;
 733	if (unlikely(folio_test_ksm(folio)))
 734		collect_procs_ksm(folio, page, tokill, force_early);
 735	else if (folio_test_anon(folio))
 736		collect_procs_anon(folio, page, tokill, force_early);
 737	else
 738		collect_procs_file(folio, page, tokill, force_early);
 739}
 740
 741struct hwpoison_walk {
 742	struct to_kill tk;
 743	unsigned long pfn;
 744	int flags;
 745};
 746
 747static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
 748{
 749	tk->addr = addr;
 750	tk->size_shift = shift;
 751}
 752
 753static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
 754				unsigned long poisoned_pfn, struct to_kill *tk)
 755{
 756	unsigned long pfn = 0;
 757
 758	if (pte_present(pte)) {
 759		pfn = pte_pfn(pte);
 760	} else {
 761		swp_entry_t swp = pte_to_swp_entry(pte);
 762
 763		if (is_hwpoison_entry(swp))
 764			pfn = swp_offset_pfn(swp);
 765	}
 766
 767	if (!pfn || pfn != poisoned_pfn)
 768		return 0;
 769
 770	set_to_kill(tk, addr, shift);
 771	return 1;
 772}
 773
 774#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 775static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
 776				      struct hwpoison_walk *hwp)
 777{
 778	pmd_t pmd = *pmdp;
 779	unsigned long pfn;
 780	unsigned long hwpoison_vaddr;
 781
 782	if (!pmd_present(pmd))
 783		return 0;
 784	pfn = pmd_pfn(pmd);
 785	if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
 786		hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
 787		set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
 788		return 1;
 789	}
 790	return 0;
 791}
 792#else
 793static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
 794				      struct hwpoison_walk *hwp)
 795{
 796	return 0;
 797}
 798#endif
 799
 800static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
 801			      unsigned long end, struct mm_walk *walk)
 802{
 803	struct hwpoison_walk *hwp = walk->private;
 804	int ret = 0;
 805	pte_t *ptep, *mapped_pte;
 806	spinlock_t *ptl;
 807
 808	ptl = pmd_trans_huge_lock(pmdp, walk->vma);
 809	if (ptl) {
 810		ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
 811		spin_unlock(ptl);
 812		goto out;
 813	}
 814
 815	mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
 816						addr, &ptl);
 817	if (!ptep)
 818		goto out;
 819
 820	for (; addr != end; ptep++, addr += PAGE_SIZE) {
 821		ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT,
 822					     hwp->pfn, &hwp->tk);
 823		if (ret == 1)
 824			break;
 825	}
 826	pte_unmap_unlock(mapped_pte, ptl);
 827out:
 828	cond_resched();
 829	return ret;
 830}
 831
 832#ifdef CONFIG_HUGETLB_PAGE
 833static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
 834			    unsigned long addr, unsigned long end,
 835			    struct mm_walk *walk)
 836{
 837	struct hwpoison_walk *hwp = walk->private;
 838	pte_t pte = huge_ptep_get(walk->mm, addr, ptep);
 839	struct hstate *h = hstate_vma(walk->vma);
 840
 841	return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
 842				      hwp->pfn, &hwp->tk);
 843}
 844#else
 845#define hwpoison_hugetlb_range	NULL
 846#endif
 847
 848static const struct mm_walk_ops hwpoison_walk_ops = {
 849	.pmd_entry = hwpoison_pte_range,
 850	.hugetlb_entry = hwpoison_hugetlb_range,
 851	.walk_lock = PGWALK_RDLOCK,
 852};
 853
 854/*
 855 * Sends SIGBUS to the current process with error info.
 856 *
 857 * This function is intended to handle "Action Required" MCEs on already
 858 * hardware poisoned pages. They could happen, for example, when
 859 * memory_failure() failed to unmap the error page at the first call, or
 860 * when multiple local machine checks happened on different CPUs.
 861 *
 862 * MCE handler currently has no easy access to the error virtual address,
 863 * so this function walks page table to find it. The returned virtual address
 864 * is proper in most cases, but it could be wrong when the application
 865 * process has multiple entries mapping the error page.
 866 */
 867static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
 868				  int flags)
 869{
 870	int ret;
 871	struct hwpoison_walk priv = {
 872		.pfn = pfn,
 873	};
 874	priv.tk.tsk = p;
 875
 876	if (!p->mm)
 877		return -EFAULT;
 878
 879	mmap_read_lock(p->mm);
 880	ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops,
 881			      (void *)&priv);
 882	if (ret == 1 && priv.tk.addr)
 883		kill_proc(&priv.tk, pfn, flags);
 884	else
 885		ret = 0;
 886	mmap_read_unlock(p->mm);
 887	return ret > 0 ? -EHWPOISON : -EFAULT;
 888}
 889
 890/*
 891 * MF_IGNORED - The m-f() handler marks the page as PG_hwpoisoned'ed.
 892 * But it could not do more to isolate the page from being accessed again,
 893 * nor does it kill the process. This is extremely rare and one of the
 894 * potential causes is that the page state has been changed due to
 895 * underlying race condition. This is the most severe outcomes.
 896 *
 897 * MF_FAILED - The m-f() handler marks the page as PG_hwpoisoned'ed.
 898 * It should have killed the process, but it can't isolate the page,
 899 * due to conditions such as extra pin, unmap failure, etc. Accessing
 900 * the page again may trigger another MCE and the process will be killed
 901 * by the m-f() handler immediately.
 902 *
 903 * MF_DELAYED - The m-f() handler marks the page as PG_hwpoisoned'ed.
 904 * The page is unmapped, and is removed from the LRU or file mapping.
 905 * An attempt to access the page again will trigger page fault and the
 906 * PF handler will kill the process.
 907 *
 908 * MF_RECOVERED - The m-f() handler marks the page as PG_hwpoisoned'ed.
 909 * The page has been completely isolated, that is, unmapped, taken out of
 910 * the buddy system, or hole-punnched out of the file mapping.
 911 */
 912static const char *action_name[] = {
 913	[MF_IGNORED] = "Ignored",
 914	[MF_FAILED] = "Failed",
 915	[MF_DELAYED] = "Delayed",
 916	[MF_RECOVERED] = "Recovered",
 917};
 918
 919static const char * const action_page_types[] = {
 920	[MF_MSG_KERNEL]			= "reserved kernel page",
 921	[MF_MSG_KERNEL_HIGH_ORDER]	= "high-order kernel page",
 922	[MF_MSG_HUGE]			= "huge page",
 923	[MF_MSG_FREE_HUGE]		= "free huge page",
 924	[MF_MSG_GET_HWPOISON]		= "get hwpoison page",
 925	[MF_MSG_UNMAP_FAILED]		= "unmapping failed page",
 926	[MF_MSG_DIRTY_SWAPCACHE]	= "dirty swapcache page",
 927	[MF_MSG_CLEAN_SWAPCACHE]	= "clean swapcache page",
 928	[MF_MSG_DIRTY_MLOCKED_LRU]	= "dirty mlocked LRU page",
 929	[MF_MSG_CLEAN_MLOCKED_LRU]	= "clean mlocked LRU page",
 930	[MF_MSG_DIRTY_UNEVICTABLE_LRU]	= "dirty unevictable LRU page",
 931	[MF_MSG_CLEAN_UNEVICTABLE_LRU]	= "clean unevictable LRU page",
 932	[MF_MSG_DIRTY_LRU]		= "dirty LRU page",
 933	[MF_MSG_CLEAN_LRU]		= "clean LRU page",
 934	[MF_MSG_TRUNCATED_LRU]		= "already truncated LRU page",
 935	[MF_MSG_BUDDY]			= "free buddy page",
 936	[MF_MSG_DAX]			= "dax page",
 937	[MF_MSG_UNSPLIT_THP]		= "unsplit thp",
 938	[MF_MSG_ALREADY_POISONED]	= "already poisoned",
 939	[MF_MSG_UNKNOWN]		= "unknown page",
 940};
 941
 942/*
 943 * XXX: It is possible that a page is isolated from LRU cache,
 944 * and then kept in swap cache or failed to remove from page cache.
 945 * The page count will stop it from being freed by unpoison.
 946 * Stress tests should be aware of this memory leak problem.
 947 */
 948static int delete_from_lru_cache(struct folio *folio)
 949{
 950	if (folio_isolate_lru(folio)) {
 951		/*
 952		 * Clear sensible page flags, so that the buddy system won't
 953		 * complain when the folio is unpoison-and-freed.
 954		 */
 955		folio_clear_active(folio);
 956		folio_clear_unevictable(folio);
 957
 958		/*
 959		 * Poisoned page might never drop its ref count to 0 so we have
 960		 * to uncharge it manually from its memcg.
 961		 */
 962		mem_cgroup_uncharge(folio);
 963
 964		/*
 965		 * drop the refcount elevated by folio_isolate_lru()
 966		 */
 967		folio_put(folio);
 968		return 0;
 969	}
 970	return -EIO;
 971}
 972
 973static int truncate_error_folio(struct folio *folio, unsigned long pfn,
 974				struct address_space *mapping)
 975{
 976	int ret = MF_FAILED;
 977
 978	if (mapping->a_ops->error_remove_folio) {
 979		int err = mapping->a_ops->error_remove_folio(mapping, folio);
 980
 981		if (err != 0)
 982			pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
 983		else if (!filemap_release_folio(folio, GFP_NOIO))
 984			pr_info("%#lx: failed to release buffers\n", pfn);
 985		else
 986			ret = MF_RECOVERED;
 987	} else {
 988		/*
 989		 * If the file system doesn't support it just invalidate
 990		 * This fails on dirty or anything with private pages
 991		 */
 992		if (mapping_evict_folio(mapping, folio))
 993			ret = MF_RECOVERED;
 994		else
 995			pr_info("%#lx: Failed to invalidate\n",	pfn);
 996	}
 997
 998	return ret;
 999}
1000
1001struct page_state {
1002	unsigned long mask;
1003	unsigned long res;
1004	enum mf_action_page_type type;
1005
1006	/* Callback ->action() has to unlock the relevant page inside it. */
1007	int (*action)(struct page_state *ps, struct page *p);
1008};
1009
1010/*
1011 * Return true if page is still referenced by others, otherwise return
1012 * false.
1013 *
1014 * The extra_pins is true when one extra refcount is expected.
1015 */
1016static bool has_extra_refcount(struct page_state *ps, struct page *p,
1017			       bool extra_pins)
1018{
1019	int count = page_count(p) - 1;
1020
1021	if (extra_pins)
1022		count -= folio_nr_pages(page_folio(p));
1023
1024	if (count > 0) {
1025		pr_err("%#lx: %s still referenced by %d users\n",
1026		       page_to_pfn(p), action_page_types[ps->type], count);
1027		return true;
1028	}
1029
1030	return false;
1031}
1032
1033/*
1034 * Error hit kernel page.
1035 * Do nothing, try to be lucky and not touch this instead. For a few cases we
1036 * could be more sophisticated.
1037 */
1038static int me_kernel(struct page_state *ps, struct page *p)
1039{
1040	unlock_page(p);
1041	return MF_IGNORED;
1042}
1043
1044/*
1045 * Page in unknown state. Do nothing.
1046 * This is a catch-all in case we fail to make sense of the page state.
1047 */
1048static int me_unknown(struct page_state *ps, struct page *p)
1049{
1050	pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
1051	unlock_page(p);
1052	return MF_IGNORED;
1053}
1054
1055/*
1056 * Clean (or cleaned) page cache page.
1057 */
1058static int me_pagecache_clean(struct page_state *ps, struct page *p)
1059{
1060	struct folio *folio = page_folio(p);
1061	int ret;
1062	struct address_space *mapping;
1063	bool extra_pins;
1064
1065	delete_from_lru_cache(folio);
1066
1067	/*
1068	 * For anonymous folios the only reference left
1069	 * should be the one m_f() holds.
1070	 */
1071	if (folio_test_anon(folio)) {
1072		ret = MF_RECOVERED;
1073		goto out;
1074	}
1075
1076	/*
1077	 * Now truncate the page in the page cache. This is really
1078	 * more like a "temporary hole punch"
1079	 * Don't do this for block devices when someone else
1080	 * has a reference, because it could be file system metadata
1081	 * and that's not safe to truncate.
1082	 */
1083	mapping = folio_mapping(folio);
1084	if (!mapping) {
1085		/* Folio has been torn down in the meantime */
1086		ret = MF_FAILED;
1087		goto out;
1088	}
1089
1090	/*
1091	 * The shmem page is kept in page cache instead of truncating
1092	 * so is expected to have an extra refcount after error-handling.
1093	 */
1094	extra_pins = shmem_mapping(mapping);
1095
1096	/*
1097	 * Truncation is a bit tricky. Enable it per file system for now.
1098	 *
1099	 * Open: to take i_rwsem or not for this? Right now we don't.
1100	 */
1101	ret = truncate_error_folio(folio, page_to_pfn(p), mapping);
1102	if (has_extra_refcount(ps, p, extra_pins))
1103		ret = MF_FAILED;
1104
1105out:
1106	folio_unlock(folio);
1107
1108	return ret;
1109}
1110
1111/*
1112 * Dirty pagecache page
1113 * Issues: when the error hit a hole page the error is not properly
1114 * propagated.
1115 */
1116static int me_pagecache_dirty(struct page_state *ps, struct page *p)
1117{
1118	struct folio *folio = page_folio(p);
1119	struct address_space *mapping = folio_mapping(folio);
1120
1121	/* TBD: print more information about the file. */
1122	if (mapping) {
1123		/*
1124		 * IO error will be reported by write(), fsync(), etc.
1125		 * who check the mapping.
1126		 * This way the application knows that something went
1127		 * wrong with its dirty file data.
1128		 */
1129		mapping_set_error(mapping, -EIO);
1130	}
1131
1132	return me_pagecache_clean(ps, p);
1133}
1134
1135/*
1136 * Clean and dirty swap cache.
1137 *
1138 * Dirty swap cache page is tricky to handle. The page could live both in page
1139 * table and swap cache(ie. page is freshly swapped in). So it could be
1140 * referenced concurrently by 2 types of PTEs:
1141 * normal PTEs and swap PTEs. We try to handle them consistently by calling
1142 * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs,
1143 * and then
1144 *      - clear dirty bit to prevent IO
1145 *      - remove from LRU
1146 *      - but keep in the swap cache, so that when we return to it on
1147 *        a later page fault, we know the application is accessing
1148 *        corrupted data and shall be killed (we installed simple
1149 *        interception code in do_swap_page to catch it).
1150 *
1151 * Clean swap cache pages can be directly isolated. A later page fault will
1152 * bring in the known good data from disk.
1153 */
1154static int me_swapcache_dirty(struct page_state *ps, struct page *p)
1155{
1156	struct folio *folio = page_folio(p);
1157	int ret;
1158	bool extra_pins = false;
1159
1160	folio_clear_dirty(folio);
1161	/* Trigger EIO in shmem: */
1162	folio_clear_uptodate(folio);
1163
1164	ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_DELAYED;
1165	folio_unlock(folio);
1166
1167	if (ret == MF_DELAYED)
1168		extra_pins = true;
1169
1170	if (has_extra_refcount(ps, p, extra_pins))
1171		ret = MF_FAILED;
1172
1173	return ret;
1174}
1175
1176static int me_swapcache_clean(struct page_state *ps, struct page *p)
1177{
1178	struct folio *folio = page_folio(p);
1179	int ret;
1180
1181	delete_from_swap_cache(folio);
1182
1183	ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_RECOVERED;
1184	folio_unlock(folio);
1185
1186	if (has_extra_refcount(ps, p, false))
1187		ret = MF_FAILED;
1188
1189	return ret;
1190}
1191
1192/*
1193 * Huge pages. Needs work.
1194 * Issues:
1195 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
1196 *   To narrow down kill region to one page, we need to break up pmd.
1197 */
1198static int me_huge_page(struct page_state *ps, struct page *p)
1199{
1200	struct folio *folio = page_folio(p);
1201	int res;
1202	struct address_space *mapping;
1203	bool extra_pins = false;
1204
1205	mapping = folio_mapping(folio);
1206	if (mapping) {
1207		res = truncate_error_folio(folio, page_to_pfn(p), mapping);
1208		/* The page is kept in page cache. */
1209		extra_pins = true;
1210		folio_unlock(folio);
1211	} else {
1212		folio_unlock(folio);
1213		/*
1214		 * migration entry prevents later access on error hugepage,
1215		 * so we can free and dissolve it into buddy to save healthy
1216		 * subpages.
1217		 */
1218		folio_put(folio);
1219		if (__page_handle_poison(p) > 0) {
1220			page_ref_inc(p);
1221			res = MF_RECOVERED;
1222		} else {
1223			res = MF_FAILED;
1224		}
1225	}
1226
1227	if (has_extra_refcount(ps, p, extra_pins))
1228		res = MF_FAILED;
1229
1230	return res;
1231}
1232
1233/*
1234 * Various page states we can handle.
1235 *
1236 * A page state is defined by its current page->flags bits.
1237 * The table matches them in order and calls the right handler.
1238 *
1239 * This is quite tricky because we can access page at any time
1240 * in its live cycle, so all accesses have to be extremely careful.
1241 *
1242 * This is not complete. More states could be added.
1243 * For any missing state don't attempt recovery.
1244 */
1245
1246#define dirty		(1UL << PG_dirty)
1247#define sc		((1UL << PG_swapcache) | (1UL << PG_swapbacked))
1248#define unevict		(1UL << PG_unevictable)
1249#define mlock		(1UL << PG_mlocked)
1250#define lru		(1UL << PG_lru)
1251#define head		(1UL << PG_head)
1252#define reserved	(1UL << PG_reserved)
1253
1254static struct page_state error_states[] = {
1255	{ reserved,	reserved,	MF_MSG_KERNEL,	me_kernel },
1256	/*
1257	 * free pages are specially detected outside this table:
1258	 * PG_buddy pages only make a small fraction of all free pages.
1259	 */
1260
1261	{ head,		head,		MF_MSG_HUGE,		me_huge_page },
1262
1263	{ sc|dirty,	sc|dirty,	MF_MSG_DIRTY_SWAPCACHE,	me_swapcache_dirty },
1264	{ sc|dirty,	sc,		MF_MSG_CLEAN_SWAPCACHE,	me_swapcache_clean },
1265
1266	{ mlock|dirty,	mlock|dirty,	MF_MSG_DIRTY_MLOCKED_LRU,	me_pagecache_dirty },
1267	{ mlock|dirty,	mlock,		MF_MSG_CLEAN_MLOCKED_LRU,	me_pagecache_clean },
1268
1269	{ unevict|dirty, unevict|dirty,	MF_MSG_DIRTY_UNEVICTABLE_LRU,	me_pagecache_dirty },
1270	{ unevict|dirty, unevict,	MF_MSG_CLEAN_UNEVICTABLE_LRU,	me_pagecache_clean },
1271
1272	{ lru|dirty,	lru|dirty,	MF_MSG_DIRTY_LRU,	me_pagecache_dirty },
1273	{ lru|dirty,	lru,		MF_MSG_CLEAN_LRU,	me_pagecache_clean },
1274
1275	/*
1276	 * Catchall entry: must be at end.
1277	 */
1278	{ 0,		0,		MF_MSG_UNKNOWN,	me_unknown },
1279};
1280
1281#undef dirty
1282#undef sc
1283#undef unevict
1284#undef mlock
1285#undef lru
1286#undef head
1287#undef reserved
1288
1289static void update_per_node_mf_stats(unsigned long pfn,
1290				     enum mf_result result)
1291{
1292	int nid = MAX_NUMNODES;
1293	struct memory_failure_stats *mf_stats = NULL;
1294
1295	nid = pfn_to_nid(pfn);
1296	if (unlikely(nid < 0 || nid >= MAX_NUMNODES)) {
1297		WARN_ONCE(1, "Memory failure: pfn=%#lx, invalid nid=%d", pfn, nid);
1298		return;
1299	}
1300
1301	mf_stats = &NODE_DATA(nid)->mf_stats;
1302	switch (result) {
1303	case MF_IGNORED:
1304		++mf_stats->ignored;
1305		break;
1306	case MF_FAILED:
1307		++mf_stats->failed;
1308		break;
1309	case MF_DELAYED:
1310		++mf_stats->delayed;
1311		break;
1312	case MF_RECOVERED:
1313		++mf_stats->recovered;
1314		break;
1315	default:
1316		WARN_ONCE(1, "Memory failure: mf_result=%d is not properly handled", result);
1317		break;
1318	}
1319	++mf_stats->total;
1320}
1321
1322/*
1323 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
1324 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
1325 */
1326static int action_result(unsigned long pfn, enum mf_action_page_type type,
1327			 enum mf_result result)
1328{
1329	trace_memory_failure_event(pfn, type, result);
1330
1331	num_poisoned_pages_inc(pfn);
1332
1333	update_per_node_mf_stats(pfn, result);
1334
1335	pr_err("%#lx: recovery action for %s: %s\n",
1336		pfn, action_page_types[type], action_name[result]);
1337
1338	return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
1339}
1340
1341static int page_action(struct page_state *ps, struct page *p,
1342			unsigned long pfn)
1343{
1344	int result;
1345
1346	/* page p should be unlocked after returning from ps->action().  */
1347	result = ps->action(ps, p);
1348
1349	/* Could do more checks here if page looks ok */
1350	/*
1351	 * Could adjust zone counters here to correct for the missing page.
1352	 */
1353
1354	return action_result(pfn, ps->type, result);
1355}
1356
1357static inline bool PageHWPoisonTakenOff(struct page *page)
1358{
1359	return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
1360}
1361
1362void SetPageHWPoisonTakenOff(struct page *page)
1363{
1364	set_page_private(page, MAGIC_HWPOISON);
1365}
1366
1367void ClearPageHWPoisonTakenOff(struct page *page)
1368{
1369	if (PageHWPoison(page))
1370		set_page_private(page, 0);
1371}
1372
1373/*
1374 * Return true if a page type of a given page is supported by hwpoison
1375 * mechanism (while handling could fail), otherwise false.  This function
1376 * does not return true for hugetlb or device memory pages, so it's assumed
1377 * to be called only in the context where we never have such pages.
1378 */
1379static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
1380{
1381	if (PageSlab(page))
1382		return false;
1383
1384	/* Soft offline could migrate non-LRU movable pages */
1385	if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
1386		return true;
1387
1388	return PageLRU(page) || is_free_buddy_page(page);
1389}
1390
1391static int __get_hwpoison_page(struct page *page, unsigned long flags)
1392{
1393	struct folio *folio = page_folio(page);
1394	int ret = 0;
1395	bool hugetlb = false;
1396
1397	ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false);
1398	if (hugetlb) {
1399		/* Make sure hugetlb demotion did not happen from under us. */
1400		if (folio == page_folio(page))
1401			return ret;
1402		if (ret > 0) {
1403			folio_put(folio);
1404			folio = page_folio(page);
1405		}
1406	}
1407
1408	/*
1409	 * This check prevents from calling folio_try_get() for any
1410	 * unsupported type of folio in order to reduce the risk of unexpected
1411	 * races caused by taking a folio refcount.
1412	 */
1413	if (!HWPoisonHandlable(&folio->page, flags))
1414		return -EBUSY;
1415
1416	if (folio_try_get(folio)) {
1417		if (folio == page_folio(page))
1418			return 1;
1419
1420		pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
1421		folio_put(folio);
1422	}
1423
1424	return 0;
1425}
1426
1427#define GET_PAGE_MAX_RETRY_NUM 3
1428
1429static int get_any_page(struct page *p, unsigned long flags)
1430{
1431	int ret = 0, pass = 0;
1432	bool count_increased = false;
1433
1434	if (flags & MF_COUNT_INCREASED)
1435		count_increased = true;
1436
1437try_again:
1438	if (!count_increased) {
1439		ret = __get_hwpoison_page(p, flags);
1440		if (!ret) {
1441			if (page_count(p)) {
1442				/* We raced with an allocation, retry. */
1443				if (pass++ < GET_PAGE_MAX_RETRY_NUM)
1444					goto try_again;
1445				ret = -EBUSY;
1446			} else if (!PageHuge(p) && !is_free_buddy_page(p)) {
1447				/* We raced with put_page, retry. */
1448				if (pass++ < GET_PAGE_MAX_RETRY_NUM)
1449					goto try_again;
1450				ret = -EIO;
1451			}
1452			goto out;
1453		} else if (ret == -EBUSY) {
1454			/*
1455			 * We raced with (possibly temporary) unhandlable
1456			 * page, retry.
1457			 */
1458			if (pass++ < 3) {
1459				shake_page(p);
1460				goto try_again;
1461			}
1462			ret = -EIO;
1463			goto out;
1464		}
1465	}
1466
1467	if (PageHuge(p) || HWPoisonHandlable(p, flags)) {
1468		ret = 1;
1469	} else {
1470		/*
1471		 * A page we cannot handle. Check whether we can turn
1472		 * it into something we can handle.
1473		 */
1474		if (pass++ < GET_PAGE_MAX_RETRY_NUM) {
1475			put_page(p);
1476			shake_page(p);
1477			count_increased = false;
1478			goto try_again;
1479		}
1480		put_page(p);
1481		ret = -EIO;
1482	}
1483out:
1484	if (ret == -EIO)
1485		pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
1486
1487	return ret;
1488}
1489
1490static int __get_unpoison_page(struct page *page)
1491{
1492	struct folio *folio = page_folio(page);
1493	int ret = 0;
1494	bool hugetlb = false;
1495
1496	ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true);
1497	if (hugetlb) {
1498		/* Make sure hugetlb demotion did not happen from under us. */
1499		if (folio == page_folio(page))
1500			return ret;
1501		if (ret > 0)
1502			folio_put(folio);
1503	}
1504
1505	/*
1506	 * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
1507	 * but also isolated from buddy freelist, so need to identify the
1508	 * state and have to cancel both operations to unpoison.
1509	 */
1510	if (PageHWPoisonTakenOff(page))
1511		return -EHWPOISON;
1512
1513	return get_page_unless_zero(page) ? 1 : 0;
1514}
1515
1516/**
1517 * get_hwpoison_page() - Get refcount for memory error handling
1518 * @p:		Raw error page (hit by memory error)
1519 * @flags:	Flags controlling behavior of error handling
1520 *
1521 * get_hwpoison_page() takes a page refcount of an error page to handle memory
1522 * error on it, after checking that the error page is in a well-defined state
1523 * (defined as a page-type we can successfully handle the memory error on it,
1524 * such as LRU page and hugetlb page).
1525 *
1526 * Memory error handling could be triggered at any time on any type of page,
1527 * so it's prone to race with typical memory management lifecycle (like
1528 * allocation and free).  So to avoid such races, get_hwpoison_page() takes
1529 * extra care for the error page's state (as done in __get_hwpoison_page()),
1530 * and has some retry logic in get_any_page().
1531 *
1532 * When called from unpoison_memory(), the caller should already ensure that
1533 * the given page has PG_hwpoison. So it's never reused for other page
1534 * allocations, and __get_unpoison_page() never races with them.
1535 *
1536 * Return: 0 on failure or free buddy (hugetlb) page,
1537 *         1 on success for in-use pages in a well-defined state,
1538 *         -EIO for pages on which we can not handle memory errors,
1539 *         -EBUSY when get_hwpoison_page() has raced with page lifecycle
1540 *         operations like allocation and free,
1541 *         -EHWPOISON when the page is hwpoisoned and taken off from buddy.
1542 */
1543static int get_hwpoison_page(struct page *p, unsigned long flags)
1544{
1545	int ret;
1546
1547	zone_pcp_disable(page_zone(p));
1548	if (flags & MF_UNPOISON)
1549		ret = __get_unpoison_page(p);
1550	else
1551		ret = get_any_page(p, flags);
1552	zone_pcp_enable(page_zone(p));
1553
1554	return ret;
1555}
1556
1557/*
1558 * Do all that is necessary to remove user space mappings. Unmap
1559 * the pages and send SIGBUS to the processes if the data was dirty.
1560 */
1561static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
1562		unsigned long pfn, int flags)
1563{
1564	enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
1565	struct address_space *mapping;
1566	LIST_HEAD(tokill);
1567	bool unmap_success;
1568	int forcekill;
1569	bool mlocked = folio_test_mlocked(folio);
1570
1571	/*
1572	 * Here we are interested only in user-mapped pages, so skip any
1573	 * other types of pages.
1574	 */
1575	if (folio_test_reserved(folio) || folio_test_slab(folio) ||
1576	    folio_test_pgtable(folio) || folio_test_offline(folio))
1577		return true;
1578	if (!(folio_test_lru(folio) || folio_test_hugetlb(folio)))
1579		return true;
1580
1581	/*
1582	 * This check implies we don't kill processes if their pages
1583	 * are in the swap cache early. Those are always late kills.
1584	 */
1585	if (!folio_mapped(folio))
1586		return true;
1587
1588	if (folio_test_swapcache(folio)) {
1589		pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
1590		ttu &= ~TTU_HWPOISON;
1591	}
1592
1593	/*
1594	 * Propagate the dirty bit from PTEs to struct page first, because we
1595	 * need this to decide if we should kill or just drop the page.
1596	 * XXX: the dirty test could be racy: set_page_dirty() may not always
1597	 * be called inside page lock (it's recommended but not enforced).
1598	 */
1599	mapping = folio_mapping(folio);
1600	if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
1601	    mapping_can_writeback(mapping)) {
1602		if (folio_mkclean(folio)) {
1603			folio_set_dirty(folio);
1604		} else {
1605			ttu &= ~TTU_HWPOISON;
1606			pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
1607				pfn);
1608		}
1609	}
1610
1611	/*
1612	 * First collect all the processes that have the page
1613	 * mapped in dirty form.  This has to be done before try_to_unmap,
1614	 * because ttu takes the rmap data structures down.
1615	 */
1616	collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
1617
1618	if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
1619		/*
1620		 * For hugetlb pages in shared mappings, try_to_unmap
1621		 * could potentially call huge_pmd_unshare.  Because of
1622		 * this, take semaphore in write mode here and set
1623		 * TTU_RMAP_LOCKED to indicate we have taken the lock
1624		 * at this higher level.
1625		 */
1626		mapping = hugetlb_folio_mapping_lock_write(folio);
1627		if (mapping) {
1628			try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
1629			i_mmap_unlock_write(mapping);
1630		} else
1631			pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
1632	} else {
1633		try_to_unmap(folio, ttu);
1634	}
1635
1636	unmap_success = !folio_mapped(folio);
1637	if (!unmap_success)
1638		pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
1639		       pfn, folio_mapcount(folio));
1640
1641	/*
1642	 * try_to_unmap() might put mlocked page in lru cache, so call
1643	 * shake_page() again to ensure that it's flushed.
1644	 */
1645	if (mlocked)
1646		shake_folio(folio);
1647
1648	/*
1649	 * Now that the dirty bit has been propagated to the
1650	 * struct page and all unmaps done we can decide if
1651	 * killing is needed or not.  Only kill when the page
1652	 * was dirty or the process is not restartable,
1653	 * otherwise the tokill list is merely
1654	 * freed.  When there was a problem unmapping earlier
1655	 * use a more force-full uncatchable kill to prevent
1656	 * any accesses to the poisoned memory.
1657	 */
1658	forcekill = folio_test_dirty(folio) || (flags & MF_MUST_KILL) ||
1659		    !unmap_success;
1660	kill_procs(&tokill, forcekill, pfn, flags);
1661
1662	return unmap_success;
1663}
1664
1665static int identify_page_state(unsigned long pfn, struct page *p,
1666				unsigned long page_flags)
1667{
1668	struct page_state *ps;
1669
1670	/*
1671	 * The first check uses the current page flags which may not have any
1672	 * relevant information. The second check with the saved page flags is
1673	 * carried out only if the first check can't determine the page status.
1674	 */
1675	for (ps = error_states;; ps++)
1676		if ((p->flags & ps->mask) == ps->res)
1677			break;
1678
1679	page_flags |= (p->flags & (1UL << PG_dirty));
1680
1681	if (!ps->mask)
1682		for (ps = error_states;; ps++)
1683			if ((page_flags & ps->mask) == ps->res)
1684				break;
1685	return page_action(ps, p, pfn);
1686}
1687
1688/*
1689 * When 'release' is 'false', it means that if thp split has failed,
1690 * there is still more to do, hence the page refcount we took earlier
1691 * is still needed.
1692 */
1693static int try_to_split_thp_page(struct page *page, bool release)
1694{
1695	int ret;
1696
1697	lock_page(page);
1698	ret = split_huge_page(page);
1699	unlock_page(page);
1700
1701	if (ret && release)
1702		put_page(page);
1703
1704	return ret;
1705}
1706
1707static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
1708		struct address_space *mapping, pgoff_t index, int flags)
1709{
1710	struct to_kill *tk;
1711	unsigned long size = 0;
1712
1713	list_for_each_entry(tk, to_kill, nd)
1714		if (tk->size_shift)
1715			size = max(size, 1UL << tk->size_shift);
1716
1717	if (size) {
1718		/*
1719		 * Unmap the largest mapping to avoid breaking up device-dax
1720		 * mappings which are constant size. The actual size of the
1721		 * mapping being torn down is communicated in siginfo, see
1722		 * kill_proc()
1723		 */
1724		loff_t start = ((loff_t)index << PAGE_SHIFT) & ~(size - 1);
1725
1726		unmap_mapping_range(mapping, start, size, 0);
1727	}
1728
1729	kill_procs(to_kill, flags & MF_MUST_KILL, pfn, flags);
1730}
1731
1732/*
1733 * Only dev_pagemap pages get here, such as fsdax when the filesystem
1734 * either do not claim or fails to claim a hwpoison event, or devdax.
1735 * The fsdax pages are initialized per base page, and the devdax pages
1736 * could be initialized either as base pages, or as compound pages with
1737 * vmemmap optimization enabled. Devdax is simplistic in its dealing with
1738 * hwpoison, such that, if a subpage of a compound page is poisoned,
1739 * simply mark the compound head page is by far sufficient.
1740 */
1741static int mf_generic_kill_procs(unsigned long long pfn, int flags,
1742		struct dev_pagemap *pgmap)
1743{
1744	struct folio *folio = pfn_folio(pfn);
1745	LIST_HEAD(to_kill);
1746	dax_entry_t cookie;
1747	int rc = 0;
1748
1749	/*
1750	 * Prevent the inode from being freed while we are interrogating
1751	 * the address_space, typically this would be handled by
1752	 * lock_page(), but dax pages do not use the page lock. This
1753	 * also prevents changes to the mapping of this pfn until
1754	 * poison signaling is complete.
1755	 */
1756	cookie = dax_lock_folio(folio);
1757	if (!cookie)
1758		return -EBUSY;
1759
1760	if (hwpoison_filter(&folio->page)) {
1761		rc = -EOPNOTSUPP;
1762		goto unlock;
1763	}
1764
1765	switch (pgmap->type) {
1766	case MEMORY_DEVICE_PRIVATE:
1767	case MEMORY_DEVICE_COHERENT:
1768		/*
1769		 * TODO: Handle device pages which may need coordination
1770		 * with device-side memory.
1771		 */
1772		rc = -ENXIO;
1773		goto unlock;
1774	default:
1775		break;
1776	}
1777
1778	/*
1779	 * Use this flag as an indication that the dax page has been
1780	 * remapped UC to prevent speculative consumption of poison.
1781	 */
1782	SetPageHWPoison(&folio->page);
1783
1784	/*
1785	 * Unlike System-RAM there is no possibility to swap in a
1786	 * different physical page at a given virtual address, so all
1787	 * userspace consumption of ZONE_DEVICE memory necessitates
1788	 * SIGBUS (i.e. MF_MUST_KILL)
1789	 */
1790	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1791	collect_procs(folio, &folio->page, &to_kill, true);
1792
1793	unmap_and_kill(&to_kill, pfn, folio->mapping, folio->index, flags);
1794unlock:
1795	dax_unlock_folio(folio, cookie);
1796	return rc;
1797}
1798
1799#ifdef CONFIG_FS_DAX
1800/**
1801 * mf_dax_kill_procs - Collect and kill processes who are using this file range
1802 * @mapping:	address_space of the file in use
1803 * @index:	start pgoff of the range within the file
1804 * @count:	length of the range, in unit of PAGE_SIZE
1805 * @mf_flags:	memory failure flags
1806 */
1807int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
1808		unsigned long count, int mf_flags)
1809{
1810	LIST_HEAD(to_kill);
1811	dax_entry_t cookie;
1812	struct page *page;
1813	size_t end = index + count;
1814	bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
1815
1816	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1817
1818	for (; index < end; index++) {
1819		page = NULL;
1820		cookie = dax_lock_mapping_entry(mapping, index, &page);
1821		if (!cookie)
1822			return -EBUSY;
1823		if (!page)
1824			goto unlock;
1825
1826		if (!pre_remove)
1827			SetPageHWPoison(page);
1828
1829		/*
1830		 * The pre_remove case is revoking access, the memory is still
1831		 * good and could theoretically be put back into service.
1832		 */
1833		collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
1834		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
1835				index, mf_flags);
1836unlock:
1837		dax_unlock_mapping_entry(mapping, index, cookie);
1838	}
1839	return 0;
1840}
1841EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
1842#endif /* CONFIG_FS_DAX */
1843
1844#ifdef CONFIG_HUGETLB_PAGE
1845
1846/*
1847 * Struct raw_hwp_page represents information about "raw error page",
1848 * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
1849 */
1850struct raw_hwp_page {
1851	struct llist_node node;
1852	struct page *page;
1853};
1854
1855static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
1856{
1857	return (struct llist_head *)&folio->_hugetlb_hwpoison;
1858}
1859
1860bool is_raw_hwpoison_page_in_hugepage(struct page *page)
1861{
1862	struct llist_head *raw_hwp_head;
1863	struct raw_hwp_page *p;
1864	struct folio *folio = page_folio(page);
1865	bool ret = false;
1866
1867	if (!folio_test_hwpoison(folio))
1868		return false;
1869
1870	if (!folio_test_hugetlb(folio))
1871		return PageHWPoison(page);
1872
1873	/*
1874	 * When RawHwpUnreliable is set, kernel lost track of which subpages
1875	 * are HWPOISON. So return as if ALL subpages are HWPOISONed.
1876	 */
1877	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1878		return true;
1879
1880	mutex_lock(&mf_mutex);
1881
1882	raw_hwp_head = raw_hwp_list_head(folio);
1883	llist_for_each_entry(p, raw_hwp_head->first, node) {
1884		if (page == p->page) {
1885			ret = true;
1886			break;
1887		}
1888	}
1889
1890	mutex_unlock(&mf_mutex);
1891
1892	return ret;
1893}
1894
1895static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
1896{
1897	struct llist_node *head;
1898	struct raw_hwp_page *p, *next;
1899	unsigned long count = 0;
1900
1901	head = llist_del_all(raw_hwp_list_head(folio));
1902	llist_for_each_entry_safe(p, next, head, node) {
1903		if (move_flag)
1904			SetPageHWPoison(p->page);
1905		else
1906			num_poisoned_pages_sub(page_to_pfn(p->page), 1);
1907		kfree(p);
1908		count++;
1909	}
1910	return count;
1911}
1912
1913static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
1914{
1915	struct llist_head *head;
1916	struct raw_hwp_page *raw_hwp;
1917	struct raw_hwp_page *p;
1918	int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
1919
1920	/*
1921	 * Once the hwpoison hugepage has lost reliable raw error info,
1922	 * there is little meaning to keep additional error info precisely,
1923	 * so skip to add additional raw error info.
1924	 */
1925	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1926		return -EHWPOISON;
1927	head = raw_hwp_list_head(folio);
1928	llist_for_each_entry(p, head->first, node) {
1929		if (p->page == page)
1930			return -EHWPOISON;
1931	}
1932
1933	raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
1934	if (raw_hwp) {
1935		raw_hwp->page = page;
1936		llist_add(&raw_hwp->node, head);
1937		/* the first error event will be counted in action_result(). */
1938		if (ret)
1939			num_poisoned_pages_inc(page_to_pfn(page));
1940	} else {
1941		/*
1942		 * Failed to save raw error info.  We no longer trace all
1943		 * hwpoisoned subpages, and we need refuse to free/dissolve
1944		 * this hwpoisoned hugepage.
1945		 */
1946		folio_set_hugetlb_raw_hwp_unreliable(folio);
1947		/*
1948		 * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not
1949		 * used any more, so free it.
1950		 */
1951		__folio_free_raw_hwp(folio, false);
1952	}
1953	return ret;
1954}
1955
1956static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
1957{
1958	/*
1959	 * hugetlb_vmemmap_optimized hugepages can't be freed because struct
1960	 * pages for tail pages are required but they don't exist.
1961	 */
1962	if (move_flag && folio_test_hugetlb_vmemmap_optimized(folio))
1963		return 0;
1964
1965	/*
1966	 * hugetlb_raw_hwp_unreliable hugepages shouldn't be unpoisoned by
1967	 * definition.
1968	 */
1969	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1970		return 0;
1971
1972	return __folio_free_raw_hwp(folio, move_flag);
1973}
1974
1975void folio_clear_hugetlb_hwpoison(struct folio *folio)
1976{
1977	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1978		return;
1979	if (folio_test_hugetlb_vmemmap_optimized(folio))
1980		return;
1981	folio_clear_hwpoison(folio);
1982	folio_free_raw_hwp(folio, true);
1983}
1984
1985/*
1986 * Called from hugetlb code with hugetlb_lock held.
1987 *
1988 * Return values:
1989 *   0             - free hugepage
1990 *   1             - in-use hugepage
1991 *   2             - not a hugepage
1992 *   -EBUSY        - the hugepage is busy (try to retry)
1993 *   -EHWPOISON    - the hugepage is already hwpoisoned
1994 */
1995int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
1996				 bool *migratable_cleared)
1997{
1998	struct page *page = pfn_to_page(pfn);
1999	struct folio *folio = page_folio(page);
2000	int ret = 2;	/* fallback to normal page handling */
2001	bool count_increased = false;
2002
2003	if (!folio_test_hugetlb(folio))
2004		goto out;
2005
2006	if (flags & MF_COUNT_INCREASED) {
2007		ret = 1;
2008		count_increased = true;
2009	} else if (folio_test_hugetlb_freed(folio)) {
2010		ret = 0;
2011	} else if (folio_test_hugetlb_migratable(folio)) {
2012		ret = folio_try_get(folio);
2013		if (ret)
2014			count_increased = true;
2015	} else {
2016		ret = -EBUSY;
2017		if (!(flags & MF_NO_RETRY))
2018			goto out;
2019	}
2020
2021	if (folio_set_hugetlb_hwpoison(folio, page)) {
2022		ret = -EHWPOISON;
2023		goto out;
2024	}
2025
2026	/*
2027	 * Clearing hugetlb_migratable for hwpoisoned hugepages to prevent them
2028	 * from being migrated by memory hotremove.
2029	 */
2030	if (count_increased && folio_test_hugetlb_migratable(folio)) {
2031		folio_clear_hugetlb_migratable(folio);
2032		*migratable_cleared = true;
2033	}
2034
2035	return ret;
2036out:
2037	if (count_increased)
2038		folio_put(folio);
2039	return ret;
2040}
2041
2042/*
2043 * Taking refcount of hugetlb pages needs extra care about race conditions
2044 * with basic operations like hugepage allocation/free/demotion.
2045 * So some of prechecks for hwpoison (pinning, and testing/setting
2046 * PageHWPoison) should be done in single hugetlb_lock range.
2047 */
2048static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
2049{
2050	int res;
2051	struct page *p = pfn_to_page(pfn);
2052	struct folio *folio;
2053	unsigned long page_flags;
2054	bool migratable_cleared = false;
2055
2056	*hugetlb = 1;
2057retry:
2058	res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
2059	if (res == 2) { /* fallback to normal page handling */
2060		*hugetlb = 0;
2061		return 0;
2062	} else if (res == -EHWPOISON) {
2063		pr_err("%#lx: already hardware poisoned\n", pfn);
2064		if (flags & MF_ACTION_REQUIRED) {
2065			folio = page_folio(p);
2066			res = kill_accessing_process(current, folio_pfn(folio), flags);
2067			action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
2068		}
2069		return res;
2070	} else if (res == -EBUSY) {
2071		if (!(flags & MF_NO_RETRY)) {
2072			flags |= MF_NO_RETRY;
2073			goto retry;
2074		}
2075		return action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED);
2076	}
2077
2078	folio = page_folio(p);
2079	folio_lock(folio);
2080
2081	if (hwpoison_filter(p)) {
2082		folio_clear_hugetlb_hwpoison(folio);
2083		if (migratable_cleared)
2084			folio_set_hugetlb_migratable(folio);
2085		folio_unlock(folio);
2086		if (res == 1)
2087			folio_put(folio);
2088		return -EOPNOTSUPP;
2089	}
2090
2091	/*
2092	 * Handling free hugepage.  The possible race with hugepage allocation
2093	 * or demotion can be prevented by PageHWPoison flag.
2094	 */
2095	if (res == 0) {
2096		folio_unlock(folio);
2097		if (__page_handle_poison(p) > 0) {
2098			page_ref_inc(p);
2099			res = MF_RECOVERED;
2100		} else {
2101			res = MF_FAILED;
2102		}
2103		return action_result(pfn, MF_MSG_FREE_HUGE, res);
2104	}
2105
2106	page_flags = folio->flags;
2107
2108	if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
2109		folio_unlock(folio);
2110		return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_FAILED);
2111	}
2112
2113	return identify_page_state(pfn, p, page_flags);
2114}
2115
2116#else
2117static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
2118{
2119	return 0;
2120}
2121
2122static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag)
2123{
2124	return 0;
2125}
2126#endif	/* CONFIG_HUGETLB_PAGE */
2127
2128/* Drop the extra refcount in case we come from madvise() */
2129static void put_ref_page(unsigned long pfn, int flags)
2130{
2131	if (!(flags & MF_COUNT_INCREASED))
2132		return;
2133
2134	put_page(pfn_to_page(pfn));
2135}
2136
2137static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
2138		struct dev_pagemap *pgmap)
2139{
2140	int rc = -ENXIO;
2141
2142	/* device metadata space is not recoverable */
2143	if (!pgmap_pfn_valid(pgmap, pfn))
2144		goto out;
2145
2146	/*
2147	 * Call driver's implementation to handle the memory failure, otherwise
2148	 * fall back to generic handler.
2149	 */
2150	if (pgmap_has_memory_failure(pgmap)) {
2151		rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
2152		/*
2153		 * Fall back to generic handler too if operation is not
2154		 * supported inside the driver/device/filesystem.
2155		 */
2156		if (rc != -EOPNOTSUPP)
2157			goto out;
2158	}
2159
2160	rc = mf_generic_kill_procs(pfn, flags, pgmap);
2161out:
2162	/* drop pgmap ref acquired in caller */
2163	put_dev_pagemap(pgmap);
2164	if (rc != -EOPNOTSUPP)
2165		action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
2166	return rc;
2167}
2168
2169/*
2170 * The calling condition is as such: thp split failed, page might have
2171 * been RDMA pinned, not much can be done for recovery.
2172 * But a SIGBUS should be delivered with vaddr provided so that the user
2173 * application has a chance to recover. Also, application processes'
2174 * election for MCE early killed will be honored.
2175 */
2176static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
2177				struct folio *folio)
2178{
2179	LIST_HEAD(tokill);
2180
2181	collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
2182	kill_procs(&tokill, true, pfn, flags);
2183}
2184
2185/**
2186 * memory_failure - Handle memory failure of a page.
2187 * @pfn: Page Number of the corrupted page
2188 * @flags: fine tune action taken
2189 *
2190 * This function is called by the low level machine check code
2191 * of an architecture when it detects hardware memory corruption
2192 * of a page. It tries its best to recover, which includes
2193 * dropping pages, killing processes etc.
2194 *
2195 * The function is primarily of use for corruptions that
2196 * happen outside the current execution context (e.g. when
2197 * detected by a background scrubber)
2198 *
2199 * Must run in process context (e.g. a work queue) with interrupts
2200 * enabled and no spinlocks held.
2201 *
2202 * Return: 0 for successfully handled the memory error,
2203 *         -EOPNOTSUPP for hwpoison_filter() filtered the error event,
2204 *         < 0(except -EOPNOTSUPP) on failure.
2205 */
2206int memory_failure(unsigned long pfn, int flags)
2207{
2208	struct page *p;
2209	struct folio *folio;
2210	struct dev_pagemap *pgmap;
2211	int res = 0;
2212	unsigned long page_flags;
2213	bool retry = true;
2214	int hugetlb = 0;
2215
2216	if (!sysctl_memory_failure_recovery)
2217		panic("Memory failure on page %lx", pfn);
2218
2219	mutex_lock(&mf_mutex);
2220
2221	if (!(flags & MF_SW_SIMULATED))
2222		hw_memory_failure = true;
2223
2224	p = pfn_to_online_page(pfn);
2225	if (!p) {
2226		res = arch_memory_failure(pfn, flags);
2227		if (res == 0)
2228			goto unlock_mutex;
2229
2230		if (pfn_valid(pfn)) {
2231			pgmap = get_dev_pagemap(pfn, NULL);
2232			put_ref_page(pfn, flags);
2233			if (pgmap) {
2234				res = memory_failure_dev_pagemap(pfn, flags,
2235								 pgmap);
2236				goto unlock_mutex;
2237			}
2238		}
2239		pr_err("%#lx: memory outside kernel control\n", pfn);
2240		res = -ENXIO;
2241		goto unlock_mutex;
2242	}
2243
2244try_again:
2245	res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
2246	if (hugetlb)
2247		goto unlock_mutex;
2248
2249	if (TestSetPageHWPoison(p)) {
2250		pr_err("%#lx: already hardware poisoned\n", pfn);
2251		res = -EHWPOISON;
2252		if (flags & MF_ACTION_REQUIRED)
2253			res = kill_accessing_process(current, pfn, flags);
2254		if (flags & MF_COUNT_INCREASED)
2255			put_page(p);
2256		action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
2257		goto unlock_mutex;
2258	}
2259
2260	/*
2261	 * We need/can do nothing about count=0 pages.
2262	 * 1) it's a free page, and therefore in safe hand:
2263	 *    check_new_page() will be the gate keeper.
2264	 * 2) it's part of a non-compound high order page.
2265	 *    Implies some kernel user: cannot stop them from
2266	 *    R/W the page; let's pray that the page has been
2267	 *    used and will be freed some time later.
2268	 * In fact it's dangerous to directly bump up page count from 0,
2269	 * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
2270	 */
2271	if (!(flags & MF_COUNT_INCREASED)) {
2272		res = get_hwpoison_page(p, flags);
2273		if (!res) {
2274			if (is_free_buddy_page(p)) {
2275				if (take_page_off_buddy(p)) {
2276					page_ref_inc(p);
2277					res = MF_RECOVERED;
2278				} else {
2279					/* We lost the race, try again */
2280					if (retry) {
2281						ClearPageHWPoison(p);
2282						retry = false;
2283						goto try_again;
2284					}
2285					res = MF_FAILED;
2286				}
2287				res = action_result(pfn, MF_MSG_BUDDY, res);
2288			} else {
2289				res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
2290			}
2291			goto unlock_mutex;
2292		} else if (res < 0) {
2293			res = action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED);
2294			goto unlock_mutex;
2295		}
2296	}
2297
2298	folio = page_folio(p);
2299
2300	/* filter pages that are protected from hwpoison test by users */
2301	folio_lock(folio);
2302	if (hwpoison_filter(p)) {
2303		ClearPageHWPoison(p);
2304		folio_unlock(folio);
2305		folio_put(folio);
2306		res = -EOPNOTSUPP;
2307		goto unlock_mutex;
2308	}
2309	folio_unlock(folio);
2310
2311	if (folio_test_large(folio)) {
2312		/*
2313		 * The flag must be set after the refcount is bumped
2314		 * otherwise it may race with THP split.
2315		 * And the flag can't be set in get_hwpoison_page() since
2316		 * it is called by soft offline too and it is just called
2317		 * for !MF_COUNT_INCREASED.  So here seems to be the best
2318		 * place.
2319		 *
2320		 * Don't need care about the above error handling paths for
2321		 * get_hwpoison_page() since they handle either free page
2322		 * or unhandlable page.  The refcount is bumped iff the
2323		 * page is a valid handlable page.
2324		 */
2325		folio_set_has_hwpoisoned(folio);
2326		if (try_to_split_thp_page(p, false) < 0) {
2327			res = -EHWPOISON;
2328			kill_procs_now(p, pfn, flags, folio);
2329			put_page(p);
2330			action_result(pfn, MF_MSG_UNSPLIT_THP, MF_FAILED);
2331			goto unlock_mutex;
2332		}
2333		VM_BUG_ON_PAGE(!page_count(p), p);
2334		folio = page_folio(p);
2335	}
2336
2337	/*
2338	 * We ignore non-LRU pages for good reasons.
2339	 * - PG_locked is only well defined for LRU pages and a few others
2340	 * - to avoid races with __SetPageLocked()
2341	 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
2342	 * The check (unnecessarily) ignores LRU pages being isolated and
2343	 * walked by the page reclaim code, however that's not a big loss.
2344	 */
2345	shake_folio(folio);
2346
2347	folio_lock(folio);
2348
2349	/*
2350	 * We're only intended to deal with the non-Compound page here.
2351	 * The page cannot become compound pages again as folio has been
2352	 * splited and extra refcnt is held.
2353	 */
2354	WARN_ON(folio_test_large(folio));
2355
2356	/*
2357	 * We use page flags to determine what action should be taken, but
2358	 * the flags can be modified by the error containment action.  One
2359	 * example is an mlocked page, where PG_mlocked is cleared by
2360	 * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
2361	 * status correctly, we save a copy of the page flags at this time.
2362	 */
2363	page_flags = folio->flags;
2364
2365	/*
2366	 * __munlock_folio() may clear a writeback folio's LRU flag without
2367	 * the folio lock. We need to wait for writeback completion for this
2368	 * folio or it may trigger a vfs BUG while evicting inode.
2369	 */
2370	if (!folio_test_lru(folio) && !folio_test_writeback(folio))
2371		goto identify_page_state;
2372
2373	/*
2374	 * It's very difficult to mess with pages currently under IO
2375	 * and in many cases impossible, so we just avoid it here.
2376	 */
2377	folio_wait_writeback(folio);
2378
2379	/*
2380	 * Now take care of user space mappings.
2381	 * Abort on fail: __filemap_remove_folio() assumes unmapped page.
2382	 */
2383	if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
2384		res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_FAILED);
2385		goto unlock_page;
2386	}
2387
2388	/*
2389	 * Torn down by someone else?
2390	 */
2391	if (folio_test_lru(folio) && !folio_test_swapcache(folio) &&
2392	    folio->mapping == NULL) {
2393		res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
2394		goto unlock_page;
2395	}
2396
2397identify_page_state:
2398	res = identify_page_state(pfn, p, page_flags);
2399	mutex_unlock(&mf_mutex);
2400	return res;
2401unlock_page:
2402	folio_unlock(folio);
2403unlock_mutex:
2404	mutex_unlock(&mf_mutex);
2405	return res;
2406}
2407EXPORT_SYMBOL_GPL(memory_failure);
2408
2409#define MEMORY_FAILURE_FIFO_ORDER	4
2410#define MEMORY_FAILURE_FIFO_SIZE	(1 << MEMORY_FAILURE_FIFO_ORDER)
2411
2412struct memory_failure_entry {
2413	unsigned long pfn;
2414	int flags;
2415};
2416
2417struct memory_failure_cpu {
2418	DECLARE_KFIFO(fifo, struct memory_failure_entry,
2419		      MEMORY_FAILURE_FIFO_SIZE);
2420	raw_spinlock_t lock;
2421	struct work_struct work;
2422};
2423
2424static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
2425
2426/**
2427 * memory_failure_queue - Schedule handling memory failure of a page.
2428 * @pfn: Page Number of the corrupted page
2429 * @flags: Flags for memory failure handling
2430 *
2431 * This function is called by the low level hardware error handler
2432 * when it detects hardware memory corruption of a page. It schedules
2433 * the recovering of error page, including dropping pages, killing
2434 * processes etc.
2435 *
2436 * The function is primarily of use for corruptions that
2437 * happen outside the current execution context (e.g. when
2438 * detected by a background scrubber)
2439 *
2440 * Can run in IRQ context.
2441 */
2442void memory_failure_queue(unsigned long pfn, int flags)
2443{
2444	struct memory_failure_cpu *mf_cpu;
2445	unsigned long proc_flags;
2446	bool buffer_overflow;
2447	struct memory_failure_entry entry = {
2448		.pfn =		pfn,
2449		.flags =	flags,
2450	};
2451
2452	mf_cpu = &get_cpu_var(memory_failure_cpu);
2453	raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags);
2454	buffer_overflow = !kfifo_put(&mf_cpu->fifo, entry);
2455	if (!buffer_overflow)
2456		schedule_work_on(smp_processor_id(), &mf_cpu->work);
2457	raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
2458	put_cpu_var(memory_failure_cpu);
2459	if (buffer_overflow)
2460		pr_err("buffer overflow when queuing memory failure at %#lx\n",
2461		       pfn);
2462}
2463EXPORT_SYMBOL_GPL(memory_failure_queue);
2464
2465static void memory_failure_work_func(struct work_struct *work)
2466{
2467	struct memory_failure_cpu *mf_cpu;
2468	struct memory_failure_entry entry = { 0, };
2469	unsigned long proc_flags;
2470	int gotten;
2471
2472	mf_cpu = container_of(work, struct memory_failure_cpu, work);
2473	for (;;) {
2474		raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags);
2475		gotten = kfifo_get(&mf_cpu->fifo, &entry);
2476		raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
2477		if (!gotten)
2478			break;
2479		if (entry.flags & MF_SOFT_OFFLINE)
2480			soft_offline_page(entry.pfn, entry.flags);
2481		else
2482			memory_failure(entry.pfn, entry.flags);
2483	}
2484}
2485
2486/*
2487 * Process memory_failure work queued on the specified CPU.
2488 * Used to avoid return-to-userspace racing with the memory_failure workqueue.
2489 */
2490void memory_failure_queue_kick(int cpu)
2491{
2492	struct memory_failure_cpu *mf_cpu;
2493
2494	mf_cpu = &per_cpu(memory_failure_cpu, cpu);
2495	cancel_work_sync(&mf_cpu->work);
2496	memory_failure_work_func(&mf_cpu->work);
2497}
2498
2499static int __init memory_failure_init(void)
2500{
2501	struct memory_failure_cpu *mf_cpu;
2502	int cpu;
2503
2504	for_each_possible_cpu(cpu) {
2505		mf_cpu = &per_cpu(memory_failure_cpu, cpu);
2506		raw_spin_lock_init(&mf_cpu->lock);
2507		INIT_KFIFO(mf_cpu->fifo);
2508		INIT_WORK(&mf_cpu->work, memory_failure_work_func);
2509	}
2510
2511	register_sysctl_init("vm", memory_failure_table);
2512
2513	return 0;
2514}
2515core_initcall(memory_failure_init);
2516
2517#undef pr_fmt
2518#define pr_fmt(fmt)	"Unpoison: " fmt
2519#define unpoison_pr_info(fmt, pfn, rs)			\
2520({							\
2521	if (__ratelimit(rs))				\
2522		pr_info(fmt, pfn);			\
2523})
2524
2525/**
2526 * unpoison_memory - Unpoison a previously poisoned page
2527 * @pfn: Page number of the to be unpoisoned page
2528 *
2529 * Software-unpoison a page that has been poisoned by
2530 * memory_failure() earlier.
2531 *
2532 * This is only done on the software-level, so it only works
2533 * for linux injected failures, not real hardware failures
2534 *
2535 * Returns 0 for success, otherwise -errno.
2536 */
2537int unpoison_memory(unsigned long pfn)
2538{
2539	struct folio *folio;
2540	struct page *p;
2541	int ret = -EBUSY, ghp;
2542	unsigned long count;
2543	bool huge = false;
2544	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
2545					DEFAULT_RATELIMIT_BURST);
2546
2547	if (!pfn_valid(pfn))
2548		return -ENXIO;
2549
2550	p = pfn_to_page(pfn);
2551	folio = page_folio(p);
2552
2553	mutex_lock(&mf_mutex);
2554
2555	if (hw_memory_failure) {
2556		unpoison_pr_info("%#lx: disabled after HW memory failure\n",
2557				 pfn, &unpoison_rs);
2558		ret = -EOPNOTSUPP;
2559		goto unlock_mutex;
2560	}
2561
2562	if (is_huge_zero_folio(folio)) {
2563		unpoison_pr_info("%#lx: huge zero page is not supported\n",
2564				 pfn, &unpoison_rs);
2565		ret = -EOPNOTSUPP;
2566		goto unlock_mutex;
2567	}
2568
2569	if (!PageHWPoison(p)) {
2570		unpoison_pr_info("%#lx: page was already unpoisoned\n",
2571				 pfn, &unpoison_rs);
2572		goto unlock_mutex;
2573	}
2574
2575	if (folio_ref_count(folio) > 1) {
2576		unpoison_pr_info("%#lx: someone grabs the hwpoison page\n",
2577				 pfn, &unpoison_rs);
2578		goto unlock_mutex;
2579	}
2580
2581	if (folio_test_slab(folio) || folio_test_pgtable(folio) ||
2582	    folio_test_reserved(folio) || folio_test_offline(folio))
2583		goto unlock_mutex;
2584
2585	if (folio_mapped(folio)) {
2586		unpoison_pr_info("%#lx: someone maps the hwpoison page\n",
2587				 pfn, &unpoison_rs);
2588		goto unlock_mutex;
2589	}
2590
2591	if (folio_mapping(folio)) {
2592		unpoison_pr_info("%#lx: the hwpoison page has non-NULL mapping\n",
2593				 pfn, &unpoison_rs);
2594		goto unlock_mutex;
2595	}
2596
2597	ghp = get_hwpoison_page(p, MF_UNPOISON);
2598	if (!ghp) {
2599		if (folio_test_hugetlb(folio)) {
2600			huge = true;
2601			count = folio_free_raw_hwp(folio, false);
2602			if (count == 0)
2603				goto unlock_mutex;
2604		}
2605		ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY;
2606	} else if (ghp < 0) {
2607		if (ghp == -EHWPOISON) {
2608			ret = put_page_back_buddy(p) ? 0 : -EBUSY;
2609		} else {
2610			ret = ghp;
2611			unpoison_pr_info("%#lx: failed to grab page\n",
2612					 pfn, &unpoison_rs);
2613		}
2614	} else {
2615		if (folio_test_hugetlb(folio)) {
2616			huge = true;
2617			count = folio_free_raw_hwp(folio, false);
2618			if (count == 0) {
2619				folio_put(folio);
2620				goto unlock_mutex;
2621			}
2622		}
2623
2624		folio_put(folio);
2625		if (TestClearPageHWPoison(p)) {
2626			folio_put(folio);
2627			ret = 0;
2628		}
2629	}
2630
2631unlock_mutex:
2632	mutex_unlock(&mf_mutex);
2633	if (!ret) {
2634		if (!huge)
2635			num_poisoned_pages_sub(pfn, 1);
2636		unpoison_pr_info("%#lx: software-unpoisoned page\n",
2637				 page_to_pfn(p), &unpoison_rs);
2638	}
2639	return ret;
2640}
2641EXPORT_SYMBOL(unpoison_memory);
2642
2643#undef pr_fmt
2644#define pr_fmt(fmt) "Soft offline: " fmt
2645
2646static bool mf_isolate_folio(struct folio *folio, struct list_head *pagelist)
2647{
2648	bool isolated = false;
2649
2650	if (folio_test_hugetlb(folio)) {
2651		isolated = isolate_hugetlb(folio, pagelist);
2652	} else {
2653		bool lru = !__folio_test_movable(folio);
2654
2655		if (lru)
2656			isolated = folio_isolate_lru(folio);
2657		else
2658			isolated = isolate_movable_page(&folio->page,
2659							ISOLATE_UNEVICTABLE);
2660
2661		if (isolated) {
2662			list_add(&folio->lru, pagelist);
2663			if (lru)
2664				node_stat_add_folio(folio, NR_ISOLATED_ANON +
2665						    folio_is_file_lru(folio));
2666		}
2667	}
2668
2669	/*
2670	 * If we succeed to isolate the folio, we grabbed another refcount on
2671	 * the folio, so we can safely drop the one we got from get_any_page().
2672	 * If we failed to isolate the folio, it means that we cannot go further
2673	 * and we will return an error, so drop the reference we got from
2674	 * get_any_page() as well.
2675	 */
2676	folio_put(folio);
2677	return isolated;
2678}
2679
2680/*
2681 * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
2682 * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
2683 * If the page is mapped, it migrates the contents over.
2684 */
2685static int soft_offline_in_use_page(struct page *page)
2686{
2687	long ret = 0;
2688	unsigned long pfn = page_to_pfn(page);
2689	struct folio *folio = page_folio(page);
2690	char const *msg_page[] = {"page", "hugepage"};
2691	bool huge = folio_test_hugetlb(folio);
2692	LIST_HEAD(pagelist);
2693	struct migration_target_control mtc = {
2694		.nid = NUMA_NO_NODE,
2695		.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
2696		.reason = MR_MEMORY_FAILURE,
2697	};
2698
2699	if (!huge && folio_test_large(folio)) {
2700		if (try_to_split_thp_page(page, true)) {
2701			pr_info("%#lx: thp split failed\n", pfn);
2702			return -EBUSY;
2703		}
2704		folio = page_folio(page);
2705	}
2706
2707	folio_lock(folio);
2708	if (!huge)
2709		folio_wait_writeback(folio);
2710	if (PageHWPoison(page)) {
2711		folio_unlock(folio);
2712		folio_put(folio);
2713		pr_info("%#lx: page already poisoned\n", pfn);
2714		return 0;
2715	}
2716
2717	if (!huge && folio_test_lru(folio) && !folio_test_swapcache(folio))
2718		/*
2719		 * Try to invalidate first. This should work for
2720		 * non dirty unmapped page cache pages.
2721		 */
2722		ret = mapping_evict_folio(folio_mapping(folio), folio);
2723	folio_unlock(folio);
2724
2725	if (ret) {
2726		pr_info("%#lx: invalidated\n", pfn);
2727		page_handle_poison(page, false, true);
2728		return 0;
2729	}
2730
2731	if (mf_isolate_folio(folio, &pagelist)) {
2732		ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
2733			(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
2734		if (!ret) {
2735			bool release = !huge;
2736
2737			if (!page_handle_poison(page, huge, release))
2738				ret = -EBUSY;
2739		} else {
2740			if (!list_empty(&pagelist))
2741				putback_movable_pages(&pagelist);
2742
2743			pr_info("%#lx: %s migration failed %ld, type %pGp\n",
2744				pfn, msg_page[huge], ret, &page->flags);
2745			if (ret > 0)
2746				ret = -EBUSY;
2747		}
2748	} else {
2749		pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n",
2750			pfn, msg_page[huge], page_count(page), &page->flags);
2751		ret = -EBUSY;
2752	}
2753	return ret;
2754}
2755
2756/**
2757 * soft_offline_page - Soft offline a page.
2758 * @pfn: pfn to soft-offline
2759 * @flags: flags. Same as memory_failure().
2760 *
2761 * Returns 0 on success,
2762 *         -EOPNOTSUPP for hwpoison_filter() filtered the error event, or
2763 *         disabled by /proc/sys/vm/enable_soft_offline,
2764 *         < 0 otherwise negated errno.
2765 *
2766 * Soft offline a page, by migration or invalidation,
2767 * without killing anything. This is for the case when
2768 * a page is not corrupted yet (so it's still valid to access),
2769 * but has had a number of corrected errors and is better taken
2770 * out.
2771 *
2772 * The actual policy on when to do that is maintained by
2773 * user space.
2774 *
2775 * This should never impact any application or cause data loss,
2776 * however it might take some time.
2777 *
2778 * This is not a 100% solution for all memory, but tries to be
2779 * ``good enough'' for the majority of memory.
2780 */
2781int soft_offline_page(unsigned long pfn, int flags)
2782{
2783	int ret;
2784	bool try_again = true;
2785	struct page *page;
2786
2787	if (!pfn_valid(pfn)) {
2788		WARN_ON_ONCE(flags & MF_COUNT_INCREASED);
2789		return -ENXIO;
2790	}
2791
2792	/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
2793	page = pfn_to_online_page(pfn);
2794	if (!page) {
2795		put_ref_page(pfn, flags);
2796		return -EIO;
2797	}
2798
2799	if (!sysctl_enable_soft_offline) {
2800		pr_info_once("disabled by /proc/sys/vm/enable_soft_offline\n");
2801		put_ref_page(pfn, flags);
2802		return -EOPNOTSUPP;
2803	}
2804
2805	mutex_lock(&mf_mutex);
2806
2807	if (PageHWPoison(page)) {
2808		pr_info("%#lx: page already poisoned\n", pfn);
2809		put_ref_page(pfn, flags);
2810		mutex_unlock(&mf_mutex);
2811		return 0;
2812	}
2813
2814retry:
2815	get_online_mems();
2816	ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE);
2817	put_online_mems();
2818
2819	if (hwpoison_filter(page)) {
2820		if (ret > 0)
2821			put_page(page);
2822
2823		mutex_unlock(&mf_mutex);
2824		return -EOPNOTSUPP;
2825	}
2826
2827	if (ret > 0) {
2828		ret = soft_offline_in_use_page(page);
2829	} else if (ret == 0) {
2830		if (!page_handle_poison(page, true, false)) {
2831			if (try_again) {
2832				try_again = false;
2833				flags &= ~MF_COUNT_INCREASED;
2834				goto retry;
2835			}
2836			ret = -EBUSY;
2837		}
2838	}
2839
2840	mutex_unlock(&mf_mutex);
2841
2842	return ret;
2843}