commit d4220f987cf473c65a342ca69e3eb13dea919a49 · tjh.dev/kernel

+44

Documentation/ABI/testing/sysfs-memory-page-offline

··· 1 + What: /sys/devices/system/memory/soft_offline_page 2 + Date: Sep 2009 3 + KernelVersion: 2.6.33 4 + Contact: andi@firstfloor.org 5 + Description: 6 + Soft-offline the memory page containing the physical address 7 + written into this file. Input is a hex number specifying the 8 + physical address of the page. The kernel will then attempt 9 + to soft-offline it, by moving the contents elsewhere or 10 + dropping it if possible. The kernel will then be placed 11 + on the bad page list and never be reused. 12 + 13 + The offlining is done in kernel specific granuality. 14 + Normally it's the base page size of the kernel, but 15 + this might change. 16 + 17 + The page must be still accessible, not poisoned. The 18 + kernel will never kill anything for this, but rather 19 + fail the offline. Return value is the size of the 20 + number, or a error when the offlining failed. Reading 21 + the file is not allowed. 22 + 23 + What: /sys/devices/system/memory/hard_offline_page 24 + Date: Sep 2009 25 + KernelVersion: 2.6.33 26 + Contact: andi@firstfloor.org 27 + Description: 28 + Hard-offline the memory page containing the physical 29 + address written into this file. Input is a hex number 30 + specifying the physical address of the page. The 31 + kernel will then attempt to hard-offline the page, by 32 + trying to drop the page or killing any owner or 33 + triggering IO errors if needed. Note this may kill 34 + any processes owning the page. The kernel will avoid 35 + to access this page assuming it's poisoned by the 36 + hardware. 37 + 38 + The offlining is done in kernel specific granuality. 39 + Normally it's the base page size of the kernel, but 40 + this might change. 41 + 42 + Return value is the size of the number, or a error when 43 + the offlining failed. 44 + Reading the file is not allowed.

+49 -3

Documentation/vm/hwpoison.txt

··· 92 92 93 93 Testing: 94 94 95 - madvise(MADV_POISON, ....) 95 + madvise(MADV_HWPOISON, ....) 96 96 (as root) 97 97 Poison a page in the process for testing 98 98 99 99 100 100 hwpoison-inject module through debugfs 101 - /sys/debug/hwpoison/corrupt-pfn 102 101 103 - Inject hwpoison fault at PFN echoed into this file 102 + /sys/debug/hwpoison/ 104 103 104 + corrupt-pfn 105 + 106 + Inject hwpoison fault at PFN echoed into this file. This does 107 + some early filtering to avoid corrupted unintended pages in test suites. 108 + 109 + unpoison-pfn 110 + 111 + Software-unpoison page at PFN echoed into this file. This 112 + way a page can be reused again. 113 + This only works for Linux injected failures, not for real 114 + memory failures. 115 + 116 + Note these injection interfaces are not stable and might change between 117 + kernel versions 118 + 119 + corrupt-filter-dev-major 120 + corrupt-filter-dev-minor 121 + 122 + Only handle memory failures to pages associated with the file system defined 123 + by block device major/minor. -1U is the wildcard value. 124 + This should be only used for testing with artificial injection. 125 + 126 + corrupt-filter-memcg 127 + 128 + Limit injection to pages owned by memgroup. Specified by inode number 129 + of the memcg. 130 + 131 + Example: 132 + mkdir /cgroup/hwpoison 133 + 134 + usemem -m 100 -s 1000 & 135 + echo `jobs -p` > /cgroup/hwpoison/tasks 136 + 137 + memcg_ino=$(ls -id /cgroup/hwpoison | cut -f1 -d' ') 138 + echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg 139 + 140 + page-types -p `pidof init` --hwpoison # shall do nothing 141 + page-types -p `pidof usemem` --hwpoison # poison its pages 142 + 143 + corrupt-filter-flags-mask 144 + corrupt-filter-flags-value 145 + 146 + When specified, only poison pages if ((page_flags & mask) == value). 147 + This allows stress testing of many kinds of pages. The page_flags 148 + are the same as in /proc/kpageflags. The flag bits are defined in 149 + include/linux/kernel-page-flags.h and documented in 150 + Documentation/vm/pagemap.txt 105 151 106 152 Architecture specific MCE injector 107 153

+13 -2

Documentation/vm/page-types.c

··· 1 1 /* 2 2 * page-types: Tool for querying page flags 3 3 * 4 + * This program is free software; you can redistribute it and/or modify it 5 + * under the terms of the GNU General Public License as published by the Free 6 + * Software Foundation; version 2. 7 + * 8 + * This program is distributed in the hope that it will be useful, but WITHOUT 9 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 + * more details. 12 + * 13 + * You should find a copy of v2 of the GNU General Public License somewhere on 14 + * your Linux system; if not, write to the Free Software Foundation, Inc., 59 15 + * Temple Place, Suite 330, Boston, MA 02111-1307 USA. 16 + * 4 17 * Copyright (C) 2009 Intel corporation 5 18 * 6 19 * Authors: Wu Fengguang <fengguang.wu@intel.com> 7 - * 8 - * Released under the General Public License (GPL). 9 20 */ 10 21 11 22 #define _LARGEFILE64_SOURCE

+9

MAINTAINERS

··· 2377 2377 S: Maintained 2378 2378 F: drivers/hwmon/hdaps.c 2379 2379 2380 + HWPOISON MEMORY FAILURE HANDLING 2381 + M: Andi Kleen <andi@firstfloor.org> 2382 + L: linux-mm@kvack.org 2383 + L: linux-kernel@vger.kernel.org 2384 + T: git git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git hwpoison 2385 + S: Maintained 2386 + F: mm/memory-failure.c 2387 + F: mm/hwpoison-inject.c 2388 + 2380 2389 HYPERVISOR VIRTUAL CONSOLE DRIVER 2381 2390 L: linuxppc-dev@ozlabs.org 2382 2391 S: Odd Fixes

+61

drivers/base/memory.c

··· 341 341 } 342 342 #endif 343 343 344 + #ifdef CONFIG_MEMORY_FAILURE 345 + /* 346 + * Support for offlining pages of memory 347 + */ 348 + 349 + /* Soft offline a page */ 350 + static ssize_t 351 + store_soft_offline_page(struct class *class, const char *buf, size_t count) 352 + { 353 + int ret; 354 + u64 pfn; 355 + if (!capable(CAP_SYS_ADMIN)) 356 + return -EPERM; 357 + if (strict_strtoull(buf, 0, &pfn) < 0) 358 + return -EINVAL; 359 + pfn >>= PAGE_SHIFT; 360 + if (!pfn_valid(pfn)) 361 + return -ENXIO; 362 + ret = soft_offline_page(pfn_to_page(pfn), 0); 363 + return ret == 0 ? count : ret; 364 + } 365 + 366 + /* Forcibly offline a page, including killing processes. */ 367 + static ssize_t 368 + store_hard_offline_page(struct class *class, const char *buf, size_t count) 369 + { 370 + int ret; 371 + u64 pfn; 372 + if (!capable(CAP_SYS_ADMIN)) 373 + return -EPERM; 374 + if (strict_strtoull(buf, 0, &pfn) < 0) 375 + return -EINVAL; 376 + pfn >>= PAGE_SHIFT; 377 + ret = __memory_failure(pfn, 0, 0); 378 + return ret ? ret : count; 379 + } 380 + 381 + static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); 382 + static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); 383 + 384 + static __init int memory_fail_init(void) 385 + { 386 + int err; 387 + 388 + err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 389 + &class_attr_soft_offline_page.attr); 390 + if (!err) 391 + err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 392 + &class_attr_hard_offline_page.attr); 393 + return err; 394 + } 395 + #else 396 + static inline int memory_fail_init(void) 397 + { 398 + return 0; 399 + } 400 + #endif 401 + 344 402 /* 345 403 * Note that phys_device is optional. It is here to allow for 346 404 * differentiation between which *physical* devices each ··· 529 471 } 530 472 531 473 err = memory_probe_init(); 474 + if (!ret) 475 + ret = err; 476 + err = memory_fail_init(); 532 477 if (!ret) 533 478 ret = err; 534 479 err = block_size_init();

+3 -42

fs/proc/page.c

··· 8 8 #include <linux/proc_fs.h> 9 9 #include <linux/seq_file.h> 10 10 #include <linux/hugetlb.h> 11 + #include <linux/kernel-page-flags.h> 11 12 #include <asm/uaccess.h> 12 13 #include "internal.h" 13 14 ··· 72 71 * physical page flags. 73 72 */ 74 73 75 - /* These macros are used to decouple internal flags from exported ones */ 76 - 77 - #define KPF_LOCKED 0 78 - #define KPF_ERROR 1 79 - #define KPF_REFERENCED 2 80 - #define KPF_UPTODATE 3 81 - #define KPF_DIRTY 4 82 - #define KPF_LRU 5 83 - #define KPF_ACTIVE 6 84 - #define KPF_SLAB 7 85 - #define KPF_WRITEBACK 8 86 - #define KPF_RECLAIM 9 87 - #define KPF_BUDDY 10 88 - 89 - /* 11-20: new additions in 2.6.31 */ 90 - #define KPF_MMAP 11 91 - #define KPF_ANON 12 92 - #define KPF_SWAPCACHE 13 93 - #define KPF_SWAPBACKED 14 94 - #define KPF_COMPOUND_HEAD 15 95 - #define KPF_COMPOUND_TAIL 16 96 - #define KPF_HUGE 17 97 - #define KPF_UNEVICTABLE 18 98 - #define KPF_HWPOISON 19 99 - #define KPF_NOPAGE 20 100 - 101 - #define KPF_KSM 21 102 - 103 - /* kernel hacking assistances 104 - * WARNING: subject to change, never rely on them! 105 - */ 106 - #define KPF_RESERVED 32 107 - #define KPF_MLOCKED 33 108 - #define KPF_MAPPEDTODISK 34 109 - #define KPF_PRIVATE 35 110 - #define KPF_PRIVATE_2 36 111 - #define KPF_OWNER_PRIVATE 37 112 - #define KPF_ARCH 38 113 - #define KPF_UNCACHED 39 114 - 115 74 static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) 116 75 { 117 76 return ((kflags >> kbit) & 1) << ubit; 118 77 } 119 78 120 - static u64 get_uflags(struct page *page) 79 + u64 stable_page_flags(struct page *page) 121 80 { 122 81 u64 k; 123 82 u64 u; ··· 180 219 else 181 220 ppage = NULL; 182 221 183 - if (put_user(get_uflags(ppage), out)) { 222 + if (put_user(stable_page_flags(ppage), out)) { 184 223 ret = -EFAULT; 185 224 break; 186 225 }

+1

include/asm-generic/mman-common.h

··· 40 40 #define MADV_DONTFORK 10 /* don't inherit across fork */ 41 41 #define MADV_DOFORK 11 /* do inherit across fork */ 42 42 #define MADV_HWPOISON 100 /* poison a page for testing */ 43 + #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ 43 44 44 45 #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ 45 46 #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */

+46

include/linux/kernel-page-flags.h

··· 1 + #ifndef LINUX_KERNEL_PAGE_FLAGS_H 2 + #define LINUX_KERNEL_PAGE_FLAGS_H 3 + 4 + /* 5 + * Stable page flag bits exported to user space 6 + */ 7 + 8 + #define KPF_LOCKED 0 9 + #define KPF_ERROR 1 10 + #define KPF_REFERENCED 2 11 + #define KPF_UPTODATE 3 12 + #define KPF_DIRTY 4 13 + #define KPF_LRU 5 14 + #define KPF_ACTIVE 6 15 + #define KPF_SLAB 7 16 + #define KPF_WRITEBACK 8 17 + #define KPF_RECLAIM 9 18 + #define KPF_BUDDY 10 19 + 20 + /* 11-20: new additions in 2.6.31 */ 21 + #define KPF_MMAP 11 22 + #define KPF_ANON 12 23 + #define KPF_SWAPCACHE 13 24 + #define KPF_SWAPBACKED 14 25 + #define KPF_COMPOUND_HEAD 15 26 + #define KPF_COMPOUND_TAIL 16 27 + #define KPF_HUGE 17 28 + #define KPF_UNEVICTABLE 18 29 + #define KPF_HWPOISON 19 30 + #define KPF_NOPAGE 20 31 + 32 + #define KPF_KSM 21 33 + 34 + /* kernel hacking assistances 35 + * WARNING: subject to change, never rely on them! 36 + */ 37 + #define KPF_RESERVED 32 38 + #define KPF_MLOCKED 33 39 + #define KPF_MAPPEDTODISK 34 40 + #define KPF_PRIVATE 35 41 + #define KPF_PRIVATE_2 36 42 + #define KPF_OWNER_PRIVATE 37 43 + #define KPF_ARCH 38 44 + #define KPF_UNCACHED 39 45 + 46 + #endif /* LINUX_KERNEL_PAGE_FLAGS_H */

+13

include/linux/memcontrol.h

··· 73 73 extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); 74 74 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); 75 75 76 + extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); 76 77 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); 77 78 78 79 static inline ··· 85 84 rcu_read_unlock(); 86 85 return cgroup == mem; 87 86 } 87 + 88 + extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem); 88 89 89 90 extern int 90 91 mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr); ··· 205 202 { 206 203 } 207 204 205 + static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 206 + { 207 + return NULL; 208 + } 209 + 208 210 static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem) 209 211 { 210 212 return 1; ··· 219 211 const struct mem_cgroup *mem) 220 212 { 221 213 return 1; 214 + } 215 + 216 + static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 217 + { 218 + return NULL; 222 219 } 223 220 224 221 static inline int

+7 -1

include/linux/mm.h

··· 1331 1331 size_t size); 1332 1332 extern void refund_locked_memory(struct mm_struct *mm, size_t size); 1333 1333 1334 + enum mf_flags { 1335 + MF_COUNT_INCREASED = 1 << 0, 1336 + }; 1334 1337 extern void memory_failure(unsigned long pfn, int trapno); 1335 - extern int __memory_failure(unsigned long pfn, int trapno, int ref); 1338 + extern int __memory_failure(unsigned long pfn, int trapno, int flags); 1339 + extern int unpoison_memory(unsigned long pfn); 1336 1340 extern int sysctl_memory_failure_early_kill; 1337 1341 extern int sysctl_memory_failure_recovery; 1342 + extern void shake_page(struct page *p, int access); 1338 1343 extern atomic_long_t mce_bad_pages; 1344 + extern int soft_offline_page(struct page *page, int flags); 1339 1345 1340 1346 #endif /* __KERNEL__ */ 1341 1347 #endif /* _LINUX_MM_H */

+3 -1

include/linux/page-flags.h

··· 275 275 276 276 #ifdef CONFIG_MEMORY_FAILURE 277 277 PAGEFLAG(HWPoison, hwpoison) 278 - TESTSETFLAG(HWPoison, hwpoison) 278 + TESTSCFLAG(HWPoison, hwpoison) 279 279 #define __PG_HWPOISON (1UL << PG_hwpoison) 280 280 #else 281 281 PAGEFLAG_FALSE(HWPoison) 282 282 #define __PG_HWPOISON 0 283 283 #endif 284 + 285 + u64 stable_page_flags(struct page *page); 284 286 285 287 static inline int PageUptodate(struct page *page) 286 288 {

+2 -1

mm/Kconfig

··· 251 251 special hardware support and typically ECC memory. 252 252 253 253 config HWPOISON_INJECT 254 - tristate "Poison pages injector" 254 + tristate "HWPoison pages injector" 255 255 depends on MEMORY_FAILURE && DEBUG_KERNEL 256 + select PROC_PAGE_MONITOR 256 257 257 258 config NOMMU_INITIAL_TRIM_EXCESS 258 259 int "Turn on mmap() excess space trimming before booting"

+105 -8

mm/hwpoison-inject.c

··· 3 3 #include <linux/debugfs.h> 4 4 #include <linux/kernel.h> 5 5 #include <linux/mm.h> 6 + #include <linux/swap.h> 7 + #include <linux/pagemap.h> 8 + #include "internal.h" 6 9 7 - static struct dentry *hwpoison_dir, *corrupt_pfn; 10 + static struct dentry *hwpoison_dir; 8 11 9 12 static int hwpoison_inject(void *data, u64 val) 10 13 { 14 + unsigned long pfn = val; 15 + struct page *p; 16 + int err; 17 + 11 18 if (!capable(CAP_SYS_ADMIN)) 12 19 return -EPERM; 13 - printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); 14 - return __memory_failure(val, 18, 0); 20 + 21 + if (!hwpoison_filter_enable) 22 + goto inject; 23 + if (!pfn_valid(pfn)) 24 + return -ENXIO; 25 + 26 + p = pfn_to_page(pfn); 27 + /* 28 + * This implies unable to support free buddy pages. 29 + */ 30 + if (!get_page_unless_zero(p)) 31 + return 0; 32 + 33 + if (!PageLRU(p)) 34 + shake_page(p, 0); 35 + /* 36 + * This implies unable to support non-LRU pages. 37 + */ 38 + if (!PageLRU(p)) 39 + return 0; 40 + 41 + /* 42 + * do a racy check with elevated page count, to make sure PG_hwpoison 43 + * will only be set for the targeted owner (or on a free page). 44 + * We temporarily take page lock for try_get_mem_cgroup_from_page(). 45 + * __memory_failure() will redo the check reliably inside page lock. 46 + */ 47 + lock_page(p); 48 + err = hwpoison_filter(p); 49 + unlock_page(p); 50 + if (err) 51 + return 0; 52 + 53 + inject: 54 + printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); 55 + return __memory_failure(pfn, 18, MF_COUNT_INCREASED); 56 + } 57 + 58 + static int hwpoison_unpoison(void *data, u64 val) 59 + { 60 + if (!capable(CAP_SYS_ADMIN)) 61 + return -EPERM; 62 + 63 + return unpoison_memory(val); 15 64 } 16 65 17 66 DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); 67 + DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); 18 68 19 69 static void pfn_inject_exit(void) 20 70 { ··· 74 24 75 25 static int pfn_inject_init(void) 76 26 { 27 + struct dentry *dentry; 28 + 77 29 hwpoison_dir = debugfs_create_dir("hwpoison", NULL); 78 30 if (hwpoison_dir == NULL) 79 31 return -ENOMEM; 80 - corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, 32 + 33 + /* 34 + * Note that the below poison/unpoison interfaces do not involve 35 + * hardware status change, hence do not require hardware support. 36 + * They are mainly for testing hwpoison in software level. 37 + */ 38 + dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, 81 39 NULL, &hwpoison_fops); 82 - if (corrupt_pfn == NULL) { 83 - pfn_inject_exit(); 84 - return -ENOMEM; 85 - } 40 + if (!dentry) 41 + goto fail; 42 + 43 + dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, 44 + NULL, &unpoison_fops); 45 + if (!dentry) 46 + goto fail; 47 + 48 + dentry = debugfs_create_u32("corrupt-filter-enable", 0600, 49 + hwpoison_dir, &hwpoison_filter_enable); 50 + if (!dentry) 51 + goto fail; 52 + 53 + dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, 54 + hwpoison_dir, &hwpoison_filter_dev_major); 55 + if (!dentry) 56 + goto fail; 57 + 58 + dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, 59 + hwpoison_dir, &hwpoison_filter_dev_minor); 60 + if (!dentry) 61 + goto fail; 62 + 63 + dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, 64 + hwpoison_dir, &hwpoison_filter_flags_mask); 65 + if (!dentry) 66 + goto fail; 67 + 68 + dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, 69 + hwpoison_dir, &hwpoison_filter_flags_value); 70 + if (!dentry) 71 + goto fail; 72 + 73 + #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 74 + dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, 75 + hwpoison_dir, &hwpoison_filter_memcg); 76 + if (!dentry) 77 + goto fail; 78 + #endif 79 + 86 80 return 0; 81 + fail: 82 + pfn_inject_exit(); 83 + return -ENOMEM; 87 84 } 88 85 89 86 module_init(pfn_inject_init);

+12

mm/internal.h

··· 50 50 */ 51 51 extern void __free_pages_bootmem(struct page *page, unsigned int order); 52 52 extern void prep_compound_page(struct page *page, unsigned long order); 53 + #ifdef CONFIG_MEMORY_FAILURE 54 + extern bool is_free_buddy_page(struct page *page); 55 + #endif 53 56 54 57 55 58 /* ··· 250 247 #define ZONE_RECLAIM_SOME 0 251 248 #define ZONE_RECLAIM_SUCCESS 1 252 249 #endif 250 + 251 + extern int hwpoison_filter(struct page *p); 252 + 253 + extern u32 hwpoison_filter_dev_major; 254 + extern u32 hwpoison_filter_dev_minor; 255 + extern u64 hwpoison_filter_flags_mask; 256 + extern u64 hwpoison_filter_flags_value; 257 + extern u64 hwpoison_filter_memcg; 258 + extern u32 hwpoison_filter_enable;

+14 -7

mm/madvise.c

··· 9 9 #include <linux/pagemap.h> 10 10 #include <linux/syscalls.h> 11 11 #include <linux/mempolicy.h> 12 + #include <linux/page-isolation.h> 12 13 #include <linux/hugetlb.h> 13 14 #include <linux/sched.h> 14 15 #include <linux/ksm.h> ··· 223 222 /* 224 223 * Error injection support for memory error handling. 225 224 */ 226 - static int madvise_hwpoison(unsigned long start, unsigned long end) 225 + static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) 227 226 { 228 227 int ret = 0; 229 228 ··· 231 230 return -EPERM; 232 231 for (; start < end; start += PAGE_SIZE) { 233 232 struct page *p; 234 - int ret = get_user_pages(current, current->mm, start, 1, 235 - 0, 0, &p, NULL); 233 + int ret = get_user_pages_fast(start, 1, 0, &p); 236 234 if (ret != 1) 237 235 return ret; 236 + if (bhv == MADV_SOFT_OFFLINE) { 237 + printk(KERN_INFO "Soft offlining page %lx at %lx\n", 238 + page_to_pfn(p), start); 239 + ret = soft_offline_page(p, MF_COUNT_INCREASED); 240 + if (ret) 241 + break; 242 + continue; 243 + } 238 244 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 239 245 page_to_pfn(p), start); 240 246 /* Ignore return value for now */ 241 - __memory_failure(page_to_pfn(p), 0, 1); 242 - put_page(p); 247 + __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); 243 248 } 244 249 return ret; 245 250 } ··· 342 335 size_t len; 343 336 344 337 #ifdef CONFIG_MEMORY_FAILURE 345 - if (behavior == MADV_HWPOISON) 346 - return madvise_hwpoison(start, start+len_in); 338 + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 339 + return madvise_hwpoison(behavior, start, start+len_in); 347 340 #endif 348 341 if (!madvise_behavior_valid(behavior)) 349 342 return error;

+9 -7

mm/memcontrol.c

··· 283 283 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 284 284 } 285 285 286 + struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 287 + { 288 + return &mem->css; 289 + } 290 + 286 291 static struct mem_cgroup_per_zone * 287 292 page_cgroup_zoneinfo(struct page_cgroup *pc) 288 293 { ··· 1541 1536 return container_of(css, struct mem_cgroup, css); 1542 1537 } 1543 1538 1544 - static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1539 + struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 1545 1540 { 1546 - struct mem_cgroup *mem; 1541 + struct mem_cgroup *mem = NULL; 1547 1542 struct page_cgroup *pc; 1548 1543 unsigned short id; 1549 1544 swp_entry_t ent; 1550 1545 1551 1546 VM_BUG_ON(!PageLocked(page)); 1552 - 1553 - if (!PageSwapCache(page)) 1554 - return NULL; 1555 1547 1556 1548 pc = lookup_page_cgroup(page); 1557 1549 lock_page_cgroup(pc); ··· 1556 1554 mem = pc->mem_cgroup; 1557 1555 if (mem && !css_tryget(&mem->css)) 1558 1556 mem = NULL; 1559 - } else { 1557 + } else if (PageSwapCache(page)) { 1560 1558 ent.val = page_private(page); 1561 1559 id = lookup_swap_cgroup(ent); 1562 1560 rcu_read_lock(); ··· 1876 1874 */ 1877 1875 if (!PageSwapCache(page)) 1878 1876 goto charge_cur_mm; 1879 - mem = try_get_mem_cgroup_from_swapcache(page); 1877 + mem = try_get_mem_cgroup_from_page(page); 1880 1878 if (!mem) 1881 1879 goto charge_cur_mm; 1882 1880 *ptr = mem;

+507 -55

mm/memory-failure.c

··· 34 34 #include <linux/kernel.h> 35 35 #include <linux/mm.h> 36 36 #include <linux/page-flags.h> 37 + #include <linux/kernel-page-flags.h> 37 38 #include <linux/sched.h> 38 39 #include <linux/ksm.h> 39 40 #include <linux/rmap.h> 40 41 #include <linux/pagemap.h> 41 42 #include <linux/swap.h> 42 43 #include <linux/backing-dev.h> 44 + #include <linux/migrate.h> 45 + #include <linux/page-isolation.h> 46 + #include <linux/suspend.h> 43 47 #include "internal.h" 44 48 45 49 int sysctl_memory_failure_early_kill __read_mostly = 0; ··· 51 47 int sysctl_memory_failure_recovery __read_mostly = 1; 52 48 53 49 atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 50 + 51 + u32 hwpoison_filter_enable = 0; 52 + u32 hwpoison_filter_dev_major = ~0U; 53 + u32 hwpoison_filter_dev_minor = ~0U; 54 + u64 hwpoison_filter_flags_mask; 55 + u64 hwpoison_filter_flags_value; 56 + EXPORT_SYMBOL_GPL(hwpoison_filter_enable); 57 + EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); 58 + EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); 59 + EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); 60 + EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); 61 + 62 + static int hwpoison_filter_dev(struct page *p) 63 + { 64 + struct address_space *mapping; 65 + dev_t dev; 66 + 67 + if (hwpoison_filter_dev_major == ~0U && 68 + hwpoison_filter_dev_minor == ~0U) 69 + return 0; 70 + 71 + /* 72 + * page_mapping() does not accept slab page 73 + */ 74 + if (PageSlab(p)) 75 + return -EINVAL; 76 + 77 + mapping = page_mapping(p); 78 + if (mapping == NULL || mapping->host == NULL) 79 + return -EINVAL; 80 + 81 + dev = mapping->host->i_sb->s_dev; 82 + if (hwpoison_filter_dev_major != ~0U && 83 + hwpoison_filter_dev_major != MAJOR(dev)) 84 + return -EINVAL; 85 + if (hwpoison_filter_dev_minor != ~0U && 86 + hwpoison_filter_dev_minor != MINOR(dev)) 87 + return -EINVAL; 88 + 89 + return 0; 90 + } 91 + 92 + static int hwpoison_filter_flags(struct page *p) 93 + { 94 + if (!hwpoison_filter_flags_mask) 95 + return 0; 96 + 97 + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == 98 + hwpoison_filter_flags_value) 99 + return 0; 100 + else 101 + return -EINVAL; 102 + } 103 + 104 + /* 105 + * This allows stress tests to limit test scope to a collection of tasks 106 + * by putting them under some memcg. This prevents killing unrelated/important 107 + * processes such as /sbin/init. Note that the target task may share clean 108 + * pages with init (eg. libc text), which is harmless. If the target task 109 + * share _dirty_ pages with another task B, the test scheme must make sure B 110 + * is also included in the memcg. At last, due to race conditions this filter 111 + * can only guarantee that the page either belongs to the memcg tasks, or is 112 + * a freed page. 113 + */ 114 + #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 115 + u64 hwpoison_filter_memcg; 116 + EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 117 + static int hwpoison_filter_task(struct page *p) 118 + { 119 + struct mem_cgroup *mem; 120 + struct cgroup_subsys_state *css; 121 + unsigned long ino; 122 + 123 + if (!hwpoison_filter_memcg) 124 + return 0; 125 + 126 + mem = try_get_mem_cgroup_from_page(p); 127 + if (!mem) 128 + return -EINVAL; 129 + 130 + css = mem_cgroup_css(mem); 131 + /* root_mem_cgroup has NULL dentries */ 132 + if (!css->cgroup->dentry) 133 + return -EINVAL; 134 + 135 + ino = css->cgroup->dentry->d_inode->i_ino; 136 + css_put(css); 137 + 138 + if (ino != hwpoison_filter_memcg) 139 + return -EINVAL; 140 + 141 + return 0; 142 + } 143 + #else 144 + static int hwpoison_filter_task(struct page *p) { return 0; } 145 + #endif 146 + 147 + int hwpoison_filter(struct page *p) 148 + { 149 + if (!hwpoison_filter_enable) 150 + return 0; 151 + 152 + if (hwpoison_filter_dev(p)) 153 + return -EINVAL; 154 + 155 + if (hwpoison_filter_flags(p)) 156 + return -EINVAL; 157 + 158 + if (hwpoison_filter_task(p)) 159 + return -EINVAL; 160 + 161 + return 0; 162 + } 163 + EXPORT_SYMBOL_GPL(hwpoison_filter); 54 164 55 165 /* 56 166 * Send all the processes who have the page mapped an ``action optional'' ··· 199 81 t->comm, t->pid, ret); 200 82 return ret; 201 83 } 84 + 85 + /* 86 + * When a unknown page type is encountered drain as many buffers as possible 87 + * in the hope to turn the page into a LRU or free page, which we can handle. 88 + */ 89 + void shake_page(struct page *p, int access) 90 + { 91 + if (!PageSlab(p)) { 92 + lru_add_drain_all(); 93 + if (PageLRU(p)) 94 + return; 95 + drain_all_pages(); 96 + if (PageLRU(p) || is_free_buddy_page(p)) 97 + return; 98 + } 99 + 100 + /* 101 + * Only all shrink_slab here (which would also 102 + * shrink other caches) if access is not potentially fatal. 103 + */ 104 + if (access) { 105 + int nr; 106 + do { 107 + nr = shrink_slab(1000, GFP_KERNEL, 1000); 108 + if (page_count(p) == 0) 109 + break; 110 + } while (nr > 10); 111 + } 112 + } 113 + EXPORT_SYMBOL_GPL(shake_page); 202 114 203 115 /* 204 116 * Kill all processes that have a poisoned page mapped and then isolate ··· 325 177 * In case something went wrong with munmapping 326 178 * make sure the process doesn't catch the 327 179 * signal and then access the memory. Just kill it. 328 - * the signal handlers 329 180 */ 330 181 if (fail || tk->addr_valid == 0) { 331 182 printk(KERN_ERR ··· 461 314 */ 462 315 463 316 enum outcome { 464 - FAILED, /* Error handling failed */ 317 + IGNORED, /* Error: cannot be handled */ 318 + FAILED, /* Error: handling failed */ 465 319 DELAYED, /* Will be handled later */ 466 - IGNORED, /* Error safely ignored */ 467 320 RECOVERED, /* Successfully recovered */ 468 321 }; 469 322 470 323 static const char *action_name[] = { 324 + [IGNORED] = "Ignored", 471 325 [FAILED] = "Failed", 472 326 [DELAYED] = "Delayed", 473 - [IGNORED] = "Ignored", 474 327 [RECOVERED] = "Recovered", 475 328 }; 329 + 330 + /* 331 + * XXX: It is possible that a page is isolated from LRU cache, 332 + * and then kept in swap cache or failed to remove from page cache. 333 + * The page count will stop it from being freed by unpoison. 334 + * Stress tests should be aware of this memory leak problem. 335 + */ 336 + static int delete_from_lru_cache(struct page *p) 337 + { 338 + if (!isolate_lru_page(p)) { 339 + /* 340 + * Clear sensible page flags, so that the buddy system won't 341 + * complain when the page is unpoison-and-freed. 342 + */ 343 + ClearPageActive(p); 344 + ClearPageUnevictable(p); 345 + /* 346 + * drop the page count elevated by isolate_lru_page() 347 + */ 348 + page_cache_release(p); 349 + return 0; 350 + } 351 + return -EIO; 352 + } 476 353 477 354 /* 478 355 * Error hit kernel page. ··· 504 333 * could be more sophisticated. 505 334 */ 506 335 static int me_kernel(struct page *p, unsigned long pfn) 507 - { 508 - return DELAYED; 509 - } 510 - 511 - /* 512 - * Already poisoned page. 513 - */ 514 - static int me_ignore(struct page *p, unsigned long pfn) 515 336 { 516 337 return IGNORED; 517 338 } ··· 518 355 } 519 356 520 357 /* 521 - * Free memory 522 - */ 523 - static int me_free(struct page *p, unsigned long pfn) 524 - { 525 - return DELAYED; 526 - } 527 - 528 - /* 529 358 * Clean (or cleaned) page cache page. 530 359 */ 531 360 static int me_pagecache_clean(struct page *p, unsigned long pfn) ··· 525 370 int err; 526 371 int ret = FAILED; 527 372 struct address_space *mapping; 373 + 374 + delete_from_lru_cache(p); 528 375 529 376 /* 530 377 * For anonymous pages we're done the only reference left ··· 657 500 /* Trigger EIO in shmem: */ 658 501 ClearPageUptodate(p); 659 502 660 - return DELAYED; 503 + if (!delete_from_lru_cache(p)) 504 + return DELAYED; 505 + else 506 + return FAILED; 661 507 } 662 508 663 509 static int me_swapcache_clean(struct page *p, unsigned long pfn) 664 510 { 665 511 delete_from_swap_cache(p); 666 512 667 - return RECOVERED; 513 + if (!delete_from_lru_cache(p)) 514 + return RECOVERED; 515 + else 516 + return FAILED; 668 517 } 669 518 670 519 /* ··· 713 550 #define tail (1UL << PG_tail) 714 551 #define compound (1UL << PG_compound) 715 552 #define slab (1UL << PG_slab) 716 - #define buddy (1UL << PG_buddy) 717 553 #define reserved (1UL << PG_reserved) 718 554 719 555 static struct page_state { ··· 721 559 char *msg; 722 560 int (*action)(struct page *p, unsigned long pfn); 723 561 } error_states[] = { 724 - { reserved, reserved, "reserved kernel", me_ignore }, 725 - { buddy, buddy, "free kernel", me_free }, 562 + { reserved, reserved, "reserved kernel", me_kernel }, 563 + /* 564 + * free pages are specially detected outside this table: 565 + * PG_buddy pages only make a small fraction of all free pages. 566 + */ 726 567 727 568 /* 728 569 * Could in theory check if slab page is free or if we can drop ··· 752 587 753 588 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 754 589 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 755 - { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, 756 590 757 591 /* 758 592 * Catchall entry: must be at end. ··· 759 595 { 0, 0, "unknown page state", me_unknown }, 760 596 }; 761 597 598 + #undef dirty 599 + #undef sc 600 + #undef unevict 601 + #undef mlock 602 + #undef writeback 603 + #undef lru 604 + #undef swapbacked 605 + #undef head 606 + #undef tail 607 + #undef compound 608 + #undef slab 609 + #undef reserved 610 + 762 611 static void action_result(unsigned long pfn, char *msg, int result) 763 612 { 764 - struct page *page = NULL; 765 - if (pfn_valid(pfn)) 766 - page = pfn_to_page(pfn); 613 + struct page *page = pfn_to_page(pfn); 767 614 768 615 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", 769 616 pfn, 770 - page && PageDirty(page) ? "dirty " : "", 617 + PageDirty(page) ? "dirty " : "", 771 618 msg, action_name[result]); 772 619 } 773 620 774 621 static int page_action(struct page_state *ps, struct page *p, 775 - unsigned long pfn, int ref) 622 + unsigned long pfn) 776 623 { 777 624 int result; 778 625 int count; ··· 791 616 result = ps->action(p, pfn); 792 617 action_result(pfn, ps->msg, result); 793 618 794 - count = page_count(p) - 1 - ref; 795 - if (count != 0) 619 + count = page_count(p) - 1; 620 + if (ps->action == me_swapcache_dirty && result == DELAYED) 621 + count--; 622 + if (count != 0) { 796 623 printk(KERN_ERR 797 624 "MCE %#lx: %s page still referenced by %d users\n", 798 625 pfn, ps->msg, count); 626 + result = FAILED; 627 + } 799 628 800 629 /* Could do more checks here if page looks ok */ 801 630 /* 802 631 * Could adjust zone counters here to correct for the missing page. 803 632 */ 804 633 805 - return result == RECOVERED ? 0 : -EBUSY; 634 + return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; 806 635 } 807 636 808 637 #define N_UNMAP_TRIES 5 ··· 815 636 * Do all that is necessary to remove user space mappings. Unmap 816 637 * the pages and send SIGBUS to the processes if the data was dirty. 817 638 */ 818 - static void hwpoison_user_mappings(struct page *p, unsigned long pfn, 639 + static int hwpoison_user_mappings(struct page *p, unsigned long pfn, 819 640 int trapno) 820 641 { 821 642 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; ··· 825 646 int i; 826 647 int kill = 1; 827 648 828 - if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) 829 - return; 649 + if (PageReserved(p) || PageSlab(p)) 650 + return SWAP_SUCCESS; 830 651 831 652 /* 832 653 * This check implies we don't kill processes if their pages 833 654 * are in the swap cache early. Those are always late kills. 834 655 */ 835 656 if (!page_mapped(p)) 836 - return; 657 + return SWAP_SUCCESS; 658 + 659 + if (PageCompound(p) || PageKsm(p)) 660 + return SWAP_FAIL; 837 661 838 662 if (PageSwapCache(p)) { 839 663 printk(KERN_ERR ··· 847 665 /* 848 666 * Propagate the dirty bit from PTEs to struct page first, because we 849 667 * need this to decide if we should kill or just drop the page. 668 + * XXX: the dirty test could be racy: set_page_dirty() may not always 669 + * be called inside page lock (it's recommended but not enforced). 850 670 */ 851 671 mapping = page_mapping(p); 852 672 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { ··· 900 716 */ 901 717 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 902 718 ret != SWAP_SUCCESS, pfn); 719 + 720 + return ret; 903 721 } 904 722 905 - int __memory_failure(unsigned long pfn, int trapno, int ref) 723 + int __memory_failure(unsigned long pfn, int trapno, int flags) 906 724 { 907 - unsigned long lru_flag; 908 725 struct page_state *ps; 909 726 struct page *p; 910 727 int res; ··· 914 729 panic("Memory failure from trap %d on page %lx", trapno, pfn); 915 730 916 731 if (!pfn_valid(pfn)) { 917 - action_result(pfn, "memory outside kernel control", IGNORED); 918 - return -EIO; 732 + printk(KERN_ERR 733 + "MCE %#lx: memory outside kernel control\n", 734 + pfn); 735 + return -ENXIO; 919 736 } 920 737 921 738 p = pfn_to_page(pfn); 922 739 if (TestSetPageHWPoison(p)) { 923 - action_result(pfn, "already hardware poisoned", IGNORED); 740 + printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); 924 741 return 0; 925 742 } 926 743 ··· 939 752 * In fact it's dangerous to directly bump up page count from 0, 940 753 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 941 754 */ 942 - if (!get_page_unless_zero(compound_head(p))) { 943 - action_result(pfn, "free or high order kernel", IGNORED); 944 - return PageBuddy(compound_head(p)) ? 0 : -EBUSY; 755 + if (!(flags & MF_COUNT_INCREASED) && 756 + !get_page_unless_zero(compound_head(p))) { 757 + if (is_free_buddy_page(p)) { 758 + action_result(pfn, "free buddy", DELAYED); 759 + return 0; 760 + } else { 761 + action_result(pfn, "high order kernel", IGNORED); 762 + return -EBUSY; 763 + } 945 764 } 946 765 947 766 /* ··· 959 766 * walked by the page reclaim code, however that's not a big loss. 960 767 */ 961 768 if (!PageLRU(p)) 962 - lru_add_drain_all(); 963 - lru_flag = p->flags & lru; 964 - if (isolate_lru_page(p)) { 769 + shake_page(p, 0); 770 + if (!PageLRU(p)) { 771 + /* 772 + * shake_page could have turned it free. 773 + */ 774 + if (is_free_buddy_page(p)) { 775 + action_result(pfn, "free buddy, 2nd try", DELAYED); 776 + return 0; 777 + } 965 778 action_result(pfn, "non LRU", IGNORED); 966 779 put_page(p); 967 780 return -EBUSY; 968 781 } 969 - page_cache_release(p); 970 782 971 783 /* 972 784 * Lock the page and wait for writeback to finish. ··· 979 781 * and in many cases impossible, so we just avoid it here. 980 782 */ 981 783 lock_page_nosync(p); 784 + 785 + /* 786 + * unpoison always clear PG_hwpoison inside page lock 787 + */ 788 + if (!PageHWPoison(p)) { 789 + printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); 790 + res = 0; 791 + goto out; 792 + } 793 + if (hwpoison_filter(p)) { 794 + if (TestClearPageHWPoison(p)) 795 + atomic_long_dec(&mce_bad_pages); 796 + unlock_page(p); 797 + put_page(p); 798 + return 0; 799 + } 800 + 982 801 wait_on_page_writeback(p); 983 802 984 803 /* 985 804 * Now take care of user space mappings. 805 + * Abort on fail: __remove_from_page_cache() assumes unmapped page. 986 806 */ 987 - hwpoison_user_mappings(p, pfn, trapno); 807 + if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { 808 + printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); 809 + res = -EBUSY; 810 + goto out; 811 + } 988 812 989 813 /* 990 814 * Torn down by someone else? 991 815 */ 992 - if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { 816 + if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 993 817 action_result(pfn, "already truncated LRU", IGNORED); 994 - res = 0; 818 + res = -EBUSY; 995 819 goto out; 996 820 } 997 821 998 822 res = -EBUSY; 999 823 for (ps = error_states;; ps++) { 1000 - if (((p->flags | lru_flag)& ps->mask) == ps->res) { 1001 - res = page_action(ps, p, pfn, ref); 824 + if ((p->flags & ps->mask) == ps->res) { 825 + res = page_action(ps, p, pfn); 1002 826 break; 1003 827 } 1004 828 } ··· 1050 830 void memory_failure(unsigned long pfn, int trapno) 1051 831 { 1052 832 __memory_failure(pfn, trapno, 0); 833 + } 834 + 835 + /** 836 + * unpoison_memory - Unpoison a previously poisoned page 837 + * @pfn: Page number of the to be unpoisoned page 838 + * 839 + * Software-unpoison a page that has been poisoned by 840 + * memory_failure() earlier. 841 + * 842 + * This is only done on the software-level, so it only works 843 + * for linux injected failures, not real hardware failures 844 + * 845 + * Returns 0 for success, otherwise -errno. 846 + */ 847 + int unpoison_memory(unsigned long pfn) 848 + { 849 + struct page *page; 850 + struct page *p; 851 + int freeit = 0; 852 + 853 + if (!pfn_valid(pfn)) 854 + return -ENXIO; 855 + 856 + p = pfn_to_page(pfn); 857 + page = compound_head(p); 858 + 859 + if (!PageHWPoison(p)) { 860 + pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); 861 + return 0; 862 + } 863 + 864 + if (!get_page_unless_zero(page)) { 865 + if (TestClearPageHWPoison(p)) 866 + atomic_long_dec(&mce_bad_pages); 867 + pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 868 + return 0; 869 + } 870 + 871 + lock_page_nosync(page); 872 + /* 873 + * This test is racy because PG_hwpoison is set outside of page lock. 874 + * That's acceptable because that won't trigger kernel panic. Instead, 875 + * the PG_hwpoison page will be caught and isolated on the entrance to 876 + * the free buddy page pool. 877 + */ 878 + if (TestClearPageHWPoison(p)) { 879 + pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 880 + atomic_long_dec(&mce_bad_pages); 881 + freeit = 1; 882 + } 883 + unlock_page(page); 884 + 885 + put_page(page); 886 + if (freeit) 887 + put_page(page); 888 + 889 + return 0; 890 + } 891 + EXPORT_SYMBOL(unpoison_memory); 892 + 893 + static struct page *new_page(struct page *p, unsigned long private, int **x) 894 + { 895 + int nid = page_to_nid(p); 896 + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 897 + } 898 + 899 + /* 900 + * Safely get reference count of an arbitrary page. 901 + * Returns 0 for a free page, -EIO for a zero refcount page 902 + * that is not free, and 1 for any other page type. 903 + * For 1 the page is returned with increased page count, otherwise not. 904 + */ 905 + static int get_any_page(struct page *p, unsigned long pfn, int flags) 906 + { 907 + int ret; 908 + 909 + if (flags & MF_COUNT_INCREASED) 910 + return 1; 911 + 912 + /* 913 + * The lock_system_sleep prevents a race with memory hotplug, 914 + * because the isolation assumes there's only a single user. 915 + * This is a big hammer, a better would be nicer. 916 + */ 917 + lock_system_sleep(); 918 + 919 + /* 920 + * Isolate the page, so that it doesn't get reallocated if it 921 + * was free. 922 + */ 923 + set_migratetype_isolate(p); 924 + if (!get_page_unless_zero(compound_head(p))) { 925 + if (is_free_buddy_page(p)) { 926 + pr_debug("get_any_page: %#lx free buddy page\n", pfn); 927 + /* Set hwpoison bit while page is still isolated */ 928 + SetPageHWPoison(p); 929 + ret = 0; 930 + } else { 931 + pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", 932 + pfn, p->flags); 933 + ret = -EIO; 934 + } 935 + } else { 936 + /* Not a free page */ 937 + ret = 1; 938 + } 939 + unset_migratetype_isolate(p); 940 + unlock_system_sleep(); 941 + return ret; 942 + } 943 + 944 + /** 945 + * soft_offline_page - Soft offline a page. 946 + * @page: page to offline 947 + * @flags: flags. Same as memory_failure(). 948 + * 949 + * Returns 0 on success, otherwise negated errno. 950 + * 951 + * Soft offline a page, by migration or invalidation, 952 + * without killing anything. This is for the case when 953 + * a page is not corrupted yet (so it's still valid to access), 954 + * but has had a number of corrected errors and is better taken 955 + * out. 956 + * 957 + * The actual policy on when to do that is maintained by 958 + * user space. 959 + * 960 + * This should never impact any application or cause data loss, 961 + * however it might take some time. 962 + * 963 + * This is not a 100% solution for all memory, but tries to be 964 + * ``good enough'' for the majority of memory. 965 + */ 966 + int soft_offline_page(struct page *page, int flags) 967 + { 968 + int ret; 969 + unsigned long pfn = page_to_pfn(page); 970 + 971 + ret = get_any_page(page, pfn, flags); 972 + if (ret < 0) 973 + return ret; 974 + if (ret == 0) 975 + goto done; 976 + 977 + /* 978 + * Page cache page we can handle? 979 + */ 980 + if (!PageLRU(page)) { 981 + /* 982 + * Try to free it. 983 + */ 984 + put_page(page); 985 + shake_page(page, 1); 986 + 987 + /* 988 + * Did it turn free? 989 + */ 990 + ret = get_any_page(page, pfn, 0); 991 + if (ret < 0) 992 + return ret; 993 + if (ret == 0) 994 + goto done; 995 + } 996 + if (!PageLRU(page)) { 997 + pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", 998 + pfn, page->flags); 999 + return -EIO; 1000 + } 1001 + 1002 + lock_page(page); 1003 + wait_on_page_writeback(page); 1004 + 1005 + /* 1006 + * Synchronized using the page lock with memory_failure() 1007 + */ 1008 + if (PageHWPoison(page)) { 1009 + unlock_page(page); 1010 + put_page(page); 1011 + pr_debug("soft offline: %#lx page already poisoned\n", pfn); 1012 + return -EBUSY; 1013 + } 1014 + 1015 + /* 1016 + * Try to invalidate first. This should work for 1017 + * non dirty unmapped page cache pages. 1018 + */ 1019 + ret = invalidate_inode_page(page); 1020 + unlock_page(page); 1021 + 1022 + /* 1023 + * Drop count because page migration doesn't like raised 1024 + * counts. The page could get re-allocated, but if it becomes 1025 + * LRU the isolation will just fail. 1026 + * RED-PEN would be better to keep it isolated here, but we 1027 + * would need to fix isolation locking first. 1028 + */ 1029 + put_page(page); 1030 + if (ret == 1) { 1031 + ret = 0; 1032 + pr_debug("soft_offline: %#lx: invalidated\n", pfn); 1033 + goto done; 1034 + } 1035 + 1036 + /* 1037 + * Simple invalidation didn't work. 1038 + * Try to migrate to a new page instead. migrate.c 1039 + * handles a large number of cases for us. 1040 + */ 1041 + ret = isolate_lru_page(page); 1042 + if (!ret) { 1043 + LIST_HEAD(pagelist); 1044 + 1045 + list_add(&page->lru, &pagelist); 1046 + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1047 + if (ret) { 1048 + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1049 + pfn, ret, page->flags); 1050 + if (ret > 0) 1051 + ret = -EIO; 1052 + } 1053 + } else { 1054 + pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1055 + pfn, ret, page_count(page), page->flags); 1056 + } 1057 + if (ret) 1058 + return ret; 1059 + 1060 + done: 1061 + atomic_long_add(1, &mce_bad_pages); 1062 + SetPageHWPoison(page); 1063 + /* keep elevated page count for bad page */ 1064 + return ret; 1053 1065 }

+4

mm/memory.c

··· 2555 2555 ret = VM_FAULT_MAJOR; 2556 2556 count_vm_event(PGMAJFAULT); 2557 2557 } else if (PageHWPoison(page)) { 2558 + /* 2559 + * hwpoisoned dirty swapcache pages are kept for killing 2560 + * owner processes (which may be unknown at hwpoison time) 2561 + */ 2558 2562 ret = VM_FAULT_HWPOISON; 2559 2563 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2560 2564 goto out_release;

+21

mm/page_alloc.c

··· 5091 5091 spin_unlock_irqrestore(&zone->lock, flags); 5092 5092 } 5093 5093 #endif 5094 + 5095 + #ifdef CONFIG_MEMORY_FAILURE 5096 + bool is_free_buddy_page(struct page *page) 5097 + { 5098 + struct zone *zone = page_zone(page); 5099 + unsigned long pfn = page_to_pfn(page); 5100 + unsigned long flags; 5101 + int order; 5102 + 5103 + spin_lock_irqsave(&zone->lock, flags); 5104 + for (order = 0; order < MAX_ORDER; order++) { 5105 + struct page *page_head = page - (pfn & ((1 << order) - 1)); 5106 + 5107 + if (PageBuddy(page_head) && page_order(page_head) >= order) 5108 + break; 5109 + } 5110 + spin_unlock_irqrestore(&zone->lock, flags); 5111 + 5112 + return order < MAX_ORDER; 5113 + } 5114 + #endif