commit d4220f987cf473c65a342ca69e3eb13dea919a49 · tjh.dev/kernel

+44

Documentation/ABI/testing/sysfs-memory-page-offline

···

··· 1 + What: /sys/devices/system/memory/soft_offline_page 2 + Date: Sep 2009 3 + KernelVersion: 2.6.33 4 + Contact: andi@firstfloor.org 5 + Description: 6 + Soft-offline the memory page containing the physical address 7 + written into this file. Input is a hex number specifying the 8 + physical address of the page. The kernel will then attempt 9 + to soft-offline it, by moving the contents elsewhere or 10 + dropping it if possible. The kernel will then be placed 11 + on the bad page list and never be reused. 12 + 13 + The offlining is done in kernel specific granuality. 14 + Normally it's the base page size of the kernel, but 15 + this might change. 16 + 17 + The page must be still accessible, not poisoned. The 18 + kernel will never kill anything for this, but rather 19 + fail the offline. Return value is the size of the 20 + number, or a error when the offlining failed. Reading 21 + the file is not allowed. 22 + 23 + What: /sys/devices/system/memory/hard_offline_page 24 + Date: Sep 2009 25 + KernelVersion: 2.6.33 26 + Contact: andi@firstfloor.org 27 + Description: 28 + Hard-offline the memory page containing the physical 29 + address written into this file. Input is a hex number 30 + specifying the physical address of the page. The 31 + kernel will then attempt to hard-offline the page, by 32 + trying to drop the page or killing any owner or 33 + triggering IO errors if needed. Note this may kill 34 + any processes owning the page. The kernel will avoid 35 + to access this page assuming it's poisoned by the 36 + hardware. 37 + 38 + The offlining is done in kernel specific granuality. 39 + Normally it's the base page size of the kernel, but 40 + this might change. 41 + 42 + Return value is the size of the number, or a error when 43 + the offlining failed. 44 + Reading the file is not allowed.

+49 -3

Documentation/vm/hwpoison.txt

··· 92 93 Testing: 94 95 - madvise(MADV_POISON, ....) 96 (as root) 97 Poison a page in the process for testing 98 99 100 hwpoison-inject module through debugfs 101 - /sys/debug/hwpoison/corrupt-pfn 102 103 - Inject hwpoison fault at PFN echoed into this file 104 105 106 Architecture specific MCE injector 107

··· 92 93 Testing: 94 95 + madvise(MADV_HWPOISON, ....) 96 (as root) 97 Poison a page in the process for testing 98 99 100 hwpoison-inject module through debugfs 101 102 + /sys/debug/hwpoison/ 103 104 + corrupt-pfn 105 + 106 + Inject hwpoison fault at PFN echoed into this file. This does 107 + some early filtering to avoid corrupted unintended pages in test suites. 108 + 109 + unpoison-pfn 110 + 111 + Software-unpoison page at PFN echoed into this file. This 112 + way a page can be reused again. 113 + This only works for Linux injected failures, not for real 114 + memory failures. 115 + 116 + Note these injection interfaces are not stable and might change between 117 + kernel versions 118 + 119 + corrupt-filter-dev-major 120 + corrupt-filter-dev-minor 121 + 122 + Only handle memory failures to pages associated with the file system defined 123 + by block device major/minor. -1U is the wildcard value. 124 + This should be only used for testing with artificial injection. 125 + 126 + corrupt-filter-memcg 127 + 128 + Limit injection to pages owned by memgroup. Specified by inode number 129 + of the memcg. 130 + 131 + Example: 132 + mkdir /cgroup/hwpoison 133 + 134 + usemem -m 100 -s 1000 & 135 + echo `jobs -p` > /cgroup/hwpoison/tasks 136 + 137 + memcg_ino=$(ls -id /cgroup/hwpoison | cut -f1 -d' ') 138 + echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg 139 + 140 + page-types -p `pidof init` --hwpoison # shall do nothing 141 + page-types -p `pidof usemem` --hwpoison # poison its pages 142 + 143 + corrupt-filter-flags-mask 144 + corrupt-filter-flags-value 145 + 146 + When specified, only poison pages if ((page_flags & mask) == value). 147 + This allows stress testing of many kinds of pages. The page_flags 148 + are the same as in /proc/kpageflags. The flag bits are defined in 149 + include/linux/kernel-page-flags.h and documented in 150 + Documentation/vm/pagemap.txt 151 152 Architecture specific MCE injector 153

+13 -2

Documentation/vm/page-types.c

··· 1 /* 2 * page-types: Tool for querying page flags 3 * 4 * Copyright (C) 2009 Intel corporation 5 * 6 * Authors: Wu Fengguang <fengguang.wu@intel.com> 7 - * 8 - * Released under the General Public License (GPL). 9 */ 10 11 #define _LARGEFILE64_SOURCE

··· 1 /* 2 * page-types: Tool for querying page flags 3 * 4 + * This program is free software; you can redistribute it and/or modify it 5 + * under the terms of the GNU General Public License as published by the Free 6 + * Software Foundation; version 2. 7 + * 8 + * This program is distributed in the hope that it will be useful, but WITHOUT 9 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 + * more details. 12 + * 13 + * You should find a copy of v2 of the GNU General Public License somewhere on 14 + * your Linux system; if not, write to the Free Software Foundation, Inc., 59 15 + * Temple Place, Suite 330, Boston, MA 02111-1307 USA. 16 + * 17 * Copyright (C) 2009 Intel corporation 18 * 19 * Authors: Wu Fengguang <fengguang.wu@intel.com> 20 */ 21 22 #define _LARGEFILE64_SOURCE

+9

MAINTAINERS

··· 2377 S: Maintained 2378 F: drivers/hwmon/hdaps.c 2379 2380 HYPERVISOR VIRTUAL CONSOLE DRIVER 2381 L: linuxppc-dev@ozlabs.org 2382 S: Odd Fixes

··· 2377 S: Maintained 2378 F: drivers/hwmon/hdaps.c 2379 2380 + HWPOISON MEMORY FAILURE HANDLING 2381 + M: Andi Kleen <andi@firstfloor.org> 2382 + L: linux-mm@kvack.org 2383 + L: linux-kernel@vger.kernel.org 2384 + T: git git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git hwpoison 2385 + S: Maintained 2386 + F: mm/memory-failure.c 2387 + F: mm/hwpoison-inject.c 2388 + 2389 HYPERVISOR VIRTUAL CONSOLE DRIVER 2390 L: linuxppc-dev@ozlabs.org 2391 S: Odd Fixes

+61

drivers/base/memory.c

··· 341 } 342 #endif 343 344 /* 345 * Note that phys_device is optional. It is here to allow for 346 * differentiation between which *physical* devices each ··· 529 } 530 531 err = memory_probe_init(); 532 if (!ret) 533 ret = err; 534 err = block_size_init();

··· 341 } 342 #endif 343 344 + #ifdef CONFIG_MEMORY_FAILURE 345 + /* 346 + * Support for offlining pages of memory 347 + */ 348 + 349 + /* Soft offline a page */ 350 + static ssize_t 351 + store_soft_offline_page(struct class *class, const char *buf, size_t count) 352 + { 353 + int ret; 354 + u64 pfn; 355 + if (!capable(CAP_SYS_ADMIN)) 356 + return -EPERM; 357 + if (strict_strtoull(buf, 0, &pfn) < 0) 358 + return -EINVAL; 359 + pfn >>= PAGE_SHIFT; 360 + if (!pfn_valid(pfn)) 361 + return -ENXIO; 362 + ret = soft_offline_page(pfn_to_page(pfn), 0); 363 + return ret == 0 ? count : ret; 364 + } 365 + 366 + /* Forcibly offline a page, including killing processes. */ 367 + static ssize_t 368 + store_hard_offline_page(struct class *class, const char *buf, size_t count) 369 + { 370 + int ret; 371 + u64 pfn; 372 + if (!capable(CAP_SYS_ADMIN)) 373 + return -EPERM; 374 + if (strict_strtoull(buf, 0, &pfn) < 0) 375 + return -EINVAL; 376 + pfn >>= PAGE_SHIFT; 377 + ret = __memory_failure(pfn, 0, 0); 378 + return ret ? ret : count; 379 + } 380 + 381 + static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); 382 + static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); 383 + 384 + static __init int memory_fail_init(void) 385 + { 386 + int err; 387 + 388 + err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 389 + &class_attr_soft_offline_page.attr); 390 + if (!err) 391 + err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 392 + &class_attr_hard_offline_page.attr); 393 + return err; 394 + } 395 + #else 396 + static inline int memory_fail_init(void) 397 + { 398 + return 0; 399 + } 400 + #endif 401 + 402 /* 403 * Note that phys_device is optional. It is here to allow for 404 * differentiation between which *physical* devices each ··· 471 } 472 473 err = memory_probe_init(); 474 + if (!ret) 475 + ret = err; 476 + err = memory_fail_init(); 477 if (!ret) 478 ret = err; 479 err = block_size_init();

+3 -42

fs/proc/page.c

··· 8 #include <linux/proc_fs.h> 9 #include <linux/seq_file.h> 10 #include <linux/hugetlb.h> 11 #include <asm/uaccess.h> 12 #include "internal.h" 13 ··· 72 * physical page flags. 73 */ 74 75 - /* These macros are used to decouple internal flags from exported ones */ 76 - 77 - #define KPF_LOCKED 0 78 - #define KPF_ERROR 1 79 - #define KPF_REFERENCED 2 80 - #define KPF_UPTODATE 3 81 - #define KPF_DIRTY 4 82 - #define KPF_LRU 5 83 - #define KPF_ACTIVE 6 84 - #define KPF_SLAB 7 85 - #define KPF_WRITEBACK 8 86 - #define KPF_RECLAIM 9 87 - #define KPF_BUDDY 10 88 - 89 - /* 11-20: new additions in 2.6.31 */ 90 - #define KPF_MMAP 11 91 - #define KPF_ANON 12 92 - #define KPF_SWAPCACHE 13 93 - #define KPF_SWAPBACKED 14 94 - #define KPF_COMPOUND_HEAD 15 95 - #define KPF_COMPOUND_TAIL 16 96 - #define KPF_HUGE 17 97 - #define KPF_UNEVICTABLE 18 98 - #define KPF_HWPOISON 19 99 - #define KPF_NOPAGE 20 100 - 101 - #define KPF_KSM 21 102 - 103 - /* kernel hacking assistances 104 - * WARNING: subject to change, never rely on them! 105 - */ 106 - #define KPF_RESERVED 32 107 - #define KPF_MLOCKED 33 108 - #define KPF_MAPPEDTODISK 34 109 - #define KPF_PRIVATE 35 110 - #define KPF_PRIVATE_2 36 111 - #define KPF_OWNER_PRIVATE 37 112 - #define KPF_ARCH 38 113 - #define KPF_UNCACHED 39 114 - 115 static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) 116 { 117 return ((kflags >> kbit) & 1) << ubit; 118 } 119 120 - static u64 get_uflags(struct page *page) 121 { 122 u64 k; 123 u64 u; ··· 180 else 181 ppage = NULL; 182 183 - if (put_user(get_uflags(ppage), out)) { 184 ret = -EFAULT; 185 break; 186 }

··· 8 #include <linux/proc_fs.h> 9 #include <linux/seq_file.h> 10 #include <linux/hugetlb.h> 11 + #include <linux/kernel-page-flags.h> 12 #include <asm/uaccess.h> 13 #include "internal.h" 14 ··· 71 * physical page flags. 72 */ 73 74 static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) 75 { 76 return ((kflags >> kbit) & 1) << ubit; 77 } 78 79 + u64 stable_page_flags(struct page *page) 80 { 81 u64 k; 82 u64 u; ··· 219 else 220 ppage = NULL; 221 222 + if (put_user(stable_page_flags(ppage), out)) { 223 ret = -EFAULT; 224 break; 225 }

+1

include/asm-generic/mman-common.h

··· 40 #define MADV_DONTFORK 10 /* don't inherit across fork */ 41 #define MADV_DOFORK 11 /* do inherit across fork */ 42 #define MADV_HWPOISON 100 /* poison a page for testing */ 43 44 #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ 45 #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */

··· 40 #define MADV_DONTFORK 10 /* don't inherit across fork */ 41 #define MADV_DOFORK 11 /* do inherit across fork */ 42 #define MADV_HWPOISON 100 /* poison a page for testing */ 43 + #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ 44 45 #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ 46 #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */

+46

include/linux/kernel-page-flags.h

···

··· 1 + #ifndef LINUX_KERNEL_PAGE_FLAGS_H 2 + #define LINUX_KERNEL_PAGE_FLAGS_H 3 + 4 + /* 5 + * Stable page flag bits exported to user space 6 + */ 7 + 8 + #define KPF_LOCKED 0 9 + #define KPF_ERROR 1 10 + #define KPF_REFERENCED 2 11 + #define KPF_UPTODATE 3 12 + #define KPF_DIRTY 4 13 + #define KPF_LRU 5 14 + #define KPF_ACTIVE 6 15 + #define KPF_SLAB 7 16 + #define KPF_WRITEBACK 8 17 + #define KPF_RECLAIM 9 18 + #define KPF_BUDDY 10 19 + 20 + /* 11-20: new additions in 2.6.31 */ 21 + #define KPF_MMAP 11 22 + #define KPF_ANON 12 23 + #define KPF_SWAPCACHE 13 24 + #define KPF_SWAPBACKED 14 25 + #define KPF_COMPOUND_HEAD 15 26 + #define KPF_COMPOUND_TAIL 16 27 + #define KPF_HUGE 17 28 + #define KPF_UNEVICTABLE 18 29 + #define KPF_HWPOISON 19 30 + #define KPF_NOPAGE 20 31 + 32 + #define KPF_KSM 21 33 + 34 + /* kernel hacking assistances 35 + * WARNING: subject to change, never rely on them! 36 + */ 37 + #define KPF_RESERVED 32 38 + #define KPF_MLOCKED 33 39 + #define KPF_MAPPEDTODISK 34 40 + #define KPF_PRIVATE 35 41 + #define KPF_PRIVATE_2 36 42 + #define KPF_OWNER_PRIVATE 37 43 + #define KPF_ARCH 38 44 + #define KPF_UNCACHED 39 45 + 46 + #endif /* LINUX_KERNEL_PAGE_FLAGS_H */

+13

include/linux/memcontrol.h

··· 73 extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); 74 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); 75 76 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); 77 78 static inline ··· 85 rcu_read_unlock(); 86 return cgroup == mem; 87 } 88 89 extern int 90 mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr); ··· 205 { 206 } 207 208 static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem) 209 { 210 return 1; ··· 219 const struct mem_cgroup *mem) 220 { 221 return 1; 222 } 223 224 static inline int

··· 73 extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); 74 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); 75 76 + extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); 77 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); 78 79 static inline ··· 84 rcu_read_unlock(); 85 return cgroup == mem; 86 } 87 + 88 + extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem); 89 90 extern int 91 mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr); ··· 202 { 203 } 204 205 + static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 206 + { 207 + return NULL; 208 + } 209 + 210 static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem) 211 { 212 return 1; ··· 211 const struct mem_cgroup *mem) 212 { 213 return 1; 214 + } 215 + 216 + static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 217 + { 218 + return NULL; 219 } 220 221 static inline int

+7 -1

include/linux/mm.h

··· 1331 size_t size); 1332 extern void refund_locked_memory(struct mm_struct *mm, size_t size); 1333 1334 extern void memory_failure(unsigned long pfn, int trapno); 1335 - extern int __memory_failure(unsigned long pfn, int trapno, int ref); 1336 extern int sysctl_memory_failure_early_kill; 1337 extern int sysctl_memory_failure_recovery; 1338 extern atomic_long_t mce_bad_pages; 1339 1340 #endif /* __KERNEL__ */ 1341 #endif /* _LINUX_MM_H */

··· 1331 size_t size); 1332 extern void refund_locked_memory(struct mm_struct *mm, size_t size); 1333 1334 + enum mf_flags { 1335 + MF_COUNT_INCREASED = 1 << 0, 1336 + }; 1337 extern void memory_failure(unsigned long pfn, int trapno); 1338 + extern int __memory_failure(unsigned long pfn, int trapno, int flags); 1339 + extern int unpoison_memory(unsigned long pfn); 1340 extern int sysctl_memory_failure_early_kill; 1341 extern int sysctl_memory_failure_recovery; 1342 + extern void shake_page(struct page *p, int access); 1343 extern atomic_long_t mce_bad_pages; 1344 + extern int soft_offline_page(struct page *page, int flags); 1345 1346 #endif /* __KERNEL__ */ 1347 #endif /* _LINUX_MM_H */

+3 -1

include/linux/page-flags.h

··· 275 276 #ifdef CONFIG_MEMORY_FAILURE 277 PAGEFLAG(HWPoison, hwpoison) 278 - TESTSETFLAG(HWPoison, hwpoison) 279 #define __PG_HWPOISON (1UL << PG_hwpoison) 280 #else 281 PAGEFLAG_FALSE(HWPoison) 282 #define __PG_HWPOISON 0 283 #endif 284 285 static inline int PageUptodate(struct page *page) 286 {

··· 275 276 #ifdef CONFIG_MEMORY_FAILURE 277 PAGEFLAG(HWPoison, hwpoison) 278 + TESTSCFLAG(HWPoison, hwpoison) 279 #define __PG_HWPOISON (1UL << PG_hwpoison) 280 #else 281 PAGEFLAG_FALSE(HWPoison) 282 #define __PG_HWPOISON 0 283 #endif 284 + 285 + u64 stable_page_flags(struct page *page); 286 287 static inline int PageUptodate(struct page *page) 288 {

+2 -1

mm/Kconfig

··· 251 special hardware support and typically ECC memory. 252 253 config HWPOISON_INJECT 254 - tristate "Poison pages injector" 255 depends on MEMORY_FAILURE && DEBUG_KERNEL 256 257 config NOMMU_INITIAL_TRIM_EXCESS 258 int "Turn on mmap() excess space trimming before booting"

··· 251 special hardware support and typically ECC memory. 252 253 config HWPOISON_INJECT 254 + tristate "HWPoison pages injector" 255 depends on MEMORY_FAILURE && DEBUG_KERNEL 256 + select PROC_PAGE_MONITOR 257 258 config NOMMU_INITIAL_TRIM_EXCESS 259 int "Turn on mmap() excess space trimming before booting"

+105 -8

mm/hwpoison-inject.c

··· 3 #include <linux/debugfs.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 7 - static struct dentry *hwpoison_dir, *corrupt_pfn; 8 9 static int hwpoison_inject(void *data, u64 val) 10 { 11 if (!capable(CAP_SYS_ADMIN)) 12 return -EPERM; 13 - printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); 14 - return __memory_failure(val, 18, 0); 15 } 16 17 DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); 18 19 static void pfn_inject_exit(void) 20 { ··· 74 75 static int pfn_inject_init(void) 76 { 77 hwpoison_dir = debugfs_create_dir("hwpoison", NULL); 78 if (hwpoison_dir == NULL) 79 return -ENOMEM; 80 - corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, 81 NULL, &hwpoison_fops); 82 - if (corrupt_pfn == NULL) { 83 - pfn_inject_exit(); 84 - return -ENOMEM; 85 - } 86 return 0; 87 } 88 89 module_init(pfn_inject_init);

··· 3 #include <linux/debugfs.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 + #include <linux/swap.h> 7 + #include <linux/pagemap.h> 8 + #include "internal.h" 9 10 + static struct dentry *hwpoison_dir; 11 12 static int hwpoison_inject(void *data, u64 val) 13 { 14 + unsigned long pfn = val; 15 + struct page *p; 16 + int err; 17 + 18 if (!capable(CAP_SYS_ADMIN)) 19 return -EPERM; 20 + 21 + if (!hwpoison_filter_enable) 22 + goto inject; 23 + if (!pfn_valid(pfn)) 24 + return -ENXIO; 25 + 26 + p = pfn_to_page(pfn); 27 + /* 28 + * This implies unable to support free buddy pages. 29 + */ 30 + if (!get_page_unless_zero(p)) 31 + return 0; 32 + 33 + if (!PageLRU(p)) 34 + shake_page(p, 0); 35 + /* 36 + * This implies unable to support non-LRU pages. 37 + */ 38 + if (!PageLRU(p)) 39 + return 0; 40 + 41 + /* 42 + * do a racy check with elevated page count, to make sure PG_hwpoison 43 + * will only be set for the targeted owner (or on a free page). 44 + * We temporarily take page lock for try_get_mem_cgroup_from_page(). 45 + * __memory_failure() will redo the check reliably inside page lock. 46 + */ 47 + lock_page(p); 48 + err = hwpoison_filter(p); 49 + unlock_page(p); 50 + if (err) 51 + return 0; 52 + 53 + inject: 54 + printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); 55 + return __memory_failure(pfn, 18, MF_COUNT_INCREASED); 56 + } 57 + 58 + static int hwpoison_unpoison(void *data, u64 val) 59 + { 60 + if (!capable(CAP_SYS_ADMIN)) 61 + return -EPERM; 62 + 63 + return unpoison_memory(val); 64 } 65 66 DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); 67 + DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); 68 69 static void pfn_inject_exit(void) 70 { ··· 24 25 static int pfn_inject_init(void) 26 { 27 + struct dentry *dentry; 28 + 29 hwpoison_dir = debugfs_create_dir("hwpoison", NULL); 30 if (hwpoison_dir == NULL) 31 return -ENOMEM; 32 + 33 + /* 34 + * Note that the below poison/unpoison interfaces do not involve 35 + * hardware status change, hence do not require hardware support. 36 + * They are mainly for testing hwpoison in software level. 37 + */ 38 + dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, 39 NULL, &hwpoison_fops); 40 + if (!dentry) 41 + goto fail; 42 + 43 + dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, 44 + NULL, &unpoison_fops); 45 + if (!dentry) 46 + goto fail; 47 + 48 + dentry = debugfs_create_u32("corrupt-filter-enable", 0600, 49 + hwpoison_dir, &hwpoison_filter_enable); 50 + if (!dentry) 51 + goto fail; 52 + 53 + dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, 54 + hwpoison_dir, &hwpoison_filter_dev_major); 55 + if (!dentry) 56 + goto fail; 57 + 58 + dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, 59 + hwpoison_dir, &hwpoison_filter_dev_minor); 60 + if (!dentry) 61 + goto fail; 62 + 63 + dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, 64 + hwpoison_dir, &hwpoison_filter_flags_mask); 65 + if (!dentry) 66 + goto fail; 67 + 68 + dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, 69 + hwpoison_dir, &hwpoison_filter_flags_value); 70 + if (!dentry) 71 + goto fail; 72 + 73 + #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 74 + dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, 75 + hwpoison_dir, &hwpoison_filter_memcg); 76 + if (!dentry) 77 + goto fail; 78 + #endif 79 + 80 return 0; 81 + fail: 82 + pfn_inject_exit(); 83 + return -ENOMEM; 84 } 85 86 module_init(pfn_inject_init);

+12

mm/internal.h

··· 50 */ 51 extern void __free_pages_bootmem(struct page *page, unsigned int order); 52 extern void prep_compound_page(struct page *page, unsigned long order); 53 54 55 /* ··· 250 #define ZONE_RECLAIM_SOME 0 251 #define ZONE_RECLAIM_SUCCESS 1 252 #endif

··· 50 */ 51 extern void __free_pages_bootmem(struct page *page, unsigned int order); 52 extern void prep_compound_page(struct page *page, unsigned long order); 53 + #ifdef CONFIG_MEMORY_FAILURE 54 + extern bool is_free_buddy_page(struct page *page); 55 + #endif 56 57 58 /* ··· 247 #define ZONE_RECLAIM_SOME 0 248 #define ZONE_RECLAIM_SUCCESS 1 249 #endif 250 + 251 + extern int hwpoison_filter(struct page *p); 252 + 253 + extern u32 hwpoison_filter_dev_major; 254 + extern u32 hwpoison_filter_dev_minor; 255 + extern u64 hwpoison_filter_flags_mask; 256 + extern u64 hwpoison_filter_flags_value; 257 + extern u64 hwpoison_filter_memcg; 258 + extern u32 hwpoison_filter_enable;

+14 -7

mm/madvise.c

··· 9 #include <linux/pagemap.h> 10 #include <linux/syscalls.h> 11 #include <linux/mempolicy.h> 12 #include <linux/hugetlb.h> 13 #include <linux/sched.h> 14 #include <linux/ksm.h> ··· 223 /* 224 * Error injection support for memory error handling. 225 */ 226 - static int madvise_hwpoison(unsigned long start, unsigned long end) 227 { 228 int ret = 0; 229 ··· 231 return -EPERM; 232 for (; start < end; start += PAGE_SIZE) { 233 struct page *p; 234 - int ret = get_user_pages(current, current->mm, start, 1, 235 - 0, 0, &p, NULL); 236 if (ret != 1) 237 return ret; 238 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 239 page_to_pfn(p), start); 240 /* Ignore return value for now */ 241 - __memory_failure(page_to_pfn(p), 0, 1); 242 - put_page(p); 243 } 244 return ret; 245 } ··· 342 size_t len; 343 344 #ifdef CONFIG_MEMORY_FAILURE 345 - if (behavior == MADV_HWPOISON) 346 - return madvise_hwpoison(start, start+len_in); 347 #endif 348 if (!madvise_behavior_valid(behavior)) 349 return error;

··· 9 #include <linux/pagemap.h> 10 #include <linux/syscalls.h> 11 #include <linux/mempolicy.h> 12 + #include <linux/page-isolation.h> 13 #include <linux/hugetlb.h> 14 #include <linux/sched.h> 15 #include <linux/ksm.h> ··· 222 /* 223 * Error injection support for memory error handling. 224 */ 225 + static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) 226 { 227 int ret = 0; 228 ··· 230 return -EPERM; 231 for (; start < end; start += PAGE_SIZE) { 232 struct page *p; 233 + int ret = get_user_pages_fast(start, 1, 0, &p); 234 if (ret != 1) 235 return ret; 236 + if (bhv == MADV_SOFT_OFFLINE) { 237 + printk(KERN_INFO "Soft offlining page %lx at %lx\n", 238 + page_to_pfn(p), start); 239 + ret = soft_offline_page(p, MF_COUNT_INCREASED); 240 + if (ret) 241 + break; 242 + continue; 243 + } 244 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 245 page_to_pfn(p), start); 246 /* Ignore return value for now */ 247 + __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); 248 } 249 return ret; 250 } ··· 335 size_t len; 336 337 #ifdef CONFIG_MEMORY_FAILURE 338 + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 339 + return madvise_hwpoison(behavior, start, start+len_in); 340 #endif 341 if (!madvise_behavior_valid(behavior)) 342 return error;

+9 -7

mm/memcontrol.c

··· 283 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 284 } 285 286 static struct mem_cgroup_per_zone * 287 page_cgroup_zoneinfo(struct page_cgroup *pc) 288 { ··· 1541 return container_of(css, struct mem_cgroup, css); 1542 } 1543 1544 - static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1545 { 1546 - struct mem_cgroup *mem; 1547 struct page_cgroup *pc; 1548 unsigned short id; 1549 swp_entry_t ent; 1550 1551 VM_BUG_ON(!PageLocked(page)); 1552 - 1553 - if (!PageSwapCache(page)) 1554 - return NULL; 1555 1556 pc = lookup_page_cgroup(page); 1557 lock_page_cgroup(pc); ··· 1556 mem = pc->mem_cgroup; 1557 if (mem && !css_tryget(&mem->css)) 1558 mem = NULL; 1559 - } else { 1560 ent.val = page_private(page); 1561 id = lookup_swap_cgroup(ent); 1562 rcu_read_lock(); ··· 1876 */ 1877 if (!PageSwapCache(page)) 1878 goto charge_cur_mm; 1879 - mem = try_get_mem_cgroup_from_swapcache(page); 1880 if (!mem) 1881 goto charge_cur_mm; 1882 *ptr = mem;

··· 283 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 284 } 285 286 + struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 287 + { 288 + return &mem->css; 289 + } 290 + 291 static struct mem_cgroup_per_zone * 292 page_cgroup_zoneinfo(struct page_cgroup *pc) 293 { ··· 1536 return container_of(css, struct mem_cgroup, css); 1537 } 1538 1539 + struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 1540 { 1541 + struct mem_cgroup *mem = NULL; 1542 struct page_cgroup *pc; 1543 unsigned short id; 1544 swp_entry_t ent; 1545 1546 VM_BUG_ON(!PageLocked(page)); 1547 1548 pc = lookup_page_cgroup(page); 1549 lock_page_cgroup(pc); ··· 1554 mem = pc->mem_cgroup; 1555 if (mem && !css_tryget(&mem->css)) 1556 mem = NULL; 1557 + } else if (PageSwapCache(page)) { 1558 ent.val = page_private(page); 1559 id = lookup_swap_cgroup(ent); 1560 rcu_read_lock(); ··· 1874 */ 1875 if (!PageSwapCache(page)) 1876 goto charge_cur_mm; 1877 + mem = try_get_mem_cgroup_from_page(page); 1878 if (!mem) 1879 goto charge_cur_mm; 1880 *ptr = mem;

+507 -55

mm/memory-failure.c

··· 34 #include <linux/kernel.h> 35 #include <linux/mm.h> 36 #include <linux/page-flags.h> 37 #include <linux/sched.h> 38 #include <linux/ksm.h> 39 #include <linux/rmap.h> 40 #include <linux/pagemap.h> 41 #include <linux/swap.h> 42 #include <linux/backing-dev.h> 43 #include "internal.h" 44 45 int sysctl_memory_failure_early_kill __read_mostly = 0; ··· 51 int sysctl_memory_failure_recovery __read_mostly = 1; 52 53 atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 54 55 /* 56 * Send all the processes who have the page mapped an ``action optional'' ··· 199 t->comm, t->pid, ret); 200 return ret; 201 } 202 203 /* 204 * Kill all processes that have a poisoned page mapped and then isolate ··· 325 * In case something went wrong with munmapping 326 * make sure the process doesn't catch the 327 * signal and then access the memory. Just kill it. 328 - * the signal handlers 329 */ 330 if (fail || tk->addr_valid == 0) { 331 printk(KERN_ERR ··· 461 */ 462 463 enum outcome { 464 - FAILED, /* Error handling failed */ 465 DELAYED, /* Will be handled later */ 466 - IGNORED, /* Error safely ignored */ 467 RECOVERED, /* Successfully recovered */ 468 }; 469 470 static const char *action_name[] = { 471 [FAILED] = "Failed", 472 [DELAYED] = "Delayed", 473 - [IGNORED] = "Ignored", 474 [RECOVERED] = "Recovered", 475 }; 476 477 /* 478 * Error hit kernel page. ··· 504 * could be more sophisticated. 505 */ 506 static int me_kernel(struct page *p, unsigned long pfn) 507 - { 508 - return DELAYED; 509 - } 510 - 511 - /* 512 - * Already poisoned page. 513 - */ 514 - static int me_ignore(struct page *p, unsigned long pfn) 515 { 516 return IGNORED; 517 } ··· 518 } 519 520 /* 521 - * Free memory 522 - */ 523 - static int me_free(struct page *p, unsigned long pfn) 524 - { 525 - return DELAYED; 526 - } 527 - 528 - /* 529 * Clean (or cleaned) page cache page. 530 */ 531 static int me_pagecache_clean(struct page *p, unsigned long pfn) ··· 525 int err; 526 int ret = FAILED; 527 struct address_space *mapping; 528 529 /* 530 * For anonymous pages we're done the only reference left ··· 657 /* Trigger EIO in shmem: */ 658 ClearPageUptodate(p); 659 660 - return DELAYED; 661 } 662 663 static int me_swapcache_clean(struct page *p, unsigned long pfn) 664 { 665 delete_from_swap_cache(p); 666 667 - return RECOVERED; 668 } 669 670 /* ··· 713 #define tail (1UL << PG_tail) 714 #define compound (1UL << PG_compound) 715 #define slab (1UL << PG_slab) 716 - #define buddy (1UL << PG_buddy) 717 #define reserved (1UL << PG_reserved) 718 719 static struct page_state { ··· 721 char *msg; 722 int (*action)(struct page *p, unsigned long pfn); 723 } error_states[] = { 724 - { reserved, reserved, "reserved kernel", me_ignore }, 725 - { buddy, buddy, "free kernel", me_free }, 726 727 /* 728 * Could in theory check if slab page is free or if we can drop ··· 752 753 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 754 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 755 - { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, 756 757 /* 758 * Catchall entry: must be at end. ··· 759 { 0, 0, "unknown page state", me_unknown }, 760 }; 761 762 static void action_result(unsigned long pfn, char *msg, int result) 763 { 764 - struct page *page = NULL; 765 - if (pfn_valid(pfn)) 766 - page = pfn_to_page(pfn); 767 768 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", 769 pfn, 770 - page && PageDirty(page) ? "dirty " : "", 771 msg, action_name[result]); 772 } 773 774 static int page_action(struct page_state *ps, struct page *p, 775 - unsigned long pfn, int ref) 776 { 777 int result; 778 int count; ··· 791 result = ps->action(p, pfn); 792 action_result(pfn, ps->msg, result); 793 794 - count = page_count(p) - 1 - ref; 795 - if (count != 0) 796 printk(KERN_ERR 797 "MCE %#lx: %s page still referenced by %d users\n", 798 pfn, ps->msg, count); 799 800 /* Could do more checks here if page looks ok */ 801 /* 802 * Could adjust zone counters here to correct for the missing page. 803 */ 804 805 - return result == RECOVERED ? 0 : -EBUSY; 806 } 807 808 #define N_UNMAP_TRIES 5 ··· 815 * Do all that is necessary to remove user space mappings. Unmap 816 * the pages and send SIGBUS to the processes if the data was dirty. 817 */ 818 - static void hwpoison_user_mappings(struct page *p, unsigned long pfn, 819 int trapno) 820 { 821 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; ··· 825 int i; 826 int kill = 1; 827 828 - if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) 829 - return; 830 831 /* 832 * This check implies we don't kill processes if their pages 833 * are in the swap cache early. Those are always late kills. 834 */ 835 if (!page_mapped(p)) 836 - return; 837 838 if (PageSwapCache(p)) { 839 printk(KERN_ERR ··· 847 /* 848 * Propagate the dirty bit from PTEs to struct page first, because we 849 * need this to decide if we should kill or just drop the page. 850 */ 851 mapping = page_mapping(p); 852 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { ··· 900 */ 901 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 902 ret != SWAP_SUCCESS, pfn); 903 } 904 905 - int __memory_failure(unsigned long pfn, int trapno, int ref) 906 { 907 - unsigned long lru_flag; 908 struct page_state *ps; 909 struct page *p; 910 int res; ··· 914 panic("Memory failure from trap %d on page %lx", trapno, pfn); 915 916 if (!pfn_valid(pfn)) { 917 - action_result(pfn, "memory outside kernel control", IGNORED); 918 - return -EIO; 919 } 920 921 p = pfn_to_page(pfn); 922 if (TestSetPageHWPoison(p)) { 923 - action_result(pfn, "already hardware poisoned", IGNORED); 924 return 0; 925 } 926 ··· 939 * In fact it's dangerous to directly bump up page count from 0, 940 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 941 */ 942 - if (!get_page_unless_zero(compound_head(p))) { 943 - action_result(pfn, "free or high order kernel", IGNORED); 944 - return PageBuddy(compound_head(p)) ? 0 : -EBUSY; 945 } 946 947 /* ··· 959 * walked by the page reclaim code, however that's not a big loss. 960 */ 961 if (!PageLRU(p)) 962 - lru_add_drain_all(); 963 - lru_flag = p->flags & lru; 964 - if (isolate_lru_page(p)) { 965 action_result(pfn, "non LRU", IGNORED); 966 put_page(p); 967 return -EBUSY; 968 } 969 - page_cache_release(p); 970 971 /* 972 * Lock the page and wait for writeback to finish. ··· 979 * and in many cases impossible, so we just avoid it here. 980 */ 981 lock_page_nosync(p); 982 wait_on_page_writeback(p); 983 984 /* 985 * Now take care of user space mappings. 986 */ 987 - hwpoison_user_mappings(p, pfn, trapno); 988 989 /* 990 * Torn down by someone else? 991 */ 992 - if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { 993 action_result(pfn, "already truncated LRU", IGNORED); 994 - res = 0; 995 goto out; 996 } 997 998 res = -EBUSY; 999 for (ps = error_states;; ps++) { 1000 - if (((p->flags | lru_flag)& ps->mask) == ps->res) { 1001 - res = page_action(ps, p, pfn, ref); 1002 break; 1003 } 1004 } ··· 1050 void memory_failure(unsigned long pfn, int trapno) 1051 { 1052 __memory_failure(pfn, trapno, 0); 1053 }

··· 34 #include <linux/kernel.h> 35 #include <linux/mm.h> 36 #include <linux/page-flags.h> 37 + #include <linux/kernel-page-flags.h> 38 #include <linux/sched.h> 39 #include <linux/ksm.h> 40 #include <linux/rmap.h> 41 #include <linux/pagemap.h> 42 #include <linux/swap.h> 43 #include <linux/backing-dev.h> 44 + #include <linux/migrate.h> 45 + #include <linux/page-isolation.h> 46 + #include <linux/suspend.h> 47 #include "internal.h" 48 49 int sysctl_memory_failure_early_kill __read_mostly = 0; ··· 47 int sysctl_memory_failure_recovery __read_mostly = 1; 48 49 atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 50 + 51 + u32 hwpoison_filter_enable = 0; 52 + u32 hwpoison_filter_dev_major = ~0U; 53 + u32 hwpoison_filter_dev_minor = ~0U; 54 + u64 hwpoison_filter_flags_mask; 55 + u64 hwpoison_filter_flags_value; 56 + EXPORT_SYMBOL_GPL(hwpoison_filter_enable); 57 + EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); 58 + EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); 59 + EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); 60 + EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); 61 + 62 + static int hwpoison_filter_dev(struct page *p) 63 + { 64 + struct address_space *mapping; 65 + dev_t dev; 66 + 67 + if (hwpoison_filter_dev_major == ~0U && 68 + hwpoison_filter_dev_minor == ~0U) 69 + return 0; 70 + 71 + /* 72 + * page_mapping() does not accept slab page 73 + */ 74 + if (PageSlab(p)) 75 + return -EINVAL; 76 + 77 + mapping = page_mapping(p); 78 + if (mapping == NULL || mapping->host == NULL) 79 + return -EINVAL; 80 + 81 + dev = mapping->host->i_sb->s_dev; 82 + if (hwpoison_filter_dev_major != ~0U && 83 + hwpoison_filter_dev_major != MAJOR(dev)) 84 + return -EINVAL; 85 + if (hwpoison_filter_dev_minor != ~0U && 86 + hwpoison_filter_dev_minor != MINOR(dev)) 87 + return -EINVAL; 88 + 89 + return 0; 90 + } 91 + 92 + static int hwpoison_filter_flags(struct page *p) 93 + { 94 + if (!hwpoison_filter_flags_mask) 95 + return 0; 96 + 97 + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == 98 + hwpoison_filter_flags_value) 99 + return 0; 100 + else 101 + return -EINVAL; 102 + } 103 + 104 + /* 105 + * This allows stress tests to limit test scope to a collection of tasks 106 + * by putting them under some memcg. This prevents killing unrelated/important 107 + * processes such as /sbin/init. Note that the target task may share clean 108 + * pages with init (eg. libc text), which is harmless. If the target task 109 + * share _dirty_ pages with another task B, the test scheme must make sure B 110 + * is also included in the memcg. At last, due to race conditions this filter 111 + * can only guarantee that the page either belongs to the memcg tasks, or is 112 + * a freed page. 113 + */ 114 + #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 115 + u64 hwpoison_filter_memcg; 116 + EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 117 + static int hwpoison_filter_task(struct page *p) 118 + { 119 + struct mem_cgroup *mem; 120 + struct cgroup_subsys_state *css; 121 + unsigned long ino; 122 + 123 + if (!hwpoison_filter_memcg) 124 + return 0; 125 + 126 + mem = try_get_mem_cgroup_from_page(p); 127 + if (!mem) 128 + return -EINVAL; 129 + 130 + css = mem_cgroup_css(mem); 131 + /* root_mem_cgroup has NULL dentries */ 132 + if (!css->cgroup->dentry) 133 + return -EINVAL; 134 + 135 + ino = css->cgroup->dentry->d_inode->i_ino; 136 + css_put(css); 137 + 138 + if (ino != hwpoison_filter_memcg) 139 + return -EINVAL; 140 + 141 + return 0; 142 + } 143 + #else 144 + static int hwpoison_filter_task(struct page *p) { return 0; } 145 + #endif 146 + 147 + int hwpoison_filter(struct page *p) 148 + { 149 + if (!hwpoison_filter_enable) 150 + return 0; 151 + 152 + if (hwpoison_filter_dev(p)) 153 + return -EINVAL; 154 + 155 + if (hwpoison_filter_flags(p)) 156 + return -EINVAL; 157 + 158 + if (hwpoison_filter_task(p)) 159 + return -EINVAL; 160 + 161 + return 0; 162 + } 163 + EXPORT_SYMBOL_GPL(hwpoison_filter); 164 165 /* 166 * Send all the processes who have the page mapped an ``action optional'' ··· 81 t->comm, t->pid, ret); 82 return ret; 83 } 84 + 85 + /* 86 + * When a unknown page type is encountered drain as many buffers as possible 87 + * in the hope to turn the page into a LRU or free page, which we can handle. 88 + */ 89 + void shake_page(struct page *p, int access) 90 + { 91 + if (!PageSlab(p)) { 92 + lru_add_drain_all(); 93 + if (PageLRU(p)) 94 + return; 95 + drain_all_pages(); 96 + if (PageLRU(p) || is_free_buddy_page(p)) 97 + return; 98 + } 99 + 100 + /* 101 + * Only all shrink_slab here (which would also 102 + * shrink other caches) if access is not potentially fatal. 103 + */ 104 + if (access) { 105 + int nr; 106 + do { 107 + nr = shrink_slab(1000, GFP_KERNEL, 1000); 108 + if (page_count(p) == 0) 109 + break; 110 + } while (nr > 10); 111 + } 112 + } 113 + EXPORT_SYMBOL_GPL(shake_page); 114 115 /* 116 * Kill all processes that have a poisoned page mapped and then isolate ··· 177 * In case something went wrong with munmapping 178 * make sure the process doesn't catch the 179 * signal and then access the memory. Just kill it. 180 */ 181 if (fail || tk->addr_valid == 0) { 182 printk(KERN_ERR ··· 314 */ 315 316 enum outcome { 317 + IGNORED, /* Error: cannot be handled */ 318 + FAILED, /* Error: handling failed */ 319 DELAYED, /* Will be handled later */ 320 RECOVERED, /* Successfully recovered */ 321 }; 322 323 static const char *action_name[] = { 324 + [IGNORED] = "Ignored", 325 [FAILED] = "Failed", 326 [DELAYED] = "Delayed", 327 [RECOVERED] = "Recovered", 328 }; 329 + 330 + /* 331 + * XXX: It is possible that a page is isolated from LRU cache, 332 + * and then kept in swap cache or failed to remove from page cache. 333 + * The page count will stop it from being freed by unpoison. 334 + * Stress tests should be aware of this memory leak problem. 335 + */ 336 + static int delete_from_lru_cache(struct page *p) 337 + { 338 + if (!isolate_lru_page(p)) { 339 + /* 340 + * Clear sensible page flags, so that the buddy system won't 341 + * complain when the page is unpoison-and-freed. 342 + */ 343 + ClearPageActive(p); 344 + ClearPageUnevictable(p); 345 + /* 346 + * drop the page count elevated by isolate_lru_page() 347 + */ 348 + page_cache_release(p); 349 + return 0; 350 + } 351 + return -EIO; 352 + } 353 354 /* 355 * Error hit kernel page. ··· 333 * could be more sophisticated. 334 */ 335 static int me_kernel(struct page *p, unsigned long pfn) 336 { 337 return IGNORED; 338 } ··· 355 } 356 357 /* 358 * Clean (or cleaned) page cache page. 359 */ 360 static int me_pagecache_clean(struct page *p, unsigned long pfn) ··· 370 int err; 371 int ret = FAILED; 372 struct address_space *mapping; 373 + 374 + delete_from_lru_cache(p); 375 376 /* 377 * For anonymous pages we're done the only reference left ··· 500 /* Trigger EIO in shmem: */ 501 ClearPageUptodate(p); 502 503 + if (!delete_from_lru_cache(p)) 504 + return DELAYED; 505 + else 506 + return FAILED; 507 } 508 509 static int me_swapcache_clean(struct page *p, unsigned long pfn) 510 { 511 delete_from_swap_cache(p); 512 513 + if (!delete_from_lru_cache(p)) 514 + return RECOVERED; 515 + else 516 + return FAILED; 517 } 518 519 /* ··· 550 #define tail (1UL << PG_tail) 551 #define compound (1UL << PG_compound) 552 #define slab (1UL << PG_slab) 553 #define reserved (1UL << PG_reserved) 554 555 static struct page_state { ··· 559 char *msg; 560 int (*action)(struct page *p, unsigned long pfn); 561 } error_states[] = { 562 + { reserved, reserved, "reserved kernel", me_kernel }, 563 + /* 564 + * free pages are specially detected outside this table: 565 + * PG_buddy pages only make a small fraction of all free pages. 566 + */ 567 568 /* 569 * Could in theory check if slab page is free or if we can drop ··· 587 588 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 589 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 590 591 /* 592 * Catchall entry: must be at end. ··· 595 { 0, 0, "unknown page state", me_unknown }, 596 }; 597 598 + #undef dirty 599 + #undef sc 600 + #undef unevict 601 + #undef mlock 602 + #undef writeback 603 + #undef lru 604 + #undef swapbacked 605 + #undef head 606 + #undef tail 607 + #undef compound 608 + #undef slab 609 + #undef reserved 610 + 611 static void action_result(unsigned long pfn, char *msg, int result) 612 { 613 + struct page *page = pfn_to_page(pfn); 614 615 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", 616 pfn, 617 + PageDirty(page) ? "dirty " : "", 618 msg, action_name[result]); 619 } 620 621 static int page_action(struct page_state *ps, struct page *p, 622 + unsigned long pfn) 623 { 624 int result; 625 int count; ··· 616 result = ps->action(p, pfn); 617 action_result(pfn, ps->msg, result); 618 619 + count = page_count(p) - 1; 620 + if (ps->action == me_swapcache_dirty && result == DELAYED) 621 + count--; 622 + if (count != 0) { 623 printk(KERN_ERR 624 "MCE %#lx: %s page still referenced by %d users\n", 625 pfn, ps->msg, count); 626 + result = FAILED; 627 + } 628 629 /* Could do more checks here if page looks ok */ 630 /* 631 * Could adjust zone counters here to correct for the missing page. 632 */ 633 634 + return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; 635 } 636 637 #define N_UNMAP_TRIES 5 ··· 636 * Do all that is necessary to remove user space mappings. Unmap 637 * the pages and send SIGBUS to the processes if the data was dirty. 638 */ 639 + static int hwpoison_user_mappings(struct page *p, unsigned long pfn, 640 int trapno) 641 { 642 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; ··· 646 int i; 647 int kill = 1; 648 649 + if (PageReserved(p) || PageSlab(p)) 650 + return SWAP_SUCCESS; 651 652 /* 653 * This check implies we don't kill processes if their pages 654 * are in the swap cache early. Those are always late kills. 655 */ 656 if (!page_mapped(p)) 657 + return SWAP_SUCCESS; 658 + 659 + if (PageCompound(p) || PageKsm(p)) 660 + return SWAP_FAIL; 661 662 if (PageSwapCache(p)) { 663 printk(KERN_ERR ··· 665 /* 666 * Propagate the dirty bit from PTEs to struct page first, because we 667 * need this to decide if we should kill or just drop the page. 668 + * XXX: the dirty test could be racy: set_page_dirty() may not always 669 + * be called inside page lock (it's recommended but not enforced). 670 */ 671 mapping = page_mapping(p); 672 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { ··· 716 */ 717 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 718 ret != SWAP_SUCCESS, pfn); 719 + 720 + return ret; 721 } 722 723 + int __memory_failure(unsigned long pfn, int trapno, int flags) 724 { 725 struct page_state *ps; 726 struct page *p; 727 int res; ··· 729 panic("Memory failure from trap %d on page %lx", trapno, pfn); 730 731 if (!pfn_valid(pfn)) { 732 + printk(KERN_ERR 733 + "MCE %#lx: memory outside kernel control\n", 734 + pfn); 735 + return -ENXIO; 736 } 737 738 p = pfn_to_page(pfn); 739 if (TestSetPageHWPoison(p)) { 740 + printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); 741 return 0; 742 } 743 ··· 752 * In fact it's dangerous to directly bump up page count from 0, 753 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 754 */ 755 + if (!(flags & MF_COUNT_INCREASED) && 756 + !get_page_unless_zero(compound_head(p))) { 757 + if (is_free_buddy_page(p)) { 758 + action_result(pfn, "free buddy", DELAYED); 759 + return 0; 760 + } else { 761 + action_result(pfn, "high order kernel", IGNORED); 762 + return -EBUSY; 763 + } 764 } 765 766 /* ··· 766 * walked by the page reclaim code, however that's not a big loss. 767 */ 768 if (!PageLRU(p)) 769 + shake_page(p, 0); 770 + if (!PageLRU(p)) { 771 + /* 772 + * shake_page could have turned it free. 773 + */ 774 + if (is_free_buddy_page(p)) { 775 + action_result(pfn, "free buddy, 2nd try", DELAYED); 776 + return 0; 777 + } 778 action_result(pfn, "non LRU", IGNORED); 779 put_page(p); 780 return -EBUSY; 781 } 782 783 /* 784 * Lock the page and wait for writeback to finish. ··· 781 * and in many cases impossible, so we just avoid it here. 782 */ 783 lock_page_nosync(p); 784 + 785 + /* 786 + * unpoison always clear PG_hwpoison inside page lock 787 + */ 788 + if (!PageHWPoison(p)) { 789 + printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); 790 + res = 0; 791 + goto out; 792 + } 793 + if (hwpoison_filter(p)) { 794 + if (TestClearPageHWPoison(p)) 795 + atomic_long_dec(&mce_bad_pages); 796 + unlock_page(p); 797 + put_page(p); 798 + return 0; 799 + } 800 + 801 wait_on_page_writeback(p); 802 803 /* 804 * Now take care of user space mappings. 805 + * Abort on fail: __remove_from_page_cache() assumes unmapped page. 806 */ 807 + if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { 808 + printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); 809 + res = -EBUSY; 810 + goto out; 811 + } 812 813 /* 814 * Torn down by someone else? 815 */ 816 + if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 817 action_result(pfn, "already truncated LRU", IGNORED); 818 + res = -EBUSY; 819 goto out; 820 } 821 822 res = -EBUSY; 823 for (ps = error_states;; ps++) { 824 + if ((p->flags & ps->mask) == ps->res) { 825 + res = page_action(ps, p, pfn); 826 break; 827 } 828 } ··· 830 void memory_failure(unsigned long pfn, int trapno) 831 { 832 __memory_failure(pfn, trapno, 0); 833 + } 834 + 835 + /** 836 + * unpoison_memory - Unpoison a previously poisoned page 837 + * @pfn: Page number of the to be unpoisoned page 838 + * 839 + * Software-unpoison a page that has been poisoned by 840 + * memory_failure() earlier. 841 + * 842 + * This is only done on the software-level, so it only works 843 + * for linux injected failures, not real hardware failures 844 + * 845 + * Returns 0 for success, otherwise -errno. 846 + */ 847 + int unpoison_memory(unsigned long pfn) 848 + { 849 + struct page *page; 850 + struct page *p; 851 + int freeit = 0; 852 + 853 + if (!pfn_valid(pfn)) 854 + return -ENXIO; 855 + 856 + p = pfn_to_page(pfn); 857 + page = compound_head(p); 858 + 859 + if (!PageHWPoison(p)) { 860 + pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); 861 + return 0; 862 + } 863 + 864 + if (!get_page_unless_zero(page)) { 865 + if (TestClearPageHWPoison(p)) 866 + atomic_long_dec(&mce_bad_pages); 867 + pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 868 + return 0; 869 + } 870 + 871 + lock_page_nosync(page); 872 + /* 873 + * This test is racy because PG_hwpoison is set outside of page lock. 874 + * That's acceptable because that won't trigger kernel panic. Instead, 875 + * the PG_hwpoison page will be caught and isolated on the entrance to 876 + * the free buddy page pool. 877 + */ 878 + if (TestClearPageHWPoison(p)) { 879 + pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 880 + atomic_long_dec(&mce_bad_pages); 881 + freeit = 1; 882 + } 883 + unlock_page(page); 884 + 885 + put_page(page); 886 + if (freeit) 887 + put_page(page); 888 + 889 + return 0; 890 + } 891 + EXPORT_SYMBOL(unpoison_memory); 892 + 893 + static struct page *new_page(struct page *p, unsigned long private, int **x) 894 + { 895 + int nid = page_to_nid(p); 896 + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 897 + } 898 + 899 + /* 900 + * Safely get reference count of an arbitrary page. 901 + * Returns 0 for a free page, -EIO for a zero refcount page 902 + * that is not free, and 1 for any other page type. 903 + * For 1 the page is returned with increased page count, otherwise not. 904 + */ 905 + static int get_any_page(struct page *p, unsigned long pfn, int flags) 906 + { 907 + int ret; 908 + 909 + if (flags & MF_COUNT_INCREASED) 910 + return 1; 911 + 912 + /* 913 + * The lock_system_sleep prevents a race with memory hotplug, 914 + * because the isolation assumes there's only a single user. 915 + * This is a big hammer, a better would be nicer. 916 + */ 917 + lock_system_sleep(); 918 + 919 + /* 920 + * Isolate the page, so that it doesn't get reallocated if it 921 + * was free. 922 + */ 923 + set_migratetype_isolate(p); 924 + if (!get_page_unless_zero(compound_head(p))) { 925 + if (is_free_buddy_page(p)) { 926 + pr_debug("get_any_page: %#lx free buddy page\n", pfn); 927 + /* Set hwpoison bit while page is still isolated */ 928 + SetPageHWPoison(p); 929 + ret = 0; 930 + } else { 931 + pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", 932 + pfn, p->flags); 933 + ret = -EIO; 934 + } 935 + } else { 936 + /* Not a free page */ 937 + ret = 1; 938 + } 939 + unset_migratetype_isolate(p); 940 + unlock_system_sleep(); 941 + return ret; 942 + } 943 + 944 + /** 945 + * soft_offline_page - Soft offline a page. 946 + * @page: page to offline 947 + * @flags: flags. Same as memory_failure(). 948 + * 949 + * Returns 0 on success, otherwise negated errno. 950 + * 951 + * Soft offline a page, by migration or invalidation, 952 + * without killing anything. This is for the case when 953 + * a page is not corrupted yet (so it's still valid to access), 954 + * but has had a number of corrected errors and is better taken 955 + * out. 956 + * 957 + * The actual policy on when to do that is maintained by 958 + * user space. 959 + * 960 + * This should never impact any application or cause data loss, 961 + * however it might take some time. 962 + * 963 + * This is not a 100% solution for all memory, but tries to be 964 + * ``good enough'' for the majority of memory. 965 + */ 966 + int soft_offline_page(struct page *page, int flags) 967 + { 968 + int ret; 969 + unsigned long pfn = page_to_pfn(page); 970 + 971 + ret = get_any_page(page, pfn, flags); 972 + if (ret < 0) 973 + return ret; 974 + if (ret == 0) 975 + goto done; 976 + 977 + /* 978 + * Page cache page we can handle? 979 + */ 980 + if (!PageLRU(page)) { 981 + /* 982 + * Try to free it. 983 + */ 984 + put_page(page); 985 + shake_page(page, 1); 986 + 987 + /* 988 + * Did it turn free? 989 + */ 990 + ret = get_any_page(page, pfn, 0); 991 + if (ret < 0) 992 + return ret; 993 + if (ret == 0) 994 + goto done; 995 + } 996 + if (!PageLRU(page)) { 997 + pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", 998 + pfn, page->flags); 999 + return -EIO; 1000 + } 1001 + 1002 + lock_page(page); 1003 + wait_on_page_writeback(page); 1004 + 1005 + /* 1006 + * Synchronized using the page lock with memory_failure() 1007 + */ 1008 + if (PageHWPoison(page)) { 1009 + unlock_page(page); 1010 + put_page(page); 1011 + pr_debug("soft offline: %#lx page already poisoned\n", pfn); 1012 + return -EBUSY; 1013 + } 1014 + 1015 + /* 1016 + * Try to invalidate first. This should work for 1017 + * non dirty unmapped page cache pages. 1018 + */ 1019 + ret = invalidate_inode_page(page); 1020 + unlock_page(page); 1021 + 1022 + /* 1023 + * Drop count because page migration doesn't like raised 1024 + * counts. The page could get re-allocated, but if it becomes 1025 + * LRU the isolation will just fail. 1026 + * RED-PEN would be better to keep it isolated here, but we 1027 + * would need to fix isolation locking first. 1028 + */ 1029 + put_page(page); 1030 + if (ret == 1) { 1031 + ret = 0; 1032 + pr_debug("soft_offline: %#lx: invalidated\n", pfn); 1033 + goto done; 1034 + } 1035 + 1036 + /* 1037 + * Simple invalidation didn't work. 1038 + * Try to migrate to a new page instead. migrate.c 1039 + * handles a large number of cases for us. 1040 + */ 1041 + ret = isolate_lru_page(page); 1042 + if (!ret) { 1043 + LIST_HEAD(pagelist); 1044 + 1045 + list_add(&page->lru, &pagelist); 1046 + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1047 + if (ret) { 1048 + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1049 + pfn, ret, page->flags); 1050 + if (ret > 0) 1051 + ret = -EIO; 1052 + } 1053 + } else { 1054 + pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1055 + pfn, ret, page_count(page), page->flags); 1056 + } 1057 + if (ret) 1058 + return ret; 1059 + 1060 + done: 1061 + atomic_long_add(1, &mce_bad_pages); 1062 + SetPageHWPoison(page); 1063 + /* keep elevated page count for bad page */ 1064 + return ret; 1065 }

+4

mm/memory.c

··· 2555 ret = VM_FAULT_MAJOR; 2556 count_vm_event(PGMAJFAULT); 2557 } else if (PageHWPoison(page)) { 2558 ret = VM_FAULT_HWPOISON; 2559 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2560 goto out_release;

··· 2555 ret = VM_FAULT_MAJOR; 2556 count_vm_event(PGMAJFAULT); 2557 } else if (PageHWPoison(page)) { 2558 + /* 2559 + * hwpoisoned dirty swapcache pages are kept for killing 2560 + * owner processes (which may be unknown at hwpoison time) 2561 + */ 2562 ret = VM_FAULT_HWPOISON; 2563 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2564 goto out_release;

+21

mm/page_alloc.c

··· 5091 spin_unlock_irqrestore(&zone->lock, flags); 5092 } 5093 #endif

··· 5091 spin_unlock_irqrestore(&zone->lock, flags); 5092 } 5093 #endif 5094 + 5095 + #ifdef CONFIG_MEMORY_FAILURE 5096 + bool is_free_buddy_page(struct page *page) 5097 + { 5098 + struct zone *zone = page_zone(page); 5099 + unsigned long pfn = page_to_pfn(page); 5100 + unsigned long flags; 5101 + int order; 5102 + 5103 + spin_lock_irqsave(&zone->lock, flags); 5104 + for (order = 0; order < MAX_ORDER; order++) { 5105 + struct page *page_head = page - (pfn & ((1 << order) - 1)); 5106 + 5107 + if (PageBuddy(page_head) && page_order(page_head) >= order) 5108 + break; 5109 + } 5110 + spin_unlock_irqrestore(&zone->lock, flags); 5111 + 5112 + return order < MAX_ORDER; 5113 + } 5114 + #endif