Merge branch 'apei' into apei-release · tjh.dev/kernel@d0e323b

+9 -2

Documentation/acpi/apei/einj.txt

··· 48 48 - param1 49 49 This file is used to set the first error parameter value. Effect of 50 50 parameter depends on error_type specified. For memory error, this is 51 - physical memory address. 51 + physical memory address. Only available if param_extension module 52 + parameter is specified. 52 53 53 54 - param2 54 55 This file is used to set the second error parameter value. Effect of 55 56 parameter depends on error_type specified. For memory error, this is 56 - physical memory address mask. 57 + physical memory address mask. Only available if param_extension 58 + module parameter is specified. 59 + 60 + Injecting parameter support is a BIOS version specific extension, that 61 + is, it only works on some BIOS version. If you want to use it, please 62 + make sure your BIOS version has the proper support and specify 63 + "param_extension=y" in module parameter. 57 64 58 65 For more information about EINJ, please refer to ACPI specification 59 66 version 4.0, section 17.5.

+3

arch/Kconfig

··· 178 178 config HAVE_RCU_TABLE_FREE 179 179 bool 180 180 181 + config ARCH_HAVE_NMI_SAFE_CMPXCHG 182 + bool 183 + 181 184 source "kernel/gcov/Kconfig"

+1

arch/alpha/Kconfig

··· 14 14 select AUTO_IRQ_AFFINITY if SMP 15 15 select GENERIC_IRQ_SHOW 16 16 select ARCH_WANT_OPTIONAL_GPIOLIB 17 + select ARCH_HAVE_NMI_SAFE_CMPXCHG 17 18 help 18 19 The Alpha is a 64-bit general-purpose processor designed and 19 20 marketed by the Digital Equipment Corporation of blessed memory,

+1

arch/avr32/Kconfig

··· 10 10 select GENERIC_IRQ_PROBE 11 11 select HARDIRQS_SW_RESEND 12 12 select GENERIC_IRQ_SHOW 13 + select ARCH_HAVE_NMI_SAFE_CMPXCHG 13 14 help 14 15 AVR32 is a high-performance 32-bit RISC microprocessor core, 15 16 designed for cost-sensitive embedded applications, with particular

+1

arch/frv/Kconfig

··· 7 7 select HAVE_PERF_EVENTS 8 8 select HAVE_GENERIC_HARDIRQS 9 9 select GENERIC_IRQ_SHOW 10 + select ARCH_HAVE_NMI_SAFE_CMPXCHG 10 11 11 12 config ZONE_DMA 12 13 bool

+1

arch/ia64/Kconfig

··· 28 28 select IRQ_PER_CPU 29 29 select GENERIC_IRQ_SHOW 30 30 select ARCH_WANT_OPTIONAL_GPIOLIB 31 + select ARCH_HAVE_NMI_SAFE_CMPXCHG 31 32 default y 32 33 help 33 34 The Itanium Processor Family is Intel's 64-bit successor to

+1

arch/m68k/Kconfig

··· 6 6 select GENERIC_ATOMIC64 if MMU 7 7 select HAVE_GENERIC_HARDIRQS if !MMU 8 8 select GENERIC_IRQ_SHOW if !MMU 9 + select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS 9 10 10 11 config RWSEM_GENERIC_SPINLOCK 11 12 bool

+1

arch/parisc/Kconfig

··· 15 15 select HAVE_GENERIC_HARDIRQS 16 16 select GENERIC_IRQ_PROBE 17 17 select IRQ_PER_CPU 18 + select ARCH_HAVE_NMI_SAFE_CMPXCHG 18 19 19 20 help 20 21 The PA-RISC microprocessor is designed by Hewlett-Packard and used

+1

arch/powerpc/Kconfig

··· 136 136 select HAVE_SYSCALL_TRACEPOINTS 137 137 select HAVE_BPF_JIT if (PPC64 && NET) 138 138 select HAVE_ARCH_JUMP_LABEL 139 + select ARCH_HAVE_NMI_SAFE_CMPXCHG 139 140 140 141 config EARLY_PRINTK 141 142 bool

+1

arch/s390/Kconfig

··· 81 81 select INIT_ALL_POSSIBLE 82 82 select HAVE_IRQ_WORK 83 83 select HAVE_PERF_EVENTS 84 + select ARCH_HAVE_NMI_SAFE_CMPXCHG 84 85 select HAVE_KERNEL_GZIP 85 86 select HAVE_KERNEL_BZIP2 86 87 select HAVE_KERNEL_LZMA

+1

arch/sh/Kconfig

··· 11 11 select HAVE_DMA_ATTRS 12 12 select HAVE_IRQ_WORK 13 13 select HAVE_PERF_EVENTS 14 + select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) 14 15 select PERF_USE_VMALLOC 15 16 select HAVE_KERNEL_GZIP 16 17 select HAVE_KERNEL_BZIP2

+1

arch/sparc/Kconfig

··· 54 54 select HAVE_PERF_EVENTS 55 55 select PERF_USE_VMALLOC 56 56 select IRQ_PREFLOW_FASTEOI 57 + select ARCH_HAVE_NMI_SAFE_CMPXCHG 57 58 58 59 config ARCH_DEFCONFIG 59 60 string

+1

arch/tile/Kconfig

··· 12 12 select GENERIC_PENDING_IRQ if SMP 13 13 select GENERIC_IRQ_SHOW 14 14 select SYS_HYPERVISOR 15 + select ARCH_HAVE_NMI_SAFE_CMPXCHG if !M386 15 16 16 17 # FIXME: investigate whether we need/want these options. 17 18 # select HAVE_IOREMAP_PROT

+1

arch/x86/Kconfig

··· 72 72 select USE_GENERIC_SMP_HELPERS if SMP 73 73 select HAVE_BPF_JIT if (X86_64 && NET) 74 74 select CLKEVT_I8253 75 + select ARCH_HAVE_NMI_SAFE_CMPXCHG 75 76 76 77 config INSTRUCTION_DECODER 77 78 def_bool (KPROBES || PERF_EVENTS)

+10 -1

drivers/acpi/apei/Kconfig

··· 10 10 error injection. 11 11 12 12 config ACPI_APEI_GHES 13 - tristate "APEI Generic Hardware Error Source" 13 + bool "APEI Generic Hardware Error Source" 14 14 depends on ACPI_APEI && X86 15 15 select ACPI_HED 16 + select LLIST 17 + select GENERIC_ALLOCATOR 16 18 help 17 19 Generic Hardware Error Source provides a way to report 18 20 platform hardware errors (such as that from chipset). It ··· 31 29 help 32 30 PCIe AER errors may be reported via APEI firmware first mode. 33 31 Turn on this option to enable the corresponding support. 32 + 33 + config ACPI_APEI_MEMORY_FAILURE 34 + bool "APEI memory error recovering support" 35 + depends on ACPI_APEI && MEMORY_FAILURE 36 + help 37 + Memory errors may be reported via APEI firmware first mode. 38 + Turn on this option to enable the memory recovering support. 34 39 35 40 config ACPI_APEI_EINJ 36 41 tristate "APEI Error INJection (EINJ)"

+31 -4

drivers/acpi/apei/apei-base.c

··· 157 157 * Interpret the specified action. Go through whole action table, 158 158 * execute all instructions belong to the action. 159 159 */ 160 - int apei_exec_run(struct apei_exec_context *ctx, u8 action) 160 + int __apei_exec_run(struct apei_exec_context *ctx, u8 action, 161 + bool optional) 161 162 { 162 - int rc; 163 + int rc = -ENOENT; 163 164 u32 i, ip; 164 165 struct acpi_whea_header *entry; 165 166 apei_exec_ins_func_t run; ··· 199 198 goto rewind; 200 199 } 201 200 202 - return 0; 201 + return !optional && rc < 0 ? rc : 0; 203 202 } 204 - EXPORT_SYMBOL_GPL(apei_exec_run); 203 + EXPORT_SYMBOL_GPL(__apei_exec_run); 205 204 206 205 typedef int (*apei_exec_entry_func_t)(struct apei_exec_context *ctx, 207 206 struct acpi_whea_header *entry, ··· 604 603 return dapei; 605 604 } 606 605 EXPORT_SYMBOL_GPL(apei_get_debugfs_dir); 606 + 607 + int apei_osc_setup(void) 608 + { 609 + static u8 whea_uuid_str[] = "ed855e0c-6c90-47bf-a62a-26de0fc5ad5c"; 610 + acpi_handle handle; 611 + u32 capbuf[3]; 612 + struct acpi_osc_context context = { 613 + .uuid_str = whea_uuid_str, 614 + .rev = 1, 615 + .cap.length = sizeof(capbuf), 616 + .cap.pointer = capbuf, 617 + }; 618 + 619 + capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE; 620 + capbuf[OSC_SUPPORT_TYPE] = 0; 621 + capbuf[OSC_CONTROL_TYPE] = 0; 622 + 623 + if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle)) 624 + || ACPI_FAILURE(acpi_run_osc(handle, &context))) 625 + return -EIO; 626 + else { 627 + kfree(context.ret.pointer); 628 + return 0; 629 + } 630 + } 631 + EXPORT_SYMBOL_GPL(apei_osc_setup);

+14 -1

drivers/acpi/apei/apei-internal.h

··· 50 50 return ctx->value; 51 51 } 52 52 53 - int apei_exec_run(struct apei_exec_context *ctx, u8 action); 53 + int __apei_exec_run(struct apei_exec_context *ctx, u8 action, bool optional); 54 + 55 + static inline int apei_exec_run(struct apei_exec_context *ctx, u8 action) 56 + { 57 + return __apei_exec_run(ctx, action, 0); 58 + } 59 + 60 + /* It is optional whether the firmware provides the action */ 61 + static inline int apei_exec_run_optional(struct apei_exec_context *ctx, u8 action) 62 + { 63 + return __apei_exec_run(ctx, action, 1); 64 + } 54 65 55 66 /* Common instruction implementation */ 56 67 ··· 124 113 const struct acpi_hest_generic_status *estatus); 125 114 int apei_estatus_check_header(const struct acpi_hest_generic_status *estatus); 126 115 int apei_estatus_check(const struct acpi_hest_generic_status *estatus); 116 + 117 + int apei_osc_setup(void); 127 118 #endif

+26 -17

drivers/acpi/apei/einj.c

··· 46 46 * Some BIOSes allow parameters to the SET_ERROR_TYPE entries in the 47 47 * EINJ table through an unpublished extension. Use with caution as 48 48 * most will ignore the parameter and make their own choice of address 49 - * for error injection. 49 + * for error injection. This extension is used only if 50 + * param_extension module parameter is specified. 50 51 */ 51 52 struct einj_parameter { 52 53 u64 type; ··· 65 64 #define EINJ_TAB_ENTRY(tab) \ 66 65 ((struct acpi_whea_header *)((char *)(tab) + \ 67 66 sizeof(struct acpi_table_einj))) 67 + 68 + static bool param_extension; 69 + module_param(param_extension, bool, 0); 68 70 69 71 static struct acpi_table_einj *einj_tab; 70 72 ··· 289 285 290 286 einj_exec_ctx_init(&ctx); 291 287 292 - rc = apei_exec_run(&ctx, ACPI_EINJ_BEGIN_OPERATION); 288 + rc = apei_exec_run_optional(&ctx, ACPI_EINJ_BEGIN_OPERATION); 293 289 if (rc) 294 290 return rc; 295 291 apei_exec_ctx_set_input(&ctx, type); ··· 327 323 rc = __einj_error_trigger(trigger_paddr); 328 324 if (rc) 329 325 return rc; 330 - rc = apei_exec_run(&ctx, ACPI_EINJ_END_OPERATION); 326 + rc = apei_exec_run_optional(&ctx, ACPI_EINJ_END_OPERATION); 331 327 332 328 return rc; 333 329 } ··· 493 489 einj_debug_dir, NULL, &error_type_fops); 494 490 if (!fentry) 495 491 goto err_cleanup; 496 - fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR, 497 - einj_debug_dir, &error_param1); 498 - if (!fentry) 499 - goto err_cleanup; 500 - fentry = debugfs_create_x64("param2", S_IRUSR | S_IWUSR, 501 - einj_debug_dir, &error_param2); 502 - if (!fentry) 503 - goto err_cleanup; 504 492 fentry = debugfs_create_file("error_inject", S_IWUSR, 505 493 einj_debug_dir, NULL, &error_inject_fops); 506 494 if (!fentry) ··· 509 513 rc = apei_exec_pre_map_gars(&ctx); 510 514 if (rc) 511 515 goto err_release; 512 - param_paddr = einj_get_parameter_address(); 513 - if (param_paddr) { 514 - einj_param = ioremap(param_paddr, sizeof(*einj_param)); 515 - rc = -ENOMEM; 516 - if (!einj_param) 517 - goto err_unmap; 516 + if (param_extension) { 517 + param_paddr = einj_get_parameter_address(); 518 + if (param_paddr) { 519 + einj_param = ioremap(param_paddr, sizeof(*einj_param)); 520 + rc = -ENOMEM; 521 + if (!einj_param) 522 + goto err_unmap; 523 + fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR, 524 + einj_debug_dir, &error_param1); 525 + if (!fentry) 526 + goto err_unmap; 527 + fentry = debugfs_create_x64("param2", S_IRUSR | S_IWUSR, 528 + einj_debug_dir, &error_param2); 529 + if (!fentry) 530 + goto err_unmap; 531 + } else 532 + pr_warn(EINJ_PFX "Parameter extension is not supported.\n"); 518 533 } 519 534 520 535 pr_info(EINJ_PFX "Error INJection is initialized.\n"); ··· 533 526 return 0; 534 527 535 528 err_unmap: 529 + if (einj_param) 530 + iounmap(einj_param); 536 531 apei_exec_post_unmap_gars(&ctx); 537 532 err_release: 538 533 apei_resources_release(&einj_resources);

+5 -1

drivers/acpi/apei/erst-dbg.c

··· 33 33 34 34 #define ERST_DBG_PFX "ERST DBG: " 35 35 36 - #define ERST_DBG_RECORD_LEN_MAX 4096 36 + #define ERST_DBG_RECORD_LEN_MAX 0x4000 37 37 38 38 static void *erst_dbg_buf; 39 39 static unsigned int erst_dbg_buf_len; ··· 213 213 214 214 static __init int erst_dbg_init(void) 215 215 { 216 + if (erst_disable) { 217 + pr_info(ERST_DBG_PFX "ERST support is disabled.\n"); 218 + return -ENODEV; 219 + } 216 220 return misc_register(&erst_dbg_dev); 217 221 } 218 222

+6 -6

drivers/acpi/apei/erst.c

··· 642 642 int rc; 643 643 644 644 erst_exec_ctx_init(&ctx); 645 - rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_WRITE); 645 + rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_WRITE); 646 646 if (rc) 647 647 return rc; 648 648 apei_exec_ctx_set_input(&ctx, offset); ··· 666 666 if (rc) 667 667 return rc; 668 668 val = apei_exec_ctx_get_output(&ctx); 669 - rc = apei_exec_run(&ctx, ACPI_ERST_END); 669 + rc = apei_exec_run_optional(&ctx, ACPI_ERST_END); 670 670 if (rc) 671 671 return rc; 672 672 ··· 681 681 int rc; 682 682 683 683 erst_exec_ctx_init(&ctx); 684 - rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_READ); 684 + rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_READ); 685 685 if (rc) 686 686 return rc; 687 687 apei_exec_ctx_set_input(&ctx, offset); ··· 709 709 if (rc) 710 710 return rc; 711 711 val = apei_exec_ctx_get_output(&ctx); 712 - rc = apei_exec_run(&ctx, ACPI_ERST_END); 712 + rc = apei_exec_run_optional(&ctx, ACPI_ERST_END); 713 713 if (rc) 714 714 return rc; 715 715 ··· 724 724 int rc; 725 725 726 726 erst_exec_ctx_init(&ctx); 727 - rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_CLEAR); 727 + rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_CLEAR); 728 728 if (rc) 729 729 return rc; 730 730 apei_exec_ctx_set_input(&ctx, record_id); ··· 748 748 if (rc) 749 749 return rc; 750 750 val = apei_exec_ctx_get_output(&ctx); 751 - rc = apei_exec_run(&ctx, ACPI_ERST_END); 751 + rc = apei_exec_run_optional(&ctx, ACPI_ERST_END); 752 752 if (rc) 753 753 return rc; 754 754

+403 -28

drivers/acpi/apei/ghes.c

··· 12 12 * For more information about Generic Hardware Error Source, please 13 13 * refer to ACPI Specification version 4.0, section 17.3.2.6 14 14 * 15 - * Copyright 2010 Intel Corp. 15 + * Copyright 2010,2011 Intel Corp. 16 16 * Author: Huang Ying <ying.huang@intel.com> 17 17 * 18 18 * This program is free software; you can redistribute it and/or ··· 42 42 #include <linux/mutex.h> 43 43 #include <linux/ratelimit.h> 44 44 #include <linux/vmalloc.h> 45 + #include <linux/irq_work.h> 46 + #include <linux/llist.h> 47 + #include <linux/genalloc.h> 45 48 #include <acpi/apei.h> 46 49 #include <acpi/atomicio.h> 47 50 #include <acpi/hed.h> ··· 56 53 #define GHES_PFX "GHES: " 57 54 58 55 #define GHES_ESTATUS_MAX_SIZE 65536 56 + #define GHES_ESOURCE_PREALLOC_MAX_SIZE 65536 57 + 58 + #define GHES_ESTATUS_POOL_MIN_ALLOC_ORDER 3 59 + 60 + /* This is just an estimation for memory pool allocation */ 61 + #define GHES_ESTATUS_CACHE_AVG_SIZE 512 62 + 63 + #define GHES_ESTATUS_CACHES_SIZE 4 64 + 65 + #define GHES_ESTATUS_IN_CACHE_MAX_NSEC 10000000000ULL 66 + /* Prevent too many caches are allocated because of RCU */ 67 + #define GHES_ESTATUS_CACHE_ALLOCED_MAX (GHES_ESTATUS_CACHES_SIZE * 3 / 2) 68 + 69 + #define GHES_ESTATUS_CACHE_LEN(estatus_len) \ 70 + (sizeof(struct ghes_estatus_cache) + (estatus_len)) 71 + #define GHES_ESTATUS_FROM_CACHE(estatus_cache) \ 72 + ((struct acpi_hest_generic_status *) \ 73 + ((struct ghes_estatus_cache *)(estatus_cache) + 1)) 74 + 75 + #define GHES_ESTATUS_NODE_LEN(estatus_len) \ 76 + (sizeof(struct ghes_estatus_node) + (estatus_len)) 77 + #define GHES_ESTATUS_FROM_NODE(estatus_node) \ 78 + ((struct acpi_hest_generic_status *) \ 79 + ((struct ghes_estatus_node *)(estatus_node) + 1)) 59 80 60 81 /* 61 82 * One struct ghes is created for each generic hardware error source. ··· 103 76 unsigned int irq; 104 77 }; 105 78 }; 79 + 80 + struct ghes_estatus_node { 81 + struct llist_node llnode; 82 + struct acpi_hest_generic *generic; 83 + }; 84 + 85 + struct ghes_estatus_cache { 86 + u32 estatus_len; 87 + atomic_t count; 88 + struct acpi_hest_generic *generic; 89 + unsigned long long time_in; 90 + struct rcu_head rcu; 91 + }; 92 + 93 + int ghes_disable; 94 + module_param_named(disable, ghes_disable, bool, 0); 106 95 107 96 static int ghes_panic_timeout __read_mostly = 30; 108 97 ··· 163 120 */ 164 121 static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi); 165 122 static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); 123 + 124 + /* 125 + * printk is not safe in NMI context. So in NMI handler, we allocate 126 + * required memory from lock-less memory allocator 127 + * (ghes_estatus_pool), save estatus into it, put them into lock-less 128 + * list (ghes_estatus_llist), then delay printk into IRQ context via 129 + * irq_work (ghes_proc_irq_work). ghes_estatus_size_request record 130 + * required pool size by all NMI error source. 131 + */ 132 + static struct gen_pool *ghes_estatus_pool; 133 + static unsigned long ghes_estatus_pool_size_request; 134 + static struct llist_head ghes_estatus_llist; 135 + static struct irq_work ghes_proc_irq_work; 136 + 137 + struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE]; 138 + static atomic_t ghes_estatus_cache_alloced; 166 139 167 140 static int ghes_ioremap_init(void) 168 141 { ··· 237 178 BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base)); 238 179 unmap_kernel_range_noflush(vaddr, PAGE_SIZE); 239 180 __flush_tlb_one(vaddr); 181 + } 182 + 183 + static int ghes_estatus_pool_init(void) 184 + { 185 + ghes_estatus_pool = gen_pool_create(GHES_ESTATUS_POOL_MIN_ALLOC_ORDER, -1); 186 + if (!ghes_estatus_pool) 187 + return -ENOMEM; 188 + return 0; 189 + } 190 + 191 + static void ghes_estatus_pool_free_chunk_page(struct gen_pool *pool, 192 + struct gen_pool_chunk *chunk, 193 + void *data) 194 + { 195 + free_page(chunk->start_addr); 196 + } 197 + 198 + static void ghes_estatus_pool_exit(void) 199 + { 200 + gen_pool_for_each_chunk(ghes_estatus_pool, 201 + ghes_estatus_pool_free_chunk_page, NULL); 202 + gen_pool_destroy(ghes_estatus_pool); 203 + } 204 + 205 + static int ghes_estatus_pool_expand(unsigned long len) 206 + { 207 + unsigned long i, pages, size, addr; 208 + int ret; 209 + 210 + ghes_estatus_pool_size_request += PAGE_ALIGN(len); 211 + size = gen_pool_size(ghes_estatus_pool); 212 + if (size >= ghes_estatus_pool_size_request) 213 + return 0; 214 + pages = (ghes_estatus_pool_size_request - size) / PAGE_SIZE; 215 + for (i = 0; i < pages; i++) { 216 + addr = __get_free_page(GFP_KERNEL); 217 + if (!addr) 218 + return -ENOMEM; 219 + ret = gen_pool_add(ghes_estatus_pool, addr, PAGE_SIZE, -1); 220 + if (ret) 221 + return ret; 222 + } 223 + 224 + return 0; 225 + } 226 + 227 + static void ghes_estatus_pool_shrink(unsigned long len) 228 + { 229 + ghes_estatus_pool_size_request -= PAGE_ALIGN(len); 240 230 } 241 231 242 232 static struct ghes *ghes_new(struct acpi_hest_generic *generic) ··· 449 341 ghes->flags &= ~GHES_TO_CLEAR; 450 342 } 451 343 452 - static void ghes_do_proc(struct ghes *ghes) 344 + static void ghes_do_proc(const struct acpi_hest_generic_status *estatus) 453 345 { 454 - int sev, processed = 0; 346 + int sev, sec_sev; 455 347 struct acpi_hest_generic_data *gdata; 456 348 457 - sev = ghes_severity(ghes->estatus->error_severity); 458 - apei_estatus_for_each_section(ghes->estatus, gdata) { 459 - #ifdef CONFIG_X86_MCE 349 + sev = ghes_severity(estatus->error_severity); 350 + apei_estatus_for_each_section(estatus, gdata) { 351 + sec_sev = ghes_severity(gdata->error_severity); 460 352 if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, 461 353 CPER_SEC_PLATFORM_MEM)) { 462 - apei_mce_report_mem_error( 463 - sev == GHES_SEV_CORRECTED, 464 - (struct cper_sec_mem_err *)(gdata+1)); 465 - processed = 1; 466 - } 354 + struct cper_sec_mem_err *mem_err; 355 + mem_err = (struct cper_sec_mem_err *)(gdata+1); 356 + #ifdef CONFIG_X86_MCE 357 + apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED, 358 + mem_err); 467 359 #endif 360 + #ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE 361 + if (sev == GHES_SEV_RECOVERABLE && 362 + sec_sev == GHES_SEV_RECOVERABLE && 363 + mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) { 364 + unsigned long pfn; 365 + pfn = mem_err->physical_addr >> PAGE_SHIFT; 366 + memory_failure_queue(pfn, 0, 0); 367 + } 368 + #endif 369 + } 468 370 } 469 371 } 470 372 471 - static void ghes_print_estatus(const char *pfx, struct ghes *ghes) 373 + static void __ghes_print_estatus(const char *pfx, 374 + const struct acpi_hest_generic *generic, 375 + const struct acpi_hest_generic_status *estatus) 472 376 { 473 - /* Not more than 2 messages every 5 seconds */ 474 - static DEFINE_RATELIMIT_STATE(ratelimit, 5*HZ, 2); 475 - 476 377 if (pfx == NULL) { 477 - if (ghes_severity(ghes->estatus->error_severity) <= 378 + if (ghes_severity(estatus->error_severity) <= 478 379 GHES_SEV_CORRECTED) 479 380 pfx = KERN_WARNING HW_ERR; 480 381 else 481 382 pfx = KERN_ERR HW_ERR; 482 383 } 483 - if (__ratelimit(&ratelimit)) { 484 - printk( 485 - "%s""Hardware error from APEI Generic Hardware Error Source: %d\n", 486 - pfx, ghes->generic->header.source_id); 487 - apei_estatus_print(pfx, ghes->estatus); 384 + printk("%s""Hardware error from APEI Generic Hardware Error Source: %d\n", 385 + pfx, generic->header.source_id); 386 + apei_estatus_print(pfx, estatus); 387 + } 388 + 389 + static int ghes_print_estatus(const char *pfx, 390 + const struct acpi_hest_generic *generic, 391 + const struct acpi_hest_generic_status *estatus) 392 + { 393 + /* Not more than 2 messages every 5 seconds */ 394 + static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2); 395 + static DEFINE_RATELIMIT_STATE(ratelimit_uncorrected, 5*HZ, 2); 396 + struct ratelimit_state *ratelimit; 397 + 398 + if (ghes_severity(estatus->error_severity) <= GHES_SEV_CORRECTED) 399 + ratelimit = &ratelimit_corrected; 400 + else 401 + ratelimit = &ratelimit_uncorrected; 402 + if (__ratelimit(ratelimit)) { 403 + __ghes_print_estatus(pfx, generic, estatus); 404 + return 1; 488 405 } 406 + return 0; 407 + } 408 + 409 + /* 410 + * GHES error status reporting throttle, to report more kinds of 411 + * errors, instead of just most frequently occurred errors. 412 + */ 413 + static int ghes_estatus_cached(struct acpi_hest_generic_status *estatus) 414 + { 415 + u32 len; 416 + int i, cached = 0; 417 + unsigned long long now; 418 + struct ghes_estatus_cache *cache; 419 + struct acpi_hest_generic_status *cache_estatus; 420 + 421 + len = apei_estatus_len(estatus); 422 + rcu_read_lock(); 423 + for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) { 424 + cache = rcu_dereference(ghes_estatus_caches[i]); 425 + if (cache == NULL) 426 + continue; 427 + if (len != cache->estatus_len) 428 + continue; 429 + cache_estatus = GHES_ESTATUS_FROM_CACHE(cache); 430 + if (memcmp(estatus, cache_estatus, len)) 431 + continue; 432 + atomic_inc(&cache->count); 433 + now = sched_clock(); 434 + if (now - cache->time_in < GHES_ESTATUS_IN_CACHE_MAX_NSEC) 435 + cached = 1; 436 + break; 437 + } 438 + rcu_read_unlock(); 439 + return cached; 440 + } 441 + 442 + static struct ghes_estatus_cache *ghes_estatus_cache_alloc( 443 + struct acpi_hest_generic *generic, 444 + struct acpi_hest_generic_status *estatus) 445 + { 446 + int alloced; 447 + u32 len, cache_len; 448 + struct ghes_estatus_cache *cache; 449 + struct acpi_hest_generic_status *cache_estatus; 450 + 451 + alloced = atomic_add_return(1, &ghes_estatus_cache_alloced); 452 + if (alloced > GHES_ESTATUS_CACHE_ALLOCED_MAX) { 453 + atomic_dec(&ghes_estatus_cache_alloced); 454 + return NULL; 455 + } 456 + len = apei_estatus_len(estatus); 457 + cache_len = GHES_ESTATUS_CACHE_LEN(len); 458 + cache = (void *)gen_pool_alloc(ghes_estatus_pool, cache_len); 459 + if (!cache) { 460 + atomic_dec(&ghes_estatus_cache_alloced); 461 + return NULL; 462 + } 463 + cache_estatus = GHES_ESTATUS_FROM_CACHE(cache); 464 + memcpy(cache_estatus, estatus, len); 465 + cache->estatus_len = len; 466 + atomic_set(&cache->count, 0); 467 + cache->generic = generic; 468 + cache->time_in = sched_clock(); 469 + return cache; 470 + } 471 + 472 + static void ghes_estatus_cache_free(struct ghes_estatus_cache *cache) 473 + { 474 + u32 len; 475 + 476 + len = apei_estatus_len(GHES_ESTATUS_FROM_CACHE(cache)); 477 + len = GHES_ESTATUS_CACHE_LEN(len); 478 + gen_pool_free(ghes_estatus_pool, (unsigned long)cache, len); 479 + atomic_dec(&ghes_estatus_cache_alloced); 480 + } 481 + 482 + static void ghes_estatus_cache_rcu_free(struct rcu_head *head) 483 + { 484 + struct ghes_estatus_cache *cache; 485 + 486 + cache = container_of(head, struct ghes_estatus_cache, rcu); 487 + ghes_estatus_cache_free(cache); 488 + } 489 + 490 + static void ghes_estatus_cache_add( 491 + struct acpi_hest_generic *generic, 492 + struct acpi_hest_generic_status *estatus) 493 + { 494 + int i, slot = -1, count; 495 + unsigned long long now, duration, period, max_period = 0; 496 + struct ghes_estatus_cache *cache, *slot_cache = NULL, *new_cache; 497 + 498 + new_cache = ghes_estatus_cache_alloc(generic, estatus); 499 + if (new_cache == NULL) 500 + return; 501 + rcu_read_lock(); 502 + now = sched_clock(); 503 + for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) { 504 + cache = rcu_dereference(ghes_estatus_caches[i]); 505 + if (cache == NULL) { 506 + slot = i; 507 + slot_cache = NULL; 508 + break; 509 + } 510 + duration = now - cache->time_in; 511 + if (duration >= GHES_ESTATUS_IN_CACHE_MAX_NSEC) { 512 + slot = i; 513 + slot_cache = cache; 514 + break; 515 + } 516 + count = atomic_read(&cache->count); 517 + period = duration; 518 + do_div(period, (count + 1)); 519 + if (period > max_period) { 520 + max_period = period; 521 + slot = i; 522 + slot_cache = cache; 523 + } 524 + } 525 + /* new_cache must be put into array after its contents are written */ 526 + smp_wmb(); 527 + if (slot != -1 && cmpxchg(ghes_estatus_caches + slot, 528 + slot_cache, new_cache) == slot_cache) { 529 + if (slot_cache) 530 + call_rcu(&slot_cache->rcu, ghes_estatus_cache_rcu_free); 531 + } else 532 + ghes_estatus_cache_free(new_cache); 533 + rcu_read_unlock(); 489 534 } 490 535 491 536 static int ghes_proc(struct ghes *ghes) ··· 648 387 rc = ghes_read_estatus(ghes, 0); 649 388 if (rc) 650 389 goto out; 651 - ghes_print_estatus(NULL, ghes); 652 - ghes_do_proc(ghes); 653 - 390 + if (!ghes_estatus_cached(ghes->estatus)) { 391 + if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus)) 392 + ghes_estatus_cache_add(ghes->generic, ghes->estatus); 393 + } 394 + ghes_do_proc(ghes->estatus); 654 395 out: 655 396 ghes_clear_estatus(ghes); 656 397 return 0; ··· 710 447 return ret; 711 448 } 712 449 450 + static void ghes_proc_in_irq(struct irq_work *irq_work) 451 + { 452 + struct llist_node *llnode, *next, *tail = NULL; 453 + struct ghes_estatus_node *estatus_node; 454 + struct acpi_hest_generic *generic; 455 + struct acpi_hest_generic_status *estatus; 456 + u32 len, node_len; 457 + 458 + /* 459 + * Because the time order of estatus in list is reversed, 460 + * revert it back to proper order. 461 + */ 462 + llnode = llist_del_all(&ghes_estatus_llist); 463 + while (llnode) { 464 + next = llnode->next; 465 + llnode->next = tail; 466 + tail = llnode; 467 + llnode = next; 468 + } 469 + llnode = tail; 470 + while (llnode) { 471 + next = llnode->next; 472 + estatus_node = llist_entry(llnode, struct ghes_estatus_node, 473 + llnode); 474 + estatus = GHES_ESTATUS_FROM_NODE(estatus_node); 475 + len = apei_estatus_len(estatus); 476 + node_len = GHES_ESTATUS_NODE_LEN(len); 477 + ghes_do_proc(estatus); 478 + if (!ghes_estatus_cached(estatus)) { 479 + generic = estatus_node->generic; 480 + if (ghes_print_estatus(NULL, generic, estatus)) 481 + ghes_estatus_cache_add(generic, estatus); 482 + } 483 + gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, 484 + node_len); 485 + llnode = next; 486 + } 487 + } 488 + 713 489 static int ghes_notify_nmi(struct notifier_block *this, 714 490 unsigned long cmd, void *data) 715 491 { ··· 778 476 779 477 if (sev_global >= GHES_SEV_PANIC) { 780 478 oops_begin(); 781 - ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global); 479 + __ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global->generic, 480 + ghes_global->estatus); 782 481 /* reboot to log the error! */ 783 482 if (panic_timeout == 0) 784 483 panic_timeout = ghes_panic_timeout; ··· 787 484 } 788 485 789 486 list_for_each_entry_rcu(ghes, &ghes_nmi, list) { 487 + #ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG 488 + u32 len, node_len; 489 + struct ghes_estatus_node *estatus_node; 490 + struct acpi_hest_generic_status *estatus; 491 + #endif 790 492 if (!(ghes->flags & GHES_TO_CLEAR)) 791 493 continue; 792 - /* Do not print estatus because printk is not NMI safe */ 793 - ghes_do_proc(ghes); 494 + #ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG 495 + if (ghes_estatus_cached(ghes->estatus)) 496 + goto next; 497 + /* Save estatus for further processing in IRQ context */ 498 + len = apei_estatus_len(ghes->estatus); 499 + node_len = GHES_ESTATUS_NODE_LEN(len); 500 + estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, 501 + node_len); 502 + if (estatus_node) { 503 + estatus_node->generic = ghes->generic; 504 + estatus = GHES_ESTATUS_FROM_NODE(estatus_node); 505 + memcpy(estatus, ghes->estatus, len); 506 + llist_add(&estatus_node->llnode, &ghes_estatus_llist); 507 + } 508 + next: 509 + #endif 794 510 ghes_clear_estatus(ghes); 795 511 } 512 + #ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG 513 + irq_work_queue(&ghes_proc_irq_work); 514 + #endif 796 515 797 516 out: 798 517 raw_spin_unlock(&ghes_nmi_lock); ··· 829 504 .notifier_call = ghes_notify_nmi, 830 505 }; 831 506 507 + static unsigned long ghes_esource_prealloc_size( 508 + const struct acpi_hest_generic *generic) 509 + { 510 + unsigned long block_length, prealloc_records, prealloc_size; 511 + 512 + block_length = min_t(unsigned long, generic->error_block_length, 513 + GHES_ESTATUS_MAX_SIZE); 514 + prealloc_records = max_t(unsigned long, 515 + generic->records_to_preallocate, 1); 516 + prealloc_size = min_t(unsigned long, block_length * prealloc_records, 517 + GHES_ESOURCE_PREALLOC_MAX_SIZE); 518 + 519 + return prealloc_size; 520 + } 521 + 832 522 static int __devinit ghes_probe(struct platform_device *ghes_dev) 833 523 { 834 524 struct acpi_hest_generic *generic; 835 525 struct ghes *ghes = NULL; 526 + unsigned long len; 836 527 int rc = -EINVAL; 837 528 838 529 generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data; ··· 914 573 mutex_unlock(&ghes_list_mutex); 915 574 break; 916 575 case ACPI_HEST_NOTIFY_NMI: 576 + len = ghes_esource_prealloc_size(generic); 577 + ghes_estatus_pool_expand(len); 917 578 mutex_lock(&ghes_list_mutex); 918 579 if (list_empty(&ghes_nmi)) 919 580 register_die_notifier(&ghes_notifier_nmi); ··· 940 597 { 941 598 struct ghes *ghes; 942 599 struct acpi_hest_generic *generic; 600 + unsigned long len; 943 601 944 602 ghes = platform_get_drvdata(ghes_dev); 945 603 generic = ghes->generic; ··· 971 627 * freed after NMI handler finishes. 972 628 */ 973 629 synchronize_rcu(); 630 + len = ghes_esource_prealloc_size(generic); 631 + ghes_estatus_pool_shrink(len); 974 632 break; 975 633 default: 976 634 BUG(); ··· 1008 662 return -EINVAL; 1009 663 } 1010 664 665 + if (ghes_disable) { 666 + pr_info(GHES_PFX "GHES is not enabled!\n"); 667 + return -EINVAL; 668 + } 669 + 670 + init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq); 671 + 1011 672 rc = ghes_ioremap_init(); 1012 673 if (rc) 1013 674 goto err; 1014 675 1015 - rc = platform_driver_register(&ghes_platform_driver); 676 + rc = ghes_estatus_pool_init(); 1016 677 if (rc) 1017 678 goto err_ioremap_exit; 1018 679 680 + rc = ghes_estatus_pool_expand(GHES_ESTATUS_CACHE_AVG_SIZE * 681 + GHES_ESTATUS_CACHE_ALLOCED_MAX); 682 + if (rc) 683 + goto err_pool_exit; 684 + 685 + rc = platform_driver_register(&ghes_platform_driver); 686 + if (rc) 687 + goto err_pool_exit; 688 + 689 + rc = apei_osc_setup(); 690 + if (rc == 0 && osc_sb_apei_support_acked) 691 + pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit and WHEA _OSC.\n"); 692 + else if (rc == 0 && !osc_sb_apei_support_acked) 693 + pr_info(GHES_PFX "APEI firmware first mode is enabled by WHEA _OSC.\n"); 694 + else if (rc && osc_sb_apei_support_acked) 695 + pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit.\n"); 696 + else 697 + pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n"); 698 + 1019 699 return 0; 700 + err_pool_exit: 701 + ghes_estatus_pool_exit(); 1020 702 err_ioremap_exit: 1021 703 ghes_ioremap_exit(); 1022 704 err: ··· 1054 680 static void __exit ghes_exit(void) 1055 681 { 1056 682 platform_driver_unregister(&ghes_platform_driver); 683 + ghes_estatus_pool_exit(); 1057 684 ghes_ioremap_exit(); 1058 685 } 1059 686

+9 -8

drivers/acpi/apei/hest.c

··· 231 231 goto err; 232 232 } 233 233 234 - rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count); 235 - if (rc) 236 - goto err; 237 - 238 - rc = hest_ghes_dev_register(ghes_count); 239 - if (!rc) { 240 - pr_info(HEST_PFX "Table parsing has been initialized.\n"); 241 - return; 234 + if (!ghes_disable) { 235 + rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count); 236 + if (rc) 237 + goto err; 238 + rc = hest_ghes_dev_register(ghes_count); 239 + if (rc) 240 + goto err; 242 241 } 243 242 243 + pr_info(HEST_PFX "Table parsing has been initialized.\n"); 244 + return; 244 245 err: 245 246 hest_disable = 1; 246 247 }

+12 -2

drivers/acpi/bus.c

··· 39 39 #include <linux/pci.h> 40 40 #include <acpi/acpi_bus.h> 41 41 #include <acpi/acpi_drivers.h> 42 + #include <acpi/apei.h> 42 43 #include <linux/dmi.h> 43 44 #include <linux/suspend.h> 44 45 ··· 520 519 } 521 520 EXPORT_SYMBOL(acpi_run_osc); 522 521 522 + bool osc_sb_apei_support_acked; 523 523 static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48"; 524 524 static void acpi_bus_osc_support(void) 525 525 { ··· 543 541 #if defined(CONFIG_ACPI_PROCESSOR) || defined(CONFIG_ACPI_PROCESSOR_MODULE) 544 542 capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_PPC_OST_SUPPORT; 545 543 #endif 544 + 545 + if (!ghes_disable) 546 + capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_APEI_SUPPORT; 546 547 if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) 547 548 return; 548 - if (ACPI_SUCCESS(acpi_run_osc(handle, &context))) 549 + if (ACPI_SUCCESS(acpi_run_osc(handle, &context))) { 550 + u32 *capbuf_ret = context.ret.pointer; 551 + if (context.ret.length > OSC_SUPPORT_TYPE) 552 + osc_sb_apei_support_acked = 553 + capbuf_ret[OSC_SUPPORT_TYPE] & OSC_SB_APEI_SUPPORT; 549 554 kfree(context.ret.pointer); 550 - /* do we need to check the returned cap? Sounds no */ 555 + } 556 + /* do we need to check other returned cap? Sounds no */ 551 557 } 552 558 553 559 /* --------------------------------------------------------------------------

+5

include/acpi/apei.h

··· 18 18 19 19 extern int hest_disable; 20 20 extern int erst_disable; 21 + #ifdef CONFIG_ACPI_APEI_GHES 22 + extern int ghes_disable; 23 + #else 24 + #define ghes_disable 1 25 + #endif 21 26 22 27 #ifdef CONFIG_ACPI_APEI 23 28 void __init acpi_hest_init(void);

+2

include/linux/acpi.h

··· 280 280 #define OSC_SB_CPUHP_OST_SUPPORT 8 281 281 #define OSC_SB_APEI_SUPPORT 16 282 282 283 + extern bool osc_sb_apei_support_acked; 284 + 283 285 /* PCI defined _OSC bits */ 284 286 /* _OSC DW1 Definition (OS Support Fields) */ 285 287 #define OSC_EXT_PCI_CONFIG_SUPPORT 1

+1

include/linux/bitmap.h

··· 146 146 extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); 147 147 extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits); 148 148 149 + #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG)) 149 150 #define BITMAP_LAST_WORD_MASK(nbits) \ 150 151 ( \ 151 152 ((nbits) % BITS_PER_LONG) ? \

+28 -6

include/linux/genalloc.h

··· 1 1 /* 2 - * Basic general purpose allocator for managing special purpose memory 3 - * not managed by the regular kmalloc/kfree interface. 4 - * Uses for this includes on-device special memory, uncached memory 5 - * etc. 2 + * Basic general purpose allocator for managing special purpose 3 + * memory, for example, memory that is not managed by the regular 4 + * kmalloc/kfree interface. Uses for this includes on-device special 5 + * memory, uncached memory etc. 6 + * 7 + * It is safe to use the allocator in NMI handlers and other special 8 + * unblockable contexts that could otherwise deadlock on locks. This 9 + * is implemented by using atomic operations and retries on any 10 + * conflicts. The disadvantage is that there may be livelocks in 11 + * extreme cases. For better scalability, one allocator can be used 12 + * for each CPU. 13 + * 14 + * The lockless operation only works if there is enough memory 15 + * available. If new memory is added to the pool a lock has to be 16 + * still taken. So any user relying on locklessness has to ensure 17 + * that sufficient memory is preallocated. 18 + * 19 + * The basic atomic operation of this allocator is cmpxchg on long. 20 + * On architectures that don't have NMI-safe cmpxchg implementation, 21 + * the allocator can NOT be used in NMI handler. So code uses the 22 + * allocator in NMI handler should depend on 23 + * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. 6 24 * 7 25 * This source code is licensed under the GNU General Public License, 8 26 * Version 2. See the file COPYING for more details. ··· 33 15 * General purpose special memory pool descriptor. 34 16 */ 35 17 struct gen_pool { 36 - rwlock_t lock; 18 + spinlock_t lock; 37 19 struct list_head chunks; /* list of chunks in this pool */ 38 20 int min_alloc_order; /* minimum allocation order */ 39 21 }; ··· 42 24 * General purpose special memory pool chunk descriptor. 43 25 */ 44 26 struct gen_pool_chunk { 45 - spinlock_t lock; 46 27 struct list_head next_chunk; /* next chunk in pool */ 28 + atomic_t avail; 47 29 phys_addr_t phys_addr; /* physical starting address of memory chunk */ 48 30 unsigned long start_addr; /* starting address of memory chunk */ 49 31 unsigned long end_addr; /* ending address of memory chunk */ ··· 74 56 extern void gen_pool_destroy(struct gen_pool *); 75 57 extern unsigned long gen_pool_alloc(struct gen_pool *, size_t); 76 58 extern void gen_pool_free(struct gen_pool *, unsigned long, size_t); 59 + extern void gen_pool_for_each_chunk(struct gen_pool *, 60 + void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *); 61 + extern size_t gen_pool_avail(struct gen_pool *); 62 + extern size_t gen_pool_size(struct gen_pool *); 77 63 #endif /* __GENALLOC_H__ */

+126

include/linux/llist.h

··· 1 + #ifndef LLIST_H 2 + #define LLIST_H 3 + /* 4 + * Lock-less NULL terminated single linked list 5 + * 6 + * If there are multiple producers and multiple consumers, llist_add 7 + * can be used in producers and llist_del_all can be used in 8 + * consumers. They can work simultaneously without lock. But 9 + * llist_del_first can not be used here. Because llist_del_first 10 + * depends on list->first->next does not changed if list->first is not 11 + * changed during its operation, but llist_del_first, llist_add, 12 + * llist_add (or llist_del_all, llist_add, llist_add) sequence in 13 + * another consumer may violate that. 14 + * 15 + * If there are multiple producers and one consumer, llist_add can be 16 + * used in producers and llist_del_all or llist_del_first can be used 17 + * in the consumer. 18 + * 19 + * This can be summarized as follow: 20 + * 21 + * | add | del_first | del_all 22 + * add | - | - | - 23 + * del_first | | L | L 24 + * del_all | | | - 25 + * 26 + * Where "-" stands for no lock is needed, while "L" stands for lock 27 + * is needed. 28 + * 29 + * The list entries deleted via llist_del_all can be traversed with 30 + * traversing function such as llist_for_each etc. But the list 31 + * entries can not be traversed safely before deleted from the list. 32 + * The order of deleted entries is from the newest to the oldest added 33 + * one. If you want to traverse from the oldest to the newest, you 34 + * must reverse the order by yourself before traversing. 35 + * 36 + * The basic atomic operation of this list is cmpxchg on long. On 37 + * architectures that don't have NMI-safe cmpxchg implementation, the 38 + * list can NOT be used in NMI handler. So code uses the list in NMI 39 + * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. 40 + */ 41 + 42 + struct llist_head { 43 + struct llist_node *first; 44 + }; 45 + 46 + struct llist_node { 47 + struct llist_node *next; 48 + }; 49 + 50 + #define LLIST_HEAD_INIT(name) { NULL } 51 + #define LLIST_HEAD(name) struct llist_head name = LLIST_HEAD_INIT(name) 52 + 53 + /** 54 + * init_llist_head - initialize lock-less list head 55 + * @head: the head for your lock-less list 56 + */ 57 + static inline void init_llist_head(struct llist_head *list) 58 + { 59 + list->first = NULL; 60 + } 61 + 62 + /** 63 + * llist_entry - get the struct of this entry 64 + * @ptr: the &struct llist_node pointer. 65 + * @type: the type of the struct this is embedded in. 66 + * @member: the name of the llist_node within the struct. 67 + */ 68 + #define llist_entry(ptr, type, member) \ 69 + container_of(ptr, type, member) 70 + 71 + /** 72 + * llist_for_each - iterate over some deleted entries of a lock-less list 73 + * @pos: the &struct llist_node to use as a loop cursor 74 + * @node: the first entry of deleted list entries 75 + * 76 + * In general, some entries of the lock-less list can be traversed 77 + * safely only after being deleted from list, so start with an entry 78 + * instead of list head. 79 + * 80 + * If being used on entries deleted from lock-less list directly, the 81 + * traverse order is from the newest to the oldest added entry. If 82 + * you want to traverse from the oldest to the newest, you must 83 + * reverse the order by yourself before traversing. 84 + */ 85 + #define llist_for_each(pos, node) \ 86 + for ((pos) = (node); pos; (pos) = (pos)->next) 87 + 88 + /** 89 + * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type 90 + * @pos: the type * to use as a loop cursor. 91 + * @node: the fist entry of deleted list entries. 92 + * @member: the name of the llist_node with the struct. 93 + * 94 + * In general, some entries of the lock-less list can be traversed 95 + * safely only after being removed from list, so start with an entry 96 + * instead of list head. 97 + * 98 + * If being used on entries deleted from lock-less list directly, the 99 + * traverse order is from the newest to the oldest added entry. If 100 + * you want to traverse from the oldest to the newest, you must 101 + * reverse the order by yourself before traversing. 102 + */ 103 + #define llist_for_each_entry(pos, node, member) \ 104 + for ((pos) = llist_entry((node), typeof(*(pos)), member); \ 105 + &(pos)->member != NULL; \ 106 + (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member)) 107 + 108 + /** 109 + * llist_empty - tests whether a lock-less list is empty 110 + * @head: the list to test 111 + * 112 + * Not guaranteed to be accurate or up to date. Just a quick way to 113 + * test whether the list is empty without deleting something from the 114 + * list. 115 + */ 116 + static inline int llist_empty(const struct llist_head *head) 117 + { 118 + return ACCESS_ONCE(head->first) == NULL; 119 + } 120 + 121 + void llist_add(struct llist_node *new, struct llist_head *head); 122 + void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, 123 + struct llist_head *head); 124 + struct llist_node *llist_del_first(struct llist_head *head); 125 + struct llist_node *llist_del_all(struct llist_head *head); 126 + #endif /* LLIST_H */

+1

include/linux/mm.h

··· 1600 1600 }; 1601 1601 extern void memory_failure(unsigned long pfn, int trapno); 1602 1602 extern int __memory_failure(unsigned long pfn, int trapno, int flags); 1603 + extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); 1603 1604 extern int unpoison_memory(unsigned long pfn); 1604 1605 extern int sysctl_memory_failure_early_kill; 1605 1606 extern int sysctl_memory_failure_recovery;

+3

lib/Kconfig

··· 276 276 so its calculations are in fixed point. Modules can select this 277 277 when they require this function. Module will be called cordic. 278 278 279 + config LLIST 280 + bool 281 + 279 282 endmenu

+2

lib/Makefile

··· 115 115 116 116 obj-$(CONFIG_CORDIC) += cordic.o 117 117 118 + obj-$(CONFIG_LLIST) += llist.o 119 + 118 120 hostprogs-y := gen_crc32table 119 121 clean-files := crc32table.h 120 122

-2

lib/bitmap.c

··· 271 271 } 272 272 EXPORT_SYMBOL(__bitmap_weight); 273 273 274 - #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG)) 275 - 276 274 void bitmap_set(unsigned long *map, int start, int nr) 277 275 { 278 276 unsigned long *p = map + BIT_WORD(start);

+243 -57

lib/genalloc.c

··· 1 1 /* 2 - * Basic general purpose allocator for managing special purpose memory 3 - * not managed by the regular kmalloc/kfree interface. 4 - * Uses for this includes on-device special memory, uncached memory 5 - * etc. 2 + * Basic general purpose allocator for managing special purpose 3 + * memory, for example, memory that is not managed by the regular 4 + * kmalloc/kfree interface. Uses for this includes on-device special 5 + * memory, uncached memory etc. 6 + * 7 + * It is safe to use the allocator in NMI handlers and other special 8 + * unblockable contexts that could otherwise deadlock on locks. This 9 + * is implemented by using atomic operations and retries on any 10 + * conflicts. The disadvantage is that there may be livelocks in 11 + * extreme cases. For better scalability, one allocator can be used 12 + * for each CPU. 13 + * 14 + * The lockless operation only works if there is enough memory 15 + * available. If new memory is added to the pool a lock has to be 16 + * still taken. So any user relying on locklessness has to ensure 17 + * that sufficient memory is preallocated. 18 + * 19 + * The basic atomic operation of this allocator is cmpxchg on long. 20 + * On architectures that don't have NMI-safe cmpxchg implementation, 21 + * the allocator can NOT be used in NMI handler. So code uses the 22 + * allocator in NMI handler should depend on 23 + * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. 6 24 * 7 25 * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org> 8 26 * ··· 31 13 #include <linux/slab.h> 32 14 #include <linux/module.h> 33 15 #include <linux/bitmap.h> 16 + #include <linux/rculist.h> 17 + #include <linux/interrupt.h> 34 18 #include <linux/genalloc.h> 35 19 20 + static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set) 21 + { 22 + unsigned long val, nval; 23 + 24 + nval = *addr; 25 + do { 26 + val = nval; 27 + if (val & mask_to_set) 28 + return -EBUSY; 29 + cpu_relax(); 30 + } while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val); 31 + 32 + return 0; 33 + } 34 + 35 + static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear) 36 + { 37 + unsigned long val, nval; 38 + 39 + nval = *addr; 40 + do { 41 + val = nval; 42 + if ((val & mask_to_clear) != mask_to_clear) 43 + return -EBUSY; 44 + cpu_relax(); 45 + } while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val); 46 + 47 + return 0; 48 + } 49 + 50 + /* 51 + * bitmap_set_ll - set the specified number of bits at the specified position 52 + * @map: pointer to a bitmap 53 + * @start: a bit position in @map 54 + * @nr: number of bits to set 55 + * 56 + * Set @nr bits start from @start in @map lock-lessly. Several users 57 + * can set/clear the same bitmap simultaneously without lock. If two 58 + * users set the same bit, one user will return remain bits, otherwise 59 + * return 0. 60 + */ 61 + static int bitmap_set_ll(unsigned long *map, int start, int nr) 62 + { 63 + unsigned long *p = map + BIT_WORD(start); 64 + const int size = start + nr; 65 + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); 66 + unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); 67 + 68 + while (nr - bits_to_set >= 0) { 69 + if (set_bits_ll(p, mask_to_set)) 70 + return nr; 71 + nr -= bits_to_set; 72 + bits_to_set = BITS_PER_LONG; 73 + mask_to_set = ~0UL; 74 + p++; 75 + } 76 + if (nr) { 77 + mask_to_set &= BITMAP_LAST_WORD_MASK(size); 78 + if (set_bits_ll(p, mask_to_set)) 79 + return nr; 80 + } 81 + 82 + return 0; 83 + } 84 + 85 + /* 86 + * bitmap_clear_ll - clear the specified number of bits at the specified position 87 + * @map: pointer to a bitmap 88 + * @start: a bit position in @map 89 + * @nr: number of bits to set 90 + * 91 + * Clear @nr bits start from @start in @map lock-lessly. Several users 92 + * can set/clear the same bitmap simultaneously without lock. If two 93 + * users clear the same bit, one user will return remain bits, 94 + * otherwise return 0. 95 + */ 96 + static int bitmap_clear_ll(unsigned long *map, int start, int nr) 97 + { 98 + unsigned long *p = map + BIT_WORD(start); 99 + const int size = start + nr; 100 + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); 101 + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); 102 + 103 + while (nr - bits_to_clear >= 0) { 104 + if (clear_bits_ll(p, mask_to_clear)) 105 + return nr; 106 + nr -= bits_to_clear; 107 + bits_to_clear = BITS_PER_LONG; 108 + mask_to_clear = ~0UL; 109 + p++; 110 + } 111 + if (nr) { 112 + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); 113 + if (clear_bits_ll(p, mask_to_clear)) 114 + return nr; 115 + } 116 + 117 + return 0; 118 + } 36 119 37 120 /** 38 121 * gen_pool_create - create a new special memory pool ··· 149 30 150 31 pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid); 151 32 if (pool != NULL) { 152 - rwlock_init(&pool->lock); 33 + spin_lock_init(&pool->lock); 153 34 INIT_LIST_HEAD(&pool->chunks); 154 35 pool->min_alloc_order = min_alloc_order; 155 36 } ··· 182 63 if (unlikely(chunk == NULL)) 183 64 return -ENOMEM; 184 65 185 - spin_lock_init(&chunk->lock); 186 66 chunk->phys_addr = phys; 187 67 chunk->start_addr = virt; 188 68 chunk->end_addr = virt + size; 69 + atomic_set(&chunk->avail, size); 189 70 190 - write_lock(&pool->lock); 191 - list_add(&chunk->next_chunk, &pool->chunks); 192 - write_unlock(&pool->lock); 71 + spin_lock(&pool->lock); 72 + list_add_rcu(&chunk->next_chunk, &pool->chunks); 73 + spin_unlock(&pool->lock); 193 74 194 75 return 0; 195 76 } ··· 204 85 */ 205 86 phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr) 206 87 { 207 - struct list_head *_chunk; 208 88 struct gen_pool_chunk *chunk; 89 + phys_addr_t paddr = -1; 209 90 210 - read_lock(&pool->lock); 211 - list_for_each(_chunk, &pool->chunks) { 212 - chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); 213 - 214 - if (addr >= chunk->start_addr && addr < chunk->end_addr) 215 - return chunk->phys_addr + addr - chunk->start_addr; 91 + rcu_read_lock(); 92 + list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { 93 + if (addr >= chunk->start_addr && addr < chunk->end_addr) { 94 + paddr = chunk->phys_addr + (addr - chunk->start_addr); 95 + break; 96 + } 216 97 } 217 - read_unlock(&pool->lock); 98 + rcu_read_unlock(); 218 99 219 - return -1; 100 + return paddr; 220 101 } 221 102 EXPORT_SYMBOL(gen_pool_virt_to_phys); 222 103 ··· 233 114 struct gen_pool_chunk *chunk; 234 115 int order = pool->min_alloc_order; 235 116 int bit, end_bit; 236 - 237 117 238 118 list_for_each_safe(_chunk, _next_chunk, &pool->chunks) { 239 119 chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); ··· 255 137 * @size: number of bytes to allocate from the pool 256 138 * 257 139 * Allocate the requested number of bytes from the specified pool. 258 - * Uses a first-fit algorithm. 140 + * Uses a first-fit algorithm. Can not be used in NMI handler on 141 + * architectures without NMI-safe cmpxchg implementation. 259 142 */ 260 143 unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) 261 144 { 262 - struct list_head *_chunk; 263 145 struct gen_pool_chunk *chunk; 264 - unsigned long addr, flags; 146 + unsigned long addr = 0; 265 147 int order = pool->min_alloc_order; 266 - int nbits, start_bit, end_bit; 148 + int nbits, start_bit = 0, end_bit, remain; 149 + 150 + #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG 151 + BUG_ON(in_nmi()); 152 + #endif 267 153 268 154 if (size == 0) 269 155 return 0; 270 156 271 157 nbits = (size + (1UL << order) - 1) >> order; 272 - 273 - read_lock(&pool->lock); 274 - list_for_each(_chunk, &pool->chunks) { 275 - chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); 158 + rcu_read_lock(); 159 + list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { 160 + if (size > atomic_read(&chunk->avail)) 161 + continue; 276 162 277 163 end_bit = (chunk->end_addr - chunk->start_addr) >> order; 278 - 279 - spin_lock_irqsave(&chunk->lock, flags); 280 - start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit, 0, 281 - nbits, 0); 282 - if (start_bit >= end_bit) { 283 - spin_unlock_irqrestore(&chunk->lock, flags); 164 + retry: 165 + start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit, 166 + start_bit, nbits, 0); 167 + if (start_bit >= end_bit) 284 168 continue; 169 + remain = bitmap_set_ll(chunk->bits, start_bit, nbits); 170 + if (remain) { 171 + remain = bitmap_clear_ll(chunk->bits, start_bit, 172 + nbits - remain); 173 + BUG_ON(remain); 174 + goto retry; 285 175 } 286 176 287 177 addr = chunk->start_addr + ((unsigned long)start_bit << order); 288 - 289 - bitmap_set(chunk->bits, start_bit, nbits); 290 - spin_unlock_irqrestore(&chunk->lock, flags); 291 - read_unlock(&pool->lock); 292 - return addr; 178 + size = nbits << order; 179 + atomic_sub(size, &chunk->avail); 180 + break; 293 181 } 294 - read_unlock(&pool->lock); 295 - return 0; 182 + rcu_read_unlock(); 183 + return addr; 296 184 } 297 185 EXPORT_SYMBOL(gen_pool_alloc); 298 186 ··· 308 184 * @addr: starting address of memory to free back to pool 309 185 * @size: size in bytes of memory to free 310 186 * 311 - * Free previously allocated special memory back to the specified pool. 187 + * Free previously allocated special memory back to the specified 188 + * pool. Can not be used in NMI handler on architectures without 189 + * NMI-safe cmpxchg implementation. 312 190 */ 313 191 void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size) 314 192 { 315 - struct list_head *_chunk; 316 193 struct gen_pool_chunk *chunk; 317 - unsigned long flags; 318 194 int order = pool->min_alloc_order; 319 - int bit, nbits; 195 + int start_bit, nbits, remain; 196 + 197 + #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG 198 + BUG_ON(in_nmi()); 199 + #endif 320 200 321 201 nbits = (size + (1UL << order) - 1) >> order; 322 - 323 - read_lock(&pool->lock); 324 - list_for_each(_chunk, &pool->chunks) { 325 - chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); 326 - 202 + rcu_read_lock(); 203 + list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { 327 204 if (addr >= chunk->start_addr && addr < chunk->end_addr) { 328 205 BUG_ON(addr + size > chunk->end_addr); 329 - spin_lock_irqsave(&chunk->lock, flags); 330 - bit = (addr - chunk->start_addr) >> order; 331 - while (nbits--) 332 - __clear_bit(bit++, chunk->bits); 333 - spin_unlock_irqrestore(&chunk->lock, flags); 334 - break; 206 + start_bit = (addr - chunk->start_addr) >> order; 207 + remain = bitmap_clear_ll(chunk->bits, start_bit, nbits); 208 + BUG_ON(remain); 209 + size = nbits << order; 210 + atomic_add(size, &chunk->avail); 211 + rcu_read_unlock(); 212 + return; 335 213 } 336 214 } 337 - BUG_ON(nbits > 0); 338 - read_unlock(&pool->lock); 215 + rcu_read_unlock(); 216 + BUG(); 339 217 } 340 218 EXPORT_SYMBOL(gen_pool_free); 219 + 220 + /** 221 + * gen_pool_for_each_chunk - call func for every chunk of generic memory pool 222 + * @pool: the generic memory pool 223 + * @func: func to call 224 + * @data: additional data used by @func 225 + * 226 + * Call @func for every chunk of generic memory pool. The @func is 227 + * called with rcu_read_lock held. 228 + */ 229 + void gen_pool_for_each_chunk(struct gen_pool *pool, 230 + void (*func)(struct gen_pool *pool, struct gen_pool_chunk *chunk, void *data), 231 + void *data) 232 + { 233 + struct gen_pool_chunk *chunk; 234 + 235 + rcu_read_lock(); 236 + list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk) 237 + func(pool, chunk, data); 238 + rcu_read_unlock(); 239 + } 240 + EXPORT_SYMBOL(gen_pool_for_each_chunk); 241 + 242 + /** 243 + * gen_pool_avail - get available free space of the pool 244 + * @pool: pool to get available free space 245 + * 246 + * Return available free space of the specified pool. 247 + */ 248 + size_t gen_pool_avail(struct gen_pool *pool) 249 + { 250 + struct gen_pool_chunk *chunk; 251 + size_t avail = 0; 252 + 253 + rcu_read_lock(); 254 + list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) 255 + avail += atomic_read(&chunk->avail); 256 + rcu_read_unlock(); 257 + return avail; 258 + } 259 + EXPORT_SYMBOL_GPL(gen_pool_avail); 260 + 261 + /** 262 + * gen_pool_size - get size in bytes of memory managed by the pool 263 + * @pool: pool to get size 264 + * 265 + * Return size in bytes of memory managed by the pool. 266 + */ 267 + size_t gen_pool_size(struct gen_pool *pool) 268 + { 269 + struct gen_pool_chunk *chunk; 270 + size_t size = 0; 271 + 272 + rcu_read_lock(); 273 + list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) 274 + size += chunk->end_addr - chunk->start_addr; 275 + rcu_read_unlock(); 276 + return size; 277 + } 278 + EXPORT_SYMBOL_GPL(gen_pool_size);

+129

lib/llist.c

··· 1 + /* 2 + * Lock-less NULL terminated single linked list 3 + * 4 + * The basic atomic operation of this list is cmpxchg on long. On 5 + * architectures that don't have NMI-safe cmpxchg implementation, the 6 + * list can NOT be used in NMI handler. So code uses the list in NMI 7 + * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. 8 + * 9 + * Copyright 2010,2011 Intel Corp. 10 + * Author: Huang Ying <ying.huang@intel.com> 11 + * 12 + * This program is free software; you can redistribute it and/or 13 + * modify it under the terms of the GNU General Public License version 14 + * 2 as published by the Free Software Foundation; 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + * You should have received a copy of the GNU General Public License 22 + * along with this program; if not, write to the Free Software 23 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 24 + */ 25 + #include <linux/kernel.h> 26 + #include <linux/module.h> 27 + #include <linux/interrupt.h> 28 + #include <linux/llist.h> 29 + 30 + #include <asm/system.h> 31 + 32 + /** 33 + * llist_add - add a new entry 34 + * @new: new entry to be added 35 + * @head: the head for your lock-less list 36 + */ 37 + void llist_add(struct llist_node *new, struct llist_head *head) 38 + { 39 + struct llist_node *entry, *old_entry; 40 + 41 + #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG 42 + BUG_ON(in_nmi()); 43 + #endif 44 + 45 + entry = head->first; 46 + do { 47 + old_entry = entry; 48 + new->next = entry; 49 + cpu_relax(); 50 + } while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry); 51 + } 52 + EXPORT_SYMBOL_GPL(llist_add); 53 + 54 + /** 55 + * llist_add_batch - add several linked entries in batch 56 + * @new_first: first entry in batch to be added 57 + * @new_last: last entry in batch to be added 58 + * @head: the head for your lock-less list 59 + */ 60 + void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, 61 + struct llist_head *head) 62 + { 63 + struct llist_node *entry, *old_entry; 64 + 65 + #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG 66 + BUG_ON(in_nmi()); 67 + #endif 68 + 69 + entry = head->first; 70 + do { 71 + old_entry = entry; 72 + new_last->next = entry; 73 + cpu_relax(); 74 + } while ((entry = cmpxchg(&head->first, old_entry, new_first)) != old_entry); 75 + } 76 + EXPORT_SYMBOL_GPL(llist_add_batch); 77 + 78 + /** 79 + * llist_del_first - delete the first entry of lock-less list 80 + * @head: the head for your lock-less list 81 + * 82 + * If list is empty, return NULL, otherwise, return the first entry 83 + * deleted, this is the newest added one. 84 + * 85 + * Only one llist_del_first user can be used simultaneously with 86 + * multiple llist_add users without lock. Because otherwise 87 + * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add, 88 + * llist_add) sequence in another user may change @head->first->next, 89 + * but keep @head->first. If multiple consumers are needed, please 90 + * use llist_del_all or use lock between consumers. 91 + */ 92 + struct llist_node *llist_del_first(struct llist_head *head) 93 + { 94 + struct llist_node *entry, *old_entry, *next; 95 + 96 + #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG 97 + BUG_ON(in_nmi()); 98 + #endif 99 + 100 + entry = head->first; 101 + do { 102 + if (entry == NULL) 103 + return NULL; 104 + old_entry = entry; 105 + next = entry->next; 106 + cpu_relax(); 107 + } while ((entry = cmpxchg(&head->first, old_entry, next)) != old_entry); 108 + 109 + return entry; 110 + } 111 + EXPORT_SYMBOL_GPL(llist_del_first); 112 + 113 + /** 114 + * llist_del_all - delete all entries from lock-less list 115 + * @head: the head of lock-less list to delete all entries 116 + * 117 + * If list is empty, return NULL, otherwise, delete all entries and 118 + * return the pointer to the first entry. The order of entries 119 + * deleted is from the newest to the oldest added one. 120 + */ 121 + struct llist_node *llist_del_all(struct llist_head *head) 122 + { 123 + #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG 124 + BUG_ON(in_nmi()); 125 + #endif 126 + 127 + return xchg(&head->first, NULL); 128 + } 129 + EXPORT_SYMBOL_GPL(llist_del_all);

+92

mm/memory-failure.c

··· 53 53 #include <linux/hugetlb.h> 54 54 #include <linux/memory_hotplug.h> 55 55 #include <linux/mm_inline.h> 56 + #include <linux/kfifo.h> 56 57 #include "internal.h" 57 58 58 59 int sysctl_memory_failure_early_kill __read_mostly = 0; ··· 1178 1177 { 1179 1178 __memory_failure(pfn, trapno, 0); 1180 1179 } 1180 + 1181 + #define MEMORY_FAILURE_FIFO_ORDER 4 1182 + #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) 1183 + 1184 + struct memory_failure_entry { 1185 + unsigned long pfn; 1186 + int trapno; 1187 + int flags; 1188 + }; 1189 + 1190 + struct memory_failure_cpu { 1191 + DECLARE_KFIFO(fifo, struct memory_failure_entry, 1192 + MEMORY_FAILURE_FIFO_SIZE); 1193 + spinlock_t lock; 1194 + struct work_struct work; 1195 + }; 1196 + 1197 + static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); 1198 + 1199 + /** 1200 + * memory_failure_queue - Schedule handling memory failure of a page. 1201 + * @pfn: Page Number of the corrupted page 1202 + * @trapno: Trap number reported in the signal to user space. 1203 + * @flags: Flags for memory failure handling 1204 + * 1205 + * This function is called by the low level hardware error handler 1206 + * when it detects hardware memory corruption of a page. It schedules 1207 + * the recovering of error page, including dropping pages, killing 1208 + * processes etc. 1209 + * 1210 + * The function is primarily of use for corruptions that 1211 + * happen outside the current execution context (e.g. when 1212 + * detected by a background scrubber) 1213 + * 1214 + * Can run in IRQ context. 1215 + */ 1216 + void memory_failure_queue(unsigned long pfn, int trapno, int flags) 1217 + { 1218 + struct memory_failure_cpu *mf_cpu; 1219 + unsigned long proc_flags; 1220 + struct memory_failure_entry entry = { 1221 + .pfn = pfn, 1222 + .trapno = trapno, 1223 + .flags = flags, 1224 + }; 1225 + 1226 + mf_cpu = &get_cpu_var(memory_failure_cpu); 1227 + spin_lock_irqsave(&mf_cpu->lock, proc_flags); 1228 + if (kfifo_put(&mf_cpu->fifo, &entry)) 1229 + schedule_work_on(smp_processor_id(), &mf_cpu->work); 1230 + else 1231 + pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n", 1232 + pfn); 1233 + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); 1234 + put_cpu_var(memory_failure_cpu); 1235 + } 1236 + EXPORT_SYMBOL_GPL(memory_failure_queue); 1237 + 1238 + static void memory_failure_work_func(struct work_struct *work) 1239 + { 1240 + struct memory_failure_cpu *mf_cpu; 1241 + struct memory_failure_entry entry = { 0, }; 1242 + unsigned long proc_flags; 1243 + int gotten; 1244 + 1245 + mf_cpu = &__get_cpu_var(memory_failure_cpu); 1246 + for (;;) { 1247 + spin_lock_irqsave(&mf_cpu->lock, proc_flags); 1248 + gotten = kfifo_get(&mf_cpu->fifo, &entry); 1249 + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); 1250 + if (!gotten) 1251 + break; 1252 + __memory_failure(entry.pfn, entry.trapno, entry.flags); 1253 + } 1254 + } 1255 + 1256 + static int __init memory_failure_init(void) 1257 + { 1258 + struct memory_failure_cpu *mf_cpu; 1259 + int cpu; 1260 + 1261 + for_each_possible_cpu(cpu) { 1262 + mf_cpu = &per_cpu(memory_failure_cpu, cpu); 1263 + spin_lock_init(&mf_cpu->lock); 1264 + INIT_KFIFO(mf_cpu->fifo); 1265 + INIT_WORK(&mf_cpu->work, memory_failure_work_func); 1266 + } 1267 + 1268 + return 0; 1269 + } 1270 + core_initcall(memory_failure_init); 1181 1271 1182 1272 /** 1183 1273 * unpoison_memory - Unpoison a previously poisoned page