Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

irq_work: Add generic hardirq context callbacks

Provide a mechanism that allows running code in IRQ context. It is
most useful for NMI code that needs to interact with the rest of the
system -- like wakeup a task to drain buffers.

Perf currently has such a mechanism, so extract that and provide it as
a generic feature, independent of perf so that others may also
benefit.

The IRQ context callback is generated through self-IPIs where
possible, or on architectures like powerpc the decrementer (the
built-in timer facility) is set to generate an interrupt immediately.

Architectures that don't have anything like this get to do with a
callback from the timer tick. These architectures can call
irq_work_run() at the tail of any IRQ handlers that might enqueue such
work (like the perf IRQ handler) to avoid undue latencies in
processing the work.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Kyle McMartin <kyle@mcmartin.ca>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
[ various fixes ]
Signed-off-by: Huang Ying <ying.huang@intel.com>
LKML-Reference: <1287036094.7768.291.camel@yhuang-dev>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by

Peter Zijlstra and committed by
Ingo Molnar
e360adbe 8e5fc1a7

+311 -242
+1
arch/alpha/Kconfig
··· 9 9 select HAVE_IDE 10 10 select HAVE_OPROFILE 11 11 select HAVE_SYSCALL_WRAPPERS 12 + select HAVE_IRQ_WORK 12 13 select HAVE_PERF_EVENTS 13 14 select HAVE_DMA_ATTRS 14 15 help
-5
arch/alpha/include/asm/perf_event.h
··· 1 1 #ifndef __ASM_ALPHA_PERF_EVENT_H 2 2 #define __ASM_ALPHA_PERF_EVENT_H 3 3 4 - /* Alpha only supports software events through this interface. */ 5 - extern void set_perf_event_pending(void); 6 - 7 - #define PERF_EVENT_INDEX_OFFSET 0 8 - 9 4 #ifdef CONFIG_PERF_EVENTS 10 5 extern void init_hw_perf_events(void); 11 6 #else
+15 -15
arch/alpha/kernel/time.c
··· 41 41 #include <linux/init.h> 42 42 #include <linux/bcd.h> 43 43 #include <linux/profile.h> 44 - #include <linux/perf_event.h> 44 + #include <linux/irq_work.h> 45 45 46 46 #include <asm/uaccess.h> 47 47 #include <asm/io.h> ··· 83 83 84 84 unsigned long est_cycle_freq; 85 85 86 - #ifdef CONFIG_PERF_EVENTS 86 + #ifdef CONFIG_IRQ_WORK 87 87 88 - DEFINE_PER_CPU(u8, perf_event_pending); 88 + DEFINE_PER_CPU(u8, irq_work_pending); 89 89 90 - #define set_perf_event_pending_flag() __get_cpu_var(perf_event_pending) = 1 91 - #define test_perf_event_pending() __get_cpu_var(perf_event_pending) 92 - #define clear_perf_event_pending() __get_cpu_var(perf_event_pending) = 0 90 + #define set_irq_work_pending_flag() __get_cpu_var(irq_work_pending) = 1 91 + #define test_irq_work_pending() __get_cpu_var(irq_work_pending) 92 + #define clear_irq_work_pending() __get_cpu_var(irq_work_pending) = 0 93 93 94 - void set_perf_event_pending(void) 94 + void set_irq_work_pending(void) 95 95 { 96 - set_perf_event_pending_flag(); 96 + set_irq_work_pending_flag(); 97 97 } 98 98 99 - #else /* CONFIG_PERF_EVENTS */ 99 + #else /* CONFIG_IRQ_WORK */ 100 100 101 - #define test_perf_event_pending() 0 102 - #define clear_perf_event_pending() 101 + #define test_irq_work_pending() 0 102 + #define clear_irq_work_pending() 103 103 104 - #endif /* CONFIG_PERF_EVENTS */ 104 + #endif /* CONFIG_IRQ_WORK */ 105 105 106 106 107 107 static inline __u32 rpcc(void) ··· 191 191 192 192 write_sequnlock(&xtime_lock); 193 193 194 - if (test_perf_event_pending()) { 195 - clear_perf_event_pending(); 196 - perf_event_do_pending(); 194 + if (test_irq_work_pending()) { 195 + clear_irq_work_pending(); 196 + irq_work_run(); 197 197 } 198 198 199 199 #ifndef CONFIG_SMP
+1
arch/arm/Kconfig
··· 23 23 select HAVE_KERNEL_GZIP 24 24 select HAVE_KERNEL_LZO 25 25 select HAVE_KERNEL_LZMA 26 + select HAVE_IRQ_WORK 26 27 select HAVE_PERF_EVENTS 27 28 select PERF_USE_VMALLOC 28 29 select HAVE_REGS_AND_STACK_ACCESS_API
-12
arch/arm/include/asm/perf_event.h
··· 12 12 #ifndef __ARM_PERF_EVENT_H__ 13 13 #define __ARM_PERF_EVENT_H__ 14 14 15 - /* 16 - * NOP: on *most* (read: all supported) ARM platforms, the performance 17 - * counter interrupts are regular interrupts and not an NMI. This 18 - * means that when we receive the interrupt we can call 19 - * perf_event_do_pending() that handles all of the work with 20 - * interrupts disabled. 21 - */ 22 - static inline void 23 - set_perf_event_pending(void) 24 - { 25 - } 26 - 27 15 /* ARM performance counters start from 1 (in the cp15 accesses) so use the 28 16 * same indexes here for consistency. */ 29 17 #define PERF_EVENT_INDEX_OFFSET 1
+4 -4
arch/arm/kernel/perf_event.c
··· 1092 1092 * platforms that can have the PMU interrupts raised as an NMI, this 1093 1093 * will not work. 1094 1094 */ 1095 - perf_event_do_pending(); 1095 + irq_work_run(); 1096 1096 1097 1097 return IRQ_HANDLED; 1098 1098 } ··· 2068 2068 * platforms that can have the PMU interrupts raised as an NMI, this 2069 2069 * will not work. 2070 2070 */ 2071 - perf_event_do_pending(); 2071 + irq_work_run(); 2072 2072 2073 2073 return IRQ_HANDLED; 2074 2074 } ··· 2436 2436 armpmu->disable(hwc, idx); 2437 2437 } 2438 2438 2439 - perf_event_do_pending(); 2439 + irq_work_run(); 2440 2440 2441 2441 /* 2442 2442 * Re-enable the PMU. ··· 2763 2763 armpmu->disable(hwc, idx); 2764 2764 } 2765 2765 2766 - perf_event_do_pending(); 2766 + irq_work_run(); 2767 2767 2768 2768 /* 2769 2769 * Re-enable the PMU.
+1
arch/frv/Kconfig
··· 7 7 default y 8 8 select HAVE_IDE 9 9 select HAVE_ARCH_TRACEHOOK 10 + select HAVE_IRQ_WORK 10 11 select HAVE_PERF_EVENTS 11 12 12 13 config ZONE_DMA
+1 -1
arch/frv/lib/Makefile
··· 5 5 lib-y := \ 6 6 __ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \ 7 7 checksum.o memcpy.o memset.o atomic-ops.o atomic64-ops.o \ 8 - outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_event.o 8 + outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o
-19
arch/frv/lib/perf_event.c
··· 1 - /* Performance event handling 2 - * 3 - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. 4 - * Written by David Howells (dhowells@redhat.com) 5 - * 6 - * This program is free software; you can redistribute it and/or 7 - * modify it under the terms of the GNU General Public Licence 8 - * as published by the Free Software Foundation; either version 9 - * 2 of the Licence, or (at your option) any later version. 10 - */ 11 - 12 - #include <linux/perf_event.h> 13 - 14 - /* 15 - * mark the performance event as pending 16 - */ 17 - void set_perf_event_pending(void) 18 - { 19 - }
+1
arch/parisc/Kconfig
··· 16 16 select RTC_DRV_GENERIC 17 17 select INIT_ALL_POSSIBLE 18 18 select BUG 19 + select HAVE_IRQ_WORK 19 20 select HAVE_PERF_EVENTS 20 21 select GENERIC_ATOMIC64 if !64BIT 21 22 help
+1 -2
arch/parisc/include/asm/perf_event.h
··· 1 1 #ifndef __ASM_PARISC_PERF_EVENT_H 2 2 #define __ASM_PARISC_PERF_EVENT_H 3 3 4 - /* parisc only supports software events through this interface. */ 5 - static inline void set_perf_event_pending(void) { } 4 + /* Empty, just to avoid compiling error */ 6 5 7 6 #endif /* __ASM_PARISC_PERF_EVENT_H */
+1
arch/powerpc/Kconfig
··· 138 138 select HAVE_OPROFILE 139 139 select HAVE_SYSCALL_WRAPPERS if PPC64 140 140 select GENERIC_ATOMIC64 if PPC32 141 + select HAVE_IRQ_WORK 141 142 select HAVE_PERF_EVENTS 142 143 select HAVE_REGS_AND_STACK_ACCESS_API 143 144 select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
+1 -1
arch/powerpc/include/asm/paca.h
··· 129 129 u8 soft_enabled; /* irq soft-enable flag */ 130 130 u8 hard_enabled; /* set if irqs are enabled in MSR */ 131 131 u8 io_sync; /* writel() needs spin_unlock sync */ 132 - u8 perf_event_pending; /* PM interrupt while soft-disabled */ 132 + u8 irq_work_pending; /* IRQ_WORK interrupt while soft-disable */ 133 133 134 134 /* Stuff for accurate time accounting */ 135 135 u64 user_time; /* accumulated usermode TB ticks */
+21 -21
arch/powerpc/kernel/time.c
··· 53 53 #include <linux/posix-timers.h> 54 54 #include <linux/irq.h> 55 55 #include <linux/delay.h> 56 - #include <linux/perf_event.h> 56 + #include <linux/irq_work.h> 57 57 #include <asm/trace.h> 58 58 59 59 #include <asm/io.h> ··· 493 493 } 494 494 #endif /* CONFIG_PPC_ISERIES */ 495 495 496 - #ifdef CONFIG_PERF_EVENTS 496 + #ifdef CONFIG_IRQ_WORK 497 497 498 498 /* 499 499 * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... 500 500 */ 501 501 #ifdef CONFIG_PPC64 502 - static inline unsigned long test_perf_event_pending(void) 502 + static inline unsigned long test_irq_work_pending(void) 503 503 { 504 504 unsigned long x; 505 505 506 506 asm volatile("lbz %0,%1(13)" 507 507 : "=r" (x) 508 - : "i" (offsetof(struct paca_struct, perf_event_pending))); 508 + : "i" (offsetof(struct paca_struct, irq_work_pending))); 509 509 return x; 510 510 } 511 511 512 - static inline void set_perf_event_pending_flag(void) 512 + static inline void set_irq_work_pending_flag(void) 513 513 { 514 514 asm volatile("stb %0,%1(13)" : : 515 515 "r" (1), 516 - "i" (offsetof(struct paca_struct, perf_event_pending))); 516 + "i" (offsetof(struct paca_struct, irq_work_pending))); 517 517 } 518 518 519 - static inline void clear_perf_event_pending(void) 519 + static inline void clear_irq_work_pending(void) 520 520 { 521 521 asm volatile("stb %0,%1(13)" : : 522 522 "r" (0), 523 - "i" (offsetof(struct paca_struct, perf_event_pending))); 523 + "i" (offsetof(struct paca_struct, irq_work_pending))); 524 524 } 525 525 526 526 #else /* 32-bit */ 527 527 528 - DEFINE_PER_CPU(u8, perf_event_pending); 528 + DEFINE_PER_CPU(u8, irq_work_pending); 529 529 530 - #define set_perf_event_pending_flag() __get_cpu_var(perf_event_pending) = 1 531 - #define test_perf_event_pending() __get_cpu_var(perf_event_pending) 532 - #define clear_perf_event_pending() __get_cpu_var(perf_event_pending) = 0 530 + #define set_irq_work_pending_flag() __get_cpu_var(irq_work_pending) = 1 531 + #define test_irq_work_pending() __get_cpu_var(irq_work_pending) 532 + #define clear_irq_work_pending() __get_cpu_var(irq_work_pending) = 0 533 533 534 534 #endif /* 32 vs 64 bit */ 535 535 536 - void set_perf_event_pending(void) 536 + void set_irq_work_pending(void) 537 537 { 538 538 preempt_disable(); 539 - set_perf_event_pending_flag(); 539 + set_irq_work_pending_flag(); 540 540 set_dec(1); 541 541 preempt_enable(); 542 542 } 543 543 544 - #else /* CONFIG_PERF_EVENTS */ 544 + #else /* CONFIG_IRQ_WORK */ 545 545 546 - #define test_perf_event_pending() 0 547 - #define clear_perf_event_pending() 546 + #define test_irq_work_pending() 0 547 + #define clear_irq_work_pending() 548 548 549 - #endif /* CONFIG_PERF_EVENTS */ 549 + #endif /* CONFIG_IRQ_WORK */ 550 550 551 551 /* 552 552 * For iSeries shared processors, we have to let the hypervisor ··· 587 587 588 588 calculate_steal_time(); 589 589 590 - if (test_perf_event_pending()) { 591 - clear_perf_event_pending(); 592 - perf_event_do_pending(); 590 + if (test_irq_work_pending()) { 591 + clear_irq_work_pending(); 592 + irq_work_run(); 593 593 } 594 594 595 595 #ifdef CONFIG_PPC_ISERIES
+1
arch/s390/Kconfig
··· 95 95 select HAVE_KVM if 64BIT 96 96 select HAVE_ARCH_TRACEHOOK 97 97 select INIT_ALL_POSSIBLE 98 + select HAVE_IRQ_WORK 98 99 select HAVE_PERF_EVENTS 99 100 select HAVE_KERNEL_GZIP 100 101 select HAVE_KERNEL_BZIP2
+1 -2
arch/s390/include/asm/perf_event.h
··· 4 4 * Copyright 2009 Martin Schwidefsky, IBM Corporation. 5 5 */ 6 6 7 - static inline void set_perf_event_pending(void) {} 8 - static inline void clear_perf_event_pending(void) {} 7 + /* Empty, just to avoid compiling error */ 9 8 10 9 #define PERF_EVENT_INDEX_OFFSET 0
+1
arch/sh/Kconfig
··· 16 16 select HAVE_ARCH_TRACEHOOK 17 17 select HAVE_DMA_API_DEBUG 18 18 select HAVE_DMA_ATTRS 19 + select HAVE_IRQ_WORK 19 20 select HAVE_PERF_EVENTS 20 21 select PERF_USE_VMALLOC 21 22 select HAVE_KERNEL_GZIP
-7
arch/sh/include/asm/perf_event.h
··· 26 26 extern int reserve_pmc_hardware(void); 27 27 extern void release_pmc_hardware(void); 28 28 29 - static inline void set_perf_event_pending(void) 30 - { 31 - /* Nothing to see here, move along. */ 32 - } 33 - 34 - #define PERF_EVENT_INDEX_OFFSET 0 35 - 36 29 #endif /* __ASM_SH_PERF_EVENT_H */
+2
arch/sparc/Kconfig
··· 26 26 select ARCH_WANT_OPTIONAL_GPIOLIB 27 27 select RTC_CLASS 28 28 select RTC_DRV_M48T59 29 + select HAVE_IRQ_WORK 29 30 select HAVE_PERF_EVENTS 30 31 select PERF_USE_VMALLOC 31 32 select HAVE_DMA_ATTRS ··· 55 54 select RTC_DRV_BQ4802 56 55 select RTC_DRV_SUN4V 57 56 select RTC_DRV_STARFIRE 57 + select HAVE_IRQ_WORK 58 58 select HAVE_PERF_EVENTS 59 59 select PERF_USE_VMALLOC 60 60
-4
arch/sparc/include/asm/perf_event.h
··· 1 1 #ifndef __ASM_SPARC_PERF_EVENT_H 2 2 #define __ASM_SPARC_PERF_EVENT_H 3 3 4 - extern void set_perf_event_pending(void); 5 - 6 - #define PERF_EVENT_INDEX_OFFSET 0 7 - 8 4 #ifdef CONFIG_PERF_EVENTS 9 5 #include <asm/ptrace.h> 10 6
+4 -4
arch/sparc/kernel/pcr.c
··· 7 7 #include <linux/init.h> 8 8 #include <linux/irq.h> 9 9 10 - #include <linux/perf_event.h> 10 + #include <linux/irq_work.h> 11 11 #include <linux/ftrace.h> 12 12 13 13 #include <asm/pil.h> ··· 43 43 44 44 old_regs = set_irq_regs(regs); 45 45 irq_enter(); 46 - #ifdef CONFIG_PERF_EVENTS 47 - perf_event_do_pending(); 46 + #ifdef CONFIG_IRQ_WORK 47 + irq_work_run(); 48 48 #endif 49 49 irq_exit(); 50 50 set_irq_regs(old_regs); 51 51 } 52 52 53 - void set_perf_event_pending(void) 53 + void arch_irq_work_raise(void) 54 54 { 55 55 set_softint(1 << PIL_DEFERRED_PCR_WORK); 56 56 }
+1
arch/x86/Kconfig
··· 25 25 select HAVE_IDE 26 26 select HAVE_OPROFILE 27 27 select HAVE_PERF_EVENTS if (!M386 && !M486) 28 + select HAVE_IRQ_WORK 28 29 select HAVE_IOREMAP_PROT 29 30 select HAVE_KPROBES 30 31 select ARCH_WANT_OPTIONAL_GPIOLIB
+2 -2
arch/x86/include/asm/entry_arch.h
··· 49 49 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) 50 50 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 51 51 52 - #ifdef CONFIG_PERF_EVENTS 53 - BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) 52 + #ifdef CONFIG_IRQ_WORK 53 + BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) 54 54 #endif 55 55 56 56 #ifdef CONFIG_X86_THERMAL_VECTOR
+1 -1
arch/x86/include/asm/hardirq.h
··· 14 14 #endif 15 15 unsigned int x86_platform_ipis; /* arch dependent */ 16 16 unsigned int apic_perf_irqs; 17 - unsigned int apic_pending_irqs; 17 + unsigned int apic_irq_work_irqs; 18 18 #ifdef CONFIG_SMP 19 19 unsigned int irq_resched_count; 20 20 unsigned int irq_call_count;
+1 -1
arch/x86/include/asm/hw_irq.h
··· 29 29 extern void apic_timer_interrupt(void); 30 30 extern void x86_platform_ipi(void); 31 31 extern void error_interrupt(void); 32 - extern void perf_pending_interrupt(void); 32 + extern void irq_work_interrupt(void); 33 33 34 34 extern void spurious_interrupt(void); 35 35 extern void thermal_interrupt(void);
+2 -2
arch/x86/include/asm/irq_vectors.h
··· 114 114 #define X86_PLATFORM_IPI_VECTOR 0xed 115 115 116 116 /* 117 - * Performance monitoring pending work vector: 117 + * IRQ work vector: 118 118 */ 119 - #define LOCAL_PENDING_VECTOR 0xec 119 + #define IRQ_WORK_VECTOR 0xec 120 120 121 121 #define UV_BAU_MESSAGE 0xea 122 122
+1
arch/x86/kernel/Makefile
··· 35 35 obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 36 36 obj-y += time.o ioport.o ldt.o dumpstack.o 37 37 obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o 38 + obj-$(CONFIG_IRQ_WORK) += irq_work.o 38 39 obj-$(CONFIG_X86_VISWS) += visws_quirks.o 39 40 obj-$(CONFIG_X86_32) += probe_roms_32.o 40 41 obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
-19
arch/x86/kernel/cpu/perf_event.c
··· 1196 1196 return handled; 1197 1197 } 1198 1198 1199 - void smp_perf_pending_interrupt(struct pt_regs *regs) 1200 - { 1201 - irq_enter(); 1202 - ack_APIC_irq(); 1203 - inc_irq_stat(apic_pending_irqs); 1204 - perf_event_do_pending(); 1205 - irq_exit(); 1206 - } 1207 - 1208 - void set_perf_event_pending(void) 1209 - { 1210 - #ifdef CONFIG_X86_LOCAL_APIC 1211 - if (!x86_pmu.apic || !x86_pmu_initialized()) 1212 - return; 1213 - 1214 - apic->send_IPI_self(LOCAL_PENDING_VECTOR); 1215 - #endif 1216 - } 1217 - 1218 1199 void perf_events_lapic_init(void) 1219 1200 { 1220 1201 if (!x86_pmu.apic || !x86_pmu_initialized())
+3 -3
arch/x86/kernel/entry_64.S
··· 1023 1023 apicinterrupt SPURIOUS_APIC_VECTOR \ 1024 1024 spurious_interrupt smp_spurious_interrupt 1025 1025 1026 - #ifdef CONFIG_PERF_EVENTS 1027 - apicinterrupt LOCAL_PENDING_VECTOR \ 1028 - perf_pending_interrupt smp_perf_pending_interrupt 1026 + #ifdef CONFIG_IRQ_WORK 1027 + apicinterrupt IRQ_WORK_VECTOR \ 1028 + irq_work_interrupt smp_irq_work_interrupt 1029 1029 #endif 1030 1030 1031 1031 /*
+4 -4
arch/x86/kernel/irq.c
··· 67 67 for_each_online_cpu(j) 68 68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); 69 69 seq_printf(p, " Performance monitoring interrupts\n"); 70 - seq_printf(p, "%*s: ", prec, "PND"); 70 + seq_printf(p, "%*s: ", prec, "IWI"); 71 71 for_each_online_cpu(j) 72 - seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 73 - seq_printf(p, " Performance pending work\n"); 72 + seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); 73 + seq_printf(p, " IRQ work interrupts\n"); 74 74 #endif 75 75 if (x86_platform_ipi_callback) { 76 76 seq_printf(p, "%*s: ", prec, "PLT"); ··· 185 185 sum += irq_stats(cpu)->apic_timer_irqs; 186 186 sum += irq_stats(cpu)->irq_spurious_count; 187 187 sum += irq_stats(cpu)->apic_perf_irqs; 188 - sum += irq_stats(cpu)->apic_pending_irqs; 188 + sum += irq_stats(cpu)->apic_irq_work_irqs; 189 189 #endif 190 190 if (x86_platform_ipi_callback) 191 191 sum += irq_stats(cpu)->x86_platform_ipis;
+30
arch/x86/kernel/irq_work.c
··· 1 + /* 2 + * x86 specific code for irq_work 3 + * 4 + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 5 + */ 6 + 7 + #include <linux/kernel.h> 8 + #include <linux/irq_work.h> 9 + #include <linux/hardirq.h> 10 + #include <asm/apic.h> 11 + 12 + void smp_irq_work_interrupt(struct pt_regs *regs) 13 + { 14 + irq_enter(); 15 + ack_APIC_irq(); 16 + inc_irq_stat(apic_irq_work_irqs); 17 + irq_work_run(); 18 + irq_exit(); 19 + } 20 + 21 + void arch_irq_work_raise(void) 22 + { 23 + #ifdef CONFIG_X86_LOCAL_APIC 24 + if (!cpu_has_apic) 25 + return; 26 + 27 + apic->send_IPI_self(IRQ_WORK_VECTOR); 28 + apic_wait_icr_idle(); 29 + #endif 30 + }
+3 -3
arch/x86/kernel/irqinit.c
··· 224 224 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 225 225 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 226 226 227 - /* Performance monitoring interrupts: */ 228 - # ifdef CONFIG_PERF_EVENTS 229 - alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); 227 + /* IRQ work interrupts: */ 228 + # ifdef CONFIG_IRQ_WORK 229 + alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); 230 230 # endif 231 231 232 232 #endif
+20
include/linux/irq_work.h
··· 1 + #ifndef _LINUX_IRQ_WORK_H 2 + #define _LINUX_IRQ_WORK_H 3 + 4 + struct irq_work { 5 + struct irq_work *next; 6 + void (*func)(struct irq_work *); 7 + }; 8 + 9 + static inline 10 + void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *)) 11 + { 12 + entry->next = NULL; 13 + entry->func = func; 14 + } 15 + 16 + bool irq_work_queue(struct irq_work *entry); 17 + void irq_work_run(void); 18 + void irq_work_sync(struct irq_work *entry); 19 + 20 + #endif /* _LINUX_IRQ_WORK_H */
+2 -9
include/linux/perf_event.h
··· 486 486 #include <linux/workqueue.h> 487 487 #include <linux/ftrace.h> 488 488 #include <linux/cpu.h> 489 + #include <linux/irq_work.h> 489 490 #include <asm/atomic.h> 490 491 #include <asm/local.h> 491 492 ··· 673 672 void *data_pages[0]; 674 673 }; 675 674 676 - struct perf_pending_entry { 677 - struct perf_pending_entry *next; 678 - void (*func)(struct perf_pending_entry *); 679 - }; 680 - 681 675 struct perf_sample_data; 682 676 683 677 typedef void (*perf_overflow_handler_t)(struct perf_event *, int, ··· 780 784 int pending_wakeup; 781 785 int pending_kill; 782 786 int pending_disable; 783 - struct perf_pending_entry pending; 787 + struct irq_work pending; 784 788 785 789 atomic_t event_limit; 786 790 ··· 894 898 extern void perf_event_exit_task(struct task_struct *child); 895 899 extern void perf_event_free_task(struct task_struct *task); 896 900 extern void perf_event_delayed_put(struct task_struct *task); 897 - extern void set_perf_event_pending(void); 898 - extern void perf_event_do_pending(void); 899 901 extern void perf_event_print_debug(void); 900 902 extern void perf_pmu_disable(struct pmu *pmu); 901 903 extern void perf_pmu_enable(struct pmu *pmu); ··· 1072 1078 static inline void perf_event_exit_task(struct task_struct *child) { } 1073 1079 static inline void perf_event_free_task(struct task_struct *task) { } 1074 1080 static inline void perf_event_delayed_put(struct task_struct *task) { } 1075 - static inline void perf_event_do_pending(void) { } 1076 1081 static inline void perf_event_print_debug(void) { } 1077 1082 static inline int perf_event_task_disable(void) { return -EINVAL; } 1078 1083 static inline int perf_event_task_enable(void) { return -EINVAL; }
+8
init/Kconfig
··· 21 21 depends on !UML 22 22 default y 23 23 24 + config HAVE_IRQ_WORK 25 + bool 26 + 27 + config IRQ_WORK 28 + bool 29 + depends on HAVE_IRQ_WORK 30 + 24 31 menu "General setup" 25 32 26 33 config EXPERIMENTAL ··· 994 987 default y if (PROFILING || PERF_COUNTERS) 995 988 depends on HAVE_PERF_EVENTS 996 989 select ANON_INODES 990 + select IRQ_WORK 997 991 help 998 992 Enable kernel support for various performance events provided 999 993 by software and hardware.
+2
kernel/Makefile
··· 23 23 CFLAGS_REMOVE_cgroup-debug.o = -pg 24 24 CFLAGS_REMOVE_sched_clock.o = -pg 25 25 CFLAGS_REMOVE_perf_event.o = -pg 26 + CFLAGS_REMOVE_irq_work.o = -pg 26 27 endif 27 28 28 29 obj-$(CONFIG_FREEZER) += freezer.o ··· 101 100 obj-$(CONFIG_X86_DS) += trace/ 102 101 obj-$(CONFIG_RING_BUFFER) += trace/ 103 102 obj-$(CONFIG_SMP) += sched_cpupri.o 103 + obj-$(CONFIG_IRQ_WORK) += irq_work.o 104 104 obj-$(CONFIG_PERF_EVENTS) += perf_event.o 105 105 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 106 106 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+164
kernel/irq_work.c
··· 1 + /* 2 + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 3 + * 4 + * Provides a framework for enqueueing and running callbacks from hardirq 5 + * context. The enqueueing is NMI-safe. 6 + */ 7 + 8 + #include <linux/kernel.h> 9 + #include <linux/module.h> 10 + #include <linux/irq_work.h> 11 + #include <linux/hardirq.h> 12 + 13 + /* 14 + * An entry can be in one of four states: 15 + * 16 + * free NULL, 0 -> {claimed} : free to be used 17 + * claimed NULL, 3 -> {pending} : claimed to be enqueued 18 + * pending next, 3 -> {busy} : queued, pending callback 19 + * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed 20 + * 21 + * We use the lower two bits of the next pointer to keep PENDING and BUSY 22 + * flags. 23 + */ 24 + 25 + #define IRQ_WORK_PENDING 1UL 26 + #define IRQ_WORK_BUSY 2UL 27 + #define IRQ_WORK_FLAGS 3UL 28 + 29 + static inline bool irq_work_is_set(struct irq_work *entry, int flags) 30 + { 31 + return (unsigned long)entry->next & flags; 32 + } 33 + 34 + static inline struct irq_work *irq_work_next(struct irq_work *entry) 35 + { 36 + unsigned long next = (unsigned long)entry->next; 37 + next &= ~IRQ_WORK_FLAGS; 38 + return (struct irq_work *)next; 39 + } 40 + 41 + static inline struct irq_work *next_flags(struct irq_work *entry, int flags) 42 + { 43 + unsigned long next = (unsigned long)entry; 44 + next |= flags; 45 + return (struct irq_work *)next; 46 + } 47 + 48 + static DEFINE_PER_CPU(struct irq_work *, irq_work_list); 49 + 50 + /* 51 + * Claim the entry so that no one else will poke at it. 52 + */ 53 + static bool irq_work_claim(struct irq_work *entry) 54 + { 55 + struct irq_work *next, *nflags; 56 + 57 + do { 58 + next = entry->next; 59 + if ((unsigned long)next & IRQ_WORK_PENDING) 60 + return false; 61 + nflags = next_flags(next, IRQ_WORK_FLAGS); 62 + } while (cmpxchg(&entry->next, next, nflags) != next); 63 + 64 + return true; 65 + } 66 + 67 + 68 + void __weak arch_irq_work_raise(void) 69 + { 70 + /* 71 + * Lame architectures will get the timer tick callback 72 + */ 73 + } 74 + 75 + /* 76 + * Queue the entry and raise the IPI if needed. 77 + */ 78 + static void __irq_work_queue(struct irq_work *entry) 79 + { 80 + struct irq_work **head, *next; 81 + 82 + head = &get_cpu_var(irq_work_list); 83 + 84 + do { 85 + next = *head; 86 + /* Can assign non-atomic because we keep the flags set. */ 87 + entry->next = next_flags(next, IRQ_WORK_FLAGS); 88 + } while (cmpxchg(head, next, entry) != next); 89 + 90 + /* The list was empty, raise self-interrupt to start processing. */ 91 + if (!irq_work_next(entry)) 92 + arch_irq_work_raise(); 93 + 94 + put_cpu_var(irq_work_list); 95 + } 96 + 97 + /* 98 + * Enqueue the irq_work @entry, returns true on success, failure when the 99 + * @entry was already enqueued by someone else. 100 + * 101 + * Can be re-enqueued while the callback is still in progress. 102 + */ 103 + bool irq_work_queue(struct irq_work *entry) 104 + { 105 + if (!irq_work_claim(entry)) { 106 + /* 107 + * Already enqueued, can't do! 108 + */ 109 + return false; 110 + } 111 + 112 + __irq_work_queue(entry); 113 + return true; 114 + } 115 + EXPORT_SYMBOL_GPL(irq_work_queue); 116 + 117 + /* 118 + * Run the irq_work entries on this cpu. Requires to be ran from hardirq 119 + * context with local IRQs disabled. 120 + */ 121 + void irq_work_run(void) 122 + { 123 + struct irq_work *list, **head; 124 + 125 + head = &__get_cpu_var(irq_work_list); 126 + if (*head == NULL) 127 + return; 128 + 129 + BUG_ON(!in_irq()); 130 + BUG_ON(!irqs_disabled()); 131 + 132 + list = xchg(head, NULL); 133 + while (list != NULL) { 134 + struct irq_work *entry = list; 135 + 136 + list = irq_work_next(list); 137 + 138 + /* 139 + * Clear the PENDING bit, after this point the @entry 140 + * can be re-used. 141 + */ 142 + entry->next = next_flags(NULL, IRQ_WORK_BUSY); 143 + entry->func(entry); 144 + /* 145 + * Clear the BUSY bit and return to the free state if 146 + * no-one else claimed it meanwhile. 147 + */ 148 + cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); 149 + } 150 + } 151 + EXPORT_SYMBOL_GPL(irq_work_run); 152 + 153 + /* 154 + * Synchronize against the irq_work @entry, ensures the entry is not 155 + * currently in use. 156 + */ 157 + void irq_work_sync(struct irq_work *entry) 158 + { 159 + WARN_ON_ONCE(irqs_disabled()); 160 + 161 + while (irq_work_is_set(entry, IRQ_WORK_BUSY)) 162 + cpu_relax(); 163 + } 164 + EXPORT_SYMBOL_GPL(irq_work_sync);
+5 -99
kernel/perf_event.c
··· 2206 2206 kfree(event); 2207 2207 } 2208 2208 2209 - static void perf_pending_sync(struct perf_event *event); 2210 2209 static void perf_buffer_put(struct perf_buffer *buffer); 2211 2210 2212 2211 static void free_event(struct perf_event *event) 2213 2212 { 2214 - perf_pending_sync(event); 2213 + irq_work_sync(&event->pending); 2215 2214 2216 2215 if (!event->parent) { 2217 2216 atomic_dec(&nr_events); ··· 3161 3162 } 3162 3163 } 3163 3164 3164 - /* 3165 - * Pending wakeups 3166 - * 3167 - * Handle the case where we need to wakeup up from NMI (or rq->lock) context. 3168 - * 3169 - * The NMI bit means we cannot possibly take locks. Therefore, maintain a 3170 - * single linked list and use cmpxchg() to add entries lockless. 3171 - */ 3172 - 3173 - static void perf_pending_event(struct perf_pending_entry *entry) 3165 + static void perf_pending_event(struct irq_work *entry) 3174 3166 { 3175 3167 struct perf_event *event = container_of(entry, 3176 3168 struct perf_event, pending); ··· 3175 3185 event->pending_wakeup = 0; 3176 3186 perf_event_wakeup(event); 3177 3187 } 3178 - } 3179 - 3180 - #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) 3181 - 3182 - static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { 3183 - PENDING_TAIL, 3184 - }; 3185 - 3186 - static void perf_pending_queue(struct perf_pending_entry *entry, 3187 - void (*func)(struct perf_pending_entry *)) 3188 - { 3189 - struct perf_pending_entry **head; 3190 - 3191 - if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) 3192 - return; 3193 - 3194 - entry->func = func; 3195 - 3196 - head = &get_cpu_var(perf_pending_head); 3197 - 3198 - do { 3199 - entry->next = *head; 3200 - } while (cmpxchg(head, entry->next, entry) != entry->next); 3201 - 3202 - set_perf_event_pending(); 3203 - 3204 - put_cpu_var(perf_pending_head); 3205 - } 3206 - 3207 - static int __perf_pending_run(void) 3208 - { 3209 - struct perf_pending_entry *list; 3210 - int nr = 0; 3211 - 3212 - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); 3213 - while (list != PENDING_TAIL) { 3214 - void (*func)(struct perf_pending_entry *); 3215 - struct perf_pending_entry *entry = list; 3216 - 3217 - list = list->next; 3218 - 3219 - func = entry->func; 3220 - entry->next = NULL; 3221 - /* 3222 - * Ensure we observe the unqueue before we issue the wakeup, 3223 - * so that we won't be waiting forever. 3224 - * -- see perf_not_pending(). 3225 - */ 3226 - smp_wmb(); 3227 - 3228 - func(entry); 3229 - nr++; 3230 - } 3231 - 3232 - return nr; 3233 - } 3234 - 3235 - static inline int perf_not_pending(struct perf_event *event) 3236 - { 3237 - /* 3238 - * If we flush on whatever cpu we run, there is a chance we don't 3239 - * need to wait. 3240 - */ 3241 - get_cpu(); 3242 - __perf_pending_run(); 3243 - put_cpu(); 3244 - 3245 - /* 3246 - * Ensure we see the proper queue state before going to sleep 3247 - * so that we do not miss the wakeup. -- see perf_pending_handle() 3248 - */ 3249 - smp_rmb(); 3250 - return event->pending.next == NULL; 3251 - } 3252 - 3253 - static void perf_pending_sync(struct perf_event *event) 3254 - { 3255 - wait_event(event->waitq, perf_not_pending(event)); 3256 - } 3257 - 3258 - void perf_event_do_pending(void) 3259 - { 3260 - __perf_pending_run(); 3261 3188 } 3262 3189 3263 3190 /* ··· 3226 3319 3227 3320 if (handle->nmi) { 3228 3321 handle->event->pending_wakeup = 1; 3229 - perf_pending_queue(&handle->event->pending, 3230 - perf_pending_event); 3322 + irq_work_queue(&handle->event->pending); 3231 3323 } else 3232 3324 perf_event_wakeup(handle->event); 3233 3325 } ··· 4262 4356 event->pending_kill = POLL_HUP; 4263 4357 if (nmi) { 4264 4358 event->pending_disable = 1; 4265 - perf_pending_queue(&event->pending, 4266 - perf_pending_event); 4359 + irq_work_queue(&event->pending); 4267 4360 } else 4268 4361 perf_event_disable(event); 4269 4362 } ··· 5279 5374 INIT_LIST_HEAD(&event->event_entry); 5280 5375 INIT_LIST_HEAD(&event->sibling_list); 5281 5376 init_waitqueue_head(&event->waitq); 5377 + init_irq_work(&event->pending, perf_pending_event); 5282 5378 5283 5379 mutex_init(&event->mmap_mutex); 5284 5380
+5 -2
kernel/timer.c
··· 37 37 #include <linux/delay.h> 38 38 #include <linux/tick.h> 39 39 #include <linux/kallsyms.h> 40 - #include <linux/perf_event.h> 40 + #include <linux/irq_work.h> 41 41 #include <linux/sched.h> 42 42 #include <linux/slab.h> 43 43 ··· 1279 1279 run_local_timers(); 1280 1280 rcu_check_callbacks(cpu, user_tick); 1281 1281 printk_tick(); 1282 - perf_event_do_pending(); 1282 + #ifdef CONFIG_IRQ_WORK 1283 + if (in_irq()) 1284 + irq_work_run(); 1285 + #endif 1283 1286 scheduler_tick(); 1284 1287 run_posix_cpu_timers(p); 1285 1288 }