Merge tag 'smp-core-2023-04-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

tjh.dev / kernel

fork atom

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork atom

Merge tag 'smp-core-2023-04-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull SMP cross-CPU function-call updates from Ingo Molnar:

- Remove diagnostics and adjust config for CSD lock diagnostics

- Add a generic IPI-sending tracepoint, as currently there's no easy
way to instrument IPI origins: it's arch dependent and for some major
architectures it's not even consistently available.

* tag 'smp-core-2023-04-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
trace,smp: Trace all smp_function_call*() invocations
trace: Add trace_ipi_send_cpu()
sched, smp: Trace smp callback causing an IPI
smp: reword smp call IPI comment
treewide: Trace IPIs sent via smp_send_reschedule()
irq_work: Trace self-IPIs sent via arch_irq_work_raise()
smp: Trace IPIs sent via arch_send_call_function_ipi_mask()
sched, smp: Trace IPIs sent via send_call_function_single_ipi()
trace: Add trace_ipi_send_cpumask()
kernel/smp: Make csdlock_debug= resettable
locking/csd_lock: Remove per-CPU data indirection from CSD lock debugging
locking/csd_lock: Remove added data from CSD lock debugging
locking/csd_lock: Add Kconfig option for csd_debug default

Linus Torvalds 3 years ago f20730ef 586b222d

+216 -280

33 changed files

expand all collapse all

Documentation

admin-guide

kernel-parameters.txt

arch

alpha

kernel

smp.c

arc

kernel

smp.c

arm

kernel

smp.c

mach-actions

platsmp.c

arm64

kernel

smp.c

csky

kernel

smp.c

hexagon

kernel

smp.c

ia64

kernel

smp.c

loongarch

kernel

smp.c

mips

include

asm

smp.h

openrisc

kernel

smp.c

parisc

kernel

smp.c

powerpc

kernel

smp.c

kvm

book3s_hv.c

platforms

powernv

subcore.c

riscv

kernel

smp.c

s390

kernel

smp.c

kernel

smp.c

sparc

kernel

smp_32.c

smp_64.c

x86

include

asm

smp.h

kvm

svm

svm.c

x86.c

xtensa

kernel

smp.c

include

linux

smp.h

trace

events

ipi.h

kernel

irq_work.c

sched

core.c

smp.h

smp.c

lib

Kconfig.debug

virt

kvm

kvm_main.c

+8 -9

Documentation/admin-guide/kernel-parameters.txt

reviewed

··· 912 912 cs89x0_media= [HW,NET] 913 913 Format: { rj45 | aui | bnc } 914 914 915 915 - csdlock_debug= [KNL] Enable debug add-ons of cross-CPU function call 916 916 - handling. When switched on, additional debug data is 917 917 - printed to the console in case a hanging CPU is 918 918 - detected, and that CPU is pinged again in order to try 919 919 - to resolve the hang situation. 920 920 - 0: disable csdlock debugging (default) 921 921 - 1: enable basic csdlock debugging (minor impact) 922 922 - ext: enable extended csdlock debugging (more impact, 923 923 - but more data) 915 915 + csdlock_debug= [KNL] Enable or disable debug add-ons of cross-CPU 916 916 + function call handling. When switched on, 917 917 + additional debug data is printed to the console 918 918 + in case a hanging CPU is detected, and that 919 919 + CPU is pinged again in order to try to resolve 920 920 + the hang situation. The default value of this 921 921 + option depends on the CSD_LOCK_WAIT_DEBUG_DEFAULT 922 922 + Kconfig option. 924 923 925 924 dasd= [HW,NET] 926 925 See header of drivers/s390/block/dasd_devmap.c.

+1 -1

arch/alpha/kernel/smp.c

reviewed

··· 562 562 } 563 563 564 564 void 565 565 - smp_send_reschedule(int cpu) 565 565 + arch_smp_send_reschedule(int cpu) 566 566 { 567 567 #ifdef DEBUG_IPI_MSG 568 568 if (cpu == hard_smp_processor_id())

+1 -1

arch/arc/kernel/smp.c

reviewed

··· 292 292 ipi_send_msg_one(cpu, msg); 293 293 } 294 294 295 295 - void smp_send_reschedule(int cpu) 295 295 + void arch_smp_send_reschedule(int cpu) 296 296 { 297 297 ipi_send_msg_one(cpu, IPI_RESCHEDULE); 298 298 }

+1 -2

arch/arm/kernel/smp.c

reviewed

··· 48 48 #include <asm/mach/arch.h> 49 49 #include <asm/mpu.h> 50 50 51 51 - #define CREATE_TRACE_POINTS 52 51 #include <trace/events/ipi.h> 53 52 54 53 /* ··· 748 749 ipi_setup(smp_processor_id()); 749 750 } 750 751 751 751 - void smp_send_reschedule(int cpu) 752 752 + void arch_smp_send_reschedule(int cpu) 752 753 { 753 754 smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE); 754 755 }

arch/arm/mach-actions/platsmp.c

reviewed

··· 20 20 #include <asm/smp_plat.h> 21 21 #include <asm/smp_scu.h> 22 22 23 23 + #include <trace/events/ipi.h> 24 24 + 23 25 #define OWL_CPU1_ADDR 0x50 24 26 #define OWL_CPU1_FLAG 0x5c 25 27

+1 -2

arch/arm64/kernel/smp.c

reviewed

··· 51 51 #include <asm/ptrace.h> 52 52 #include <asm/virt.h> 53 53 54 54 - #define CREATE_TRACE_POINTS 55 54 #include <trace/events/ipi.h> 56 55 57 56 DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); ··· 978 979 ipi_setup(smp_processor_id()); 979 980 } 980 981 981 981 - void smp_send_reschedule(int cpu) 982 982 + void arch_smp_send_reschedule(int cpu) 982 983 { 983 984 smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE); 984 985 }

+1 -1

arch/csky/kernel/smp.c

reviewed

··· 140 140 on_each_cpu(ipi_stop, NULL, 1); 141 141 } 142 142 143 143 - void smp_send_reschedule(int cpu) 143 143 + void arch_smp_send_reschedule(int cpu) 144 144 { 145 145 send_ipi_message(cpumask_of(cpu), IPI_RESCHEDULE); 146 146 }

+1 -1

arch/hexagon/kernel/smp.c

reviewed

··· 217 217 } 218 218 } 219 219 220 220 - void smp_send_reschedule(int cpu) 220 220 + void arch_smp_send_reschedule(int cpu) 221 221 { 222 222 send_ipi(cpumask_of(cpu), IPI_RESCHEDULE); 223 223 }

+2 -2

arch/ia64/kernel/smp.c

reviewed

··· 220 220 * Called with preemption disabled. 221 221 */ 222 222 void 223 223 - smp_send_reschedule (int cpu) 223 223 + arch_smp_send_reschedule (int cpu) 224 224 { 225 225 ia64_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0); 226 226 } 227 227 - EXPORT_SYMBOL_GPL(smp_send_reschedule); 227 227 + EXPORT_SYMBOL_GPL(arch_smp_send_reschedule); 228 228 229 229 /* 230 230 * Called with preemption disabled.

+2 -2

arch/loongarch/kernel/smp.c

reviewed

··· 155 155 * it goes straight through and wastes no time serializing 156 156 * anything. Worst case is that we lose a reschedule ... 157 157 */ 158 158 - void smp_send_reschedule(int cpu) 158 158 + void arch_smp_send_reschedule(int cpu) 159 159 { 160 160 loongson_send_ipi_single(cpu, SMP_RESCHEDULE); 161 161 } 162 162 - EXPORT_SYMBOL_GPL(smp_send_reschedule); 162 162 + EXPORT_SYMBOL_GPL(arch_smp_send_reschedule); 163 163 164 164 irqreturn_t loongson_ipi_interrupt(int irq, void *dev) 165 165 {

+1 -1

arch/mips/include/asm/smp.h

reviewed

··· 66 66 * it goes straight through and wastes no time serializing 67 67 * anything. Worst case is that we lose a reschedule ... 68 68 */ 69 69 - static inline void smp_send_reschedule(int cpu) 69 69 + static inline void arch_smp_send_reschedule(int cpu) 70 70 { 71 71 extern const struct plat_smp_ops *mp_ops; /* private */ 72 72

+1 -1

arch/openrisc/kernel/smp.c

reviewed

··· 173 173 } 174 174 } 175 175 176 176 - void smp_send_reschedule(int cpu) 176 176 + void arch_smp_send_reschedule(int cpu) 177 177 { 178 178 smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE); 179 179 }

+2 -2

arch/parisc/kernel/smp.c

reviewed

··· 246 246 inline void 247 247 smp_send_stop(void) { send_IPI_allbutself(IPI_CPU_STOP); } 248 248 249 249 - void 250 250 - smp_send_reschedule(int cpu) { send_IPI_single(cpu, IPI_RESCHEDULE); } 249 249 + void 250 250 + arch_smp_send_reschedule(int cpu) { send_IPI_single(cpu, IPI_RESCHEDULE); } 251 251 252 252 void 253 253 smp_send_all_nop(void)

+4 -2

arch/powerpc/kernel/smp.c

reviewed

··· 61 61 #include <asm/kup.h> 62 62 #include <asm/fadump.h> 63 63 64 64 + #include <trace/events/ipi.h> 65 65 + 64 66 #ifdef DEBUG 65 67 #include <asm/udbg.h> 66 68 #define DBG(fmt...) udbg_printf(fmt) ··· 366 364 #endif 367 365 } 368 366 369 369 - void smp_send_reschedule(int cpu) 367 367 + void arch_smp_send_reschedule(int cpu) 370 368 { 371 369 if (likely(smp_ops)) 372 370 do_message_pass(cpu, PPC_MSG_RESCHEDULE); 373 371 } 374 374 - EXPORT_SYMBOL_GPL(smp_send_reschedule); 372 372 + EXPORT_SYMBOL_GPL(arch_smp_send_reschedule); 375 373 376 374 void arch_send_call_function_single_ipi(int cpu) 377 375 {

arch/powerpc/kvm/book3s_hv.c

reviewed

··· 43 43 #include <linux/compiler.h> 44 44 #include <linux/of.h> 45 45 #include <linux/irqdomain.h> 46 46 + #include <linux/smp.h> 46 47 47 48 #include <asm/ftrace.h> 48 49 #include <asm/reg.h> ··· 80 79 #include <asm/ultravisor.h> 81 80 #include <asm/dtl.h> 82 81 #include <asm/plpar_wrappers.h> 82 82 + 83 83 + #include <trace/events/ipi.h> 83 84 84 85 #include "book3s.h" 85 86 #include "book3s_hv.h"

arch/powerpc/platforms/powernv/subcore.c

reviewed

··· 20 20 #include <asm/opal.h> 21 21 #include <asm/smp.h> 22 22 23 23 + #include <trace/events/ipi.h> 24 24 + 23 25 #include "subcore.h" 24 26 #include "powernv.h" 25 27

+2 -2

arch/riscv/kernel/smp.c

reviewed

··· 333 333 } 334 334 #endif 335 335 336 336 - void smp_send_reschedule(int cpu) 336 336 + void arch_smp_send_reschedule(int cpu) 337 337 { 338 338 send_ipi_single(cpu, IPI_RESCHEDULE); 339 339 } 340 340 - EXPORT_SYMBOL_GPL(smp_send_reschedule); 340 340 + EXPORT_SYMBOL_GPL(arch_smp_send_reschedule);

+1 -1

arch/s390/kernel/smp.c

reviewed

··· 553 553 * it goes straight through and wastes no time serializing 554 554 * anything. Worst case is that we lose a reschedule ... 555 555 */ 556 556 - void smp_send_reschedule(int cpu) 556 556 + void arch_smp_send_reschedule(int cpu) 557 557 { 558 558 pcpu_ec_call(pcpu_devices + cpu, ec_schedule); 559 559 }

+1 -1

arch/sh/kernel/smp.c

reviewed

··· 256 256 (bogosum / (5000/HZ)) % 100); 257 257 } 258 258 259 259 - void smp_send_reschedule(int cpu) 259 259 + void arch_smp_send_reschedule(int cpu) 260 260 { 261 261 mp_ops->send_ipi(cpu, SMP_MSG_RESCHEDULE); 262 262 }

+1 -1

arch/sparc/kernel/smp_32.c

reviewed

··· 120 120 121 121 struct linux_prom_registers smp_penguin_ctable = { 0 }; 122 122 123 123 - void smp_send_reschedule(int cpu) 123 123 + void arch_smp_send_reschedule(int cpu) 124 124 { 125 125 /* 126 126 * CPU model dependent way of implementing IPI generation targeting

+1 -1

arch/sparc/kernel/smp_64.c

reviewed

··· 1430 1430 return hv_err; 1431 1431 } 1432 1432 1433 1433 - void smp_send_reschedule(int cpu) 1433 1433 + void arch_smp_send_reschedule(int cpu) 1434 1434 { 1435 1435 if (cpu == smp_processor_id()) { 1436 1436 WARN_ON_ONCE(preemptible());

+1 -1

arch/x86/include/asm/smp.h

reviewed

··· 99 99 BUG(); 100 100 } 101 101 102 102 - static inline void smp_send_reschedule(int cpu) 102 102 + static inline void arch_smp_send_reschedule(int cpu) 103 103 { 104 104 smp_ops.smp_send_reschedule(cpu); 105 105 }

arch/x86/kvm/svm/svm.c

reviewed

··· 27 27 #include <linux/swap.h> 28 28 #include <linux/rwsem.h> 29 29 #include <linux/cc_platform.h> 30 30 + #include <linux/smp.h> 30 31 31 32 #include <asm/apic.h> 32 33 #include <asm/perf_event.h> ··· 42 41 #include <asm/fpu/api.h> 43 42 44 43 #include <asm/virtext.h> 44 44 + 45 45 + #include <trace/events/ipi.h> 46 46 + 45 47 #include "trace.h" 46 48 47 49 #include "svm.h"

arch/x86/kvm/x86.c

reviewed

··· 60 60 #include <linux/mem_encrypt.h> 61 61 #include <linux/entry-kvm.h> 62 62 #include <linux/suspend.h> 63 63 + #include <linux/smp.h> 63 64 65 65 + #include <trace/events/ipi.h> 64 66 #include <trace/events/kvm.h> 65 67 66 68 #include <asm/debugreg.h>

+1 -1

arch/xtensa/kernel/smp.c

reviewed

··· 391 391 send_ipi_message(cpumask_of(cpu), IPI_CALL_FUNC); 392 392 } 393 393 394 394 - void smp_send_reschedule(int cpu) 394 394 + void arch_smp_send_reschedule(int cpu) 395 395 { 396 396 send_ipi_message(cpumask_of(cpu), IPI_RESCHEDULE); 397 397 }

+9 -2

include/linux/smp.h

reviewed

··· 125 125 /* 126 126 * sends a 'reschedule' event to another CPU: 127 127 */ 128 128 - extern void smp_send_reschedule(int cpu); 129 129 - 128 128 + extern void arch_smp_send_reschedule(int cpu); 129 129 + /* 130 130 + * scheduler_ipi() is inline so can't be passed as callback reason, but the 131 131 + * callsite IP should be sufficient for root-causing IPIs sent from here. 132 132 + */ 133 133 + #define smp_send_reschedule(cpu) ({ \ 134 134 + trace_ipi_send_cpu(cpu, _RET_IP_, NULL); \ 135 135 + arch_smp_send_reschedule(cpu); \ 136 136 + }) 130 137 131 138 /* 132 139 * Prepare machine for booting other CPUs.

+44

include/trace/events/ipi.h

reviewed

··· 35 35 TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason) 36 36 ); 37 37 38 38 + TRACE_EVENT(ipi_send_cpu, 39 39 + 40 40 + TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback), 41 41 + 42 42 + TP_ARGS(cpu, callsite, callback), 43 43 + 44 44 + TP_STRUCT__entry( 45 45 + __field(unsigned int, cpu) 46 46 + __field(void *, callsite) 47 47 + __field(void *, callback) 48 48 + ), 49 49 + 50 50 + TP_fast_assign( 51 51 + __entry->cpu = cpu; 52 52 + __entry->callsite = (void *)callsite; 53 53 + __entry->callback = callback; 54 54 + ), 55 55 + 56 56 + TP_printk("cpu=%u callsite=%pS callback=%pS", 57 57 + __entry->cpu, __entry->callsite, __entry->callback) 58 58 + ); 59 59 + 60 60 + TRACE_EVENT(ipi_send_cpumask, 61 61 + 62 62 + TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback), 63 63 + 64 64 + TP_ARGS(cpumask, callsite, callback), 65 65 + 66 66 + TP_STRUCT__entry( 67 67 + __cpumask(cpumask) 68 68 + __field(void *, callsite) 69 69 + __field(void *, callback) 70 70 + ), 71 71 + 72 72 + TP_fast_assign( 73 73 + __assign_cpumask(cpumask, cpumask_bits(cpumask)); 74 74 + __entry->callsite = (void *)callsite; 75 75 + __entry->callback = callback; 76 76 + ), 77 77 + 78 78 + TP_printk("cpumask=%s callsite=%pS callback=%pS", 79 79 + __get_cpumask(cpumask), __entry->callsite, __entry->callback) 80 80 + ); 81 81 + 38 82 DECLARE_EVENT_CLASS(ipi_handler, 39 83 40 84 TP_PROTO(const char *reason),

+11 -1

kernel/irq_work.c

reviewed

··· 22 22 #include <asm/processor.h> 23 23 #include <linux/kasan.h> 24 24 25 25 + #include <trace/events/ipi.h> 26 26 + 25 27 static DEFINE_PER_CPU(struct llist_head, raised_list); 26 28 static DEFINE_PER_CPU(struct llist_head, lazy_list); 27 29 static DEFINE_PER_CPU(struct task_struct *, irq_workd); ··· 76 74 */ 77 75 } 78 76 77 77 + static __always_inline void irq_work_raise(struct irq_work *work) 78 78 + { 79 79 + if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt()) 80 80 + trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func); 81 81 + 82 82 + arch_irq_work_raise(); 83 83 + } 84 84 + 79 85 /* Enqueue on current CPU, work must already be claimed and preempt disabled */ 80 86 static void __irq_work_queue_local(struct irq_work *work) 81 87 { ··· 109 99 110 100 /* If the work is "lazy", handle it from next tick if any */ 111 101 if (!lazy_work || tick_nohz_tick_stopped()) 112 112 - arch_irq_work_raise(); 102 102 + irq_work_raise(work); 113 103 } 114 104 115 105 /* Enqueue the irq work @work on the current CPU */

+16 -6

kernel/sched/core.c

reviewed

··· 80 80 #define CREATE_TRACE_POINTS 81 81 #include <linux/sched/rseq_api.h> 82 82 #include <trace/events/sched.h> 83 83 + #include <trace/events/ipi.h> 83 84 #undef CREATE_TRACE_POINTS 84 85 85 86 #include "sched.h" ··· 95 94 #include "../workqueue_internal.h" 96 95 #include "../../io_uring/io-wq.h" 97 96 #include "../smpboot.h" 97 97 + 98 98 + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); 99 99 + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); 98 100 99 101 /* 100 102 * Export tracepoints that act as a bare tracehook (ie: have no trace event ··· 3852 3848 rq_unlock_irqrestore(rq, &rf); 3853 3849 } 3854 3850 3855 3855 - void send_call_function_single_ipi(int cpu) 3851 3851 + /* 3852 3852 + * Prepare the scene for sending an IPI for a remote smp_call 3853 3853 + * 3854 3854 + * Returns true if the caller can proceed with sending the IPI. 3855 3855 + * Returns false otherwise. 3856 3856 + */ 3857 3857 + bool call_function_single_prep_ipi(int cpu) 3856 3858 { 3857 3857 - struct rq *rq = cpu_rq(cpu); 3858 3858 - 3859 3859 - if (!set_nr_if_polling(rq->idle)) 3860 3860 - arch_send_call_function_single_ipi(cpu); 3861 3861 - else 3859 3859 + if (set_nr_if_polling(cpu_rq(cpu)->idle)) { 3862 3860 trace_sched_wake_idle_without_ipi(cpu); 3861 3861 + return false; 3862 3862 + } 3863 3863 + 3864 3864 + return true; 3863 3865 } 3864 3866 3865 3867 /*

+1 -1

kernel/sched/smp.h

reviewed

··· 6 6 7 7 extern void sched_ttwu_pending(void *arg); 8 8 9 9 - extern void send_call_function_single_ipi(int cpu); 9 9 + extern bool call_function_single_prep_ipi(int cpu); 10 10 11 11 #ifdef CONFIG_SMP 12 12 extern void flush_smp_call_function_queue(void);

+76 -235

kernel/smp.c

reviewed

··· 26 26 #include <linux/sched/debug.h> 27 27 #include <linux/jump_label.h> 28 28 29 29 + #include <trace/events/ipi.h> 30 30 + 29 31 #include "smpboot.h" 30 32 #include "sched/smp.h" 31 33 32 34 #define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) 33 35 34 34 - #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG 35 35 - union cfd_seq_cnt { 36 36 - u64 val; 37 37 - struct { 38 38 - u64 src:16; 39 39 - u64 dst:16; 40 40 - #define CFD_SEQ_NOCPU 0xffff 41 41 - u64 type:4; 42 42 - #define CFD_SEQ_QUEUE 0 43 43 - #define CFD_SEQ_IPI 1 44 44 - #define CFD_SEQ_NOIPI 2 45 45 - #define CFD_SEQ_PING 3 46 46 - #define CFD_SEQ_PINGED 4 47 47 - #define CFD_SEQ_HANDLE 5 48 48 - #define CFD_SEQ_DEQUEUE 6 49 49 - #define CFD_SEQ_IDLE 7 50 50 - #define CFD_SEQ_GOTIPI 8 51 51 - #define CFD_SEQ_HDLEND 9 52 52 - u64 cnt:28; 53 53 - } u; 54 54 - }; 55 55 - 56 56 - static char *seq_type[] = { 57 57 - [CFD_SEQ_QUEUE] = "queue", 58 58 - [CFD_SEQ_IPI] = "ipi", 59 59 - [CFD_SEQ_NOIPI] = "noipi", 60 60 - [CFD_SEQ_PING] = "ping", 61 61 - [CFD_SEQ_PINGED] = "pinged", 62 62 - [CFD_SEQ_HANDLE] = "handle", 63 63 - [CFD_SEQ_DEQUEUE] = "dequeue (src CPU 0 == empty)", 64 64 - [CFD_SEQ_IDLE] = "idle", 65 65 - [CFD_SEQ_GOTIPI] = "gotipi", 66 66 - [CFD_SEQ_HDLEND] = "hdlend (src CPU 0 == early)", 67 67 - }; 68 68 - 69 69 - struct cfd_seq_local { 70 70 - u64 ping; 71 71 - u64 pinged; 72 72 - u64 handle; 73 73 - u64 dequeue; 74 74 - u64 idle; 75 75 - u64 gotipi; 76 76 - u64 hdlend; 77 77 - }; 78 78 - #endif 79 79 - 80 80 - struct cfd_percpu { 81 81 - call_single_data_t csd; 82 82 - #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG 83 83 - u64 seq_queue; 84 84 - u64 seq_ipi; 85 85 - u64 seq_noipi; 86 86 - #endif 87 87 - }; 88 88 - 89 36 struct call_function_data { 90 90 - struct cfd_percpu __percpu *pcpu; 37 37 + call_single_data_t __percpu *csd; 91 38 cpumask_var_t cpumask; 92 39 cpumask_var_t cpumask_ipi; 93 40 }; ··· 57 110 free_cpumask_var(cfd->cpumask); 58 111 return -ENOMEM; 59 112 } 60 60 - cfd->pcpu = alloc_percpu(struct cfd_percpu); 61 61 - if (!cfd->pcpu) { 113 113 + cfd->csd = alloc_percpu(call_single_data_t); 114 114 + if (!cfd->csd) { 62 115 free_cpumask_var(cfd->cpumask); 63 116 free_cpumask_var(cfd->cpumask_ipi); 64 117 return -ENOMEM; ··· 73 126 74 127 free_cpumask_var(cfd->cpumask); 75 128 free_cpumask_var(cfd->cpumask_ipi); 76 76 - free_percpu(cfd->pcpu); 129 129 + free_percpu(cfd->csd); 77 130 return 0; 78 131 } 79 132 ··· 103 156 smpcfd_prepare_cpu(smp_processor_id()); 104 157 } 105 158 159 159 + static __always_inline void 160 160 + send_call_function_single_ipi(int cpu) 161 161 + { 162 162 + if (call_function_single_prep_ipi(cpu)) { 163 163 + trace_ipi_send_cpu(cpu, _RET_IP_, 164 164 + generic_smp_call_function_single_interrupt); 165 165 + arch_send_call_function_single_ipi(cpu); 166 166 + } 167 167 + } 168 168 + 169 169 + static __always_inline void 170 170 + send_call_function_ipi_mask(struct cpumask *mask) 171 171 + { 172 172 + trace_ipi_send_cpumask(mask, _RET_IP_, 173 173 + generic_smp_call_function_single_interrupt); 174 174 + arch_send_call_function_ipi_mask(mask); 175 175 + } 176 176 + 106 177 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG 107 178 108 108 - static DEFINE_STATIC_KEY_FALSE(csdlock_debug_enabled); 109 109 - static DEFINE_STATIC_KEY_FALSE(csdlock_debug_extended); 179 179 + static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled); 110 180 181 181 + /* 182 182 + * Parse the csdlock_debug= kernel boot parameter. 183 183 + * 184 184 + * If you need to restore the old "ext" value that once provided 185 185 + * additional debugging information, reapply the following commits: 186 186 + * 187 187 + * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging") 188 188 + * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging") 189 189 + */ 111 190 static int __init csdlock_debug(char *str) 112 191 { 192 192 + int ret; 113 193 unsigned int val = 0; 114 194 115 115 - if (str && !strcmp(str, "ext")) { 116 116 - val = 1; 117 117 - static_branch_enable(&csdlock_debug_extended); 118 118 - } else 119 119 - get_option(&str, &val); 120 120 - 121 121 - if (val) 122 122 - static_branch_enable(&csdlock_debug_enabled); 195 195 + ret = get_option(&str, &val); 196 196 + if (ret) { 197 197 + if (val) 198 198 + static_branch_enable(&csdlock_debug_enabled); 199 199 + else 200 200 + static_branch_disable(&csdlock_debug_enabled); 201 201 + } 123 202 124 203 return 1; 125 204 } ··· 154 181 static DEFINE_PER_CPU(call_single_data_t *, cur_csd); 155 182 static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); 156 183 static DEFINE_PER_CPU(void *, cur_csd_info); 157 157 - static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local); 158 184 159 185 static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ 160 186 module_param(csd_lock_timeout, ulong, 0444); 161 187 162 188 static atomic_t csd_bug_count = ATOMIC_INIT(0); 163 163 - static u64 cfd_seq; 164 164 - 165 165 - #define CFD_SEQ(s, d, t, c) \ 166 166 - (union cfd_seq_cnt){ .u.src = s, .u.dst = d, .u.type = t, .u.cnt = c } 167 167 - 168 168 - static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type) 169 169 - { 170 170 - union cfd_seq_cnt new, old; 171 171 - 172 172 - new = CFD_SEQ(src, dst, type, 0); 173 173 - 174 174 - do { 175 175 - old.val = READ_ONCE(cfd_seq); 176 176 - new.u.cnt = old.u.cnt + 1; 177 177 - } while (cmpxchg(&cfd_seq, old.val, new.val) != old.val); 178 178 - 179 179 - return old.val; 180 180 - } 181 181 - 182 182 - #define cfd_seq_store(var, src, dst, type) \ 183 183 - do { \ 184 184 - if (static_branch_unlikely(&csdlock_debug_extended)) \ 185 185 - var = cfd_seq_inc(src, dst, type); \ 186 186 - } while (0) 187 189 188 190 /* Record current CSD work for current CPU, NULL to erase. */ 189 191 static void __csd_lock_record(struct __call_single_data *csd) ··· 190 242 if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) 191 243 return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */ 192 244 return -1; 193 193 - } 194 194 - 195 195 - static void cfd_seq_data_add(u64 val, unsigned int src, unsigned int dst, 196 196 - unsigned int type, union cfd_seq_cnt *data, 197 197 - unsigned int *n_data, unsigned int now) 198 198 - { 199 199 - union cfd_seq_cnt new[2]; 200 200 - unsigned int i, j, k; 201 201 - 202 202 - new[0].val = val; 203 203 - new[1] = CFD_SEQ(src, dst, type, new[0].u.cnt + 1); 204 204 - 205 205 - for (i = 0; i < 2; i++) { 206 206 - if (new[i].u.cnt <= now) 207 207 - new[i].u.cnt |= 0x80000000U; 208 208 - for (j = 0; j < *n_data; j++) { 209 209 - if (new[i].u.cnt == data[j].u.cnt) { 210 210 - /* Direct read value trumps generated one. */ 211 211 - if (i == 0) 212 212 - data[j].val = new[i].val; 213 213 - break; 214 214 - } 215 215 - if (new[i].u.cnt < data[j].u.cnt) { 216 216 - for (k = *n_data; k > j; k--) 217 217 - data[k].val = data[k - 1].val; 218 218 - data[j].val = new[i].val; 219 219 - (*n_data)++; 220 220 - break; 221 221 - } 222 222 - } 223 223 - if (j == *n_data) { 224 224 - data[j].val = new[i].val; 225 225 - (*n_data)++; 226 226 - } 227 227 - } 228 228 - } 229 229 - 230 230 - static const char *csd_lock_get_type(unsigned int type) 231 231 - { 232 232 - return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type]; 233 233 - } 234 234 - 235 235 - static void csd_lock_print_extended(struct __call_single_data *csd, int cpu) 236 236 - { 237 237 - struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu); 238 238 - unsigned int srccpu = csd->node.src; 239 239 - struct call_function_data *cfd = per_cpu_ptr(&cfd_data, srccpu); 240 240 - struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); 241 241 - unsigned int now; 242 242 - union cfd_seq_cnt data[2 * ARRAY_SIZE(seq_type)]; 243 243 - unsigned int n_data = 0, i; 244 244 - 245 245 - data[0].val = READ_ONCE(cfd_seq); 246 246 - now = data[0].u.cnt; 247 247 - 248 248 - cfd_seq_data_add(pcpu->seq_queue, srccpu, cpu, CFD_SEQ_QUEUE, data, &n_data, now); 249 249 - cfd_seq_data_add(pcpu->seq_ipi, srccpu, cpu, CFD_SEQ_IPI, data, &n_data, now); 250 250 - cfd_seq_data_add(pcpu->seq_noipi, srccpu, cpu, CFD_SEQ_NOIPI, data, &n_data, now); 251 251 - 252 252 - cfd_seq_data_add(per_cpu(cfd_seq_local.ping, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PING, data, &n_data, now); 253 253 - cfd_seq_data_add(per_cpu(cfd_seq_local.pinged, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED, data, &n_data, now); 254 254 - 255 255 - cfd_seq_data_add(seq->idle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_IDLE, data, &n_data, now); 256 256 - cfd_seq_data_add(seq->gotipi, CFD_SEQ_NOCPU, cpu, CFD_SEQ_GOTIPI, data, &n_data, now); 257 257 - cfd_seq_data_add(seq->handle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HANDLE, data, &n_data, now); 258 258 - cfd_seq_data_add(seq->dequeue, CFD_SEQ_NOCPU, cpu, CFD_SEQ_DEQUEUE, data, &n_data, now); 259 259 - cfd_seq_data_add(seq->hdlend, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HDLEND, data, &n_data, now); 260 260 - 261 261 - for (i = 0; i < n_data; i++) { 262 262 - pr_alert("\tcsd: cnt(%07x): %04x->%04x %s\n", 263 263 - data[i].u.cnt & ~0x80000000U, data[i].u.src, 264 264 - data[i].u.dst, csd_lock_get_type(data[i].u.type)); 265 265 - } 266 266 - pr_alert("\tcsd: cnt now: %07x\n", now); 267 245 } 268 246 269 247 /* ··· 242 368 *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); 243 369 } 244 370 if (cpu >= 0) { 245 245 - if (static_branch_unlikely(&csdlock_debug_extended)) 246 246 - csd_lock_print_extended(csd, cpu); 247 371 dump_cpu_task(cpu); 248 372 if (!cpu_cur_csd) { 249 373 pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); ··· 284 412 285 413 smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); 286 414 } 287 287 - 288 288 - static void __smp_call_single_queue_debug(int cpu, struct llist_node *node) 289 289 - { 290 290 - unsigned int this_cpu = smp_processor_id(); 291 291 - struct cfd_seq_local *seq = this_cpu_ptr(&cfd_seq_local); 292 292 - struct call_function_data *cfd = this_cpu_ptr(&cfd_data); 293 293 - struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); 294 294 - 295 295 - cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE); 296 296 - if (llist_add(node, &per_cpu(call_single_queue, cpu))) { 297 297 - cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI); 298 298 - cfd_seq_store(seq->ping, this_cpu, cpu, CFD_SEQ_PING); 299 299 - send_call_function_single_ipi(cpu); 300 300 - cfd_seq_store(seq->pinged, this_cpu, cpu, CFD_SEQ_PINGED); 301 301 - } else { 302 302 - cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI); 303 303 - } 304 304 - } 305 415 #else 306 306 - #define cfd_seq_store(var, src, dst, type) 307 307 - 308 416 static void csd_lock_record(struct __call_single_data *csd) 309 417 { 310 418 } ··· 322 470 323 471 void __smp_call_single_queue(int cpu, struct llist_node *node) 324 472 { 325 325 - #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG 326 326 - if (static_branch_unlikely(&csdlock_debug_extended)) { 327 327 - unsigned int type; 473 473 + /* 474 474 + * We have to check the type of the CSD before queueing it, because 475 475 + * once queued it can have its flags cleared by 476 476 + * flush_smp_call_function_queue() 477 477 + * even if we haven't sent the smp_call IPI yet (e.g. the stopper 478 478 + * executes migration_cpu_stop() on the remote CPU). 479 479 + */ 480 480 + if (trace_ipi_send_cpu_enabled()) { 481 481 + call_single_data_t *csd; 482 482 + smp_call_func_t func; 328 483 329 329 - type = CSD_TYPE(container_of(node, call_single_data_t, 330 330 - node.llist)); 331 331 - if (type == CSD_TYPE_SYNC || type == CSD_TYPE_ASYNC) { 332 332 - __smp_call_single_queue_debug(cpu, node); 333 333 - return; 334 334 - } 484 484 + csd = container_of(node, call_single_data_t, node.llist); 485 485 + func = CSD_TYPE(csd) == CSD_TYPE_TTWU ? 486 486 + sched_ttwu_pending : csd->func; 487 487 + 488 488 + trace_ipi_send_cpu(cpu, _RET_IP_, func); 335 489 } 336 336 - #endif 337 490 338 491 /* 339 339 - * The list addition should be visible before sending the IPI 340 340 - * handler locks the list to pull the entry off it because of 341 341 - * normal cache coherency rules implied by spinlocks. 492 492 + * The list addition should be visible to the target CPU when it pops 493 493 + * the head of the list to pull the entry off it in the IPI handler 494 494 + * because of normal cache coherency rules implied by the underlying 495 495 + * llist ops. 342 496 * 343 497 * If IPIs can go out of order to the cache coherency protocol 344 498 * in an architecture, sufficient synchronisation should be added ··· 399 541 */ 400 542 void generic_smp_call_function_single_interrupt(void) 401 543 { 402 402 - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU, 403 403 - smp_processor_id(), CFD_SEQ_GOTIPI); 404 544 __flush_smp_call_function_queue(true); 405 545 } 406 546 ··· 426 570 lockdep_assert_irqs_disabled(); 427 571 428 572 head = this_cpu_ptr(&call_single_queue); 429 429 - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->handle, CFD_SEQ_NOCPU, 430 430 - smp_processor_id(), CFD_SEQ_HANDLE); 431 573 entry = llist_del_all(head); 432 432 - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->dequeue, 433 433 - /* Special meaning of source cpu: 0 == queue empty */ 434 434 - entry ? CFD_SEQ_NOCPU : 0, 435 435 - smp_processor_id(), CFD_SEQ_DEQUEUE); 436 574 entry = llist_reverse_order(entry); 437 575 438 576 /* There shouldn't be any pending callbacks on an offline CPU. */ ··· 485 635 } 486 636 } 487 637 488 488 - if (!entry) { 489 489 - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend, 490 490 - 0, smp_processor_id(), 491 491 - CFD_SEQ_HDLEND); 638 638 + if (!entry) 492 639 return; 493 493 - } 494 640 495 641 /* 496 642 * Second; run all !SYNC callbacks. ··· 524 678 */ 525 679 if (entry) 526 680 sched_ttwu_pending(entry); 527 527 - 528 528 - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend, CFD_SEQ_NOCPU, 529 529 - smp_processor_id(), CFD_SEQ_HDLEND); 530 681 } 531 682 532 683 ··· 547 704 if (llist_empty(this_cpu_ptr(&call_single_queue))) 548 705 return; 549 706 550 550 - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, 551 551 - smp_processor_id(), CFD_SEQ_IDLE); 552 707 local_irq_save(flags); 553 708 /* Get the already pending soft interrupts for RT enabled kernels */ 554 709 was_pending = local_softirq_pending(); ··· 728 887 int cpu, last_cpu, this_cpu = smp_processor_id(); 729 888 struct call_function_data *cfd; 730 889 bool wait = scf_flags & SCF_WAIT; 890 890 + int nr_cpus = 0, nr_queued = 0; 731 891 bool run_remote = false; 732 892 bool run_local = false; 733 733 - int nr_cpus = 0; 734 893 735 894 lockdep_assert_preemption_disabled(); 736 895 ··· 770 929 771 930 cpumask_clear(cfd->cpumask_ipi); 772 931 for_each_cpu(cpu, cfd->cpumask) { 773 773 - struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); 774 774 - call_single_data_t *csd = &pcpu->csd; 932 932 + call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); 775 933 776 776 - if (cond_func && !cond_func(cpu, info)) 934 934 + if (cond_func && !cond_func(cpu, info)) { 935 935 + __cpumask_clear_cpu(cpu, cfd->cpumask); 777 936 continue; 937 937 + } 778 938 779 939 csd_lock(csd); 780 940 if (wait) ··· 786 944 csd->node.src = smp_processor_id(); 787 945 csd->node.dst = cpu; 788 946 #endif 789 789 - cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE); 790 947 if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { 791 948 __cpumask_set_cpu(cpu, cfd->cpumask_ipi); 792 949 nr_cpus++; 793 950 last_cpu = cpu; 794 794 - 795 795 - cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI); 796 796 - } else { 797 797 - cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI); 798 951 } 952 952 + nr_queued++; 799 953 } 800 954 801 801 - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->ping, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PING); 955 955 + /* 956 956 + * Trace each smp_function_call_*() as an IPI, actual IPIs 957 957 + * will be traced with func==generic_smp_call_function_single_ipi(). 958 958 + */ 959 959 + if (nr_queued) 960 960 + trace_ipi_send_cpumask(cfd->cpumask, _RET_IP_, func); 802 961 803 962 /* 804 963 * Choose the most efficient way to send an IPI. Note that the ··· 809 966 if (nr_cpus == 1) 810 967 send_call_function_single_ipi(last_cpu); 811 968 else if (likely(nr_cpus > 1)) 812 812 - arch_send_call_function_ipi_mask(cfd->cpumask_ipi); 813 813 - 814 814 - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->pinged, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED); 969 969 + send_call_function_ipi_mask(cfd->cpumask_ipi); 815 970 } 816 971 817 972 if (run_local && (!cond_func || cond_func(this_cpu, info))) { ··· 824 983 for_each_cpu(cpu, cfd->cpumask) { 825 984 call_single_data_t *csd; 826 985 827 827 - csd = &per_cpu_ptr(cfd->pcpu, cpu)->csd; 986 986 + csd = per_cpu_ptr(cfd->csd, cpu); 828 987 csd_lock_wait(csd); 829 988 } 830 989 }

lib/Kconfig.debug

reviewed

··· 1490 1490 include the IPI handler function currently executing (if any) 1491 1491 and relevant stack traces. 1492 1492 1493 1493 + config CSD_LOCK_WAIT_DEBUG_DEFAULT 1494 1494 + bool "Default csd_lock_wait() debugging on at boot time" 1495 1495 + depends on CSD_LOCK_WAIT_DEBUG 1496 1496 + depends on 64BIT 1497 1497 + default n 1498 1498 + help 1499 1499 + This option causes the csdlock_debug= kernel boot parameter to 1500 1500 + default to 1 (basic debugging) instead of 0 (no debugging). 1501 1501 + 1493 1502 endmenu # lock debugging 1494 1503 1495 1504 config TRACE_IRQFLAGS

virt/kvm/kvm_main.c

reviewed

··· 62 62 #include "kvm_mm.h" 63 63 #include "vfio.h" 64 64 65 65 + #include <trace/events/ipi.h> 66 66 + 65 67 #define CREATE_TRACE_POINTS 66 68 #include <trace/events/kvm.h> 67 69 68 70 #include <linux/kvm_dirty_ring.h> 71 71 + 69 72 70 73 /* Worst case buffer size needed for holding an integer. */ 71 74 #define ITOA_MAX_LEN 12