Merge tag 'timers-core-2024-03-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

-6

Documentation/admin-guide/kernel-parameters.txt

··· 680 680 loops can be debugged more effectively on production 681 681 systems. 682 682 683 - clocksource.max_cswd_read_retries= [KNL] 684 - Number of clocksource_watchdog() retries due to 685 - external delays before the clock will be marked 686 - unstable. Defaults to two retries, that is, 687 - three attempts to read the clock under test. 688 - 689 683 clocksource.verify_n_cpus= [KNL] 690 684 Limit the number of CPUs checked for clocksources 691 685 marked with CLOCK_SOURCE_VERIFY_PERCPU that

+1

MAINTAINERS

··· 17503 17503 F: fs/timerfd.c 17504 17504 F: include/linux/time_namespace.h 17505 17505 F: include/linux/timer* 17506 + F: include/trace/events/timer* 17506 17507 F: kernel/time/*timer* 17507 17508 F: kernel/time/namespace.c 17508 17509

-1

arch/arm/include/asm/elf.h

··· 4 4 5 5 #include <asm/auxvec.h> 6 6 #include <asm/hwcap.h> 7 - #include <asm/vdso_datapage.h> 8 7 9 8 /* 10 9 * ELF register definitions..

-26

arch/arm/include/asm/vdso_datapage.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0-only */ 2 - /* 3 - * Adapted from arm64 version. 4 - * 5 - * Copyright (C) 2012 ARM Limited 6 - */ 7 - #ifndef __ASM_VDSO_DATAPAGE_H 8 - #define __ASM_VDSO_DATAPAGE_H 9 - 10 - #ifdef __KERNEL__ 11 - 12 - #ifndef __ASSEMBLY__ 13 - 14 - #include <vdso/datapage.h> 15 - #include <asm/page.h> 16 - 17 - union vdso_data_store { 18 - struct vdso_data data[CS_BASES]; 19 - u8 page[PAGE_SIZE]; 20 - }; 21 - 22 - #endif /* !__ASSEMBLY__ */ 23 - 24 - #endif /* __KERNEL__ */ 25 - 26 - #endif /* __ASM_VDSO_DATAPAGE_H */

+3 -1

arch/arm/kernel/asm-offsets.c

··· 21 21 #include <asm/mpu.h> 22 22 #include <asm/procinfo.h> 23 23 #include <asm/suspend.h> 24 - #include <asm/vdso_datapage.h> 25 24 #include <asm/hardware/cache-l2x0.h> 26 25 #include <linux/kbuild.h> 27 26 #include <linux/arm-smccc.h> 27 + 28 + #include <vdso/datapage.h> 29 + 28 30 #include "signal.h" 29 31 30 32 /*

-4

arch/arm/kernel/vdso.c

··· 21 21 #include <asm/cacheflush.h> 22 22 #include <asm/page.h> 23 23 #include <asm/vdso.h> 24 - #include <asm/vdso_datapage.h> 25 24 #include <clocksource/arm_arch_timer.h> 26 25 #include <vdso/helpers.h> 27 26 #include <vdso/vsyscall.h> ··· 34 35 /* Total number of pages needed for the data and text portions of the VDSO. */ 35 36 unsigned int vdso_total_pages __ro_after_init; 36 37 37 - /* 38 - * The VDSO data page. 39 - */ 40 38 static union vdso_data_store vdso_data_store __page_aligned_data; 41 39 struct vdso_data *vdso_data = vdso_data_store.data; 42 40

+1 -4

arch/arm64/kernel/vdso.c

··· 69 69 /* 70 70 * The vDSO data page. 71 71 */ 72 - static union { 73 - struct vdso_data data[CS_BASES]; 74 - u8 page[PAGE_SIZE]; 75 - } vdso_data_store __page_aligned_data; 72 + static union vdso_data_store vdso_data_store __page_aligned_data; 76 73 struct vdso_data *vdso_data = vdso_data_store.data; 77 74 78 75 static int vdso_mremap(const struct vm_special_mapping *sm,

-5

arch/csky/include/asm/vdso.h

··· 5 5 6 6 #include <linux/types.h> 7 7 8 - #ifndef GENERIC_TIME_VSYSCALL 9 - struct vdso_data { 10 - }; 11 - #endif 12 - 13 8 /* 14 9 * The VDSO symbols are mapped into Linux so we can just use regular symbol 15 10 * addressing to get their offsets in userspace. The symbols are mapped at an

+2 -12

arch/csky/kernel/vdso.c

··· 8 8 #include <linux/slab.h> 9 9 10 10 #include <asm/page.h> 11 - #ifdef GENERIC_TIME_VSYSCALL 12 11 #include <vdso/datapage.h> 13 - #else 14 - #include <asm/vdso.h> 15 - #endif 16 12 17 13 extern char vdso_start[], vdso_end[]; 18 14 19 15 static unsigned int vdso_pages; 20 16 static struct page **vdso_pagelist; 21 17 22 - /* 23 - * The vDSO data page. 24 - */ 25 - static union { 26 - struct vdso_data data; 27 - u8 page[PAGE_SIZE]; 28 - } vdso_data_store __page_aligned_data; 29 - struct vdso_data *vdso_data = &vdso_data_store.data; 18 + static union vdso_data_store vdso_data_store __page_aligned_data; 19 + struct vdso_data *vdso_data = vdso_data_store.data; 30 20 31 21 static int __init vdso_init(void) 32 22 {

+2 -4

arch/loongarch/kernel/vdso.c

··· 21 21 #include <asm/vdso.h> 22 22 #include <vdso/helpers.h> 23 23 #include <vdso/vsyscall.h> 24 + #include <vdso/datapage.h> 24 25 #include <generated/vdso-offsets.h> 25 26 26 27 extern char vdso_start[], vdso_end[]; 27 28 28 29 /* Kernel-provided data used by the VDSO. */ 29 - static union { 30 - u8 page[PAGE_SIZE]; 31 - struct vdso_data data[CS_BASES]; 32 - } generic_vdso_data __page_aligned_data; 30 + static union vdso_data_store generic_vdso_data __page_aligned_data; 33 31 34 32 static union { 35 33 u8 page[LOONGARCH_VDSO_DATA_SIZE];

-5

arch/mips/include/asm/vdso.h

··· 50 50 extern struct mips_vdso_image vdso_image_n32; 51 51 #endif 52 52 53 - union mips_vdso_data { 54 - struct vdso_data data[CS_BASES]; 55 - u8 page[PAGE_SIZE]; 56 - }; 57 - 58 53 #endif /* __ASM_VDSO_H */

+1 -1

arch/mips/kernel/vdso.c

··· 24 24 #include <vdso/vsyscall.h> 25 25 26 26 /* Kernel-provided data used by the VDSO. */ 27 - static union mips_vdso_data mips_vdso_data __page_aligned_data; 27 + static union vdso_data_store mips_vdso_data __page_aligned_data; 28 28 struct vdso_data *vdso_data = mips_vdso_data.data; 29 29 30 30 /*

+2 -8

arch/riscv/kernel/vdso.c

··· 30 30 31 31 #define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT) 32 32 33 - /* 34 - * The vDSO data page. 35 - */ 36 - static union { 37 - struct vdso_data data; 38 - u8 page[PAGE_SIZE]; 39 - } vdso_data_store __page_aligned_data; 40 - struct vdso_data *vdso_data = &vdso_data_store.data; 33 + static union vdso_data_store vdso_data_store __page_aligned_data; 34 + struct vdso_data *vdso_data = vdso_data_store.data; 41 35 42 36 struct __vdso_info { 43 37 const char *name;

-1

arch/s390/include/asm/vdso/data.h

··· 3 3 #define __S390_ASM_VDSO_DATA_H 4 4 5 5 #include <linux/types.h> 6 - #include <vdso/datapage.h> 7 6 8 7 struct arch_vdso_data { 9 8 __s64 tod_steering_delta;

+1 -4

arch/s390/kernel/vdso.c

··· 25 25 26 26 static struct vm_special_mapping vvar_mapping; 27 27 28 - static union { 29 - struct vdso_data data[CS_BASES]; 30 - u8 page[PAGE_SIZE]; 31 - } vdso_data_store __page_aligned_data; 28 + static union vdso_data_store vdso_data_store __page_aligned_data; 32 29 33 30 struct vdso_data *vdso_data = vdso_data_store.data; 34 31

+13 -1

include/linux/clocksource.h

··· 291 291 #define TIMER_ACPI_DECLARE(name, table_id, fn) \ 292 292 ACPI_DECLARE_PROBE_ENTRY(timer, name, table_id, 0, NULL, 0, fn) 293 293 294 - extern ulong max_cswd_read_retries; 294 + static inline unsigned int clocksource_get_max_watchdog_retry(void) 295 + { 296 + /* 297 + * When system is in the boot phase or under heavy workload, there 298 + * can be random big latencies during the clocksource/watchdog 299 + * read, so allow retries to filter the noise latency. As the 300 + * latency's frequency and maximum value goes up with the number of 301 + * CPUs, scale the number of retries with the number of online 302 + * CPUs. 303 + */ 304 + return (ilog2(num_online_cpus()) / 2) + 1; 305 + } 306 + 295 307 void clocksource_verify_percpu(struct clocksource *cs); 296 308 297 309 #endif /* _LINUX_CLOCKSOURCE_H */

+2

include/linux/cpuhotplug.h

··· 184 184 CPUHP_AP_ARM64_ISNDEP_STARTING, 185 185 CPUHP_AP_SMPCFD_DYING, 186 186 CPUHP_AP_HRTIMERS_DYING, 187 + CPUHP_AP_TICK_DYING, 187 188 CPUHP_AP_X86_TBOOT_DYING, 188 189 CPUHP_AP_ARM_CACHE_B15_RAC_DYING, 189 190 CPUHP_AP_ONLINE, ··· 232 231 CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE, 233 232 CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE, 234 233 CPUHP_AP_PERF_CSKY_ONLINE, 234 + CPUHP_AP_TMIGR_ONLINE, 235 235 CPUHP_AP_WATCHDOG_ONLINE, 236 236 CPUHP_AP_WORKQUEUE_ONLINE, 237 237 CPUHP_AP_RANDOM_ONLINE,

+3 -116

include/linux/hrtimer.h

··· 18 18 #include <linux/list.h> 19 19 #include <linux/percpu-defs.h> 20 20 #include <linux/rbtree.h> 21 - #include <linux/seqlock.h> 22 21 #include <linux/timer.h> 23 - 24 - struct hrtimer_clock_base; 25 - struct hrtimer_cpu_base; 26 22 27 23 /* 28 24 * Mode arguments of xxx_hrtimer functions: ··· 93 97 struct hrtimer timer; 94 98 struct task_struct *task; 95 99 }; 96 - 97 - #ifdef CONFIG_64BIT 98 - # define __hrtimer_clock_base_align ____cacheline_aligned 99 - #else 100 - # define __hrtimer_clock_base_align 101 - #endif 102 - 103 - /** 104 - * struct hrtimer_clock_base - the timer base for a specific clock 105 - * @cpu_base: per cpu clock base 106 - * @index: clock type index for per_cpu support when moving a 107 - * timer to a base on another cpu. 108 - * @clockid: clock id for per_cpu support 109 - * @seq: seqcount around __run_hrtimer 110 - * @running: pointer to the currently running hrtimer 111 - * @active: red black tree root node for the active timers 112 - * @get_time: function to retrieve the current time of the clock 113 - * @offset: offset of this clock to the monotonic base 114 - */ 115 - struct hrtimer_clock_base { 116 - struct hrtimer_cpu_base *cpu_base; 117 - unsigned int index; 118 - clockid_t clockid; 119 - seqcount_raw_spinlock_t seq; 120 - struct hrtimer *running; 121 - struct timerqueue_head active; 122 - ktime_t (*get_time)(void); 123 - ktime_t offset; 124 - } __hrtimer_clock_base_align; 125 - 126 - enum hrtimer_base_type { 127 - HRTIMER_BASE_MONOTONIC, 128 - HRTIMER_BASE_REALTIME, 129 - HRTIMER_BASE_BOOTTIME, 130 - HRTIMER_BASE_TAI, 131 - HRTIMER_BASE_MONOTONIC_SOFT, 132 - HRTIMER_BASE_REALTIME_SOFT, 133 - HRTIMER_BASE_BOOTTIME_SOFT, 134 - HRTIMER_BASE_TAI_SOFT, 135 - HRTIMER_MAX_CLOCK_BASES, 136 - }; 137 - 138 - /** 139 - * struct hrtimer_cpu_base - the per cpu clock bases 140 - * @lock: lock protecting the base and associated clock bases 141 - * and timers 142 - * @cpu: cpu number 143 - * @active_bases: Bitfield to mark bases with active timers 144 - * @clock_was_set_seq: Sequence counter of clock was set events 145 - * @hres_active: State of high resolution mode 146 - * @in_hrtirq: hrtimer_interrupt() is currently executing 147 - * @hang_detected: The last hrtimer interrupt detected a hang 148 - * @softirq_activated: displays, if the softirq is raised - update of softirq 149 - * related settings is not required then. 150 - * @nr_events: Total number of hrtimer interrupt events 151 - * @nr_retries: Total number of hrtimer interrupt retries 152 - * @nr_hangs: Total number of hrtimer interrupt hangs 153 - * @max_hang_time: Maximum time spent in hrtimer_interrupt 154 - * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are 155 - * expired 156 - * @online: CPU is online from an hrtimers point of view 157 - * @timer_waiters: A hrtimer_cancel() invocation waits for the timer 158 - * callback to finish. 159 - * @expires_next: absolute time of the next event, is required for remote 160 - * hrtimer enqueue; it is the total first expiry time (hard 161 - * and soft hrtimer are taken into account) 162 - * @next_timer: Pointer to the first expiring timer 163 - * @softirq_expires_next: Time to check, if soft queues needs also to be expired 164 - * @softirq_next_timer: Pointer to the first expiring softirq based timer 165 - * @clock_base: array of clock bases for this cpu 166 - * 167 - * Note: next_timer is just an optimization for __remove_hrtimer(). 168 - * Do not dereference the pointer because it is not reliable on 169 - * cross cpu removals. 170 - */ 171 - struct hrtimer_cpu_base { 172 - raw_spinlock_t lock; 173 - unsigned int cpu; 174 - unsigned int active_bases; 175 - unsigned int clock_was_set_seq; 176 - unsigned int hres_active : 1, 177 - in_hrtirq : 1, 178 - hang_detected : 1, 179 - softirq_activated : 1, 180 - online : 1; 181 - #ifdef CONFIG_HIGH_RES_TIMERS 182 - unsigned int nr_events; 183 - unsigned short nr_retries; 184 - unsigned short nr_hangs; 185 - unsigned int max_hang_time; 186 - #endif 187 - #ifdef CONFIG_PREEMPT_RT 188 - spinlock_t softirq_expiry_lock; 189 - atomic_t timer_waiters; 190 - #endif 191 - ktime_t expires_next; 192 - struct hrtimer *next_timer; 193 - ktime_t softirq_expires_next; 194 - struct hrtimer *softirq_next_timer; 195 - struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; 196 - } ____cacheline_aligned; 197 100 198 101 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) 199 102 { ··· 342 447 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); 343 448 344 449 /** 345 - * hrtimer_forward_now - forward the timer expiry so it expires after now 450 + * hrtimer_forward_now() - forward the timer expiry so it expires after now 346 451 * @timer: hrtimer to forward 347 452 * @interval: the interval to forward 348 453 * 349 - * Forward the timer expiry so it will expire after the current time 350 - * of the hrtimer clock base. Returns the number of overruns. 351 - * 352 - * Can be safely called from the callback function of @timer. If 353 - * called from other contexts @timer must neither be enqueued nor 354 - * running the callback and the caller needs to take care of 355 - * serialization. 356 - * 357 - * Note: This only updates the timer expiry value and does not requeue 358 - * the timer. 454 + * It is a variant of hrtimer_forward(). The timer will expire after the current 455 + * time of the hrtimer clock base. See hrtimer_forward() for details. 359 456 */ 360 457 static inline u64 hrtimer_forward_now(struct hrtimer *timer, 361 458 ktime_t interval)

+104

include/linux/hrtimer_defs.h

··· 3 3 #define _LINUX_HRTIMER_DEFS_H 4 4 5 5 #include <linux/ktime.h> 6 + #include <linux/timerqueue.h> 7 + #include <linux/seqlock.h> 6 8 7 9 #ifdef CONFIG_HIGH_RES_TIMERS 8 10 ··· 25 23 # define KTIME_MONOTONIC_RES KTIME_LOW_RES 26 24 27 25 #endif 26 + 27 + #ifdef CONFIG_64BIT 28 + # define __hrtimer_clock_base_align ____cacheline_aligned 29 + #else 30 + # define __hrtimer_clock_base_align 31 + #endif 32 + 33 + /** 34 + * struct hrtimer_clock_base - the timer base for a specific clock 35 + * @cpu_base: per cpu clock base 36 + * @index: clock type index for per_cpu support when moving a 37 + * timer to a base on another cpu. 38 + * @clockid: clock id for per_cpu support 39 + * @seq: seqcount around __run_hrtimer 40 + * @running: pointer to the currently running hrtimer 41 + * @active: red black tree root node for the active timers 42 + * @get_time: function to retrieve the current time of the clock 43 + * @offset: offset of this clock to the monotonic base 44 + */ 45 + struct hrtimer_clock_base { 46 + struct hrtimer_cpu_base *cpu_base; 47 + unsigned int index; 48 + clockid_t clockid; 49 + seqcount_raw_spinlock_t seq; 50 + struct hrtimer *running; 51 + struct timerqueue_head active; 52 + ktime_t (*get_time)(void); 53 + ktime_t offset; 54 + } __hrtimer_clock_base_align; 55 + 56 + enum hrtimer_base_type { 57 + HRTIMER_BASE_MONOTONIC, 58 + HRTIMER_BASE_REALTIME, 59 + HRTIMER_BASE_BOOTTIME, 60 + HRTIMER_BASE_TAI, 61 + HRTIMER_BASE_MONOTONIC_SOFT, 62 + HRTIMER_BASE_REALTIME_SOFT, 63 + HRTIMER_BASE_BOOTTIME_SOFT, 64 + HRTIMER_BASE_TAI_SOFT, 65 + HRTIMER_MAX_CLOCK_BASES, 66 + }; 67 + 68 + /** 69 + * struct hrtimer_cpu_base - the per cpu clock bases 70 + * @lock: lock protecting the base and associated clock bases 71 + * and timers 72 + * @cpu: cpu number 73 + * @active_bases: Bitfield to mark bases with active timers 74 + * @clock_was_set_seq: Sequence counter of clock was set events 75 + * @hres_active: State of high resolution mode 76 + * @in_hrtirq: hrtimer_interrupt() is currently executing 77 + * @hang_detected: The last hrtimer interrupt detected a hang 78 + * @softirq_activated: displays, if the softirq is raised - update of softirq 79 + * related settings is not required then. 80 + * @nr_events: Total number of hrtimer interrupt events 81 + * @nr_retries: Total number of hrtimer interrupt retries 82 + * @nr_hangs: Total number of hrtimer interrupt hangs 83 + * @max_hang_time: Maximum time spent in hrtimer_interrupt 84 + * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are 85 + * expired 86 + * @online: CPU is online from an hrtimers point of view 87 + * @timer_waiters: A hrtimer_cancel() invocation waits for the timer 88 + * callback to finish. 89 + * @expires_next: absolute time of the next event, is required for remote 90 + * hrtimer enqueue; it is the total first expiry time (hard 91 + * and soft hrtimer are taken into account) 92 + * @next_timer: Pointer to the first expiring timer 93 + * @softirq_expires_next: Time to check, if soft queues needs also to be expired 94 + * @softirq_next_timer: Pointer to the first expiring softirq based timer 95 + * @clock_base: array of clock bases for this cpu 96 + * 97 + * Note: next_timer is just an optimization for __remove_hrtimer(). 98 + * Do not dereference the pointer because it is not reliable on 99 + * cross cpu removals. 100 + */ 101 + struct hrtimer_cpu_base { 102 + raw_spinlock_t lock; 103 + unsigned int cpu; 104 + unsigned int active_bases; 105 + unsigned int clock_was_set_seq; 106 + unsigned int hres_active : 1, 107 + in_hrtirq : 1, 108 + hang_detected : 1, 109 + softirq_activated : 1, 110 + online : 1; 111 + #ifdef CONFIG_HIGH_RES_TIMERS 112 + unsigned int nr_events; 113 + unsigned short nr_retries; 114 + unsigned short nr_hangs; 115 + unsigned int max_hang_time; 116 + #endif 117 + #ifdef CONFIG_PREEMPT_RT 118 + spinlock_t softirq_expiry_lock; 119 + atomic_t timer_waiters; 120 + #endif 121 + ktime_t expires_next; 122 + struct hrtimer *next_timer; 123 + ktime_t softirq_expires_next; 124 + struct hrtimer *softirq_next_timer; 125 + struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; 126 + } ____cacheline_aligned; 127 + 28 128 29 129 #endif

+9 -6

include/linux/jiffies.h

··· 102 102 } 103 103 #endif 104 104 105 - /* 106 - * These inlines deal with timer wrapping correctly. You are 107 - * strongly encouraged to use them: 108 - * 1. Because people otherwise forget 109 - * 2. Because if the timer wrap changes in future you won't have to 110 - * alter your driver code. 105 + /** 106 + * DOC: General information about time_* inlines 107 + * 108 + * These inlines deal with timer wrapping correctly. You are strongly encouraged 109 + * to use them: 110 + * 111 + * #. Because people otherwise forget 112 + * #. Because if the timer wrap changes in future you won't have to alter your 113 + * driver code. 111 114 */ 112 115 113 116 /**

+8 -8

include/linux/tick.h

··· 19 19 extern void tick_suspend_local(void); 20 20 /* Should be core only, but XEN resume magic and ARM BL switcher require it */ 21 21 extern void tick_resume_local(void); 22 - extern void tick_handover_do_timer(void); 23 22 extern void tick_cleanup_dead_cpu(int cpu); 24 23 #else /* CONFIG_GENERIC_CLOCKEVENTS */ 25 24 static inline void tick_init(void) { } 26 25 static inline void tick_suspend_local(void) { } 27 26 static inline void tick_resume_local(void) { } 28 - static inline void tick_handover_do_timer(void) { } 29 27 static inline void tick_cleanup_dead_cpu(int cpu) { } 30 28 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ 29 + 30 + #if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_HOTPLUG_CPU) 31 + extern int tick_cpu_dying(unsigned int cpu); 32 + extern void tick_assert_timekeeping_handover(void); 33 + #else 34 + #define tick_cpu_dying NULL 35 + static inline void tick_assert_timekeeping_handover(void) { } 36 + #endif 31 37 32 38 #if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_SUSPEND) 33 39 extern void tick_freeze(void); ··· 74 68 #else 75 69 static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { } 76 70 #endif /* BROADCAST */ 77 - 78 - #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) 79 - extern void tick_offline_cpu(unsigned int cpu); 80 - #else 81 - static inline void tick_offline_cpu(unsigned int cpu) { } 82 - #endif 83 71 84 72 #ifdef CONFIG_GENERIC_CLOCKEVENTS 85 73 extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state);

+6 -10

include/linux/timer.h

··· 36 36 * workqueue locking issues. It's not meant for executing random crap 37 37 * with interrupts disabled. Abuse is monitored! 38 38 * 39 - * @TIMER_PINNED: A pinned timer will not be affected by any timer 40 - * placement heuristics (like, NOHZ) and will always expire on the CPU 41 - * on which the timer was enqueued. 42 - * 43 - * Note: Because enqueuing of timers can migrate the timer from one 44 - * CPU to another, pinned timers are not guaranteed to stay on the 45 - * initialy selected CPU. They move to the CPU on which the enqueue 46 - * function is invoked via mod_timer() or add_timer(). If the timer 47 - * should be placed on a particular CPU, then add_timer_on() has to be 48 - * used. 39 + * @TIMER_PINNED: A pinned timer will always expire on the CPU on which the 40 + * timer was enqueued. When a particular CPU is required, add_timer_on() 41 + * has to be used. Enqueue via mod_timer() and add_timer() is always done 42 + * on the local CPU. 49 43 */ 50 44 #define TIMER_CPUMASK 0x0003FFFF 51 45 #define TIMER_MIGRATING 0x00040000 ··· 159 165 #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) 160 166 161 167 extern void add_timer(struct timer_list *timer); 168 + extern void add_timer_local(struct timer_list *timer); 169 + extern void add_timer_global(struct timer_list *timer); 162 170 163 171 extern int try_to_del_timer_sync(struct timer_list *timer); 164 172 extern int timer_delete_sync(struct timer_list *timer);

+298

include/trace/events/timer_migration.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + 3 + #undef TRACE_SYSTEM 4 + #define TRACE_SYSTEM timer_migration 5 + 6 + #if !defined(_TRACE_TIMER_MIGRATION_H) || defined(TRACE_HEADER_MULTI_READ) 7 + #define _TRACE_TIMER_MIGRATION_H 8 + 9 + #include <linux/tracepoint.h> 10 + 11 + /* Group events */ 12 + TRACE_EVENT(tmigr_group_set, 13 + 14 + TP_PROTO(struct tmigr_group *group), 15 + 16 + TP_ARGS(group), 17 + 18 + TP_STRUCT__entry( 19 + __field( void *, group ) 20 + __field( unsigned int, lvl ) 21 + __field( unsigned int, numa_node ) 22 + ), 23 + 24 + TP_fast_assign( 25 + __entry->group = group; 26 + __entry->lvl = group->level; 27 + __entry->numa_node = group->numa_node; 28 + ), 29 + 30 + TP_printk("group=%p lvl=%d numa=%d", 31 + __entry->group, __entry->lvl, __entry->numa_node) 32 + ); 33 + 34 + TRACE_EVENT(tmigr_connect_child_parent, 35 + 36 + TP_PROTO(struct tmigr_group *child), 37 + 38 + TP_ARGS(child), 39 + 40 + TP_STRUCT__entry( 41 + __field( void *, child ) 42 + __field( void *, parent ) 43 + __field( unsigned int, lvl ) 44 + __field( unsigned int, numa_node ) 45 + __field( unsigned int, num_children ) 46 + __field( u32, childmask ) 47 + ), 48 + 49 + TP_fast_assign( 50 + __entry->child = child; 51 + __entry->parent = child->parent; 52 + __entry->lvl = child->parent->level; 53 + __entry->numa_node = child->parent->numa_node; 54 + __entry->num_children = child->parent->num_children; 55 + __entry->childmask = child->childmask; 56 + ), 57 + 58 + TP_printk("group=%p childmask=%0x parent=%p lvl=%d numa=%d num_children=%d", 59 + __entry->child, __entry->childmask, __entry->parent, 60 + __entry->lvl, __entry->numa_node, __entry->num_children) 61 + ); 62 + 63 + TRACE_EVENT(tmigr_connect_cpu_parent, 64 + 65 + TP_PROTO(struct tmigr_cpu *tmc), 66 + 67 + TP_ARGS(tmc), 68 + 69 + TP_STRUCT__entry( 70 + __field( void *, parent ) 71 + __field( unsigned int, cpu ) 72 + __field( unsigned int, lvl ) 73 + __field( unsigned int, numa_node ) 74 + __field( unsigned int, num_children ) 75 + __field( u32, childmask ) 76 + ), 77 + 78 + TP_fast_assign( 79 + __entry->parent = tmc->tmgroup; 80 + __entry->cpu = tmc->cpuevt.cpu; 81 + __entry->lvl = tmc->tmgroup->level; 82 + __entry->numa_node = tmc->tmgroup->numa_node; 83 + __entry->num_children = tmc->tmgroup->num_children; 84 + __entry->childmask = tmc->childmask; 85 + ), 86 + 87 + TP_printk("cpu=%d childmask=%0x parent=%p lvl=%d numa=%d num_children=%d", 88 + __entry->cpu, __entry->childmask, __entry->parent, 89 + __entry->lvl, __entry->numa_node, __entry->num_children) 90 + ); 91 + 92 + DECLARE_EVENT_CLASS(tmigr_group_and_cpu, 93 + 94 + TP_PROTO(struct tmigr_group *group, union tmigr_state state, u32 childmask), 95 + 96 + TP_ARGS(group, state, childmask), 97 + 98 + TP_STRUCT__entry( 99 + __field( void *, group ) 100 + __field( void *, parent ) 101 + __field( unsigned int, lvl ) 102 + __field( unsigned int, numa_node ) 103 + __field( u32, childmask ) 104 + __field( u8, active ) 105 + __field( u8, migrator ) 106 + ), 107 + 108 + TP_fast_assign( 109 + __entry->group = group; 110 + __entry->parent = group->parent; 111 + __entry->lvl = group->level; 112 + __entry->numa_node = group->numa_node; 113 + __entry->childmask = childmask; 114 + __entry->active = state.active; 115 + __entry->migrator = state.migrator; 116 + ), 117 + 118 + TP_printk("group=%p lvl=%d numa=%d active=%0x migrator=%0x " 119 + "parent=%p childmask=%0x", 120 + __entry->group, __entry->lvl, __entry->numa_node, 121 + __entry->active, __entry->migrator, 122 + __entry->parent, __entry->childmask) 123 + ); 124 + 125 + DEFINE_EVENT(tmigr_group_and_cpu, tmigr_group_set_cpu_inactive, 126 + 127 + TP_PROTO(struct tmigr_group *group, union tmigr_state state, u32 childmask), 128 + 129 + TP_ARGS(group, state, childmask) 130 + ); 131 + 132 + DEFINE_EVENT(tmigr_group_and_cpu, tmigr_group_set_cpu_active, 133 + 134 + TP_PROTO(struct tmigr_group *group, union tmigr_state state, u32 childmask), 135 + 136 + TP_ARGS(group, state, childmask) 137 + ); 138 + 139 + /* CPU events*/ 140 + DECLARE_EVENT_CLASS(tmigr_cpugroup, 141 + 142 + TP_PROTO(struct tmigr_cpu *tmc), 143 + 144 + TP_ARGS(tmc), 145 + 146 + TP_STRUCT__entry( 147 + __field( u64, wakeup ) 148 + __field( void *, parent ) 149 + __field( unsigned int, cpu ) 150 + 151 + ), 152 + 153 + TP_fast_assign( 154 + __entry->wakeup = tmc->wakeup; 155 + __entry->parent = tmc->tmgroup; 156 + __entry->cpu = tmc->cpuevt.cpu; 157 + ), 158 + 159 + TP_printk("cpu=%d parent=%p wakeup=%llu", __entry->cpu, __entry->parent, __entry->wakeup) 160 + ); 161 + 162 + DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_new_timer, 163 + 164 + TP_PROTO(struct tmigr_cpu *tmc), 165 + 166 + TP_ARGS(tmc) 167 + ); 168 + 169 + DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_active, 170 + 171 + TP_PROTO(struct tmigr_cpu *tmc), 172 + 173 + TP_ARGS(tmc) 174 + ); 175 + 176 + DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_online, 177 + 178 + TP_PROTO(struct tmigr_cpu *tmc), 179 + 180 + TP_ARGS(tmc) 181 + ); 182 + 183 + DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_offline, 184 + 185 + TP_PROTO(struct tmigr_cpu *tmc), 186 + 187 + TP_ARGS(tmc) 188 + ); 189 + 190 + DEFINE_EVENT(tmigr_cpugroup, tmigr_handle_remote_cpu, 191 + 192 + TP_PROTO(struct tmigr_cpu *tmc), 193 + 194 + TP_ARGS(tmc) 195 + ); 196 + 197 + DECLARE_EVENT_CLASS(tmigr_idle, 198 + 199 + TP_PROTO(struct tmigr_cpu *tmc, u64 nextevt), 200 + 201 + TP_ARGS(tmc, nextevt), 202 + 203 + TP_STRUCT__entry( 204 + __field( u64, nextevt) 205 + __field( u64, wakeup) 206 + __field( void *, parent) 207 + __field( unsigned int, cpu) 208 + ), 209 + 210 + TP_fast_assign( 211 + __entry->nextevt = nextevt; 212 + __entry->wakeup = tmc->wakeup; 213 + __entry->parent = tmc->tmgroup; 214 + __entry->cpu = tmc->cpuevt.cpu; 215 + ), 216 + 217 + TP_printk("cpu=%d parent=%p nextevt=%llu wakeup=%llu", 218 + __entry->cpu, __entry->parent, __entry->nextevt, __entry->wakeup) 219 + ); 220 + 221 + DEFINE_EVENT(tmigr_idle, tmigr_cpu_idle, 222 + 223 + TP_PROTO(struct tmigr_cpu *tmc, u64 nextevt), 224 + 225 + TP_ARGS(tmc, nextevt) 226 + ); 227 + 228 + DEFINE_EVENT(tmigr_idle, tmigr_cpu_new_timer_idle, 229 + 230 + TP_PROTO(struct tmigr_cpu *tmc, u64 nextevt), 231 + 232 + TP_ARGS(tmc, nextevt) 233 + ); 234 + 235 + TRACE_EVENT(tmigr_update_events, 236 + 237 + TP_PROTO(struct tmigr_group *child, struct tmigr_group *group, 238 + union tmigr_state childstate, union tmigr_state groupstate, 239 + u64 nextevt), 240 + 241 + TP_ARGS(child, group, childstate, groupstate, nextevt), 242 + 243 + TP_STRUCT__entry( 244 + __field( void *, child ) 245 + __field( void *, group ) 246 + __field( u64, nextevt ) 247 + __field( u64, group_next_expiry ) 248 + __field( u64, child_evt_expiry ) 249 + __field( unsigned int, group_lvl ) 250 + __field( unsigned int, child_evtcpu ) 251 + __field( u8, child_active ) 252 + __field( u8, group_active ) 253 + ), 254 + 255 + TP_fast_assign( 256 + __entry->child = child; 257 + __entry->group = group; 258 + __entry->nextevt = nextevt; 259 + __entry->group_next_expiry = group->next_expiry; 260 + __entry->child_evt_expiry = child ? child->groupevt.nextevt.expires : 0; 261 + __entry->group_lvl = group->level; 262 + __entry->child_evtcpu = child ? child->groupevt.cpu : 0; 263 + __entry->child_active = childstate.active; 264 + __entry->group_active = groupstate.active; 265 + ), 266 + 267 + TP_printk("child=%p group=%p group_lvl=%d child_active=%0x group_active=%0x " 268 + "nextevt=%llu next_expiry=%llu child_evt_expiry=%llu child_evtcpu=%d", 269 + __entry->child, __entry->group, __entry->group_lvl, __entry->child_active, 270 + __entry->group_active, 271 + __entry->nextevt, __entry->group_next_expiry, __entry->child_evt_expiry, 272 + __entry->child_evtcpu) 273 + ); 274 + 275 + TRACE_EVENT(tmigr_handle_remote, 276 + 277 + TP_PROTO(struct tmigr_group *group), 278 + 279 + TP_ARGS(group), 280 + 281 + TP_STRUCT__entry( 282 + __field( void * , group ) 283 + __field( unsigned int , lvl ) 284 + ), 285 + 286 + TP_fast_assign( 287 + __entry->group = group; 288 + __entry->lvl = group->level; 289 + ), 290 + 291 + TP_printk("group=%p lvl=%d", 292 + __entry->group, __entry->lvl) 293 + ); 294 + 295 + #endif /* _TRACE_TIMER_MIGRATION_H */ 296 + 297 + /* This part must be outside protection */ 298 + #include <trace/define_trace.h>

+14

include/vdso/datapage.h

··· 19 19 #include <vdso/time32.h> 20 20 #include <vdso/time64.h> 21 21 22 + #ifdef CONFIG_ARM64 23 + #include <asm/page-def.h> 24 + #else 25 + #include <asm/page.h> 26 + #endif 27 + 22 28 #ifdef CONFIG_ARCH_HAS_VDSO_DATA 23 29 #include <asm/vdso/data.h> 24 30 #else ··· 126 120 */ 127 121 extern struct vdso_data _vdso_data[CS_BASES] __attribute__((visibility("hidden"))); 128 122 extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden"))); 123 + 124 + /** 125 + * union vdso_data_store - Generic vDSO data page 126 + */ 127 + union vdso_data_store { 128 + struct vdso_data data[CS_BASES]; 129 + u8 page[PAGE_SIZE]; 130 + }; 129 131 130 132 /* 131 133 * The generic vDSO implementation requires that gettimeofday.h

+4 -4

include/vdso/helpers.h

··· 30 30 static __always_inline void vdso_write_begin(struct vdso_data *vd) 31 31 { 32 32 /* 33 - * WRITE_ONCE it is required otherwise the compiler can validly tear 33 + * WRITE_ONCE() is required otherwise the compiler can validly tear 34 34 * updates to vd[x].seq and it is possible that the value seen by the 35 - * reader it is inconsistent. 35 + * reader is inconsistent. 36 36 */ 37 37 WRITE_ONCE(vd[CS_HRES_COARSE].seq, vd[CS_HRES_COARSE].seq + 1); 38 38 WRITE_ONCE(vd[CS_RAW].seq, vd[CS_RAW].seq + 1); ··· 43 43 { 44 44 smp_wmb(); 45 45 /* 46 - * WRITE_ONCE it is required otherwise the compiler can validly tear 46 + * WRITE_ONCE() is required otherwise the compiler can validly tear 47 47 * updates to vd[x].seq and it is possible that the value seen by the 48 - * reader it is inconsistent. 48 + * reader is inconsistent. 49 49 */ 50 50 WRITE_ONCE(vd[CS_HRES_COARSE].seq, vd[CS_HRES_COARSE].seq + 1); 51 51 WRITE_ONCE(vd[CS_RAW].seq, vd[CS_RAW].seq + 1);

+6 -5

kernel/cpu.c

··· 1323 1323 */ 1324 1324 cpuhp_invoke_callback_range_nofail(false, cpu, st, target); 1325 1325 1326 - /* Give up timekeeping duties */ 1327 - tick_handover_do_timer(); 1328 - /* Remove CPU from timer broadcasting */ 1329 - tick_offline_cpu(cpu); 1330 1326 /* Park the stopper thread */ 1331 1327 stop_machine_park(cpu); 1332 1328 return 0; ··· 1398 1402 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); 1399 1403 1400 1404 BUG_ON(st->state != CPUHP_AP_OFFLINE); 1405 + tick_assert_timekeeping_handover(); 1401 1406 rcutree_report_cpu_dead(); 1402 1407 st->state = CPUHP_AP_IDLE_DEAD; 1403 1408 /* ··· 2201 2204 .startup.single = NULL, 2202 2205 .teardown.single = hrtimers_cpu_dying, 2203 2206 }, 2204 - 2207 + [CPUHP_AP_TICK_DYING] = { 2208 + .name = "tick:dying", 2209 + .startup.single = NULL, 2210 + .teardown.single = tick_cpu_dying, 2211 + }, 2205 2212 /* Entry state on starting. Interrupts enabled from here on. Transient 2206 2213 * state for synchronsization */ 2207 2214 [CPUHP_AP_ONLINE] = {

-1

kernel/sched/idle.c

··· 291 291 local_irq_disable(); 292 292 293 293 if (cpu_is_offline(cpu)) { 294 - tick_nohz_idle_stop_tick(); 295 294 cpuhp_report_idle_dead(); 296 295 arch_cpu_idle_dead(); 297 296 }

+3

kernel/time/Makefile

··· 17 17 obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o 18 18 obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o 19 19 obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o 20 + ifeq ($(CONFIG_SMP),y) 21 + obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o 22 + endif 20 23 obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o 21 24 obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o 22 25 obj-$(CONFIG_TEST_UDELAY) += test_udelay.o

+1 -1

kernel/time/clockevents.c

··· 659 659 #endif 660 660 661 661 #ifdef CONFIG_SYSFS 662 - static struct bus_type clockevents_subsys = { 662 + static const struct bus_type clockevents_subsys = { 663 663 .name = "clockevents", 664 664 .dev_name = "clockevent", 665 665 };

+7 -6

kernel/time/clocksource-wdtest.c

··· 104 104 static int wdtest_func(void *arg) 105 105 { 106 106 unsigned long j1, j2; 107 + int i, max_retries; 107 108 char *s; 108 - int i; 109 109 110 110 schedule_timeout_uninterruptible(holdoff * HZ); 111 111 ··· 139 139 WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC)); 140 140 141 141 /* Verify tsc-like stability with various numbers of errors injected. */ 142 - for (i = 0; i <= max_cswd_read_retries + 1; i++) { 143 - if (i <= 1 && i < max_cswd_read_retries) 142 + max_retries = clocksource_get_max_watchdog_retry(); 143 + for (i = 0; i <= max_retries + 1; i++) { 144 + if (i <= 1 && i < max_retries) 144 145 s = ""; 145 - else if (i <= max_cswd_read_retries) 146 + else if (i <= max_retries) 146 147 s = ", expect message"; 147 148 else 148 149 s = ", expect clock skew"; 149 - pr_info("--- Watchdog with %dx error injection, %lu retries%s.\n", i, max_cswd_read_retries, s); 150 + pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s); 150 151 WRITE_ONCE(wdtest_ktime_read_ndelays, i); 151 152 schedule_timeout_uninterruptible(2 * HZ); 152 153 WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays)); 153 - WARN_ON_ONCE((i <= max_cswd_read_retries) != 154 + WARN_ON_ONCE((i <= max_retries) != 154 155 !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); 155 156 wdtest_ktime_clocksource_reset(); 156 157 }

+5 -7

kernel/time/clocksource.c

··· 210 210 spin_unlock_irqrestore(&watchdog_lock, flags); 211 211 } 212 212 213 - ulong max_cswd_read_retries = 2; 214 - module_param(max_cswd_read_retries, ulong, 0644); 215 - EXPORT_SYMBOL_GPL(max_cswd_read_retries); 216 213 static int verify_n_cpus = 8; 217 214 module_param(verify_n_cpus, int, 0644); 218 215 ··· 221 224 222 225 static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) 223 226 { 224 - unsigned int nretries; 227 + unsigned int nretries, max_retries; 225 228 u64 wd_end, wd_end2, wd_delta; 226 229 int64_t wd_delay, wd_seq_delay; 227 230 228 - for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) { 231 + max_retries = clocksource_get_max_watchdog_retry(); 232 + for (nretries = 0; nretries <= max_retries; nretries++) { 229 233 local_irq_disable(); 230 234 *wdnow = watchdog->read(watchdog); 231 235 *csnow = cs->read(cs); ··· 238 240 wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, 239 241 watchdog->shift); 240 242 if (wd_delay <= WATCHDOG_MAX_SKEW) { 241 - if (nretries > 1 || nretries >= max_cswd_read_retries) { 243 + if (nretries > 1 || nretries >= max_retries) { 242 244 pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", 243 245 smp_processor_id(), watchdog->name, nretries); 244 246 } ··· 1466 1468 }; 1467 1469 ATTRIBUTE_GROUPS(clocksource); 1468 1470 1469 - static struct bus_type clocksource_subsys = { 1471 + static const struct bus_type clocksource_subsys = { 1470 1472 .name = "clocksource", 1471 1473 .dev_name = "clocksource", 1472 1474 };

+13 -12

kernel/time/hrtimer.c

··· 38 38 #include <linux/sched/deadline.h> 39 39 #include <linux/sched/nohz.h> 40 40 #include <linux/sched/debug.h> 41 + #include <linux/sched/isolation.h> 41 42 #include <linux/timer.h> 42 43 #include <linux/freezer.h> 43 44 #include <linux/compat.h> ··· 747 746 base->hres_active = 1; 748 747 hrtimer_resolution = HIGH_RES_NSEC; 749 748 750 - tick_setup_sched_timer(); 749 + tick_setup_sched_timer(true); 751 750 /* "Retrigger" the interrupt to get things going */ 752 751 retrigger_next_event(NULL); 753 752 } ··· 1022 1021 } 1023 1022 1024 1023 /** 1025 - * hrtimer_forward - forward the timer expiry 1024 + * hrtimer_forward() - forward the timer expiry 1026 1025 * @timer: hrtimer to forward 1027 1026 * @now: forward past this time 1028 1027 * @interval: the interval to forward 1029 1028 * 1030 1029 * Forward the timer expiry so it will expire in the future. 1031 - * Returns the number of overruns. 1032 1030 * 1033 - * Can be safely called from the callback function of @timer. If 1034 - * called from other contexts @timer must neither be enqueued nor 1035 - * running the callback and the caller needs to take care of 1036 - * serialization. 1031 + * .. note:: 1032 + * This only updates the timer expiry value and does not requeue the timer. 1037 1033 * 1038 - * Note: This only updates the timer expiry value and does not requeue 1039 - * the timer. 1034 + * There is also a variant of the function hrtimer_forward_now(). 1035 + * 1036 + * Context: Can be safely called from the callback function of @timer. If called 1037 + * from other contexts @timer must neither be enqueued nor running the 1038 + * callback and the caller needs to take care of serialization. 1039 + * 1040 + * Return: The number of overruns are returned. 1040 1041 */ 1041 1042 u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) 1042 1043 { ··· 2226 2223 2227 2224 int hrtimers_cpu_dying(unsigned int dying_cpu) 2228 2225 { 2226 + int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); 2229 2227 struct hrtimer_cpu_base *old_base, *new_base; 2230 - int i, ncpu = cpumask_first(cpu_active_mask); 2231 - 2232 - tick_cancel_sched_timer(dying_cpu); 2233 2228 2234 2229 old_base = this_cpu_ptr(&hrtimer_bases); 2235 2230 new_base = &per_cpu(hrtimer_bases, ncpu);

+24 -11

kernel/time/tick-common.c

··· 111 111 112 112 tick_periodic(cpu); 113 113 114 - #if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON) 115 114 /* 116 115 * The cpu might have transitioned to HIGHRES or NOHZ mode via 117 116 * update_process_times() -> run_local_timers() -> 118 117 * hrtimer_run_queues(). 119 118 */ 120 - if (dev->event_handler != tick_handle_periodic) 119 + if (IS_ENABLED(CONFIG_TICK_ONESHOT) && dev->event_handler != tick_handle_periodic) 121 120 return; 122 - #endif 123 121 124 122 if (!clockevent_state_oneshot(dev)) 125 123 return; ··· 396 398 EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); 397 399 398 400 #ifdef CONFIG_HOTPLUG_CPU 399 - /* 400 - * Transfer the do_timer job away from a dying cpu. 401 - * 402 - * Called with interrupts disabled. No locking required. If 403 - * tick_do_timer_cpu is owned by this cpu, nothing can change it. 404 - */ 405 - void tick_handover_do_timer(void) 401 + void tick_assert_timekeeping_handover(void) 406 402 { 407 - if (tick_do_timer_cpu == smp_processor_id()) 403 + WARN_ON_ONCE(tick_do_timer_cpu == smp_processor_id()); 404 + } 405 + /* 406 + * Stop the tick and transfer the timekeeping job away from a dying cpu. 407 + */ 408 + int tick_cpu_dying(unsigned int dying_cpu) 409 + { 410 + /* 411 + * If the current CPU is the timekeeper, it's the only one that 412 + * can safely hand over its duty. Also all online CPUs are in 413 + * stop machine, guaranteed not to be idle, therefore it's safe 414 + * to pick any online successor. 415 + */ 416 + if (tick_do_timer_cpu == dying_cpu) 408 417 tick_do_timer_cpu = cpumask_first(cpu_online_mask); 418 + 419 + /* Make sure the CPU won't try to retake the timekeeping duty */ 420 + tick_sched_timer_dying(dying_cpu); 421 + 422 + /* Remove CPU from timer broadcasting */ 423 + tick_offline_cpu(dying_cpu); 424 + 425 + return 0; 409 426 } 410 427 411 428 /*

+16

kernel/time/tick-internal.h

··· 8 8 #include "timekeeping.h" 9 9 #include "tick-sched.h" 10 10 11 + struct timer_events { 12 + u64 local; 13 + u64 global; 14 + }; 15 + 11 16 #ifdef CONFIG_GENERIC_CLOCKEVENTS 12 17 13 18 # define TICK_DO_TIMER_NONE -1 ··· 142 137 #endif /* !(BROADCAST && ONESHOT) */ 143 138 144 139 #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) 140 + extern void tick_offline_cpu(unsigned int cpu); 145 141 extern void tick_broadcast_offline(unsigned int cpu); 146 142 #else 143 + static inline void tick_offline_cpu(unsigned int cpu) { } 147 144 static inline void tick_broadcast_offline(unsigned int cpu) { } 148 145 #endif 149 146 ··· 159 152 #ifdef CONFIG_NO_HZ_COMMON 160 153 extern unsigned long tick_nohz_active; 161 154 extern void timers_update_nohz(void); 155 + extern u64 get_jiffies_update(unsigned long *basej); 162 156 # ifdef CONFIG_SMP 163 157 extern struct static_key_false timers_migration_enabled; 158 + extern void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem, 159 + struct timer_events *tevt, 160 + unsigned int cpu); 161 + extern void timer_lock_remote_bases(unsigned int cpu); 162 + extern void timer_unlock_remote_bases(unsigned int cpu); 163 + extern bool timer_base_is_idle(void); 164 + extern void timer_expire_remote(unsigned int cpu); 164 165 # endif 165 166 #else /* CONFIG_NO_HZ_COMMON */ 166 167 static inline void timers_update_nohz(void) { } ··· 178 163 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); 179 164 180 165 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); 166 + u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle); 181 167 void timer_clear_idle(void); 182 168 183 169 #define CLOCK_SET_WALL \

+195 -166

kernel/time/tick-sched.c

··· 43 43 return &per_cpu(tick_cpu_sched, cpu); 44 44 } 45 45 46 - #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) 47 46 /* 48 47 * The time when the last jiffy update happened. Write access must hold 49 48 * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a ··· 180 181 return period; 181 182 } 182 183 184 + static inline int tick_sched_flag_test(struct tick_sched *ts, 185 + unsigned long flag) 186 + { 187 + return !!(ts->flags & flag); 188 + } 189 + 190 + static inline void tick_sched_flag_set(struct tick_sched *ts, 191 + unsigned long flag) 192 + { 193 + lockdep_assert_irqs_disabled(); 194 + ts->flags |= flag; 195 + } 196 + 197 + static inline void tick_sched_flag_clear(struct tick_sched *ts, 198 + unsigned long flag) 199 + { 200 + lockdep_assert_irqs_disabled(); 201 + ts->flags &= ~flag; 202 + } 203 + 183 204 #define MAX_STALLED_JIFFIES 5 184 205 185 206 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) 186 207 { 187 208 int cpu = smp_processor_id(); 188 209 189 - #ifdef CONFIG_NO_HZ_COMMON 190 210 /* 191 211 * Check if the do_timer duty was dropped. We don't care about 192 212 * concurrency: This happens only when the CPU in charge went ··· 216 198 * If nohz_full is enabled, this should not happen because the 217 199 * 'tick_do_timer_cpu' CPU never relinquishes. 218 200 */ 219 - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { 201 + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && 202 + unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { 220 203 #ifdef CONFIG_NO_HZ_FULL 221 204 WARN_ON_ONCE(tick_nohz_full_running); 222 205 #endif 223 206 tick_do_timer_cpu = cpu; 224 207 } 225 - #endif 226 208 227 209 /* Check if jiffies need an update */ 228 210 if (tick_do_timer_cpu == cpu) ··· 243 225 } 244 226 } 245 227 246 - if (ts->inidle) 228 + if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) 247 229 ts->got_idle_tick = 1; 248 230 } 249 231 250 232 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) 251 233 { 252 - #ifdef CONFIG_NO_HZ_COMMON 253 234 /* 254 235 * When we are idle and the tick is stopped, we have to touch 255 236 * the watchdog as we might not schedule for a really long ··· 257 240 * idle" jiffy stamp so the idle accounting adjustment we do 258 241 * when we go busy again does not account too many ticks. 259 242 */ 260 - if (ts->tick_stopped) { 243 + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && 244 + tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 261 245 touch_softlockup_watchdog_sched(); 262 246 if (is_idle_task(current)) 263 247 ts->idle_jiffies++; ··· 269 251 */ 270 252 ts->next_tick = 0; 271 253 } 272 - #endif 254 + 273 255 update_process_times(user_mode(regs)); 274 256 profile_tick(CPU_PROFILING); 275 257 } 276 - #endif 258 + 259 + /* 260 + * We rearm the timer until we get disabled by the idle code. 261 + * Called with interrupts disabled. 262 + */ 263 + static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) 264 + { 265 + struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); 266 + struct pt_regs *regs = get_irq_regs(); 267 + ktime_t now = ktime_get(); 268 + 269 + tick_sched_do_timer(ts, now); 270 + 271 + /* 272 + * Do not call when we are not in IRQ context and have 273 + * no valid 'regs' pointer 274 + */ 275 + if (regs) 276 + tick_sched_handle(ts, regs); 277 + else 278 + ts->next_tick = 0; 279 + 280 + /* 281 + * In dynticks mode, tick reprogram is deferred: 282 + * - to the idle task if in dynticks-idle 283 + * - to IRQ exit if in full-dynticks. 284 + */ 285 + if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED))) 286 + return HRTIMER_NORESTART; 287 + 288 + hrtimer_forward(timer, now, TICK_NSEC); 289 + 290 + return HRTIMER_RESTART; 291 + } 292 + 293 + static void tick_sched_timer_cancel(struct tick_sched *ts) 294 + { 295 + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) 296 + hrtimer_cancel(&ts->sched_timer); 297 + else if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) 298 + tick_program_event(KTIME_MAX, 1); 299 + } 277 300 278 301 #ifdef CONFIG_NO_HZ_FULL 279 302 cpumask_var_t tick_nohz_full_mask; ··· 588 529 589 530 ts = this_cpu_ptr(&tick_cpu_sched); 590 531 591 - if (ts->tick_stopped) { 532 + if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 592 533 if (atomic_read(&current->tick_dep_mask) || 593 534 atomic_read(&current->signal->tick_dep_mask)) 594 535 tick_nohz_full_kick(); ··· 660 601 pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", 661 602 cpumask_pr_args(tick_nohz_full_mask)); 662 603 } 663 - #endif 604 + #endif /* #ifdef CONFIG_NO_HZ_FULL */ 664 605 665 606 /* 666 607 * NOHZ - aka dynamic tick functionality ··· 685 626 { 686 627 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 687 628 688 - return ts->tick_stopped; 629 + return tick_sched_flag_test(ts, TS_FLAG_STOPPED); 689 630 } 690 631 691 632 bool tick_nohz_tick_stopped_cpu(int cpu) 692 633 { 693 634 struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); 694 635 695 - return ts->tick_stopped; 636 + return tick_sched_flag_test(ts, TS_FLAG_STOPPED); 696 637 } 697 638 698 639 /** ··· 722 663 { 723 664 ktime_t delta; 724 665 725 - if (WARN_ON_ONCE(!ts->idle_active)) 666 + if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))) 726 667 return; 727 668 728 669 delta = ktime_sub(now, ts->idle_entrytime); ··· 734 675 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 735 676 736 677 ts->idle_entrytime = now; 737 - ts->idle_active = 0; 678 + tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE); 738 679 write_seqcount_end(&ts->idle_sleeptime_seq); 739 680 740 681 sched_clock_idle_wakeup_event(); ··· 744 685 { 745 686 write_seqcount_begin(&ts->idle_sleeptime_seq); 746 687 ts->idle_entrytime = ktime_get(); 747 - ts->idle_active = 1; 688 + tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE); 748 689 write_seqcount_end(&ts->idle_sleeptime_seq); 749 690 750 691 sched_clock_idle_sleep_event(); ··· 766 707 do { 767 708 seq = read_seqcount_begin(&ts->idle_sleeptime_seq); 768 709 769 - if (ts->idle_active && compute_delta) { 710 + if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) { 770 711 ktime_t delta = ktime_sub(now, ts->idle_entrytime); 771 712 772 713 idle = ktime_add(*sleeptime, delta); ··· 839 780 /* Forward the time to expire in the future */ 840 781 hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); 841 782 842 - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 783 + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { 843 784 hrtimer_start_expires(&ts->sched_timer, 844 785 HRTIMER_MODE_ABS_PINNED_HARD); 845 786 } else { ··· 858 799 return local_softirq_pending() & BIT(TIMER_SOFTIRQ); 859 800 } 860 801 861 - static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) 802 + /* 803 + * Read jiffies and the time when jiffies were updated last 804 + */ 805 + u64 get_jiffies_update(unsigned long *basej) 862 806 { 863 - u64 basemono, next_tick, delta, expires; 864 807 unsigned long basejiff; 865 808 unsigned int seq; 809 + u64 basemono; 866 810 867 - /* Read jiffies and the time when jiffies were updated last */ 868 811 do { 869 812 seq = read_seqcount_begin(&jiffies_seq); 870 813 basemono = last_jiffies_update; 871 814 basejiff = jiffies; 872 815 } while (read_seqcount_retry(&jiffies_seq, seq)); 816 + *basej = basejiff; 817 + return basemono; 818 + } 819 + 820 + /** 821 + * tick_nohz_next_event() - return the clock monotonic based next event 822 + * @ts: pointer to tick_sched struct 823 + * @cpu: CPU number 824 + * 825 + * Return: 826 + * *%0 - When the next event is a maximum of TICK_NSEC in the future 827 + * and the tick is not stopped yet 828 + * *%next_event - Next event based on clock monotonic 829 + */ 830 + static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) 831 + { 832 + u64 basemono, next_tick, delta, expires; 833 + unsigned long basejiff; 834 + 835 + basemono = get_jiffies_update(&basejiff); 873 836 ts->last_jiffies = basejiff; 874 837 ts->timer_expires_base = basemono; 875 838 ··· 931 850 delta = next_tick - basemono; 932 851 if (delta <= (u64)TICK_NSEC) { 933 852 /* 934 - * Tell the timer code that the base is not idle, i.e. undo 935 - * the effect of get_next_timer_interrupt(): 936 - */ 937 - timer_clear_idle(); 938 - /* 939 853 * We've not stopped the tick yet, and there's a timer in the 940 854 * next period, so no point in stopping it either, bail. 941 855 */ 942 - if (!ts->tick_stopped) { 856 + if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 943 857 ts->timer_expires = 0; 944 858 goto out; 945 859 } ··· 947 871 */ 948 872 delta = timekeeping_max_deferment(); 949 873 if (cpu != tick_do_timer_cpu && 950 - (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) 874 + (tick_do_timer_cpu != TICK_DO_TIMER_NONE || 875 + !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) 951 876 delta = KTIME_MAX; 952 877 953 878 /* Calculate the next expiry time */ ··· 966 889 static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) 967 890 { 968 891 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 892 + unsigned long basejiff = ts->last_jiffies; 969 893 u64 basemono = ts->timer_expires_base; 970 - u64 expires = ts->timer_expires; 894 + bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); 895 + u64 expires; 971 896 972 897 /* Make sure we won't be trying to stop it twice in a row. */ 973 898 ts->timer_expires_base = 0; 899 + 900 + /* 901 + * Now the tick should be stopped definitely - so the timer base needs 902 + * to be marked idle as well to not miss a newly queued timer. 903 + */ 904 + expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle); 905 + if (expires > ts->timer_expires) { 906 + /* 907 + * This path could only happen when the first timer was removed 908 + * between calculating the possible sleep length and now (when 909 + * high resolution mode is not active, timer could also be a 910 + * hrtimer). 911 + * 912 + * We have to stick to the original calculated expiry value to 913 + * not stop the tick for too long with a shallow C-state (which 914 + * was programmed by cpuidle because of an early next expiration 915 + * value). 916 + */ 917 + expires = ts->timer_expires; 918 + } 919 + 920 + /* If the timer base is not idle, retain the not yet stopped tick. */ 921 + if (!timer_idle) 922 + return; 974 923 975 924 /* 976 925 * If this CPU is the one which updates jiffies, then give up ··· 1008 905 */ 1009 906 if (cpu == tick_do_timer_cpu) { 1010 907 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 1011 - ts->do_timer_last = 1; 908 + tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); 1012 909 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { 1013 - ts->do_timer_last = 0; 910 + tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); 1014 911 } 1015 912 1016 913 /* Skip reprogram of event if it's not changed */ 1017 - if (ts->tick_stopped && (expires == ts->next_tick)) { 914 + if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) { 1018 915 /* Sanity check: make sure clockevent is actually programmed */ 1019 916 if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) 1020 917 return; ··· 1032 929 * call we save the current tick time, so we can restart the 1033 930 * scheduler tick in tick_nohz_restart_sched_tick(). 1034 931 */ 1035 - if (!ts->tick_stopped) { 932 + if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 1036 933 calc_load_nohz_start(); 1037 934 quiet_vmstat(); 1038 935 1039 936 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 1040 - ts->tick_stopped = 1; 937 + tick_sched_flag_set(ts, TS_FLAG_STOPPED); 1041 938 trace_tick_stop(1, TICK_DEP_MASK_NONE); 1042 939 } 1043 940 ··· 1048 945 * the tick timer. 1049 946 */ 1050 947 if (unlikely(expires == KTIME_MAX)) { 1051 - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 1052 - hrtimer_cancel(&ts->sched_timer); 1053 - else 1054 - tick_program_event(KTIME_MAX, 1); 948 + tick_sched_timer_cancel(ts); 1055 949 return; 1056 950 } 1057 951 1058 - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 952 + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { 1059 953 hrtimer_start(&ts->sched_timer, expires, 1060 954 HRTIMER_MODE_ABS_PINNED_HARD); 1061 955 } else { ··· 1067 967 } 1068 968 1069 969 #ifdef CONFIG_NO_HZ_FULL 1070 - static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) 970 + static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu) 1071 971 { 1072 972 if (tick_nohz_next_event(ts, cpu)) 1073 973 tick_nohz_stop_tick(ts, cpu); ··· 1091 991 touch_softlockup_watchdog_sched(); 1092 992 1093 993 /* Cancel the scheduled timer and restore the tick: */ 1094 - ts->tick_stopped = 0; 994 + tick_sched_flag_clear(ts, TS_FLAG_STOPPED); 1095 995 tick_nohz_restart(ts, now); 1096 996 } 1097 997 ··· 1102 1002 int cpu = smp_processor_id(); 1103 1003 1104 1004 if (can_stop_full_tick(cpu, ts)) 1105 - tick_nohz_stop_sched_tick(ts, cpu); 1106 - else if (ts->tick_stopped) 1005 + tick_nohz_full_stop_tick(ts, cpu); 1006 + else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) 1107 1007 tick_nohz_restart_sched_tick(ts, now); 1108 1008 #endif 1109 1009 } ··· 1113 1013 if (!tick_nohz_full_cpu(smp_processor_id())) 1114 1014 return; 1115 1015 1116 - if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) 1016 + if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ)) 1117 1017 return; 1118 1018 1119 1019 __tick_nohz_full_update_tick(ts, ktime_get()); ··· 1160 1060 1161 1061 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) 1162 1062 { 1163 - /* 1164 - * If this CPU is offline and it is the one which updates 1165 - * jiffies, then give up the assignment and let it be taken by 1166 - * the CPU which runs the tick timer next. If we don't drop 1167 - * this here, the jiffies might be stale and do_timer() never 1168 - * gets invoked. 1169 - */ 1170 - if (unlikely(!cpu_online(cpu))) { 1171 - if (cpu == tick_do_timer_cpu) 1172 - tick_do_timer_cpu = TICK_DO_TIMER_NONE; 1173 - /* 1174 - * Make sure the CPU doesn't get fooled by obsolete tick 1175 - * deadline if it comes back online later. 1176 - */ 1177 - ts->next_tick = 0; 1178 - return false; 1179 - } 1063 + WARN_ON_ONCE(cpu_is_offline(cpu)); 1180 1064 1181 - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 1065 + if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ))) 1182 1066 return false; 1183 1067 1184 1068 if (need_resched()) ··· 1212 1128 ts->idle_calls++; 1213 1129 1214 1130 if (expires > 0LL) { 1215 - int was_stopped = ts->tick_stopped; 1131 + int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); 1216 1132 1217 1133 tick_nohz_stop_tick(ts, cpu); 1218 1134 1219 1135 ts->idle_sleeps++; 1220 1136 ts->idle_expires = expires; 1221 1137 1222 - if (!was_stopped && ts->tick_stopped) { 1138 + if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 1223 1139 ts->idle_jiffies = ts->last_jiffies; 1224 1140 nohz_balance_enter_idle(cpu); 1225 1141 } ··· 1231 1147 void tick_nohz_idle_retain_tick(void) 1232 1148 { 1233 1149 tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); 1234 - /* 1235 - * Undo the effect of get_next_timer_interrupt() called from 1236 - * tick_nohz_next_event(). 1237 - */ 1238 - timer_clear_idle(); 1239 1150 } 1240 1151 1241 1152 /** ··· 1250 1171 1251 1172 WARN_ON_ONCE(ts->timer_expires_base); 1252 1173 1253 - ts->inidle = 1; 1174 + tick_sched_flag_set(ts, TS_FLAG_INIDLE); 1254 1175 tick_nohz_start_idle(ts); 1255 1176 1256 1177 local_irq_enable(); ··· 1279 1200 { 1280 1201 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1281 1202 1282 - if (ts->inidle) 1203 + if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) 1283 1204 tick_nohz_start_idle(ts); 1284 1205 else 1285 1206 tick_nohz_full_update_tick(ts); ··· 1333 1254 ktime_t now = ts->idle_entrytime; 1334 1255 ktime_t next_event; 1335 1256 1336 - WARN_ON_ONCE(!ts->inidle); 1257 + WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); 1337 1258 1338 1259 *delta_next = ktime_sub(dev->next_event, now); 1339 1260 ··· 1405 1326 { 1406 1327 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1407 1328 1408 - if (ts->tick_stopped) { 1329 + if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { 1409 1330 ktime_t now = ktime_get(); 1410 1331 tick_nohz_restart_sched_tick(ts, now); 1411 1332 tick_nohz_account_idle_time(ts, now); ··· 1446 1367 1447 1368 local_irq_disable(); 1448 1369 1449 - WARN_ON_ONCE(!ts->inidle); 1370 + WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); 1450 1371 WARN_ON_ONCE(ts->timer_expires_base); 1451 1372 1452 - ts->inidle = 0; 1453 - idle_active = ts->idle_active; 1454 - tick_stopped = ts->tick_stopped; 1373 + tick_sched_flag_clear(ts, TS_FLAG_INIDLE); 1374 + idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE); 1375 + tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); 1455 1376 1456 1377 if (idle_active || tick_stopped) 1457 1378 now = ktime_get(); ··· 1470 1391 * at the clockevent level. hrtimer can't be used instead, because its 1471 1392 * infrastructure actually relies on the tick itself as a backend in 1472 1393 * low-resolution mode (see hrtimer_run_queues()). 1473 - * 1474 - * This low-resolution handler still makes use of some hrtimer APIs meanwhile 1475 - * for convenience with expiration calculation and forwarding. 1476 1394 */ 1477 1395 static void tick_nohz_lowres_handler(struct clock_event_device *dev) 1478 1396 { 1479 1397 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1480 - struct pt_regs *regs = get_irq_regs(); 1481 - ktime_t now = ktime_get(); 1482 1398 1483 1399 dev->next_event = KTIME_MAX; 1484 1400 1485 - tick_sched_do_timer(ts, now); 1486 - tick_sched_handle(ts, regs); 1487 - 1488 - /* 1489 - * In dynticks mode, tick reprogram is deferred: 1490 - * - to the idle task if in dynticks-idle 1491 - * - to IRQ exit if in full-dynticks. 1492 - */ 1493 - if (likely(!ts->tick_stopped)) { 1494 - hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); 1401 + if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) 1495 1402 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 1496 - } 1497 - 1498 1403 } 1499 1404 1500 - static inline void tick_nohz_activate(struct tick_sched *ts, int mode) 1405 + static inline void tick_nohz_activate(struct tick_sched *ts) 1501 1406 { 1502 1407 if (!tick_nohz_enabled) 1503 1408 return; 1504 - ts->nohz_mode = mode; 1409 + tick_sched_flag_set(ts, TS_FLAG_NOHZ); 1505 1410 /* One update is enough */ 1506 1411 if (!test_and_set_bit(0, &tick_nohz_active)) 1507 1412 timers_update_nohz(); ··· 1496 1433 */ 1497 1434 static void tick_nohz_switch_to_nohz(void) 1498 1435 { 1499 - struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1500 - ktime_t next; 1501 - 1502 1436 if (!tick_nohz_enabled) 1503 1437 return; 1504 1438 ··· 1504 1444 1505 1445 /* 1506 1446 * Recycle the hrtimer in 'ts', so we can share the 1507 - * hrtimer_forward_now() function with the highres code. 1447 + * highres code. 1508 1448 */ 1509 - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); 1510 - /* Get the next period */ 1511 - next = tick_init_jiffy_update(); 1512 - 1513 - hrtimer_set_expires(&ts->sched_timer, next); 1514 - hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); 1515 - tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 1516 - tick_nohz_activate(ts, NOHZ_MODE_LOWRES); 1449 + tick_setup_sched_timer(false); 1517 1450 } 1518 1451 1519 1452 static inline void tick_nohz_irq_enter(void) ··· 1514 1461 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1515 1462 ktime_t now; 1516 1463 1517 - if (!ts->idle_active && !ts->tick_stopped) 1464 + if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE)) 1518 1465 return; 1519 1466 now = ktime_get(); 1520 - if (ts->idle_active) 1467 + if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) 1521 1468 tick_nohz_stop_idle(ts, now); 1522 1469 /* 1523 1470 * If all CPUs are idle we may need to update a stale jiffies value. ··· 1526 1473 * rare case (typically stop machine). So we must make sure we have a 1527 1474 * last resort. 1528 1475 */ 1529 - if (ts->tick_stopped) 1476 + if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) 1530 1477 tick_nohz_update_jiffies(now); 1531 1478 } 1532 1479 ··· 1534 1481 1535 1482 static inline void tick_nohz_switch_to_nohz(void) { } 1536 1483 static inline void tick_nohz_irq_enter(void) { } 1537 - static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } 1484 + static inline void tick_nohz_activate(struct tick_sched *ts) { } 1538 1485 1539 1486 #endif /* CONFIG_NO_HZ_COMMON */ 1540 1487 ··· 1545 1492 { 1546 1493 tick_check_oneshot_broadcast_this_cpu(); 1547 1494 tick_nohz_irq_enter(); 1548 - } 1549 - 1550 - /* 1551 - * High resolution timer specific code 1552 - */ 1553 - #ifdef CONFIG_HIGH_RES_TIMERS 1554 - /* 1555 - * We rearm the timer until we get disabled by the idle code. 1556 - * Called with interrupts disabled. 1557 - */ 1558 - static enum hrtimer_restart tick_nohz_highres_handler(struct hrtimer *timer) 1559 - { 1560 - struct tick_sched *ts = 1561 - container_of(timer, struct tick_sched, sched_timer); 1562 - struct pt_regs *regs = get_irq_regs(); 1563 - ktime_t now = ktime_get(); 1564 - 1565 - tick_sched_do_timer(ts, now); 1566 - 1567 - /* 1568 - * Do not call when we are not in IRQ context and have 1569 - * no valid 'regs' pointer 1570 - */ 1571 - if (regs) 1572 - tick_sched_handle(ts, regs); 1573 - else 1574 - ts->next_tick = 0; 1575 - 1576 - /* 1577 - * In dynticks mode, tick reprogram is deferred: 1578 - * - to the idle task if in dynticks-idle 1579 - * - to IRQ exit if in full-dynticks. 1580 - */ 1581 - if (unlikely(ts->tick_stopped)) 1582 - return HRTIMER_NORESTART; 1583 - 1584 - hrtimer_forward(timer, now, TICK_NSEC); 1585 - 1586 - return HRTIMER_RESTART; 1587 1495 } 1588 1496 1589 1497 static int sched_skew_tick; ··· 1559 1545 1560 1546 /** 1561 1547 * tick_setup_sched_timer - setup the tick emulation timer 1548 + * @mode: tick_nohz_mode to setup for 1562 1549 */ 1563 - void tick_setup_sched_timer(void) 1550 + void tick_setup_sched_timer(bool hrtimer) 1564 1551 { 1565 1552 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1566 - ktime_t now = ktime_get(); 1567 1553 1568 1554 /* Emulate tick processing via per-CPU hrtimers: */ 1569 1555 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); 1570 - ts->sched_timer.function = tick_nohz_highres_handler; 1556 + 1557 + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) { 1558 + tick_sched_flag_set(ts, TS_FLAG_HIGHRES); 1559 + ts->sched_timer.function = tick_nohz_handler; 1560 + } 1571 1561 1572 1562 /* Get the next period (per-CPU) */ 1573 1563 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); ··· 1584 1566 hrtimer_add_expires_ns(&ts->sched_timer, offset); 1585 1567 } 1586 1568 1587 - hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); 1588 - hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); 1589 - tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); 1569 + hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); 1570 + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) 1571 + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); 1572 + else 1573 + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 1574 + tick_nohz_activate(ts); 1590 1575 } 1591 - #endif /* HIGH_RES_TIMERS */ 1592 1576 1593 - #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS 1594 - void tick_cancel_sched_timer(int cpu) 1577 + /* 1578 + * Shut down the tick and make sure the CPU won't try to retake the timekeeping 1579 + * duty before disabling IRQs in idle for the last time. 1580 + */ 1581 + void tick_sched_timer_dying(int cpu) 1595 1582 { 1583 + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 1596 1584 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 1585 + struct clock_event_device *dev = td->evtdev; 1597 1586 ktime_t idle_sleeptime, iowait_sleeptime; 1598 1587 unsigned long idle_calls, idle_sleeps; 1599 1588 1600 - # ifdef CONFIG_HIGH_RES_TIMERS 1601 - if (ts->sched_timer.base) 1602 - hrtimer_cancel(&ts->sched_timer); 1603 - # endif 1589 + /* This must happen before hrtimers are migrated! */ 1590 + tick_sched_timer_cancel(ts); 1591 + 1592 + /* 1593 + * If the clockevents doesn't support CLOCK_EVT_STATE_ONESHOT_STOPPED, 1594 + * make sure not to call low-res tick handler. 1595 + */ 1596 + if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) 1597 + dev->event_handler = clockevents_handle_noop; 1604 1598 1605 1599 idle_sleeptime = ts->idle_sleeptime; 1606 1600 iowait_sleeptime = ts->iowait_sleeptime; ··· 1624 1594 ts->idle_calls = idle_calls; 1625 1595 ts->idle_sleeps = idle_sleeps; 1626 1596 } 1627 - #endif 1628 1597 1629 1598 /* 1630 1599 * Async notification about clocksource changes ··· 1661 1632 if (!test_and_clear_bit(0, &ts->check_clocks)) 1662 1633 return 0; 1663 1634 1664 - if (ts->nohz_mode != NOHZ_MODE_INACTIVE) 1635 + if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) 1665 1636 return 0; 1666 1637 1667 1638 if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())

+22 -20

kernel/time/tick-sched.h

··· 14 14 enum tick_device_mode mode; 15 15 }; 16 16 17 - enum tick_nohz_mode { 18 - NOHZ_MODE_INACTIVE, 19 - NOHZ_MODE_LOWRES, 20 - NOHZ_MODE_HIGHRES, 21 - }; 17 + /* The CPU is in the tick idle mode */ 18 + #define TS_FLAG_INIDLE BIT(0) 19 + /* The idle tick has been stopped */ 20 + #define TS_FLAG_STOPPED BIT(1) 21 + /* 22 + * Indicator that the CPU is actively in the tick idle mode; 23 + * it is reset during irq handling phases. 24 + */ 25 + #define TS_FLAG_IDLE_ACTIVE BIT(2) 26 + /* CPU was the last one doing do_timer before going idle */ 27 + #define TS_FLAG_DO_TIMER_LAST BIT(3) 28 + /* NO_HZ is enabled */ 29 + #define TS_FLAG_NOHZ BIT(4) 30 + /* High resolution tick mode */ 31 + #define TS_FLAG_HIGHRES BIT(5) 22 32 23 33 /** 24 34 * struct tick_sched - sched tick emulation and no idle tick control/stats 25 35 * 26 - * @inidle: Indicator that the CPU is in the tick idle mode 27 - * @tick_stopped: Indicator that the idle tick has been stopped 28 - * @idle_active: Indicator that the CPU is actively in the tick idle mode; 29 - * it is reset during irq handling phases. 30 - * @do_timer_last: CPU was the last one doing do_timer before going idle 36 + * @flags: State flags gathering the TS_FLAG_* features 31 37 * @got_idle_tick: Tick timer function has run with @inidle set 32 38 * @stalled_jiffies: Number of stalled jiffies detected across ticks 33 39 * @last_tick_jiffies: Value of jiffies seen on last tick ··· 63 57 */ 64 58 struct tick_sched { 65 59 /* Common flags */ 66 - unsigned int inidle : 1; 67 - unsigned int tick_stopped : 1; 68 - unsigned int idle_active : 1; 69 - unsigned int do_timer_last : 1; 70 - unsigned int got_idle_tick : 1; 60 + unsigned long flags; 71 61 72 62 /* Tick handling: jiffies stall check */ 73 63 unsigned int stalled_jiffies; ··· 75 73 ktime_t next_tick; 76 74 unsigned long idle_jiffies; 77 75 ktime_t idle_waketime; 76 + unsigned int got_idle_tick; 78 77 79 78 /* Idle entry */ 80 79 seqcount_t idle_sleeptime_seq; 81 80 ktime_t idle_entrytime; 82 81 83 82 /* Tick stop */ 84 - enum tick_nohz_mode nohz_mode; 85 83 unsigned long last_jiffies; 86 84 u64 timer_expires_base; 87 85 u64 timer_expires; ··· 104 102 105 103 extern struct tick_sched *tick_get_tick_sched(int cpu); 106 104 107 - extern void tick_setup_sched_timer(void); 108 - #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS 109 - extern void tick_cancel_sched_timer(int cpu); 105 + extern void tick_setup_sched_timer(bool hrtimer); 106 + #if defined CONFIG_TICK_ONESHOT 107 + extern void tick_sched_timer_dying(int cpu); 110 108 #else 111 - static inline void tick_cancel_sched_timer(int cpu) { } 109 + static inline void tick_sched_timer_dying(int cpu) { } 112 110 #endif 113 111 114 112 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST

+12 -12

kernel/time/timekeeping.c

··· 1180 1180 } 1181 1181 1182 1182 /* 1183 - * cycle_between - true if test occurs chronologically between before and after 1183 + * timestamp_in_interval - true if ts is chronologically in [start, end] 1184 + * 1185 + * True if ts occurs chronologically at or after start, and before or at end. 1184 1186 */ 1185 - static bool cycle_between(u64 before, u64 test, u64 after) 1187 + static bool timestamp_in_interval(u64 start, u64 end, u64 ts) 1186 1188 { 1187 - if (test > before && test < after) 1189 + if (ts >= start && ts <= end) 1188 1190 return true; 1189 - if (test < before && before > after) 1191 + if (start > end && (ts >= start || ts <= end)) 1190 1192 return true; 1191 1193 return false; 1192 1194 } ··· 1249 1247 */ 1250 1248 now = tk_clock_read(&tk->tkr_mono); 1251 1249 interval_start = tk->tkr_mono.cycle_last; 1252 - if (!cycle_between(interval_start, cycles, now)) { 1250 + if (!timestamp_in_interval(interval_start, now, cycles)) { 1253 1251 clock_was_set_seq = tk->clock_was_set_seq; 1254 1252 cs_was_changed_seq = tk->cs_was_changed_seq; 1255 1253 cycles = interval_start; ··· 1262 1260 tk_core.timekeeper.offs_real); 1263 1261 base_raw = tk->tkr_raw.base; 1264 1262 1265 - nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, 1266 - system_counterval.cycles); 1267 - nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, 1268 - system_counterval.cycles); 1263 + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles); 1264 + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles); 1269 1265 } while (read_seqcount_retry(&tk_core.seq, seq)); 1270 1266 1271 1267 xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); ··· 1278 1278 bool discontinuity; 1279 1279 1280 1280 /* 1281 - * Check that the counter value occurs after the provided 1281 + * Check that the counter value is not before the provided 1282 1282 * history reference and that the history doesn't cross a 1283 1283 * clocksource change 1284 1284 */ 1285 1285 if (!history_begin || 1286 - !cycle_between(history_begin->cycles, 1287 - system_counterval.cycles, cycles) || 1286 + !timestamp_in_interval(history_begin->cycles, 1287 + cycles, system_counterval.cycles) || 1288 1288 history_begin->cs_was_changed_seq != cs_was_changed_seq) 1289 1289 return -EINVAL; 1290 1290 partial_history_cycles = cycles - system_counterval.cycles;

+490 -101

kernel/time/timer.c

··· 53 53 #include <asm/io.h> 54 54 55 55 #include "tick-internal.h" 56 + #include "timer_migration.h" 56 57 57 58 #define CREATE_TRACE_POINTS 58 59 #include <trace/events/timer.h> ··· 188 187 #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) 189 188 190 189 #ifdef CONFIG_NO_HZ_COMMON 191 - # define NR_BASES 2 192 - # define BASE_STD 0 193 - # define BASE_DEF 1 190 + /* 191 + * If multiple bases need to be locked, use the base ordering for lock 192 + * nesting, i.e. lowest number first. 193 + */ 194 + # define NR_BASES 3 195 + # define BASE_LOCAL 0 196 + # define BASE_GLOBAL 1 197 + # define BASE_DEF 2 194 198 #else 195 199 # define NR_BASES 1 196 - # define BASE_STD 0 200 + # define BASE_LOCAL 0 201 + # define BASE_GLOBAL 0 197 202 # define BASE_DEF 0 198 203 #endif 199 204 205 + /** 206 + * struct timer_base - Per CPU timer base (number of base depends on config) 207 + * @lock: Lock protecting the timer_base 208 + * @running_timer: When expiring timers, the lock is dropped. To make 209 + * sure not to race agains deleting/modifying a 210 + * currently running timer, the pointer is set to the 211 + * timer, which expires at the moment. If no timer is 212 + * running, the pointer is NULL. 213 + * @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around 214 + * timer expiry callback execution and when trying to 215 + * delete a running timer and it wasn't successful in 216 + * the first glance. It prevents priority inversion 217 + * when callback was preempted on a remote CPU and a 218 + * caller tries to delete the running timer. It also 219 + * prevents a life lock, when the task which tries to 220 + * delete a timer preempted the softirq thread which 221 + * is running the timer callback function. 222 + * @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter 223 + * waiting for the end of the timer callback function 224 + * execution. 225 + * @clk: clock of the timer base; is updated before enqueue 226 + * of a timer; during expiry, it is 1 offset ahead of 227 + * jiffies to avoid endless requeuing to current 228 + * jiffies 229 + * @next_expiry: expiry value of the first timer; it is updated when 230 + * finding the next timer and during enqueue; the 231 + * value is not valid, when next_expiry_recalc is set 232 + * @cpu: Number of CPU the timer base belongs to 233 + * @next_expiry_recalc: States, whether a recalculation of next_expiry is 234 + * required. Value is set true, when a timer was 235 + * deleted. 236 + * @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ 237 + * code. This state is only used in standard 238 + * base. Deferrable timers, which are enqueued remotely 239 + * never wake up an idle CPU. So no matter of supporting it 240 + * for this base. 241 + * @timers_pending: Is set, when a timer is pending in the base. It is only 242 + * reliable when next_expiry_recalc is not set. 243 + * @pending_map: bitmap of the timer wheel; each bit reflects a 244 + * bucket of the wheel. When a bit is set, at least a 245 + * single timer is enqueued in the related bucket. 246 + * @vectors: Array of lists; Each array member reflects a bucket 247 + * of the timer wheel. The list contains all timers 248 + * which are enqueued into a specific bucket. 249 + */ 200 250 struct timer_base { 201 251 raw_spinlock_t lock; 202 252 struct timer_list *running_timer; ··· 635 583 636 584 /* 637 585 * We might have to IPI the remote CPU if the base is idle and the 638 - * timer is not deferrable. If the other CPU is on the way to idle 639 - * then it can't set base->is_idle as we hold the base lock: 586 + * timer is pinned. If it is a non pinned timer, it is only queued 587 + * on the remote CPU, when timer was running during queueing. Then 588 + * everything is handled by remote CPU anyway. If the other CPU is 589 + * on the way to idle then it can't set base->is_idle as we hold 590 + * the base lock: 640 591 */ 641 - if (base->is_idle) 592 + if (base->is_idle) { 593 + WARN_ON_ONCE(!(timer->flags & TIMER_PINNED)); 642 594 wake_up_nohz_cpu(base->cpu); 595 + } 643 596 } 644 597 645 598 /* ··· 956 899 957 900 static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) 958 901 { 959 - struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); 902 + int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; 903 + struct timer_base *base; 904 + 905 + base = per_cpu_ptr(&timer_bases[index], cpu); 960 906 961 907 /* 962 908 * If the timer is deferrable and NO_HZ_COMMON is set then we need ··· 972 912 973 913 static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) 974 914 { 975 - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); 915 + int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; 916 + struct timer_base *base; 917 + 918 + base = this_cpu_ptr(&timer_bases[index]); 976 919 977 920 /* 978 921 * If the timer is deferrable and NO_HZ_COMMON is set then we need ··· 989 926 static inline struct timer_base *get_timer_base(u32 tflags) 990 927 { 991 928 return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); 992 - } 993 - 994 - static inline struct timer_base * 995 - get_target_base(struct timer_base *base, unsigned tflags) 996 - { 997 - #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 998 - if (static_branch_likely(&timers_migration_enabled) && 999 - !(tflags & TIMER_PINNED)) 1000 - return get_timer_cpu_base(tflags, get_nohz_timer_target()); 1001 - #endif 1002 - return get_timer_this_cpu_base(tflags); 1003 929 } 1004 930 1005 931 static inline void __forward_timer_base(struct timer_base *base, ··· 1145 1093 if (!ret && (options & MOD_TIMER_PENDING_ONLY)) 1146 1094 goto out_unlock; 1147 1095 1148 - new_base = get_target_base(base, timer->flags); 1096 + new_base = get_timer_this_cpu_base(timer->flags); 1149 1097 1150 1098 if (base != new_base) { 1151 1099 /* ··· 1298 1246 EXPORT_SYMBOL(add_timer); 1299 1247 1300 1248 /** 1249 + * add_timer_local() - Start a timer on the local CPU 1250 + * @timer: The timer to be started 1251 + * 1252 + * Same as add_timer() except that the timer flag TIMER_PINNED is set. 1253 + * 1254 + * See add_timer() for further details. 1255 + */ 1256 + void add_timer_local(struct timer_list *timer) 1257 + { 1258 + if (WARN_ON_ONCE(timer_pending(timer))) 1259 + return; 1260 + timer->flags |= TIMER_PINNED; 1261 + __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); 1262 + } 1263 + EXPORT_SYMBOL(add_timer_local); 1264 + 1265 + /** 1266 + * add_timer_global() - Start a timer without TIMER_PINNED flag set 1267 + * @timer: The timer to be started 1268 + * 1269 + * Same as add_timer() except that the timer flag TIMER_PINNED is unset. 1270 + * 1271 + * See add_timer() for further details. 1272 + */ 1273 + void add_timer_global(struct timer_list *timer) 1274 + { 1275 + if (WARN_ON_ONCE(timer_pending(timer))) 1276 + return; 1277 + timer->flags &= ~TIMER_PINNED; 1278 + __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); 1279 + } 1280 + EXPORT_SYMBOL(add_timer_global); 1281 + 1282 + /** 1301 1283 * add_timer_on - Start a timer on a particular CPU 1302 1284 * @timer: The timer to be started 1303 1285 * @cpu: The CPU to start it on 1304 1286 * 1305 - * Same as add_timer() except that it starts the timer on the given CPU. 1287 + * Same as add_timer() except that it starts the timer on the given CPU and 1288 + * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in 1289 + * the next round, add_timer_global() should be used instead as it unsets 1290 + * the TIMER_PINNED flag. 1306 1291 * 1307 1292 * See add_timer() for further details. 1308 1293 */ ··· 1352 1263 1353 1264 if (WARN_ON_ONCE(timer_pending(timer))) 1354 1265 return; 1266 + 1267 + /* Make sure timer flags have TIMER_PINNED flag set */ 1268 + timer->flags |= TIMER_PINNED; 1355 1269 1356 1270 new_base = get_timer_cpu_base(timer->flags, cpu); 1357 1271 ··· 2003 1911 return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; 2004 1912 } 2005 1913 2006 - /** 2007 - * get_next_timer_interrupt - return the time (clock mono) of the next timer 2008 - * @basej: base time jiffies 2009 - * @basem: base time clock monotonic 2010 - * 2011 - * Returns the tick aligned clock monotonic time of the next pending 2012 - * timer or KTIME_MAX if no timer is pending. 2013 - */ 2014 - u64 get_next_timer_interrupt(unsigned long basej, u64 basem) 1914 + static unsigned long next_timer_interrupt(struct timer_base *base, 1915 + unsigned long basej) 2015 1916 { 2016 - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); 2017 - unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA; 2018 - u64 expires = KTIME_MAX; 2019 - bool was_idle; 2020 - 2021 - /* 2022 - * Pretend that there is no timer pending if the cpu is offline. 2023 - * Possible pending timers will be migrated later to an active cpu. 2024 - */ 2025 - if (cpu_is_offline(smp_processor_id())) 2026 - return expires; 2027 - 2028 - raw_spin_lock(&base->lock); 2029 1917 if (base->next_expiry_recalc) 2030 1918 next_expiry_recalc(base); 1919 + 1920 + /* 1921 + * Move next_expiry for the empty base into the future to prevent an 1922 + * unnecessary raise of the timer softirq when the next_expiry value 1923 + * will be reached even if there is no timer pending. 1924 + * 1925 + * This update is also required to make timer_base::next_expiry values 1926 + * easy comparable to find out which base holds the first pending timer. 1927 + */ 1928 + if (!base->timers_pending) 1929 + base->next_expiry = basej + NEXT_TIMER_MAX_DELTA; 1930 + 1931 + return base->next_expiry; 1932 + } 1933 + 1934 + static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem, 1935 + struct timer_base *base_local, 1936 + struct timer_base *base_global, 1937 + struct timer_events *tevt) 1938 + { 1939 + unsigned long nextevt, nextevt_local, nextevt_global; 1940 + bool local_first; 1941 + 1942 + nextevt_local = next_timer_interrupt(base_local, basej); 1943 + nextevt_global = next_timer_interrupt(base_global, basej); 1944 + 1945 + local_first = time_before_eq(nextevt_local, nextevt_global); 1946 + 1947 + nextevt = local_first ? nextevt_local : nextevt_global; 1948 + 1949 + /* 1950 + * If the @nextevt is at max. one tick away, use @nextevt and store 1951 + * it in the local expiry value. The next global event is irrelevant in 1952 + * this case and can be left as KTIME_MAX. 1953 + */ 1954 + if (time_before_eq(nextevt, basej + 1)) { 1955 + /* If we missed a tick already, force 0 delta */ 1956 + if (time_before(nextevt, basej)) 1957 + nextevt = basej; 1958 + tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC; 1959 + 1960 + /* 1961 + * This is required for the remote check only but it doesn't 1962 + * hurt, when it is done for both call sites: 1963 + * 1964 + * * The remote callers will only take care of the global timers 1965 + * as local timers will be handled by CPU itself. When not 1966 + * updating tevt->global with the already missed first global 1967 + * timer, it is possible that it will be missed completely. 1968 + * 1969 + * * The local callers will ignore the tevt->global anyway, when 1970 + * nextevt is max. one tick away. 1971 + */ 1972 + if (!local_first) 1973 + tevt->global = tevt->local; 1974 + return nextevt; 1975 + } 1976 + 1977 + /* 1978 + * Update tevt.* values: 1979 + * 1980 + * If the local queue expires first, then the global event can be 1981 + * ignored. If the global queue is empty, nothing to do either. 1982 + */ 1983 + if (!local_first && base_global->timers_pending) 1984 + tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC; 1985 + 1986 + if (base_local->timers_pending) 1987 + tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC; 1988 + 1989 + return nextevt; 1990 + } 1991 + 1992 + # ifdef CONFIG_SMP 1993 + /** 1994 + * fetch_next_timer_interrupt_remote() - Store next timers into @tevt 1995 + * @basej: base time jiffies 1996 + * @basem: base time clock monotonic 1997 + * @tevt: Pointer to the storage for the expiry values 1998 + * @cpu: Remote CPU 1999 + * 2000 + * Stores the next pending local and global timer expiry values in the 2001 + * struct pointed to by @tevt. If a queue is empty the corresponding 2002 + * field is set to KTIME_MAX. If local event expires before global 2003 + * event, global event is set to KTIME_MAX as well. 2004 + * 2005 + * Caller needs to make sure timer base locks are held (use 2006 + * timer_lock_remote_bases() for this purpose). 2007 + */ 2008 + void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem, 2009 + struct timer_events *tevt, 2010 + unsigned int cpu) 2011 + { 2012 + struct timer_base *base_local, *base_global; 2013 + 2014 + /* Preset local / global events */ 2015 + tevt->local = tevt->global = KTIME_MAX; 2016 + 2017 + base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); 2018 + base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); 2019 + 2020 + lockdep_assert_held(&base_local->lock); 2021 + lockdep_assert_held(&base_global->lock); 2022 + 2023 + fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt); 2024 + } 2025 + 2026 + /** 2027 + * timer_unlock_remote_bases - unlock timer bases of cpu 2028 + * @cpu: Remote CPU 2029 + * 2030 + * Unlocks the remote timer bases. 2031 + */ 2032 + void timer_unlock_remote_bases(unsigned int cpu) 2033 + __releases(timer_bases[BASE_LOCAL]->lock) 2034 + __releases(timer_bases[BASE_GLOBAL]->lock) 2035 + { 2036 + struct timer_base *base_local, *base_global; 2037 + 2038 + base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); 2039 + base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); 2040 + 2041 + raw_spin_unlock(&base_global->lock); 2042 + raw_spin_unlock(&base_local->lock); 2043 + } 2044 + 2045 + /** 2046 + * timer_lock_remote_bases - lock timer bases of cpu 2047 + * @cpu: Remote CPU 2048 + * 2049 + * Locks the remote timer bases. 2050 + */ 2051 + void timer_lock_remote_bases(unsigned int cpu) 2052 + __acquires(timer_bases[BASE_LOCAL]->lock) 2053 + __acquires(timer_bases[BASE_GLOBAL]->lock) 2054 + { 2055 + struct timer_base *base_local, *base_global; 2056 + 2057 + base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); 2058 + base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); 2059 + 2060 + lockdep_assert_irqs_disabled(); 2061 + 2062 + raw_spin_lock(&base_local->lock); 2063 + raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); 2064 + } 2065 + 2066 + /** 2067 + * timer_base_is_idle() - Return whether timer base is set idle 2068 + * 2069 + * Returns value of local timer base is_idle value. 2070 + */ 2071 + bool timer_base_is_idle(void) 2072 + { 2073 + return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle); 2074 + } 2075 + 2076 + static void __run_timer_base(struct timer_base *base); 2077 + 2078 + /** 2079 + * timer_expire_remote() - expire global timers of cpu 2080 + * @cpu: Remote CPU 2081 + * 2082 + * Expire timers of global base of remote CPU. 2083 + */ 2084 + void timer_expire_remote(unsigned int cpu) 2085 + { 2086 + struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); 2087 + 2088 + __run_timer_base(base); 2089 + } 2090 + 2091 + static void timer_use_tmigr(unsigned long basej, u64 basem, 2092 + unsigned long *nextevt, bool *tick_stop_path, 2093 + bool timer_base_idle, struct timer_events *tevt) 2094 + { 2095 + u64 next_tmigr; 2096 + 2097 + if (timer_base_idle) 2098 + next_tmigr = tmigr_cpu_new_timer(tevt->global); 2099 + else if (tick_stop_path) 2100 + next_tmigr = tmigr_cpu_deactivate(tevt->global); 2101 + else 2102 + next_tmigr = tmigr_quick_check(tevt->global); 2103 + 2104 + /* 2105 + * If the CPU is the last going idle in timer migration hierarchy, make 2106 + * sure the CPU will wake up in time to handle remote timers. 2107 + * next_tmigr == KTIME_MAX if other CPUs are still active. 2108 + */ 2109 + if (next_tmigr < tevt->local) { 2110 + u64 tmp; 2111 + 2112 + /* If we missed a tick already, force 0 delta */ 2113 + if (next_tmigr < basem) 2114 + next_tmigr = basem; 2115 + 2116 + tmp = div_u64(next_tmigr - basem, TICK_NSEC); 2117 + 2118 + *nextevt = basej + (unsigned long)tmp; 2119 + tevt->local = next_tmigr; 2120 + } 2121 + } 2122 + # else 2123 + static void timer_use_tmigr(unsigned long basej, u64 basem, 2124 + unsigned long *nextevt, bool *tick_stop_path, 2125 + bool timer_base_idle, struct timer_events *tevt) 2126 + { 2127 + /* 2128 + * Make sure first event is written into tevt->local to not miss a 2129 + * timer on !SMP systems. 2130 + */ 2131 + tevt->local = min_t(u64, tevt->local, tevt->global); 2132 + } 2133 + # endif /* CONFIG_SMP */ 2134 + 2135 + static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, 2136 + bool *idle) 2137 + { 2138 + struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX }; 2139 + struct timer_base *base_local, *base_global; 2140 + unsigned long nextevt; 2141 + bool idle_is_possible; 2142 + 2143 + /* 2144 + * When the CPU is offline, the tick is cancelled and nothing is supposed 2145 + * to try to stop it. 2146 + */ 2147 + if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) { 2148 + if (idle) 2149 + *idle = true; 2150 + return tevt.local; 2151 + } 2152 + 2153 + base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]); 2154 + base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]); 2155 + 2156 + raw_spin_lock(&base_local->lock); 2157 + raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); 2158 + 2159 + nextevt = fetch_next_timer_interrupt(basej, basem, base_local, 2160 + base_global, &tevt); 2161 + 2162 + /* 2163 + * If the next event is only one jiffie ahead there is no need to call 2164 + * timer migration hierarchy related functions. The value for the next 2165 + * global timer in @tevt struct equals then KTIME_MAX. This is also 2166 + * true, when the timer base is idle. 2167 + * 2168 + * The proper timer migration hierarchy function depends on the callsite 2169 + * and whether timer base is idle or not. @nextevt will be updated when 2170 + * this CPU needs to handle the first timer migration hierarchy 2171 + * event. See timer_use_tmigr() for detailed information. 2172 + */ 2173 + idle_is_possible = time_after(nextevt, basej + 1); 2174 + if (idle_is_possible) 2175 + timer_use_tmigr(basej, basem, &nextevt, idle, 2176 + base_local->is_idle, &tevt); 2031 2177 2032 2178 /* 2033 2179 * We have a fresh next event. Check whether we can forward the 2034 2180 * base. 2035 2181 */ 2036 - __forward_timer_base(base, basej); 2037 - 2038 - if (base->timers_pending) { 2039 - nextevt = base->next_expiry; 2040 - 2041 - /* If we missed a tick already, force 0 delta */ 2042 - if (time_before(nextevt, basej)) 2043 - nextevt = basej; 2044 - expires = basem + (u64)(nextevt - basej) * TICK_NSEC; 2045 - } else { 2046 - /* 2047 - * Move next_expiry for the empty base into the future to 2048 - * prevent a unnecessary raise of the timer softirq when the 2049 - * next_expiry value will be reached even if there is no timer 2050 - * pending. 2051 - */ 2052 - base->next_expiry = nextevt; 2053 - } 2182 + __forward_timer_base(base_local, basej); 2183 + __forward_timer_base(base_global, basej); 2054 2184 2055 2185 /* 2056 - * Base is idle if the next event is more than a tick away. 2057 - * 2058 - * If the base is marked idle then any timer add operation must forward 2059 - * the base clk itself to keep granularity small. This idle logic is 2060 - * only maintained for the BASE_STD base, deferrable timers may still 2061 - * see large granularity skew (by design). 2186 + * Set base->is_idle only when caller is timer_base_try_to_set_idle() 2062 2187 */ 2063 - was_idle = base->is_idle; 2064 - base->is_idle = time_after(nextevt, basej + 1); 2065 - if (was_idle != base->is_idle) 2066 - trace_timer_base_idle(base->is_idle, base->cpu); 2188 + if (idle) { 2189 + /* 2190 + * Bases are idle if the next event is more than a tick 2191 + * away. Caution: @nextevt could have changed by enqueueing a 2192 + * global timer into timer migration hierarchy. Therefore a new 2193 + * check is required here. 2194 + * 2195 + * If the base is marked idle then any timer add operation must 2196 + * forward the base clk itself to keep granularity small. This 2197 + * idle logic is only maintained for the BASE_LOCAL and 2198 + * BASE_GLOBAL base, deferrable timers may still see large 2199 + * granularity skew (by design). 2200 + */ 2201 + if (!base_local->is_idle && time_after(nextevt, basej + 1)) { 2202 + base_local->is_idle = true; 2203 + trace_timer_base_idle(true, base_local->cpu); 2204 + } 2205 + *idle = base_local->is_idle; 2067 2206 2068 - raw_spin_unlock(&base->lock); 2207 + /* 2208 + * When timer base is not set idle, undo the effect of 2209 + * tmigr_cpu_deactivate() to prevent inconsitent states - active 2210 + * timer base but inactive timer migration hierarchy. 2211 + * 2212 + * When timer base was already marked idle, nothing will be 2213 + * changed here. 2214 + */ 2215 + if (!base_local->is_idle && idle_is_possible) 2216 + tmigr_cpu_activate(); 2217 + } 2069 2218 2070 - return cmp_next_hrtimer_event(basem, expires); 2219 + raw_spin_unlock(&base_global->lock); 2220 + raw_spin_unlock(&base_local->lock); 2221 + 2222 + return cmp_next_hrtimer_event(basem, tevt.local); 2223 + } 2224 + 2225 + /** 2226 + * get_next_timer_interrupt() - return the time (clock mono) of the next timer 2227 + * @basej: base time jiffies 2228 + * @basem: base time clock monotonic 2229 + * 2230 + * Returns the tick aligned clock monotonic time of the next pending timer or 2231 + * KTIME_MAX if no timer is pending. If timer of global base was queued into 2232 + * timer migration hierarchy, first global timer is not taken into account. If 2233 + * it was the last CPU of timer migration hierarchy going idle, first global 2234 + * event is taken into account. 2235 + */ 2236 + u64 get_next_timer_interrupt(unsigned long basej, u64 basem) 2237 + { 2238 + return __get_next_timer_interrupt(basej, basem, NULL); 2239 + } 2240 + 2241 + /** 2242 + * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases 2243 + * @basej: base time jiffies 2244 + * @basem: base time clock monotonic 2245 + * @idle: pointer to store the value of timer_base->is_idle on return; 2246 + * *idle contains the information whether tick was already stopped 2247 + * 2248 + * Returns the tick aligned clock monotonic time of the next pending timer or 2249 + * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is 2250 + * returned as well. 2251 + */ 2252 + u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle) 2253 + { 2254 + if (*idle) 2255 + return KTIME_MAX; 2256 + 2257 + return __get_next_timer_interrupt(basej, basem, idle); 2071 2258 } 2072 2259 2073 2260 /** ··· 2356 1985 */ 2357 1986 void timer_clear_idle(void) 2358 1987 { 2359 - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); 2360 - 2361 1988 /* 2362 - * We do this unlocked. The worst outcome is a remote enqueue sending 2363 - * a pointless IPI, but taking the lock would just make the window for 2364 - * sending the IPI a few instructions smaller for the cost of taking 2365 - * the lock in the exit from idle path. 1989 + * We do this unlocked. The worst outcome is a remote pinned timer 1990 + * enqueue sending a pointless IPI, but taking the lock would just 1991 + * make the window for sending the IPI a few instructions smaller 1992 + * for the cost of taking the lock in the exit from idle 1993 + * path. Required for BASE_LOCAL only. 2366 1994 */ 2367 - if (base->is_idle) { 2368 - base->is_idle = false; 2369 - trace_timer_base_idle(false, smp_processor_id()); 2370 - } 1995 + __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false); 1996 + trace_timer_base_idle(false, smp_processor_id()); 1997 + 1998 + /* Activate without holding the timer_base->lock */ 1999 + tmigr_cpu_activate(); 2371 2000 } 2372 2001 #endif 2373 2002 ··· 2380 2009 struct hlist_head heads[LVL_DEPTH]; 2381 2010 int levels; 2382 2011 2383 - if (time_before(jiffies, base->next_expiry)) 2384 - return; 2012 + lockdep_assert_held(&base->lock); 2385 2013 2386 - timer_base_lock_expiry(base); 2387 - raw_spin_lock_irq(&base->lock); 2014 + if (base->running_timer) 2015 + return; 2388 2016 2389 2017 while (time_after_eq(jiffies, base->clk) && 2390 2018 time_after_eq(jiffies, base->next_expiry)) { ··· 2407 2037 while (levels--) 2408 2038 expire_timers(base, heads + levels); 2409 2039 } 2040 + } 2041 + 2042 + static void __run_timer_base(struct timer_base *base) 2043 + { 2044 + if (time_before(jiffies, base->next_expiry)) 2045 + return; 2046 + 2047 + timer_base_lock_expiry(base); 2048 + raw_spin_lock_irq(&base->lock); 2049 + __run_timers(base); 2410 2050 raw_spin_unlock_irq(&base->lock); 2411 2051 timer_base_unlock_expiry(base); 2052 + } 2053 + 2054 + static void run_timer_base(int index) 2055 + { 2056 + struct timer_base *base = this_cpu_ptr(&timer_bases[index]); 2057 + 2058 + __run_timer_base(base); 2412 2059 } 2413 2060 2414 2061 /* ··· 2433 2046 */ 2434 2047 static __latent_entropy void run_timer_softirq(struct softirq_action *h) 2435 2048 { 2436 - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); 2049 + run_timer_base(BASE_LOCAL); 2050 + if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) { 2051 + run_timer_base(BASE_GLOBAL); 2052 + run_timer_base(BASE_DEF); 2437 2053 2438 - __run_timers(base); 2439 - if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) 2440 - __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); 2054 + if (is_timers_nohz_active()) 2055 + tmigr_handle_remote(); 2056 + } 2441 2057 } 2442 2058 2443 2059 /* ··· 2448 2058 */ 2449 2059 static void run_local_timers(void) 2450 2060 { 2451 - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); 2061 + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]); 2452 2062 2453 2063 hrtimer_run_queues(); 2454 - /* Raise the softirq only if required. */ 2455 - if (time_before(jiffies, base->next_expiry)) { 2456 - if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) 2064 + 2065 + for (int i = 0; i < NR_BASES; i++, base++) { 2066 + /* Raise the softirq only if required. */ 2067 + if (time_after_eq(jiffies, base->next_expiry) || 2068 + (i == BASE_DEF && tmigr_requires_handle_remote())) { 2069 + raise_softirq(TIMER_SOFTIRQ); 2457 2070 return; 2458 - /* CPU is awake, so check the deferrable base. */ 2459 - base++; 2460 - if (time_before(jiffies, base->next_expiry)) 2461 - return; 2071 + } 2462 2072 } 2463 - raise_softirq(TIMER_SOFTIRQ); 2464 2073 } 2465 2074 2466 2075 /*

+7 -3

kernel/time/timer_list.c

··· 147 147 # define P_ns(x) \ 148 148 SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ 149 149 (unsigned long long)(ktime_to_ns(ts->x))) 150 + # define P_flag(x, f) \ 151 + SEQ_printf(m, " .%-15s: %d\n", #x, !!(ts->flags & (f))) 152 + 150 153 { 151 154 struct tick_sched *ts = tick_get_tick_sched(cpu); 152 - P(nohz_mode); 155 + P_flag(nohz, TS_FLAG_NOHZ); 156 + P_flag(highres, TS_FLAG_HIGHRES); 153 157 P_ns(last_tick); 154 - P(tick_stopped); 158 + P_flag(tick_stopped, TS_FLAG_STOPPED); 155 159 P(idle_jiffies); 156 160 P(idle_calls); 157 161 P(idle_sleeps); ··· 260 256 261 257 static inline void timer_list_header(struct seq_file *m, u64 now) 262 258 { 263 - SEQ_printf(m, "Timer List Version: v0.9\n"); 259 + SEQ_printf(m, "Timer List Version: v0.10\n"); 264 260 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 265 261 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 266 262 SEQ_printf(m, "\n");

+1793

kernel/time/timer_migration.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Infrastructure for migratable timers 4 + * 5 + * Copyright(C) 2022 linutronix GmbH 6 + */ 7 + #include <linux/cpuhotplug.h> 8 + #include <linux/slab.h> 9 + #include <linux/smp.h> 10 + #include <linux/spinlock.h> 11 + #include <linux/timerqueue.h> 12 + #include <trace/events/ipi.h> 13 + 14 + #include "timer_migration.h" 15 + #include "tick-internal.h" 16 + 17 + #define CREATE_TRACE_POINTS 18 + #include <trace/events/timer_migration.h> 19 + 20 + /* 21 + * The timer migration mechanism is built on a hierarchy of groups. The 22 + * lowest level group contains CPUs, the next level groups of CPU groups 23 + * and so forth. The CPU groups are kept per node so for the normal case 24 + * lock contention won't happen across nodes. Depending on the number of 25 + * CPUs per node even the next level might be kept as groups of CPU groups 26 + * per node and only the levels above cross the node topology. 27 + * 28 + * Example topology for a two node system with 24 CPUs each. 29 + * 30 + * LVL 2 [GRP2:0] 31 + * GRP1:0 = GRP1:M 32 + * 33 + * LVL 1 [GRP1:0] [GRP1:1] 34 + * GRP0:0 - GRP0:2 GRP0:3 - GRP0:5 35 + * 36 + * LVL 0 [GRP0:0] [GRP0:1] [GRP0:2] [GRP0:3] [GRP0:4] [GRP0:5] 37 + * CPUS 0-7 8-15 16-23 24-31 32-39 40-47 38 + * 39 + * The groups hold a timer queue of events sorted by expiry time. These 40 + * queues are updated when CPUs go in idle. When they come out of idle 41 + * ignore flag of events is set. 42 + * 43 + * Each group has a designated migrator CPU/group as long as a CPU/group is 44 + * active in the group. This designated role is necessary to avoid that all 45 + * active CPUs in a group try to migrate expired timers from other CPUs, 46 + * which would result in massive lock bouncing. 47 + * 48 + * When a CPU is awake, it checks in it's own timer tick the group 49 + * hierarchy up to the point where it is assigned the migrator role or if 50 + * no CPU is active, it also checks the groups where no migrator is set 51 + * (TMIGR_NONE). 52 + * 53 + * If it finds expired timers in one of the group queues it pulls them over 54 + * from the idle CPU and runs the timer function. After that it updates the 55 + * group and the parent groups if required. 56 + * 57 + * CPUs which go idle arm their CPU local timer hardware for the next local 58 + * (pinned) timer event. If the next migratable timer expires after the 59 + * next local timer or the CPU has no migratable timer pending then the 60 + * CPU does not queue an event in the LVL0 group. If the next migratable 61 + * timer expires before the next local timer then the CPU queues that timer 62 + * in the LVL0 group. In both cases the CPU marks itself idle in the LVL0 63 + * group. 64 + * 65 + * When CPU comes out of idle and when a group has at least a single active 66 + * child, the ignore flag of the tmigr_event is set. This indicates, that 67 + * the event is ignored even if it is still enqueued in the parent groups 68 + * timer queue. It will be removed when touching the timer queue the next 69 + * time. This spares locking in active path as the lock protects (after 70 + * setup) only event information. For more information about locking, 71 + * please read the section "Locking rules". 72 + * 73 + * If the CPU is the migrator of the group then it delegates that role to 74 + * the next active CPU in the group or sets migrator to TMIGR_NONE when 75 + * there is no active CPU in the group. This delegation needs to be 76 + * propagated up the hierarchy so hand over from other leaves can happen at 77 + * all hierarchy levels w/o doing a search. 78 + * 79 + * When the last CPU in the system goes idle, then it drops all migrator 80 + * duties up to the top level of the hierarchy (LVL2 in the example). It 81 + * then has to make sure, that it arms it's own local hardware timer for 82 + * the earliest event in the system. 83 + * 84 + * 85 + * Lifetime rules: 86 + * --------------- 87 + * 88 + * The groups are built up at init time or when CPUs come online. They are 89 + * not destroyed when a group becomes empty due to offlining. The group 90 + * just won't participate in the hierarchy management anymore. Destroying 91 + * groups would result in interesting race conditions which would just make 92 + * the whole mechanism slow and complex. 93 + * 94 + * 95 + * Locking rules: 96 + * -------------- 97 + * 98 + * For setting up new groups and handling events it's required to lock both 99 + * child and parent group. The lock ordering is always bottom up. This also 100 + * includes the per CPU locks in struct tmigr_cpu. For updating the migrator and 101 + * active CPU/group information atomic_try_cmpxchg() is used instead and only 102 + * the per CPU tmigr_cpu->lock is held. 103 + * 104 + * During the setup of groups tmigr_level_list is required. It is protected by 105 + * @tmigr_mutex. 106 + * 107 + * When @timer_base->lock as well as tmigr related locks are required, the lock 108 + * ordering is: first @timer_base->lock, afterwards tmigr related locks. 109 + * 110 + * 111 + * Protection of the tmigr group state information: 112 + * ------------------------------------------------ 113 + * 114 + * The state information with the list of active children and migrator needs to 115 + * be protected by a sequence counter. It prevents a race when updates in child 116 + * groups are propagated in changed order. The state update is performed 117 + * lockless and group wise. The following scenario describes what happens 118 + * without updating the sequence counter: 119 + * 120 + * Therefore, let's take three groups and four CPUs (CPU2 and CPU3 as well 121 + * as GRP0:1 will not change during the scenario): 122 + * 123 + * LVL 1 [GRP1:0] 124 + * migrator = GRP0:1 125 + * active = GRP0:0, GRP0:1 126 + * / \ 127 + * LVL 0 [GRP0:0] [GRP0:1] 128 + * migrator = CPU0 migrator = CPU2 129 + * active = CPU0 active = CPU2 130 + * / \ / \ 131 + * CPUs 0 1 2 3 132 + * active idle active idle 133 + * 134 + * 135 + * 1. CPU0 goes idle. As the update is performed group wise, in the first step 136 + * only GRP0:0 is updated. The update of GRP1:0 is pending as CPU0 has to 137 + * walk the hierarchy. 138 + * 139 + * LVL 1 [GRP1:0] 140 + * migrator = GRP0:1 141 + * active = GRP0:0, GRP0:1 142 + * / \ 143 + * LVL 0 [GRP0:0] [GRP0:1] 144 + * --> migrator = TMIGR_NONE migrator = CPU2 145 + * --> active = active = CPU2 146 + * / \ / \ 147 + * CPUs 0 1 2 3 148 + * --> idle idle active idle 149 + * 150 + * 2. While CPU0 goes idle and continues to update the state, CPU1 comes out of 151 + * idle. CPU1 updates GRP0:0. The update for GRP1:0 is pending as CPU1 also 152 + * has to walk the hierarchy. Both CPUs (CPU0 and CPU1) now walk the 153 + * hierarchy to perform the needed update from their point of view. The 154 + * currently visible state looks the following: 155 + * 156 + * LVL 1 [GRP1:0] 157 + * migrator = GRP0:1 158 + * active = GRP0:0, GRP0:1 159 + * / \ 160 + * LVL 0 [GRP0:0] [GRP0:1] 161 + * --> migrator = CPU1 migrator = CPU2 162 + * --> active = CPU1 active = CPU2 163 + * / \ / \ 164 + * CPUs 0 1 2 3 165 + * idle --> active active idle 166 + * 167 + * 3. Here is the race condition: CPU1 managed to propagate its changes (from 168 + * step 2) through the hierarchy to GRP1:0 before CPU0 (step 1) did. The 169 + * active members of GRP1:0 remain unchanged after the update since it is 170 + * still valid from CPU1 current point of view: 171 + * 172 + * LVL 1 [GRP1:0] 173 + * --> migrator = GRP0:1 174 + * --> active = GRP0:0, GRP0:1 175 + * / \ 176 + * LVL 0 [GRP0:0] [GRP0:1] 177 + * migrator = CPU1 migrator = CPU2 178 + * active = CPU1 active = CPU2 179 + * / \ / \ 180 + * CPUs 0 1 2 3 181 + * idle active active idle 182 + * 183 + * 4. Now CPU0 finally propagates its changes (from step 1) to GRP1:0. 184 + * 185 + * LVL 1 [GRP1:0] 186 + * --> migrator = GRP0:1 187 + * --> active = GRP0:1 188 + * / \ 189 + * LVL 0 [GRP0:0] [GRP0:1] 190 + * migrator = CPU1 migrator = CPU2 191 + * active = CPU1 active = CPU2 192 + * / \ / \ 193 + * CPUs 0 1 2 3 194 + * idle active active idle 195 + * 196 + * 197 + * The race of CPU0 vs. CPU1 led to an inconsistent state in GRP1:0. CPU1 is 198 + * active and is correctly listed as active in GRP0:0. However GRP1:0 does not 199 + * have GRP0:0 listed as active, which is wrong. The sequence counter has been 200 + * added to avoid inconsistent states during updates. The state is updated 201 + * atomically only if all members, including the sequence counter, match the 202 + * expected value (compare-and-exchange). 203 + * 204 + * Looking back at the previous example with the addition of the sequence 205 + * counter: The update as performed by CPU0 in step 4 will fail. CPU1 changed 206 + * the sequence number during the update in step 3 so the expected old value (as 207 + * seen by CPU0 before starting the walk) does not match. 208 + * 209 + * Prevent race between new event and last CPU going inactive 210 + * ---------------------------------------------------------- 211 + * 212 + * When the last CPU is going idle and there is a concurrent update of a new 213 + * first global timer of an idle CPU, the group and child states have to be read 214 + * while holding the lock in tmigr_update_events(). The following scenario shows 215 + * what happens, when this is not done. 216 + * 217 + * 1. Only CPU2 is active: 218 + * 219 + * LVL 1 [GRP1:0] 220 + * migrator = GRP0:1 221 + * active = GRP0:1 222 + * next_expiry = KTIME_MAX 223 + * / \ 224 + * LVL 0 [GRP0:0] [GRP0:1] 225 + * migrator = TMIGR_NONE migrator = CPU2 226 + * active = active = CPU2 227 + * next_expiry = KTIME_MAX next_expiry = KTIME_MAX 228 + * / \ / \ 229 + * CPUs 0 1 2 3 230 + * idle idle active idle 231 + * 232 + * 2. Now CPU 2 goes idle (and has no global timer, that has to be handled) and 233 + * propagates that to GRP0:1: 234 + * 235 + * LVL 1 [GRP1:0] 236 + * migrator = GRP0:1 237 + * active = GRP0:1 238 + * next_expiry = KTIME_MAX 239 + * / \ 240 + * LVL 0 [GRP0:0] [GRP0:1] 241 + * migrator = TMIGR_NONE --> migrator = TMIGR_NONE 242 + * active = --> active = 243 + * next_expiry = KTIME_MAX next_expiry = KTIME_MAX 244 + * / \ / \ 245 + * CPUs 0 1 2 3 246 + * idle idle --> idle idle 247 + * 248 + * 3. Now the idle state is propagated up to GRP1:0. As this is now the last 249 + * child going idle in top level group, the expiry of the next group event 250 + * has to be handed back to make sure no event is lost. As there is no event 251 + * enqueued, KTIME_MAX is handed back to CPU2. 252 + * 253 + * LVL 1 [GRP1:0] 254 + * --> migrator = TMIGR_NONE 255 + * --> active = 256 + * next_expiry = KTIME_MAX 257 + * / \ 258 + * LVL 0 [GRP0:0] [GRP0:1] 259 + * migrator = TMIGR_NONE migrator = TMIGR_NONE 260 + * active = active = 261 + * next_expiry = KTIME_MAX next_expiry = KTIME_MAX 262 + * / \ / \ 263 + * CPUs 0 1 2 3 264 + * idle idle --> idle idle 265 + * 266 + * 4. CPU 0 has a new timer queued from idle and it expires at TIMER0. CPU0 267 + * propagates that to GRP0:0: 268 + * 269 + * LVL 1 [GRP1:0] 270 + * migrator = TMIGR_NONE 271 + * active = 272 + * next_expiry = KTIME_MAX 273 + * / \ 274 + * LVL 0 [GRP0:0] [GRP0:1] 275 + * migrator = TMIGR_NONE migrator = TMIGR_NONE 276 + * active = active = 277 + * --> next_expiry = TIMER0 next_expiry = KTIME_MAX 278 + * / \ / \ 279 + * CPUs 0 1 2 3 280 + * idle idle idle idle 281 + * 282 + * 5. GRP0:0 is not active, so the new timer has to be propagated to 283 + * GRP1:0. Therefore the GRP1:0 state has to be read. When the stalled value 284 + * (from step 2) is read, the timer is enqueued into GRP1:0, but nothing is 285 + * handed back to CPU0, as it seems that there is still an active child in 286 + * top level group. 287 + * 288 + * LVL 1 [GRP1:0] 289 + * migrator = TMIGR_NONE 290 + * active = 291 + * --> next_expiry = TIMER0 292 + * / \ 293 + * LVL 0 [GRP0:0] [GRP0:1] 294 + * migrator = TMIGR_NONE migrator = TMIGR_NONE 295 + * active = active = 296 + * next_expiry = TIMER0 next_expiry = KTIME_MAX 297 + * / \ / \ 298 + * CPUs 0 1 2 3 299 + * idle idle idle idle 300 + * 301 + * This is prevented by reading the state when holding the lock (when a new 302 + * timer has to be propagated from idle path):: 303 + * 304 + * CPU2 (tmigr_inactive_up()) CPU0 (tmigr_new_timer_up()) 305 + * -------------------------- --------------------------- 306 + * // step 3: 307 + * cmpxchg(&GRP1:0->state); 308 + * tmigr_update_events() { 309 + * spin_lock(&GRP1:0->lock); 310 + * // ... update events ... 311 + * // hand back first expiry when GRP1:0 is idle 312 + * spin_unlock(&GRP1:0->lock); 313 + * // ^^^ release state modification 314 + * } 315 + * tmigr_update_events() { 316 + * spin_lock(&GRP1:0->lock) 317 + * // ^^^ acquire state modification 318 + * group_state = atomic_read(&GRP1:0->state) 319 + * // .... update events ... 320 + * // hand back first expiry when GRP1:0 is idle 321 + * spin_unlock(&GRP1:0->lock) <3> 322 + * // ^^^ makes state visible for other 323 + * // callers of tmigr_new_timer_up() 324 + * } 325 + * 326 + * When CPU0 grabs the lock directly after cmpxchg, the first timer is reported 327 + * back to CPU0 and also later on to CPU2. So no timer is missed. A concurrent 328 + * update of the group state from active path is no problem, as the upcoming CPU 329 + * will take care of the group events. 330 + * 331 + * Required event and timerqueue update after a remote expiry: 332 + * ----------------------------------------------------------- 333 + * 334 + * After expiring timers of a remote CPU, a walk through the hierarchy and 335 + * update of events and timerqueues is required. It is obviously needed if there 336 + * is a 'new' global timer but also if there is no new global timer but the 337 + * remote CPU is still idle. 338 + * 339 + * 1. CPU0 and CPU1 are idle and have both a global timer expiring at the same 340 + * time. So both have an event enqueued in the timerqueue of GRP0:0. CPU3 is 341 + * also idle and has no global timer pending. CPU2 is the only active CPU and 342 + * thus also the migrator: 343 + * 344 + * LVL 1 [GRP1:0] 345 + * migrator = GRP0:1 346 + * active = GRP0:1 347 + * --> timerqueue = evt-GRP0:0 348 + * / \ 349 + * LVL 0 [GRP0:0] [GRP0:1] 350 + * migrator = TMIGR_NONE migrator = CPU2 351 + * active = active = CPU2 352 + * groupevt.ignore = false groupevt.ignore = true 353 + * groupevt.cpu = CPU0 groupevt.cpu = 354 + * timerqueue = evt-CPU0, timerqueue = 355 + * evt-CPU1 356 + * / \ / \ 357 + * CPUs 0 1 2 3 358 + * idle idle active idle 359 + * 360 + * 2. CPU2 starts to expire remote timers. It starts with LVL0 group 361 + * GRP0:1. There is no event queued in the timerqueue, so CPU2 continues with 362 + * the parent of GRP0:1: GRP1:0. In GRP1:0 it dequeues the first event. It 363 + * looks at tmigr_event::cpu struct member and expires the pending timer(s) 364 + * of CPU0. 365 + * 366 + * LVL 1 [GRP1:0] 367 + * migrator = GRP0:1 368 + * active = GRP0:1 369 + * --> timerqueue = 370 + * / \ 371 + * LVL 0 [GRP0:0] [GRP0:1] 372 + * migrator = TMIGR_NONE migrator = CPU2 373 + * active = active = CPU2 374 + * groupevt.ignore = false groupevt.ignore = true 375 + * --> groupevt.cpu = CPU0 groupevt.cpu = 376 + * timerqueue = evt-CPU0, timerqueue = 377 + * evt-CPU1 378 + * / \ / \ 379 + * CPUs 0 1 2 3 380 + * idle idle active idle 381 + * 382 + * 3. Some work has to be done after expiring the timers of CPU0. If we stop 383 + * here, then CPU1's pending global timer(s) will not expire in time and the 384 + * timerqueue of GRP0:0 has still an event for CPU0 enqueued which has just 385 + * been processed. So it is required to walk the hierarchy from CPU0's point 386 + * of view and update it accordingly. CPU0's event will be removed from the 387 + * timerqueue because it has no pending timer. If CPU0 would have a timer 388 + * pending then it has to expire after CPU1's first timer because all timers 389 + * from this period were just expired. Either way CPU1's event will be first 390 + * in GRP0:0's timerqueue and therefore set in the CPU field of the group 391 + * event which is then enqueued in GRP1:0's timerqueue as GRP0:0 is still not 392 + * active: 393 + * 394 + * LVL 1 [GRP1:0] 395 + * migrator = GRP0:1 396 + * active = GRP0:1 397 + * --> timerqueue = evt-GRP0:0 398 + * / \ 399 + * LVL 0 [GRP0:0] [GRP0:1] 400 + * migrator = TMIGR_NONE migrator = CPU2 401 + * active = active = CPU2 402 + * groupevt.ignore = false groupevt.ignore = true 403 + * --> groupevt.cpu = CPU1 groupevt.cpu = 404 + * --> timerqueue = evt-CPU1 timerqueue = 405 + * / \ / \ 406 + * CPUs 0 1 2 3 407 + * idle idle active idle 408 + * 409 + * Now CPU2 (migrator) will continue step 2 at GRP1:0 and will expire the 410 + * timer(s) of CPU1. 411 + * 412 + * The hierarchy walk in step 3 can be skipped if the migrator notices that a 413 + * CPU of GRP0:0 is active again. The CPU will mark GRP0:0 active and take care 414 + * of the group as migrator and any needed updates within the hierarchy. 415 + */ 416 + 417 + static DEFINE_MUTEX(tmigr_mutex); 418 + static struct list_head *tmigr_level_list __read_mostly; 419 + 420 + static unsigned int tmigr_hierarchy_levels __read_mostly; 421 + static unsigned int tmigr_crossnode_level __read_mostly; 422 + 423 + static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); 424 + 425 + #define TMIGR_NONE 0xFF 426 + #define BIT_CNT 8 427 + 428 + static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc) 429 + { 430 + return !(tmc->tmgroup && tmc->online); 431 + } 432 + 433 + /* 434 + * Returns true, when @childmask corresponds to the group migrator or when the 435 + * group is not active - so no migrator is set. 436 + */ 437 + static bool tmigr_check_migrator(struct tmigr_group *group, u8 childmask) 438 + { 439 + union tmigr_state s; 440 + 441 + s.state = atomic_read(&group->migr_state); 442 + 443 + if ((s.migrator == childmask) || (s.migrator == TMIGR_NONE)) 444 + return true; 445 + 446 + return false; 447 + } 448 + 449 + static bool tmigr_check_migrator_and_lonely(struct tmigr_group *group, u8 childmask) 450 + { 451 + bool lonely, migrator = false; 452 + unsigned long active; 453 + union tmigr_state s; 454 + 455 + s.state = atomic_read(&group->migr_state); 456 + 457 + if ((s.migrator == childmask) || (s.migrator == TMIGR_NONE)) 458 + migrator = true; 459 + 460 + active = s.active; 461 + lonely = bitmap_weight(&active, BIT_CNT) <= 1; 462 + 463 + return (migrator && lonely); 464 + } 465 + 466 + static bool tmigr_check_lonely(struct tmigr_group *group) 467 + { 468 + unsigned long active; 469 + union tmigr_state s; 470 + 471 + s.state = atomic_read(&group->migr_state); 472 + 473 + active = s.active; 474 + 475 + return bitmap_weight(&active, BIT_CNT) <= 1; 476 + } 477 + 478 + typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, void *); 479 + 480 + static void __walk_groups(up_f up, void *data, 481 + struct tmigr_cpu *tmc) 482 + { 483 + struct tmigr_group *child = NULL, *group = tmc->tmgroup; 484 + 485 + do { 486 + WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); 487 + 488 + if (up(group, child, data)) 489 + break; 490 + 491 + child = group; 492 + group = group->parent; 493 + } while (group); 494 + } 495 + 496 + static void walk_groups(up_f up, void *data, struct tmigr_cpu *tmc) 497 + { 498 + lockdep_assert_held(&tmc->lock); 499 + 500 + __walk_groups(up, data, tmc); 501 + } 502 + 503 + /** 504 + * struct tmigr_walk - data required for walking the hierarchy 505 + * @nextexp: Next CPU event expiry information which is handed into 506 + * the timer migration code by the timer code 507 + * (get_next_timer_interrupt()) 508 + * @firstexp: Contains the first event expiry information when last 509 + * active CPU of hierarchy is on the way to idle to make 510 + * sure CPU will be back in time. 511 + * @evt: Pointer to tmigr_event which needs to be queued (of idle 512 + * child group) 513 + * @childmask: childmask of child group 514 + * @remote: Is set, when the new timer path is executed in 515 + * tmigr_handle_remote_cpu() 516 + */ 517 + struct tmigr_walk { 518 + u64 nextexp; 519 + u64 firstexp; 520 + struct tmigr_event *evt; 521 + u8 childmask; 522 + bool remote; 523 + }; 524 + 525 + /** 526 + * struct tmigr_remote_data - data required for remote expiry hierarchy walk 527 + * @basej: timer base in jiffies 528 + * @now: timer base monotonic 529 + * @firstexp: returns expiry of the first timer in the idle timer 530 + * migration hierarchy to make sure the timer is handled in 531 + * time; it is stored in the per CPU tmigr_cpu struct of 532 + * CPU which expires remote timers 533 + * @childmask: childmask of child group 534 + * @check: is set if there is the need to handle remote timers; 535 + * required in tmigr_requires_handle_remote() only 536 + * @tmc_active: this flag indicates, whether the CPU which triggers 537 + * the hierarchy walk is !idle in the timer migration 538 + * hierarchy. When the CPU is idle and the whole hierarchy is 539 + * idle, only the first event of the top level has to be 540 + * considered. 541 + */ 542 + struct tmigr_remote_data { 543 + unsigned long basej; 544 + u64 now; 545 + u64 firstexp; 546 + u8 childmask; 547 + bool check; 548 + bool tmc_active; 549 + }; 550 + 551 + /* 552 + * Returns the next event of the timerqueue @group->events 553 + * 554 + * Removes timers with ignore flag and update next_expiry of the group. Values 555 + * of the group event are updated in tmigr_update_events() only. 556 + */ 557 + static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group) 558 + { 559 + struct timerqueue_node *node = NULL; 560 + struct tmigr_event *evt = NULL; 561 + 562 + lockdep_assert_held(&group->lock); 563 + 564 + WRITE_ONCE(group->next_expiry, KTIME_MAX); 565 + 566 + while ((node = timerqueue_getnext(&group->events))) { 567 + evt = container_of(node, struct tmigr_event, nextevt); 568 + 569 + if (!evt->ignore) { 570 + WRITE_ONCE(group->next_expiry, evt->nextevt.expires); 571 + return evt; 572 + } 573 + 574 + /* 575 + * Remove next timers with ignore flag, because the group lock 576 + * is held anyway 577 + */ 578 + if (!timerqueue_del(&group->events, node)) 579 + break; 580 + } 581 + 582 + return NULL; 583 + } 584 + 585 + /* 586 + * Return the next event (with the expiry equal or before @now) 587 + * 588 + * Event, which is returned, is also removed from the queue. 589 + */ 590 + static struct tmigr_event *tmigr_next_expired_groupevt(struct tmigr_group *group, 591 + u64 now) 592 + { 593 + struct tmigr_event *evt = tmigr_next_groupevt(group); 594 + 595 + if (!evt || now < evt->nextevt.expires) 596 + return NULL; 597 + 598 + /* 599 + * The event is ready to expire. Remove it and update next group event. 600 + */ 601 + timerqueue_del(&group->events, &evt->nextevt); 602 + tmigr_next_groupevt(group); 603 + 604 + return evt; 605 + } 606 + 607 + static u64 tmigr_next_groupevt_expires(struct tmigr_group *group) 608 + { 609 + struct tmigr_event *evt; 610 + 611 + evt = tmigr_next_groupevt(group); 612 + 613 + if (!evt) 614 + return KTIME_MAX; 615 + else 616 + return evt->nextevt.expires; 617 + } 618 + 619 + static bool tmigr_active_up(struct tmigr_group *group, 620 + struct tmigr_group *child, 621 + void *ptr) 622 + { 623 + union tmigr_state curstate, newstate; 624 + struct tmigr_walk *data = ptr; 625 + bool walk_done; 626 + u8 childmask; 627 + 628 + childmask = data->childmask; 629 + /* 630 + * No memory barrier is required here in contrast to 631 + * tmigr_inactive_up(), as the group state change does not depend on the 632 + * child state. 633 + */ 634 + curstate.state = atomic_read(&group->migr_state); 635 + 636 + do { 637 + newstate = curstate; 638 + walk_done = true; 639 + 640 + if (newstate.migrator == TMIGR_NONE) { 641 + newstate.migrator = childmask; 642 + 643 + /* Changes need to be propagated */ 644 + walk_done = false; 645 + } 646 + 647 + newstate.active |= childmask; 648 + newstate.seq++; 649 + 650 + } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)); 651 + 652 + if ((walk_done == false) && group->parent) 653 + data->childmask = group->childmask; 654 + 655 + /* 656 + * The group is active (again). The group event might be still queued 657 + * into the parent group's timerqueue but can now be handled by the 658 + * migrator of this group. Therefore the ignore flag for the group event 659 + * is updated to reflect this. 660 + * 661 + * The update of the ignore flag in the active path is done lockless. In 662 + * worst case the migrator of the parent group observes the change too 663 + * late and expires remotely all events belonging to this group. The 664 + * lock is held while updating the ignore flag in idle path. So this 665 + * state change will not be lost. 666 + */ 667 + group->groupevt.ignore = true; 668 + 669 + trace_tmigr_group_set_cpu_active(group, newstate, childmask); 670 + 671 + return walk_done; 672 + } 673 + 674 + static void __tmigr_cpu_activate(struct tmigr_cpu *tmc) 675 + { 676 + struct tmigr_walk data; 677 + 678 + data.childmask = tmc->childmask; 679 + 680 + trace_tmigr_cpu_active(tmc); 681 + 682 + tmc->cpuevt.ignore = true; 683 + WRITE_ONCE(tmc->wakeup, KTIME_MAX); 684 + 685 + walk_groups(&tmigr_active_up, &data, tmc); 686 + } 687 + 688 + /** 689 + * tmigr_cpu_activate() - set this CPU active in timer migration hierarchy 690 + * 691 + * Call site timer_clear_idle() is called with interrupts disabled. 692 + */ 693 + void tmigr_cpu_activate(void) 694 + { 695 + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 696 + 697 + if (tmigr_is_not_available(tmc)) 698 + return; 699 + 700 + if (WARN_ON_ONCE(!tmc->idle)) 701 + return; 702 + 703 + raw_spin_lock(&tmc->lock); 704 + tmc->idle = false; 705 + __tmigr_cpu_activate(tmc); 706 + raw_spin_unlock(&tmc->lock); 707 + } 708 + 709 + /* 710 + * Returns true, if there is nothing to be propagated to the next level 711 + * 712 + * @data->firstexp is set to expiry of first gobal event of the (top level of 713 + * the) hierarchy, but only when hierarchy is completely idle. 714 + * 715 + * The child and group states need to be read under the lock, to prevent a race 716 + * against a concurrent tmigr_inactive_up() run when the last CPU goes idle. See 717 + * also section "Prevent race between new event and last CPU going inactive" in 718 + * the documentation at the top. 719 + * 720 + * This is the only place where the group event expiry value is set. 721 + */ 722 + static 723 + bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, 724 + struct tmigr_walk *data) 725 + { 726 + struct tmigr_event *evt, *first_childevt; 727 + union tmigr_state childstate, groupstate; 728 + bool remote = data->remote; 729 + bool walk_done = false; 730 + u64 nextexp; 731 + 732 + if (child) { 733 + raw_spin_lock(&child->lock); 734 + raw_spin_lock_nested(&group->lock, SINGLE_DEPTH_NESTING); 735 + 736 + childstate.state = atomic_read(&child->migr_state); 737 + groupstate.state = atomic_read(&group->migr_state); 738 + 739 + if (childstate.active) { 740 + walk_done = true; 741 + goto unlock; 742 + } 743 + 744 + first_childevt = tmigr_next_groupevt(child); 745 + nextexp = child->next_expiry; 746 + evt = &child->groupevt; 747 + 748 + evt->ignore = (nextexp == KTIME_MAX) ? true : false; 749 + } else { 750 + nextexp = data->nextexp; 751 + 752 + first_childevt = evt = data->evt; 753 + 754 + /* 755 + * Walking the hierarchy is required in any case when a 756 + * remote expiry was done before. This ensures to not lose 757 + * already queued events in non active groups (see section 758 + * "Required event and timerqueue update after a remote 759 + * expiry" in the documentation at the top). 760 + * 761 + * The two call sites which are executed without a remote expiry 762 + * before, are not prevented from propagating changes through 763 + * the hierarchy by the return: 764 + * - When entering this path by tmigr_new_timer(), @evt->ignore 765 + * is never set. 766 + * - tmigr_inactive_up() takes care of the propagation by 767 + * itself and ignores the return value. But an immediate 768 + * return is required because nothing has to be done in this 769 + * level as the event could be ignored. 770 + */ 771 + if (evt->ignore && !remote) 772 + return true; 773 + 774 + raw_spin_lock(&group->lock); 775 + 776 + childstate.state = 0; 777 + groupstate.state = atomic_read(&group->migr_state); 778 + } 779 + 780 + /* 781 + * If the child event is already queued in the group, remove it from the 782 + * queue when the expiry time changed only or when it could be ignored. 783 + */ 784 + if (timerqueue_node_queued(&evt->nextevt)) { 785 + if ((evt->nextevt.expires == nextexp) && !evt->ignore) 786 + goto check_toplvl; 787 + 788 + if (!timerqueue_del(&group->events, &evt->nextevt)) 789 + WRITE_ONCE(group->next_expiry, KTIME_MAX); 790 + } 791 + 792 + if (evt->ignore) { 793 + /* 794 + * When the next child event could be ignored (nextexp is 795 + * KTIME_MAX) and there was no remote timer handling before or 796 + * the group is already active, there is no need to walk the 797 + * hierarchy even if there is a parent group. 798 + * 799 + * The other way round: even if the event could be ignored, but 800 + * if a remote timer handling was executed before and the group 801 + * is not active, walking the hierarchy is required to not miss 802 + * an enqueued timer in the non active group. The enqueued timer 803 + * of the group needs to be propagated to a higher level to 804 + * ensure it is handled. 805 + */ 806 + if (!remote || groupstate.active) 807 + walk_done = true; 808 + } else { 809 + evt->nextevt.expires = nextexp; 810 + evt->cpu = first_childevt->cpu; 811 + 812 + if (timerqueue_add(&group->events, &evt->nextevt)) 813 + WRITE_ONCE(group->next_expiry, nextexp); 814 + } 815 + 816 + check_toplvl: 817 + if (!group->parent && (groupstate.migrator == TMIGR_NONE)) { 818 + walk_done = true; 819 + 820 + /* 821 + * Nothing to do when update was done during remote timer 822 + * handling. First timer in top level group which needs to be 823 + * handled when top level group is not active, is calculated 824 + * directly in tmigr_handle_remote_up(). 825 + */ 826 + if (remote) 827 + goto unlock; 828 + 829 + /* 830 + * The top level group is idle and it has to be ensured the 831 + * global timers are handled in time. (This could be optimized 832 + * by keeping track of the last global scheduled event and only 833 + * arming it on the CPU if the new event is earlier. Not sure if 834 + * its worth the complexity.) 835 + */ 836 + data->firstexp = tmigr_next_groupevt_expires(group); 837 + } 838 + 839 + trace_tmigr_update_events(child, group, childstate, groupstate, 840 + nextexp); 841 + 842 + unlock: 843 + raw_spin_unlock(&group->lock); 844 + 845 + if (child) 846 + raw_spin_unlock(&child->lock); 847 + 848 + return walk_done; 849 + } 850 + 851 + static bool tmigr_new_timer_up(struct tmigr_group *group, 852 + struct tmigr_group *child, 853 + void *ptr) 854 + { 855 + struct tmigr_walk *data = ptr; 856 + 857 + return tmigr_update_events(group, child, data); 858 + } 859 + 860 + /* 861 + * Returns the expiry of the next timer that needs to be handled. KTIME_MAX is 862 + * returned, if an active CPU will handle all the timer migration hierarchy 863 + * timers. 864 + */ 865 + static u64 tmigr_new_timer(struct tmigr_cpu *tmc, u64 nextexp) 866 + { 867 + struct tmigr_walk data = { .nextexp = nextexp, 868 + .firstexp = KTIME_MAX, 869 + .evt = &tmc->cpuevt }; 870 + 871 + lockdep_assert_held(&tmc->lock); 872 + 873 + if (tmc->remote) 874 + return KTIME_MAX; 875 + 876 + trace_tmigr_cpu_new_timer(tmc); 877 + 878 + tmc->cpuevt.ignore = false; 879 + data.remote = false; 880 + 881 + walk_groups(&tmigr_new_timer_up, &data, tmc); 882 + 883 + /* If there is a new first global event, make sure it is handled */ 884 + return data.firstexp; 885 + } 886 + 887 + static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, 888 + unsigned long jif) 889 + { 890 + struct timer_events tevt; 891 + struct tmigr_walk data; 892 + struct tmigr_cpu *tmc; 893 + 894 + tmc = per_cpu_ptr(&tmigr_cpu, cpu); 895 + 896 + raw_spin_lock_irq(&tmc->lock); 897 + 898 + /* 899 + * If the remote CPU is offline then the timers have been migrated to 900 + * another CPU. 901 + * 902 + * If tmigr_cpu::remote is set, at the moment another CPU already 903 + * expires the timers of the remote CPU. 904 + * 905 + * If tmigr_event::ignore is set, then the CPU returns from idle and 906 + * takes care of its timers. 907 + * 908 + * If the next event expires in the future, then the event has been 909 + * updated and there are no timers to expire right now. The CPU which 910 + * updated the event takes care when hierarchy is completely 911 + * idle. Otherwise the migrator does it as the event is enqueued. 912 + */ 913 + if (!tmc->online || tmc->remote || tmc->cpuevt.ignore || 914 + now < tmc->cpuevt.nextevt.expires) { 915 + raw_spin_unlock_irq(&tmc->lock); 916 + return; 917 + } 918 + 919 + trace_tmigr_handle_remote_cpu(tmc); 920 + 921 + tmc->remote = true; 922 + WRITE_ONCE(tmc->wakeup, KTIME_MAX); 923 + 924 + /* Drop the lock to allow the remote CPU to exit idle */ 925 + raw_spin_unlock_irq(&tmc->lock); 926 + 927 + if (cpu != smp_processor_id()) 928 + timer_expire_remote(cpu); 929 + 930 + /* 931 + * Lock ordering needs to be preserved - timer_base locks before tmigr 932 + * related locks (see section "Locking rules" in the documentation at 933 + * the top). During fetching the next timer interrupt, also tmc->lock 934 + * needs to be held. Otherwise there is a possible race window against 935 + * the CPU itself when it comes out of idle, updates the first timer in 936 + * the hierarchy and goes back to idle. 937 + * 938 + * timer base locks are dropped as fast as possible: After checking 939 + * whether the remote CPU went offline in the meantime and after 940 + * fetching the next remote timer interrupt. Dropping the locks as fast 941 + * as possible keeps the locking region small and prevents holding 942 + * several (unnecessary) locks during walking the hierarchy for updating 943 + * the timerqueue and group events. 944 + */ 945 + local_irq_disable(); 946 + timer_lock_remote_bases(cpu); 947 + raw_spin_lock(&tmc->lock); 948 + 949 + /* 950 + * When the CPU went offline in the meantime, no hierarchy walk has to 951 + * be done for updating the queued events, because the walk was 952 + * already done during marking the CPU offline in the hierarchy. 953 + * 954 + * When the CPU is no longer idle, the CPU takes care of the timers and 955 + * also of the timers in the hierarchy. 956 + * 957 + * (See also section "Required event and timerqueue update after a 958 + * remote expiry" in the documentation at the top) 959 + */ 960 + if (!tmc->online || !tmc->idle) { 961 + timer_unlock_remote_bases(cpu); 962 + goto unlock; 963 + } 964 + 965 + /* next event of CPU */ 966 + fetch_next_timer_interrupt_remote(jif, now, &tevt, cpu); 967 + timer_unlock_remote_bases(cpu); 968 + 969 + data.nextexp = tevt.global; 970 + data.firstexp = KTIME_MAX; 971 + data.evt = &tmc->cpuevt; 972 + data.remote = true; 973 + 974 + /* 975 + * The update is done even when there is no 'new' global timer pending 976 + * on the remote CPU (see section "Required event and timerqueue update 977 + * after a remote expiry" in the documentation at the top) 978 + */ 979 + walk_groups(&tmigr_new_timer_up, &data, tmc); 980 + 981 + unlock: 982 + tmc->remote = false; 983 + raw_spin_unlock_irq(&tmc->lock); 984 + } 985 + 986 + static bool tmigr_handle_remote_up(struct tmigr_group *group, 987 + struct tmigr_group *child, 988 + void *ptr) 989 + { 990 + struct tmigr_remote_data *data = ptr; 991 + struct tmigr_event *evt; 992 + unsigned long jif; 993 + u8 childmask; 994 + u64 now; 995 + 996 + jif = data->basej; 997 + now = data->now; 998 + 999 + childmask = data->childmask; 1000 + 1001 + trace_tmigr_handle_remote(group); 1002 + again: 1003 + /* 1004 + * Handle the group only if @childmask is the migrator or if the 1005 + * group has no migrator. Otherwise the group is active and is 1006 + * handled by its own migrator. 1007 + */ 1008 + if (!tmigr_check_migrator(group, childmask)) 1009 + return true; 1010 + 1011 + raw_spin_lock_irq(&group->lock); 1012 + 1013 + evt = tmigr_next_expired_groupevt(group, now); 1014 + 1015 + if (evt) { 1016 + unsigned int remote_cpu = evt->cpu; 1017 + 1018 + raw_spin_unlock_irq(&group->lock); 1019 + 1020 + tmigr_handle_remote_cpu(remote_cpu, now, jif); 1021 + 1022 + /* check if there is another event, that needs to be handled */ 1023 + goto again; 1024 + } 1025 + 1026 + /* 1027 + * Update of childmask for the next level and keep track of the expiry 1028 + * of the first event that needs to be handled (group->next_expiry was 1029 + * updated by tmigr_next_expired_groupevt(), next was set by 1030 + * tmigr_handle_remote_cpu()). 1031 + */ 1032 + data->childmask = group->childmask; 1033 + data->firstexp = group->next_expiry; 1034 + 1035 + raw_spin_unlock_irq(&group->lock); 1036 + 1037 + return false; 1038 + } 1039 + 1040 + /** 1041 + * tmigr_handle_remote() - Handle global timers of remote idle CPUs 1042 + * 1043 + * Called from the timer soft interrupt with interrupts enabled. 1044 + */ 1045 + void tmigr_handle_remote(void) 1046 + { 1047 + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1048 + struct tmigr_remote_data data; 1049 + 1050 + if (tmigr_is_not_available(tmc)) 1051 + return; 1052 + 1053 + data.childmask = tmc->childmask; 1054 + data.firstexp = KTIME_MAX; 1055 + 1056 + /* 1057 + * NOTE: This is a doubled check because the migrator test will be done 1058 + * in tmigr_handle_remote_up() anyway. Keep this check to speed up the 1059 + * return when nothing has to be done. 1060 + */ 1061 + if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) 1062 + return; 1063 + 1064 + data.now = get_jiffies_update(&data.basej); 1065 + 1066 + /* 1067 + * Update @tmc->wakeup only at the end and do not reset @tmc->wakeup to 1068 + * KTIME_MAX. Even if tmc->lock is not held during the whole remote 1069 + * handling, tmc->wakeup is fine to be stale as it is called in 1070 + * interrupt context and tick_nohz_next_event() is executed in interrupt 1071 + * exit path only after processing the last pending interrupt. 1072 + */ 1073 + 1074 + __walk_groups(&tmigr_handle_remote_up, &data, tmc); 1075 + 1076 + raw_spin_lock_irq(&tmc->lock); 1077 + WRITE_ONCE(tmc->wakeup, data.firstexp); 1078 + raw_spin_unlock_irq(&tmc->lock); 1079 + } 1080 + 1081 + static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, 1082 + struct tmigr_group *child, 1083 + void *ptr) 1084 + { 1085 + struct tmigr_remote_data *data = ptr; 1086 + u8 childmask; 1087 + 1088 + childmask = data->childmask; 1089 + 1090 + /* 1091 + * Handle the group only if the child is the migrator or if the group 1092 + * has no migrator. Otherwise the group is active and is handled by its 1093 + * own migrator. 1094 + */ 1095 + if (!tmigr_check_migrator(group, childmask)) 1096 + return true; 1097 + 1098 + /* 1099 + * When there is a parent group and the CPU which triggered the 1100 + * hierarchy walk is not active, proceed the walk to reach the top level 1101 + * group before reading the next_expiry value. 1102 + */ 1103 + if (group->parent && !data->tmc_active) 1104 + goto out; 1105 + 1106 + /* 1107 + * The lock is required on 32bit architectures to read the variable 1108 + * consistently with a concurrent writer. On 64bit the lock is not 1109 + * required because the read operation is not split and so it is always 1110 + * consistent. 1111 + */ 1112 + if (IS_ENABLED(CONFIG_64BIT)) { 1113 + data->firstexp = READ_ONCE(group->next_expiry); 1114 + if (data->now >= data->firstexp) { 1115 + data->check = true; 1116 + return true; 1117 + } 1118 + } else { 1119 + raw_spin_lock(&group->lock); 1120 + data->firstexp = group->next_expiry; 1121 + if (data->now >= group->next_expiry) { 1122 + data->check = true; 1123 + raw_spin_unlock(&group->lock); 1124 + return true; 1125 + } 1126 + raw_spin_unlock(&group->lock); 1127 + } 1128 + 1129 + out: 1130 + /* Update of childmask for the next level */ 1131 + data->childmask = group->childmask; 1132 + return false; 1133 + } 1134 + 1135 + /** 1136 + * tmigr_requires_handle_remote() - Check the need of remote timer handling 1137 + * 1138 + * Must be called with interrupts disabled. 1139 + */ 1140 + bool tmigr_requires_handle_remote(void) 1141 + { 1142 + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1143 + struct tmigr_remote_data data; 1144 + unsigned long jif; 1145 + bool ret = false; 1146 + 1147 + if (tmigr_is_not_available(tmc)) 1148 + return ret; 1149 + 1150 + data.now = get_jiffies_update(&jif); 1151 + data.childmask = tmc->childmask; 1152 + data.firstexp = KTIME_MAX; 1153 + data.tmc_active = !tmc->idle; 1154 + data.check = false; 1155 + 1156 + /* 1157 + * If the CPU is active, walk the hierarchy to check whether a remote 1158 + * expiry is required. 1159 + * 1160 + * Check is done lockless as interrupts are disabled and @tmc->idle is 1161 + * set only by the local CPU. 1162 + */ 1163 + if (!tmc->idle) { 1164 + __walk_groups(&tmigr_requires_handle_remote_up, &data, tmc); 1165 + 1166 + return data.check; 1167 + } 1168 + 1169 + /* 1170 + * When the CPU is idle, compare @tmc->wakeup with @data.now. The lock 1171 + * is required on 32bit architectures to read the variable consistently 1172 + * with a concurrent writer. On 64bit the lock is not required because 1173 + * the read operation is not split and so it is always consistent. 1174 + */ 1175 + if (IS_ENABLED(CONFIG_64BIT)) { 1176 + if (data.now >= READ_ONCE(tmc->wakeup)) 1177 + return true; 1178 + } else { 1179 + raw_spin_lock(&tmc->lock); 1180 + if (data.now >= tmc->wakeup) 1181 + ret = true; 1182 + raw_spin_unlock(&tmc->lock); 1183 + } 1184 + 1185 + return ret; 1186 + } 1187 + 1188 + /** 1189 + * tmigr_cpu_new_timer() - enqueue next global timer into hierarchy (idle tmc) 1190 + * @nextexp: Next expiry of global timer (or KTIME_MAX if not) 1191 + * 1192 + * The CPU is already deactivated in the timer migration 1193 + * hierarchy. tick_nohz_get_sleep_length() calls tick_nohz_next_event() 1194 + * and thereby the timer idle path is executed once more. @tmc->wakeup 1195 + * holds the first timer, when the timer migration hierarchy is 1196 + * completely idle. 1197 + * 1198 + * Returns the first timer that needs to be handled by this CPU or KTIME_MAX if 1199 + * nothing needs to be done. 1200 + */ 1201 + u64 tmigr_cpu_new_timer(u64 nextexp) 1202 + { 1203 + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1204 + u64 ret; 1205 + 1206 + if (tmigr_is_not_available(tmc)) 1207 + return nextexp; 1208 + 1209 + raw_spin_lock(&tmc->lock); 1210 + 1211 + ret = READ_ONCE(tmc->wakeup); 1212 + if (nextexp != KTIME_MAX) { 1213 + if (nextexp != tmc->cpuevt.nextevt.expires || 1214 + tmc->cpuevt.ignore) { 1215 + ret = tmigr_new_timer(tmc, nextexp); 1216 + } 1217 + } 1218 + /* 1219 + * Make sure the reevaluation of timers in idle path will not miss an 1220 + * event. 1221 + */ 1222 + WRITE_ONCE(tmc->wakeup, ret); 1223 + 1224 + trace_tmigr_cpu_new_timer_idle(tmc, nextexp); 1225 + raw_spin_unlock(&tmc->lock); 1226 + return ret; 1227 + } 1228 + 1229 + static bool tmigr_inactive_up(struct tmigr_group *group, 1230 + struct tmigr_group *child, 1231 + void *ptr) 1232 + { 1233 + union tmigr_state curstate, newstate, childstate; 1234 + struct tmigr_walk *data = ptr; 1235 + bool walk_done; 1236 + u8 childmask; 1237 + 1238 + childmask = data->childmask; 1239 + childstate.state = 0; 1240 + 1241 + /* 1242 + * The memory barrier is paired with the cmpxchg() in tmigr_active_up() 1243 + * to make sure the updates of child and group states are ordered. The 1244 + * ordering is mandatory, as the group state change depends on the child 1245 + * state. 1246 + */ 1247 + curstate.state = atomic_read_acquire(&group->migr_state); 1248 + 1249 + for (;;) { 1250 + if (child) 1251 + childstate.state = atomic_read(&child->migr_state); 1252 + 1253 + newstate = curstate; 1254 + walk_done = true; 1255 + 1256 + /* Reset active bit when the child is no longer active */ 1257 + if (!childstate.active) 1258 + newstate.active &= ~childmask; 1259 + 1260 + if (newstate.migrator == childmask) { 1261 + /* 1262 + * Find a new migrator for the group, because the child 1263 + * group is idle! 1264 + */ 1265 + if (!childstate.active) { 1266 + unsigned long new_migr_bit, active = newstate.active; 1267 + 1268 + new_migr_bit = find_first_bit(&active, BIT_CNT); 1269 + 1270 + if (new_migr_bit != BIT_CNT) { 1271 + newstate.migrator = BIT(new_migr_bit); 1272 + } else { 1273 + newstate.migrator = TMIGR_NONE; 1274 + 1275 + /* Changes need to be propagated */ 1276 + walk_done = false; 1277 + } 1278 + } 1279 + } 1280 + 1281 + newstate.seq++; 1282 + 1283 + WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active)); 1284 + 1285 + if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, 1286 + newstate.state)) 1287 + break; 1288 + 1289 + /* 1290 + * The memory barrier is paired with the cmpxchg() in 1291 + * tmigr_active_up() to make sure the updates of child and group 1292 + * states are ordered. It is required only when the above 1293 + * try_cmpxchg() fails. 1294 + */ 1295 + smp_mb__after_atomic(); 1296 + } 1297 + 1298 + data->remote = false; 1299 + 1300 + /* Event Handling */ 1301 + tmigr_update_events(group, child, data); 1302 + 1303 + if (group->parent && (walk_done == false)) 1304 + data->childmask = group->childmask; 1305 + 1306 + /* 1307 + * data->firstexp was set by tmigr_update_events() and contains the 1308 + * expiry of the first global event which needs to be handled. It 1309 + * differs from KTIME_MAX if: 1310 + * - group is the top level group and 1311 + * - group is idle (which means CPU was the last active CPU in the 1312 + * hierarchy) and 1313 + * - there is a pending event in the hierarchy 1314 + */ 1315 + WARN_ON_ONCE(data->firstexp != KTIME_MAX && group->parent); 1316 + 1317 + trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); 1318 + 1319 + return walk_done; 1320 + } 1321 + 1322 + static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp) 1323 + { 1324 + struct tmigr_walk data = { .nextexp = nextexp, 1325 + .firstexp = KTIME_MAX, 1326 + .evt = &tmc->cpuevt, 1327 + .childmask = tmc->childmask }; 1328 + 1329 + /* 1330 + * If nextexp is KTIME_MAX, the CPU event will be ignored because the 1331 + * local timer expires before the global timer, no global timer is set 1332 + * or CPU goes offline. 1333 + */ 1334 + if (nextexp != KTIME_MAX) 1335 + tmc->cpuevt.ignore = false; 1336 + 1337 + walk_groups(&tmigr_inactive_up, &data, tmc); 1338 + return data.firstexp; 1339 + } 1340 + 1341 + /** 1342 + * tmigr_cpu_deactivate() - Put current CPU into inactive state 1343 + * @nextexp: The next global timer expiry of the current CPU 1344 + * 1345 + * Must be called with interrupts disabled. 1346 + * 1347 + * Return: the next event expiry of the current CPU or the next event expiry 1348 + * from the hierarchy if this CPU is the top level migrator or the hierarchy is 1349 + * completely idle. 1350 + */ 1351 + u64 tmigr_cpu_deactivate(u64 nextexp) 1352 + { 1353 + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1354 + u64 ret; 1355 + 1356 + if (tmigr_is_not_available(tmc)) 1357 + return nextexp; 1358 + 1359 + raw_spin_lock(&tmc->lock); 1360 + 1361 + ret = __tmigr_cpu_deactivate(tmc, nextexp); 1362 + 1363 + tmc->idle = true; 1364 + 1365 + /* 1366 + * Make sure the reevaluation of timers in idle path will not miss an 1367 + * event. 1368 + */ 1369 + WRITE_ONCE(tmc->wakeup, ret); 1370 + 1371 + trace_tmigr_cpu_idle(tmc, nextexp); 1372 + raw_spin_unlock(&tmc->lock); 1373 + return ret; 1374 + } 1375 + 1376 + /** 1377 + * tmigr_quick_check() - Quick forecast of next tmigr event when CPU wants to 1378 + * go idle 1379 + * @nextevt: The next global timer expiry of the current CPU 1380 + * 1381 + * Return: 1382 + * * KTIME_MAX - when it is probable that nothing has to be done (not 1383 + * the only one in the level 0 group; and if it is the 1384 + * only one in level 0 group, but there are more than a 1385 + * single group active on the way to top level) 1386 + * * nextevt - when CPU is offline and has to handle timer on his own 1387 + * or when on the way to top in every group only a single 1388 + * child is active but @nextevt is before the lowest 1389 + * next_expiry encountered while walking up to top level. 1390 + * * next_expiry - value of lowest expiry encountered while walking groups 1391 + * if only a single child is active on each and @nextevt 1392 + * is after this lowest expiry. 1393 + */ 1394 + u64 tmigr_quick_check(u64 nextevt) 1395 + { 1396 + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1397 + struct tmigr_group *group = tmc->tmgroup; 1398 + 1399 + if (tmigr_is_not_available(tmc)) 1400 + return nextevt; 1401 + 1402 + if (WARN_ON_ONCE(tmc->idle)) 1403 + return nextevt; 1404 + 1405 + if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->childmask)) 1406 + return KTIME_MAX; 1407 + 1408 + do { 1409 + if (!tmigr_check_lonely(group)) { 1410 + return KTIME_MAX; 1411 + } else { 1412 + /* 1413 + * Since current CPU is active, events may not be sorted 1414 + * from bottom to the top because the CPU's event is ignored 1415 + * up to the top and its sibling's events not propagated upwards. 1416 + * Thus keep track of the lowest observed expiry. 1417 + */ 1418 + nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); 1419 + if (!group->parent) 1420 + return nextevt; 1421 + } 1422 + group = group->parent; 1423 + } while (group); 1424 + 1425 + return KTIME_MAX; 1426 + } 1427 + 1428 + static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, 1429 + int node) 1430 + { 1431 + union tmigr_state s; 1432 + 1433 + raw_spin_lock_init(&group->lock); 1434 + 1435 + group->level = lvl; 1436 + group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE; 1437 + 1438 + group->num_children = 0; 1439 + 1440 + s.migrator = TMIGR_NONE; 1441 + s.active = 0; 1442 + s.seq = 0; 1443 + atomic_set(&group->migr_state, s.state); 1444 + 1445 + timerqueue_init_head(&group->events); 1446 + timerqueue_init(&group->groupevt.nextevt); 1447 + group->groupevt.nextevt.expires = KTIME_MAX; 1448 + WRITE_ONCE(group->next_expiry, KTIME_MAX); 1449 + group->groupevt.ignore = true; 1450 + } 1451 + 1452 + static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, 1453 + unsigned int lvl) 1454 + { 1455 + struct tmigr_group *tmp, *group = NULL; 1456 + 1457 + lockdep_assert_held(&tmigr_mutex); 1458 + 1459 + /* Try to attach to an existing group first */ 1460 + list_for_each_entry(tmp, &tmigr_level_list[lvl], list) { 1461 + /* 1462 + * If @lvl is below the cross NUMA node level, check whether 1463 + * this group belongs to the same NUMA node. 1464 + */ 1465 + if (lvl < tmigr_crossnode_level && tmp->numa_node != node) 1466 + continue; 1467 + 1468 + /* Capacity left? */ 1469 + if (tmp->num_children >= TMIGR_CHILDREN_PER_GROUP) 1470 + continue; 1471 + 1472 + /* 1473 + * TODO: A possible further improvement: Make sure that all CPU 1474 + * siblings end up in the same group of the lowest level of the 1475 + * hierarchy. Rely on the topology sibling mask would be a 1476 + * reasonable solution. 1477 + */ 1478 + 1479 + group = tmp; 1480 + break; 1481 + } 1482 + 1483 + if (group) 1484 + return group; 1485 + 1486 + /* Allocate and set up a new group */ 1487 + group = kzalloc_node(sizeof(*group), GFP_KERNEL, node); 1488 + if (!group) 1489 + return ERR_PTR(-ENOMEM); 1490 + 1491 + tmigr_init_group(group, lvl, node); 1492 + 1493 + /* Setup successful. Add it to the hierarchy */ 1494 + list_add(&group->list, &tmigr_level_list[lvl]); 1495 + trace_tmigr_group_set(group); 1496 + return group; 1497 + } 1498 + 1499 + static void tmigr_connect_child_parent(struct tmigr_group *child, 1500 + struct tmigr_group *parent) 1501 + { 1502 + union tmigr_state childstate; 1503 + 1504 + raw_spin_lock_irq(&child->lock); 1505 + raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); 1506 + 1507 + child->parent = parent; 1508 + child->childmask = BIT(parent->num_children++); 1509 + 1510 + raw_spin_unlock(&parent->lock); 1511 + raw_spin_unlock_irq(&child->lock); 1512 + 1513 + trace_tmigr_connect_child_parent(child); 1514 + 1515 + /* 1516 + * To prevent inconsistent states, active children need to be active in 1517 + * the new parent as well. Inactive children are already marked inactive 1518 + * in the parent group: 1519 + * 1520 + * * When new groups were created by tmigr_setup_groups() starting from 1521 + * the lowest level (and not higher then one level below the current 1522 + * top level), then they are not active. They will be set active when 1523 + * the new online CPU comes active. 1524 + * 1525 + * * But if a new group above the current top level is required, it is 1526 + * mandatory to propagate the active state of the already existing 1527 + * child to the new parent. So tmigr_connect_child_parent() is 1528 + * executed with the formerly top level group (child) and the newly 1529 + * created group (parent). 1530 + */ 1531 + childstate.state = atomic_read(&child->migr_state); 1532 + if (childstate.migrator != TMIGR_NONE) { 1533 + struct tmigr_walk data; 1534 + 1535 + data.childmask = child->childmask; 1536 + 1537 + /* 1538 + * There is only one new level per time. When connecting the 1539 + * child and the parent and set the child active when the parent 1540 + * is inactive, the parent needs to be the uppermost 1541 + * level. Otherwise there went something wrong! 1542 + */ 1543 + WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); 1544 + } 1545 + } 1546 + 1547 + static int tmigr_setup_groups(unsigned int cpu, unsigned int node) 1548 + { 1549 + struct tmigr_group *group, *child, **stack; 1550 + int top = 0, err = 0, i = 0; 1551 + struct list_head *lvllist; 1552 + 1553 + stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL); 1554 + if (!stack) 1555 + return -ENOMEM; 1556 + 1557 + do { 1558 + group = tmigr_get_group(cpu, node, i); 1559 + if (IS_ERR(group)) { 1560 + err = PTR_ERR(group); 1561 + break; 1562 + } 1563 + 1564 + top = i; 1565 + stack[i++] = group; 1566 + 1567 + /* 1568 + * When booting only less CPUs of a system than CPUs are 1569 + * available, not all calculated hierarchy levels are required. 1570 + * 1571 + * The loop is aborted as soon as the highest level, which might 1572 + * be different from tmigr_hierarchy_levels, contains only a 1573 + * single group. 1574 + */ 1575 + if (group->parent || i == tmigr_hierarchy_levels || 1576 + (list_empty(&tmigr_level_list[i]) && 1577 + list_is_singular(&tmigr_level_list[i - 1]))) 1578 + break; 1579 + 1580 + } while (i < tmigr_hierarchy_levels); 1581 + 1582 + do { 1583 + group = stack[--i]; 1584 + 1585 + if (err < 0) { 1586 + list_del(&group->list); 1587 + kfree(group); 1588 + continue; 1589 + } 1590 + 1591 + WARN_ON_ONCE(i != group->level); 1592 + 1593 + /* 1594 + * Update tmc -> group / child -> group connection 1595 + */ 1596 + if (i == 0) { 1597 + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1598 + 1599 + raw_spin_lock_irq(&group->lock); 1600 + 1601 + tmc->tmgroup = group; 1602 + tmc->childmask = BIT(group->num_children++); 1603 + 1604 + raw_spin_unlock_irq(&group->lock); 1605 + 1606 + trace_tmigr_connect_cpu_parent(tmc); 1607 + 1608 + /* There are no children that need to be connected */ 1609 + continue; 1610 + } else { 1611 + child = stack[i - 1]; 1612 + tmigr_connect_child_parent(child, group); 1613 + } 1614 + 1615 + /* check if uppermost level was newly created */ 1616 + if (top != i) 1617 + continue; 1618 + 1619 + WARN_ON_ONCE(top == 0); 1620 + 1621 + lvllist = &tmigr_level_list[top]; 1622 + if (group->num_children == 1 && list_is_singular(lvllist)) { 1623 + lvllist = &tmigr_level_list[top - 1]; 1624 + list_for_each_entry(child, lvllist, list) { 1625 + if (child->parent) 1626 + continue; 1627 + 1628 + tmigr_connect_child_parent(child, group); 1629 + } 1630 + } 1631 + } while (i > 0); 1632 + 1633 + kfree(stack); 1634 + 1635 + return err; 1636 + } 1637 + 1638 + static int tmigr_add_cpu(unsigned int cpu) 1639 + { 1640 + int node = cpu_to_node(cpu); 1641 + int ret; 1642 + 1643 + mutex_lock(&tmigr_mutex); 1644 + ret = tmigr_setup_groups(cpu, node); 1645 + mutex_unlock(&tmigr_mutex); 1646 + 1647 + return ret; 1648 + } 1649 + 1650 + static int tmigr_cpu_online(unsigned int cpu) 1651 + { 1652 + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1653 + int ret; 1654 + 1655 + /* First online attempt? Initialize CPU data */ 1656 + if (!tmc->tmgroup) { 1657 + raw_spin_lock_init(&tmc->lock); 1658 + 1659 + ret = tmigr_add_cpu(cpu); 1660 + if (ret < 0) 1661 + return ret; 1662 + 1663 + if (tmc->childmask == 0) 1664 + return -EINVAL; 1665 + 1666 + timerqueue_init(&tmc->cpuevt.nextevt); 1667 + tmc->cpuevt.nextevt.expires = KTIME_MAX; 1668 + tmc->cpuevt.ignore = true; 1669 + tmc->cpuevt.cpu = cpu; 1670 + 1671 + tmc->remote = false; 1672 + WRITE_ONCE(tmc->wakeup, KTIME_MAX); 1673 + } 1674 + raw_spin_lock_irq(&tmc->lock); 1675 + trace_tmigr_cpu_online(tmc); 1676 + tmc->idle = timer_base_is_idle(); 1677 + if (!tmc->idle) 1678 + __tmigr_cpu_activate(tmc); 1679 + tmc->online = true; 1680 + raw_spin_unlock_irq(&tmc->lock); 1681 + return 0; 1682 + } 1683 + 1684 + /* 1685 + * tmigr_trigger_active() - trigger a CPU to become active again 1686 + * 1687 + * This function is executed on a CPU which is part of cpu_online_mask, when the 1688 + * last active CPU in the hierarchy is offlining. With this, it is ensured that 1689 + * the other CPU is active and takes over the migrator duty. 1690 + */ 1691 + static long tmigr_trigger_active(void *unused) 1692 + { 1693 + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1694 + 1695 + WARN_ON_ONCE(!tmc->online || tmc->idle); 1696 + 1697 + return 0; 1698 + } 1699 + 1700 + static int tmigr_cpu_offline(unsigned int cpu) 1701 + { 1702 + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1703 + int migrator; 1704 + u64 firstexp; 1705 + 1706 + raw_spin_lock_irq(&tmc->lock); 1707 + tmc->online = false; 1708 + WRITE_ONCE(tmc->wakeup, KTIME_MAX); 1709 + 1710 + /* 1711 + * CPU has to handle the local events on his own, when on the way to 1712 + * offline; Therefore nextevt value is set to KTIME_MAX 1713 + */ 1714 + firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); 1715 + trace_tmigr_cpu_offline(tmc); 1716 + raw_spin_unlock_irq(&tmc->lock); 1717 + 1718 + if (firstexp != KTIME_MAX) { 1719 + migrator = cpumask_any_but(cpu_online_mask, cpu); 1720 + work_on_cpu(migrator, tmigr_trigger_active, NULL); 1721 + } 1722 + 1723 + return 0; 1724 + } 1725 + 1726 + static int __init tmigr_init(void) 1727 + { 1728 + unsigned int cpulvl, nodelvl, cpus_per_node, i; 1729 + unsigned int nnodes = num_possible_nodes(); 1730 + unsigned int ncpus = num_possible_cpus(); 1731 + int ret = -ENOMEM; 1732 + 1733 + BUILD_BUG_ON_NOT_POWER_OF_2(TMIGR_CHILDREN_PER_GROUP); 1734 + 1735 + /* Nothing to do if running on UP */ 1736 + if (ncpus == 1) 1737 + return 0; 1738 + 1739 + /* 1740 + * Calculate the required hierarchy levels. Unfortunately there is no 1741 + * reliable information available, unless all possible CPUs have been 1742 + * brought up and all NUMA nodes are populated. 1743 + * 1744 + * Estimate the number of levels with the number of possible nodes and 1745 + * the number of possible CPUs. Assume CPUs are spread evenly across 1746 + * nodes. We cannot rely on cpumask_of_node() because it only works for 1747 + * online CPUs. 1748 + */ 1749 + cpus_per_node = DIV_ROUND_UP(ncpus, nnodes); 1750 + 1751 + /* Calc the hierarchy levels required to hold the CPUs of a node */ 1752 + cpulvl = DIV_ROUND_UP(order_base_2(cpus_per_node), 1753 + ilog2(TMIGR_CHILDREN_PER_GROUP)); 1754 + 1755 + /* Calculate the extra levels to connect all nodes */ 1756 + nodelvl = DIV_ROUND_UP(order_base_2(nnodes), 1757 + ilog2(TMIGR_CHILDREN_PER_GROUP)); 1758 + 1759 + tmigr_hierarchy_levels = cpulvl + nodelvl; 1760 + 1761 + /* 1762 + * If a NUMA node spawns more than one CPU level group then the next 1763 + * level(s) of the hierarchy contains groups which handle all CPU groups 1764 + * of the same NUMA node. The level above goes across NUMA nodes. Store 1765 + * this information for the setup code to decide in which level node 1766 + * matching is no longer required. 1767 + */ 1768 + tmigr_crossnode_level = cpulvl; 1769 + 1770 + tmigr_level_list = kcalloc(tmigr_hierarchy_levels, sizeof(struct list_head), GFP_KERNEL); 1771 + if (!tmigr_level_list) 1772 + goto err; 1773 + 1774 + for (i = 0; i < tmigr_hierarchy_levels; i++) 1775 + INIT_LIST_HEAD(&tmigr_level_list[i]); 1776 + 1777 + pr_info("Timer migration: %d hierarchy levels; %d children per group;" 1778 + " %d crossnode level\n", 1779 + tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, 1780 + tmigr_crossnode_level); 1781 + 1782 + ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", 1783 + tmigr_cpu_online, tmigr_cpu_offline); 1784 + if (ret) 1785 + goto err; 1786 + 1787 + return 0; 1788 + 1789 + err: 1790 + pr_err("Timer migration setup failed\n"); 1791 + return ret; 1792 + } 1793 + late_initcall(tmigr_init);

+140

kernel/time/timer_migration.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef _KERNEL_TIME_MIGRATION_H 3 + #define _KERNEL_TIME_MIGRATION_H 4 + 5 + /* Per group capacity. Must be a power of 2! */ 6 + #define TMIGR_CHILDREN_PER_GROUP 8 7 + 8 + /** 9 + * struct tmigr_event - a timer event associated to a CPU 10 + * @nextevt: The node to enqueue an event in the parent group queue 11 + * @cpu: The CPU to which this event belongs 12 + * @ignore: Hint whether the event could be ignored; it is set when 13 + * CPU or group is active; 14 + */ 15 + struct tmigr_event { 16 + struct timerqueue_node nextevt; 17 + unsigned int cpu; 18 + bool ignore; 19 + }; 20 + 21 + /** 22 + * struct tmigr_group - timer migration hierarchy group 23 + * @lock: Lock protecting the event information and group hierarchy 24 + * information during setup 25 + * @parent: Pointer to the parent group 26 + * @groupevt: Next event of the group which is only used when the 27 + * group is !active. The group event is then queued into 28 + * the parent timer queue. 29 + * Ignore bit of @groupevt is set when the group is active. 30 + * @next_expiry: Base monotonic expiry time of the next event of the 31 + * group; It is used for the racy lockless check whether a 32 + * remote expiry is required; it is always reliable 33 + * @events: Timer queue for child events queued in the group 34 + * @migr_state: State of the group (see union tmigr_state) 35 + * @level: Hierarchy level of the group; Required during setup 36 + * @numa_node: Required for setup only to make sure CPU and low level 37 + * group information is NUMA local. It is set to NUMA node 38 + * as long as the group level is per NUMA node (level < 39 + * tmigr_crossnode_level); otherwise it is set to 40 + * NUMA_NO_NODE 41 + * @num_children: Counter of group children to make sure the group is only 42 + * filled with TMIGR_CHILDREN_PER_GROUP; Required for setup 43 + * only 44 + * @childmask: childmask of the group in the parent group; is set 45 + * during setup and will never change; can be read 46 + * lockless 47 + * @list: List head that is added to the per level 48 + * tmigr_level_list; is required during setup when a 49 + * new group needs to be connected to the existing 50 + * hierarchy groups 51 + */ 52 + struct tmigr_group { 53 + raw_spinlock_t lock; 54 + struct tmigr_group *parent; 55 + struct tmigr_event groupevt; 56 + u64 next_expiry; 57 + struct timerqueue_head events; 58 + atomic_t migr_state; 59 + unsigned int level; 60 + int numa_node; 61 + unsigned int num_children; 62 + u8 childmask; 63 + struct list_head list; 64 + }; 65 + 66 + /** 67 + * struct tmigr_cpu - timer migration per CPU group 68 + * @lock: Lock protecting the tmigr_cpu group information 69 + * @online: Indicates whether the CPU is online; In deactivate path 70 + * it is required to know whether the migrator in the top 71 + * level group is to be set offline, while a timer is 72 + * pending. Then another online CPU needs to be notified to 73 + * take over the migrator role. Furthermore the information 74 + * is required in CPU hotplug path as the CPU is able to go 75 + * idle before the timer migration hierarchy hotplug AP is 76 + * reached. During this phase, the CPU has to handle the 77 + * global timers on its own and must not act as a migrator. 78 + * @idle: Indicates whether the CPU is idle in the timer migration 79 + * hierarchy 80 + * @remote: Is set when timers of the CPU are expired remotely 81 + * @tmgroup: Pointer to the parent group 82 + * @childmask: childmask of tmigr_cpu in the parent group 83 + * @wakeup: Stores the first timer when the timer migration 84 + * hierarchy is completely idle and remote expiry was done; 85 + * is returned to timer code in the idle path and is only 86 + * used in idle path. 87 + * @cpuevt: CPU event which could be enqueued into the parent group 88 + */ 89 + struct tmigr_cpu { 90 + raw_spinlock_t lock; 91 + bool online; 92 + bool idle; 93 + bool remote; 94 + struct tmigr_group *tmgroup; 95 + u8 childmask; 96 + u64 wakeup; 97 + struct tmigr_event cpuevt; 98 + }; 99 + 100 + /** 101 + * union tmigr_state - state of tmigr_group 102 + * @state: Combined version of the state - only used for atomic 103 + * read/cmpxchg function 104 + * @struct: Split version of the state - only use the struct members to 105 + * update information to stay independent of endianness 106 + */ 107 + union tmigr_state { 108 + u32 state; 109 + /** 110 + * struct - split state of tmigr_group 111 + * @active: Contains each childmask bit of the active children 112 + * @migrator: Contains childmask of the child which is migrator 113 + * @seq: Sequence counter needs to be increased when an update 114 + * to the tmigr_state is done. It prevents a race when 115 + * updates in the child groups are propagated in changed 116 + * order. Detailed information about the scenario is 117 + * given in the documentation at the begin of 118 + * timer_migration.c. 119 + */ 120 + struct { 121 + u8 active; 122 + u8 migrator; 123 + u16 seq; 124 + } __packed; 125 + }; 126 + 127 + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 128 + extern void tmigr_handle_remote(void); 129 + extern bool tmigr_requires_handle_remote(void); 130 + extern void tmigr_cpu_activate(void); 131 + extern u64 tmigr_cpu_deactivate(u64 nextevt); 132 + extern u64 tmigr_cpu_new_timer(u64 nextevt); 133 + extern u64 tmigr_quick_check(u64 nextevt); 134 + #else 135 + static inline void tmigr_handle_remote(void) { } 136 + static inline bool tmigr_requires_handle_remote(void) { return false; } 137 + static inline void tmigr_cpu_activate(void) { } 138 + #endif 139 + 140 + #endif

+1 -1

kernel/workqueue.c

··· 2564 2564 add_timer_on(timer, cpu); 2565 2565 } else { 2566 2566 if (likely(cpu == WORK_CPU_UNBOUND)) 2567 - add_timer(timer); 2567 + add_timer_global(timer); 2568 2568 else 2569 2569 add_timer_on(timer, cpu); 2570 2570 }

+1 -1

tools/testing/selftests/rcutorture/bin/torture.sh

··· 567 567 torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 tsc=watchdog" 568 568 torture_set "clocksourcewd-1" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --kconfig "CONFIG_TEST_CLOCKSOURCE_WATCHDOG=y" --trust-make 569 569 570 - torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 clocksource.max_cswd_read_retries=1 tsc=watchdog" 570 + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 tsc=watchdog" 571 571 torture_set "clocksourcewd-2" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --kconfig "CONFIG_TEST_CLOCKSOURCE_WATCHDOG=y" --trust-make 572 572 573 573 # In case our work is already done...