Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'timers-core-2025-03-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull timer core updates from Thomas Gleixner:

- Fix a memory ordering issue in posix-timers

Posix-timer lookup is lockless and reevaluates the timer validity
under the timer lock, but the update which validates the timer is not
protected by the timer lock. That allows the store to be reordered
against the initialization stores, so that the lookup side can
observe a partially initialized timer. That's mostly a theoretical
problem, but incorrect nevertheless.

- Fix a long standing inconsistency of the coarse time getters

The coarse time getters read the base time of the current update
cycle without reading the actual hardware clock. NTP frequency
adjustment can set the base time backwards. The fine grained
interfaces compensate this by reading the clock and applying the new
conversion factor, but the coarse grained time getters use the base
time directly. That allows the user to observe time going backwards.

Cure it by always forwarding base time, when NTP changes the
frequency with an immediate step.

- Rework of posix-timer hashing

The posix-timer hash is not scalable and due to the CRIU timer
restore mechanism prone to massive contention on the global hash
bucket lock.

Replace the global hash lock with a fine grained per bucket locking
scheme to address that.

- Rework the proc/$PID/timers interface.

/proc/$PID/timers is provided for CRIU to be able to restore a timer.
The printout happens with sighand lock held and interrupts disabled.
That's not required as this can be done with RCU protection as well.

- Provide a sane mechanism for CRIU to restore a timer ID

CRIU restores timers by creating and deleting them until the kernel
internal per process ID counter reached the requested ID. That's
horribly slow for sparse timer IDs.

Provide a prctl() which allows CRIU to restore a timer with a given
ID. When enabled the ID pointer is used as input pointer to read the
requested ID from user space. When disabled, the normal allocation
scheme (next ID) is active as before. This is backwards compatible
for both kernel and user space.

- Make hrtimer_update_function() less expensive.

The sanity checks are valuable, but expensive for high frequency
usage in io/uring. Make the debug checks conditional and enable them
only when lockdep is enabled.

- Small updates, cleanups and improvements

* tag 'timers-core-2025-03-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (27 commits)
selftests/timers: Improve skew_consistency by testing with other clockids
timekeeping: Fix possible inconsistencies in _COARSE clockids
posix-timers: Drop redundant memset() invocation
selftests/timers/posix-timers: Add a test for exact allocation mode
posix-timers: Provide a mechanism to allocate a given timer ID
posix-timers: Dont iterate /proc/$PID/timers with sighand:: Siglock held
posix-timers: Make per process list RCU safe
posix-timers: Avoid false cacheline sharing
posix-timers: Switch to jhash32()
posix-timers: Improve hash table performance
posix-timers: Make signal_struct:: Next_posix_timer_id an atomic_t
posix-timers: Make lock_timer() use guard()
posix-timers: Rework timer removal
posix-timers: Simplify lock/unlock_timer()
posix-timers: Use guards in a few places
posix-timers: Remove SLAB_PANIC from kmem cache
posix-timers: Remove a few paranoid warnings
posix-timers: Cleanup includes
posix-timers: Add cond_resched() to posix_timer_add() search loop
posix-timers: Initialise timer before adding it to the hash table
...

+528 -383
+20 -28
fs/proc/base.c
··· 2494 2494 2495 2495 #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) 2496 2496 struct timers_private { 2497 - struct pid *pid; 2498 - struct task_struct *task; 2499 - struct sighand_struct *sighand; 2500 - struct pid_namespace *ns; 2501 - unsigned long flags; 2497 + struct pid *pid; 2498 + struct task_struct *task; 2499 + struct pid_namespace *ns; 2502 2500 }; 2503 2501 2504 2502 static void *timers_start(struct seq_file *m, loff_t *pos) ··· 2507 2509 if (!tp->task) 2508 2510 return ERR_PTR(-ESRCH); 2509 2511 2510 - tp->sighand = lock_task_sighand(tp->task, &tp->flags); 2511 - if (!tp->sighand) 2512 - return ERR_PTR(-ESRCH); 2513 - 2514 - return seq_hlist_start(&tp->task->signal->posix_timers, *pos); 2512 + rcu_read_lock(); 2513 + return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos); 2515 2514 } 2516 2515 2517 2516 static void *timers_next(struct seq_file *m, void *v, loff_t *pos) 2518 2517 { 2519 2518 struct timers_private *tp = m->private; 2520 - return seq_hlist_next(v, &tp->task->signal->posix_timers, pos); 2519 + 2520 + return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos); 2521 2521 } 2522 2522 2523 2523 static void timers_stop(struct seq_file *m, void *v) 2524 2524 { 2525 2525 struct timers_private *tp = m->private; 2526 2526 2527 - if (tp->sighand) { 2528 - unlock_task_sighand(tp->task, &tp->flags); 2529 - tp->sighand = NULL; 2530 - } 2531 - 2532 2527 if (tp->task) { 2533 2528 put_task_struct(tp->task); 2534 2529 tp->task = NULL; 2530 + rcu_read_unlock(); 2535 2531 } 2536 2532 } 2537 2533 2538 2534 static int show_timer(struct seq_file *m, void *v) 2539 2535 { 2540 - struct k_itimer *timer; 2541 - struct timers_private *tp = m->private; 2542 - int notify; 2543 2536 static const char * const nstr[] = { 2544 - [SIGEV_SIGNAL] = "signal", 2545 - [SIGEV_NONE] = "none", 2546 - [SIGEV_THREAD] = "thread", 2537 + [SIGEV_SIGNAL] = "signal", 2538 + [SIGEV_NONE] = "none", 2539 + [SIGEV_THREAD] = "thread", 2547 2540 }; 2548 2541 2549 - timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); 2550 - notify = timer->it_sigev_notify; 2542 + struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); 2543 + struct timers_private *tp = m->private; 2544 + int notify = timer->it_sigev_notify; 2545 + 2546 + guard(spinlock_irq)(&timer->it_lock); 2547 + if (!posixtimer_valid(timer)) 2548 + return 0; 2551 2549 2552 2550 seq_printf(m, "ID: %d\n", timer->it_id); 2553 - seq_printf(m, "signal: %d/%px\n", 2554 - timer->sigq.info.si_signo, 2551 + seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo, 2555 2552 timer->sigq.info.si_value.sival_ptr); 2556 - seq_printf(m, "notify: %s/%s.%d\n", 2557 - nstr[notify & ~SIGEV_THREAD_ID], 2553 + seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], 2558 2554 (notify & SIGEV_THREAD_ID) ? "tid" : "pid", 2559 2555 pid_nr_ns(timer->it_pid, tp->ns)); 2560 2556 seq_printf(m, "ClockID: %d\n", timer->it_clock);
+15 -9
include/linux/cleanup.h
··· 308 308 #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \ 309 309 static __maybe_unused const bool class_##_name##_is_conditional = _is_cond 310 310 311 - #define DEFINE_GUARD(_name, _type, _lock, _unlock) \ 312 - __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ 313 - DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ 311 + #define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \ 314 312 static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ 315 - { return (void *)(__force unsigned long)*_T; } 313 + { return (void *)(__force unsigned long)*(_exp); } 314 + 315 + #define DEFINE_CLASS_IS_GUARD(_name) \ 316 + __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ 317 + __DEFINE_GUARD_LOCK_PTR(_name, _T) 318 + 319 + #define DEFINE_CLASS_IS_COND_GUARD(_name) \ 320 + __DEFINE_CLASS_IS_CONDITIONAL(_name, true); \ 321 + __DEFINE_GUARD_LOCK_PTR(_name, _T) 322 + 323 + #define DEFINE_GUARD(_name, _type, _lock, _unlock) \ 324 + DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ 325 + DEFINE_CLASS_IS_GUARD(_name) 316 326 317 327 #define DEFINE_GUARD_COND(_name, _ext, _condlock) \ 318 328 __DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \ ··· 402 392 if (_T->lock) { _unlock; } \ 403 393 } \ 404 394 \ 405 - static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ 406 - { \ 407 - return (void *)(__force unsigned long)_T->lock; \ 408 - } 409 - 395 + __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) 410 396 411 397 #define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \ 412 398 static inline class_##_name##_t class_##_name##_constructor(_type *l) \
+2 -1
include/linux/hrtimer.h
··· 333 333 static inline void hrtimer_update_function(struct hrtimer *timer, 334 334 enum hrtimer_restart (*function)(struct hrtimer *)) 335 335 { 336 + #ifdef CONFIG_PROVE_LOCKING 336 337 guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock); 337 338 338 339 if (WARN_ON_ONCE(hrtimer_is_queued(timer))) ··· 341 340 342 341 if (WARN_ON_ONCE(!function)) 343 342 return; 344 - 343 + #endif 345 344 timer->function = function; 346 345 } 347 346
+21 -9
include/linux/posix-timers.h
··· 114 114 void posixtimer_send_sigqueue(struct k_itimer *tmr); 115 115 bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq); 116 116 void posixtimer_free_timer(struct k_itimer *timer); 117 + long posixtimer_create_prctl(unsigned long ctrl); 117 118 118 119 /* Init task static initializer */ 119 120 #define INIT_CPU_TIMERBASE(b) { \ ··· 141 140 static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info, 142 141 struct sigqueue *timer_sigq) { return false; } 143 142 static inline void posixtimer_free_timer(struct k_itimer *timer) { } 143 + static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; } 144 144 #endif 145 145 146 146 #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK ··· 179 177 * @rcu: RCU head for freeing the timer. 180 178 */ 181 179 struct k_itimer { 182 - struct hlist_node list; 183 - struct hlist_node ignored_list; 180 + /* 1st cacheline contains read-mostly fields */ 184 181 struct hlist_node t_hash; 185 - spinlock_t it_lock; 186 - const struct k_clock *kclock; 187 - clockid_t it_clock; 182 + struct hlist_node list; 188 183 timer_t it_id; 184 + clockid_t it_clock; 185 + int it_sigev_notify; 186 + enum pid_type it_pid_type; 187 + struct signal_struct *it_signal; 188 + const struct k_clock *kclock; 189 + 190 + /* 2nd cacheline and above contain fields which are modified regularly */ 191 + spinlock_t it_lock; 189 192 int it_status; 190 193 bool it_sig_periodic; 191 194 s64 it_overrun; 192 195 s64 it_overrun_last; 193 196 unsigned int it_signal_seq; 194 197 unsigned int it_sigqueue_seq; 195 - int it_sigev_notify; 196 - enum pid_type it_pid_type; 197 198 ktime_t it_interval; 198 - struct signal_struct *it_signal; 199 + struct hlist_node ignored_list; 199 200 union { 200 201 struct pid *it_pid; 201 202 struct task_struct *it_process; ··· 215 210 } alarm; 216 211 } it; 217 212 struct rcu_head rcu; 218 - }; 213 + } ____cacheline_aligned_in_smp; 219 214 220 215 void run_posix_cpu_timers(void); 221 216 void posix_cpu_timers_exit(struct task_struct *task); ··· 244 239 struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); 245 240 246 241 posixtimer_putref(tmr); 242 + } 243 + 244 + static inline bool posixtimer_valid(const struct k_itimer *timer) 245 + { 246 + unsigned long val = (unsigned long)timer->it_signal; 247 + 248 + return !(val & 0x1UL); 247 249 } 248 250 #else /* CONFIG_POSIX_TIMERS */ 249 251 static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { }
+2 -1
include/linux/sched/signal.h
··· 136 136 #ifdef CONFIG_POSIX_TIMERS 137 137 138 138 /* POSIX.1b Interval Timers */ 139 - unsigned int next_posix_timer_id; 139 + unsigned int timer_create_restore_ids:1; 140 + atomic_t next_posix_timer_id; 140 141 struct hlist_head posix_timers; 141 142 struct hlist_head ignored_posix_timers; 142 143
+11
include/uapi/linux/prctl.h
··· 353 353 */ 354 354 #define PR_LOCK_SHADOW_STACK_STATUS 76 355 355 356 + /* 357 + * Controls the mode of timer_create() for CRIU restore operations. 358 + * Enabling this allows CRIU to restore timers with explicit IDs. 359 + * 360 + * Don't use for normal operations as the result might be undefined. 361 + */ 362 + #define PR_TIMER_CREATE_RESTORE_IDS 77 363 + # define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 364 + # define PR_TIMER_CREATE_RESTORE_IDS_ON 1 365 + # define PR_TIMER_CREATE_RESTORE_IDS_GET 2 366 + 356 367 #endif /* _LINUX_PRCTL_H */
+1 -1
kernel/signal.c
··· 2092 2092 * from a non-periodic timer, then just drop the reference 2093 2093 * count. Otherwise queue it on the ignored list. 2094 2094 */ 2095 - if (tmr->it_signal && tmr->it_sig_periodic) 2095 + if (posixtimer_valid(tmr) && tmr->it_sig_periodic) 2096 2096 hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers); 2097 2097 else 2098 2098 posixtimer_putref(tmr);
+5
kernel/sys.c
··· 2815 2815 return -EINVAL; 2816 2816 error = arch_lock_shadow_stack_status(me, arg2); 2817 2817 break; 2818 + case PR_TIMER_CREATE_RESTORE_IDS: 2819 + if (arg3 || arg4 || arg5) 2820 + return -EINVAL; 2821 + error = posixtimer_create_prctl(arg2); 2822 + break; 2818 2823 default: 2819 2824 trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); 2820 2825 error = -EINVAL;
+1 -1
kernel/time/clocksource.c
··· 1510 1510 { 1511 1511 mutex_lock(&clocksource_mutex); 1512 1512 if (str) 1513 - strscpy(override_name, str, sizeof(override_name)); 1513 + strscpy(override_name, str); 1514 1514 mutex_unlock(&clocksource_mutex); 1515 1515 return 1; 1516 1516 }
+12 -17
kernel/time/hrtimer.c
··· 117 117 .csd = CSD_INIT(retrigger_next_event, NULL) 118 118 }; 119 119 120 - static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { 121 - /* Make sure we catch unsupported clockids */ 122 - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, 123 - 124 - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, 125 - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, 126 - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, 127 - [CLOCK_TAI] = HRTIMER_BASE_TAI, 128 - }; 129 - 130 120 static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) 131 121 { 132 122 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) ··· 1577 1587 1578 1588 static inline int hrtimer_clockid_to_base(clockid_t clock_id) 1579 1589 { 1580 - if (likely(clock_id < MAX_CLOCKS)) { 1581 - int base = hrtimer_clock_to_base_table[clock_id]; 1582 - 1583 - if (likely(base != HRTIMER_MAX_CLOCK_BASES)) 1584 - return base; 1590 + switch (clock_id) { 1591 + case CLOCK_REALTIME: 1592 + return HRTIMER_BASE_REALTIME; 1593 + case CLOCK_MONOTONIC: 1594 + return HRTIMER_BASE_MONOTONIC; 1595 + case CLOCK_BOOTTIME: 1596 + return HRTIMER_BASE_BOOTTIME; 1597 + case CLOCK_TAI: 1598 + return HRTIMER_BASE_TAI; 1599 + default: 1600 + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); 1601 + return HRTIMER_BASE_MONOTONIC; 1585 1602 } 1586 - WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); 1587 - return HRTIMER_BASE_MONOTONIC; 1588 1603 } 1589 1604 1590 1605 static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused)
+1 -23
kernel/time/posix-clock.c
··· 90 90 return err; 91 91 } 92 92 93 - #ifdef CONFIG_COMPAT 94 - static long posix_clock_compat_ioctl(struct file *fp, 95 - unsigned int cmd, unsigned long arg) 96 - { 97 - struct posix_clock_context *pccontext = fp->private_data; 98 - struct posix_clock *clk = get_posix_clock(fp); 99 - int err = -ENOTTY; 100 - 101 - if (!clk) 102 - return -ENODEV; 103 - 104 - if (clk->ops.ioctl) 105 - err = clk->ops.ioctl(pccontext, cmd, arg); 106 - 107 - put_posix_clock(clk); 108 - 109 - return err; 110 - } 111 - #endif 112 - 113 93 static int posix_clock_open(struct inode *inode, struct file *fp) 114 94 { 115 95 int err; ··· 151 171 .read = posix_clock_read, 152 172 .poll = posix_clock_poll, 153 173 .unlocked_ioctl = posix_clock_ioctl, 174 + .compat_ioctl = posix_clock_ioctl, 154 175 .open = posix_clock_open, 155 176 .release = posix_clock_release, 156 - #ifdef CONFIG_COMPAT 157 - .compat_ioctl = posix_clock_compat_ioctl, 158 - #endif 159 177 }; 160 178 161 179 int posix_clock_register(struct posix_clock *clk, struct device *dev)
+293 -264
kernel/time/posix-timers.c
··· 9 9 * 10 10 * These are all the functions necessary to implement POSIX clocks & timers 11 11 */ 12 - #include <linux/mm.h> 13 - #include <linux/interrupt.h> 14 - #include <linux/slab.h> 15 - #include <linux/time.h> 16 - #include <linux/mutex.h> 17 - #include <linux/sched/task.h> 18 - 19 - #include <linux/uaccess.h> 20 - #include <linux/list.h> 21 - #include <linux/init.h> 12 + #include <linux/compat.h> 22 13 #include <linux/compiler.h> 23 - #include <linux/hash.h> 14 + #include <linux/init.h> 15 + #include <linux/jhash.h> 16 + #include <linux/interrupt.h> 17 + #include <linux/list.h> 18 + #include <linux/memblock.h> 19 + #include <linux/nospec.h> 24 20 #include <linux/posix-clock.h> 25 21 #include <linux/posix-timers.h> 22 + #include <linux/prctl.h> 23 + #include <linux/sched/task.h> 24 + #include <linux/slab.h> 26 25 #include <linux/syscalls.h> 27 - #include <linux/wait.h> 28 - #include <linux/workqueue.h> 29 - #include <linux/export.h> 30 - #include <linux/hashtable.h> 31 - #include <linux/compat.h> 32 - #include <linux/nospec.h> 26 + #include <linux/time.h> 33 27 #include <linux/time_namespace.h> 28 + #include <linux/uaccess.h> 34 29 35 30 #include "timekeeping.h" 36 31 #include "posix-timers.h" ··· 41 46 * This allows checkpoint/restore to reconstruct the exact timer IDs for 42 47 * a process. 43 48 */ 44 - static DEFINE_HASHTABLE(posix_timers_hashtable, 9); 45 - static DEFINE_SPINLOCK(hash_lock); 49 + struct timer_hash_bucket { 50 + spinlock_t lock; 51 + struct hlist_head head; 52 + }; 53 + 54 + static struct { 55 + struct timer_hash_bucket *buckets; 56 + unsigned long mask; 57 + } __timer_data __ro_after_init __aligned(2*sizeof(long)); 58 + 59 + #define timer_buckets (__timer_data.buckets) 60 + #define timer_hashmask (__timer_data.mask) 46 61 47 62 static const struct k_clock * const posix_clocks[]; 48 63 static const struct k_clock *clockid_to_kclock(const clockid_t id); 49 64 static const struct k_clock clock_realtime, clock_monotonic; 65 + 66 + #define TIMER_ANY_ID INT_MIN 50 67 51 68 /* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ 52 69 #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ ··· 66 59 #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" 67 60 #endif 68 61 69 - static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); 62 + static struct k_itimer *__lock_timer(timer_t timer_id); 70 63 71 - #define lock_timer(tid, flags) \ 72 - ({ struct k_itimer *__timr; \ 73 - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ 74 - __timr; \ 64 + #define lock_timer(tid) \ 65 + ({ struct k_itimer *__timr; \ 66 + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \ 67 + __timr; \ 75 68 }) 76 69 77 - static int hash(struct signal_struct *sig, unsigned int nr) 70 + static inline void unlock_timer(struct k_itimer *timr) 78 71 { 79 - return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); 72 + if (likely((timr))) 73 + spin_unlock_irq(&timr->it_lock); 80 74 } 81 75 82 - static struct k_itimer *__posix_timers_find(struct hlist_head *head, 83 - struct signal_struct *sig, 84 - timer_t id) 76 + #define scoped_timer_get_or_fail(_id) \ 77 + scoped_cond_guard(lock_timer, return -EINVAL, _id) 78 + 79 + #define scoped_timer (scope) 80 + 81 + DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id); 82 + DEFINE_CLASS_IS_COND_GUARD(lock_timer); 83 + 84 + static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr) 85 85 { 86 + return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask]; 87 + } 88 + 89 + static struct k_itimer *posix_timer_by_id(timer_t id) 90 + { 91 + struct signal_struct *sig = current->signal; 92 + struct timer_hash_bucket *bucket = hash_bucket(sig, id); 86 93 struct k_itimer *timer; 87 94 88 - hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { 95 + hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) { 89 96 /* timer->it_signal can be set concurrently */ 90 97 if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) 91 98 return timer; ··· 107 86 return NULL; 108 87 } 109 88 110 - static struct k_itimer *posix_timer_by_id(timer_t id) 89 + static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer) 111 90 { 112 - struct signal_struct *sig = current->signal; 113 - struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; 114 - 115 - return __posix_timers_find(head, sig, id); 116 - } 117 - 118 - static int posix_timer_add(struct k_itimer *timer) 119 - { 120 - struct signal_struct *sig = current->signal; 121 - struct hlist_head *head; 122 - unsigned int cnt, id; 91 + unsigned long val = (unsigned long)timer->it_signal; 123 92 124 93 /* 125 - * FIXME: Replace this by a per signal struct xarray once there is 126 - * a plan to handle the resulting CRIU regression gracefully. 94 + * Mask out bit 0, which acts as invalid marker to prevent 95 + * posix_timer_by_id() detecting it as valid. 127 96 */ 128 - for (cnt = 0; cnt <= INT_MAX; cnt++) { 129 - spin_lock(&hash_lock); 130 - id = sig->next_posix_timer_id; 97 + return (struct signal_struct *)(val & ~1UL); 98 + } 131 99 132 - /* Write the next ID back. Clamp it to the positive space */ 133 - sig->next_posix_timer_id = (id + 1) & INT_MAX; 100 + static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig, 101 + timer_t id) 102 + { 103 + struct hlist_head *head = &bucket->head; 104 + struct k_itimer *timer; 134 105 135 - head = &posix_timers_hashtable[hash(sig, id)]; 136 - if (!__posix_timers_find(head, sig, id)) { 137 - hlist_add_head_rcu(&timer->t_hash, head); 138 - spin_unlock(&hash_lock); 139 - return id; 106 + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) { 107 + if ((posix_sig_owner(timer) == sig) && (timer->it_id == id)) 108 + return true; 109 + } 110 + return false; 111 + } 112 + 113 + static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id) 114 + { 115 + struct timer_hash_bucket *bucket = hash_bucket(sig, id); 116 + 117 + scoped_guard (spinlock, &bucket->lock) { 118 + /* 119 + * Validate under the lock as this could have raced against 120 + * another thread ending up with the same ID, which is 121 + * highly unlikely, but possible. 122 + */ 123 + if (!posix_timer_hashed(bucket, sig, id)) { 124 + /* 125 + * Set the timer ID and the signal pointer to make 126 + * it identifiable in the hash table. The signal 127 + * pointer has bit 0 set to indicate that it is not 128 + * yet fully initialized. posix_timer_hashed() 129 + * masks this bit out, but the syscall lookup fails 130 + * to match due to it being set. This guarantees 131 + * that there can't be duplicate timer IDs handed 132 + * out. 133 + */ 134 + timer->it_id = (timer_t)id; 135 + timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL); 136 + hlist_add_head_rcu(&timer->t_hash, &bucket->head); 137 + return true; 140 138 } 141 - spin_unlock(&hash_lock); 139 + } 140 + return false; 141 + } 142 + 143 + static int posix_timer_add(struct k_itimer *timer, int req_id) 144 + { 145 + struct signal_struct *sig = current->signal; 146 + 147 + if (unlikely(req_id != TIMER_ANY_ID)) { 148 + if (!posix_timer_add_at(timer, sig, req_id)) 149 + return -EBUSY; 150 + 151 + /* 152 + * Move the ID counter past the requested ID, so that after 153 + * switching back to normal mode the IDs are outside of the 154 + * exact allocated region. That avoids ID collisions on the 155 + * next regular timer_create() invocations. 156 + */ 157 + atomic_set(&sig->next_posix_timer_id, req_id + 1); 158 + return req_id; 159 + } 160 + 161 + for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) { 162 + /* Get the next timer ID and clamp it to positive space */ 163 + unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX; 164 + 165 + if (posix_timer_add_at(timer, sig, id)) 166 + return id; 167 + cond_resched(); 142 168 } 143 169 /* POSIX return code when no timer ID could be allocated */ 144 170 return -EAGAIN; 145 - } 146 - 147 - static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) 148 - { 149 - spin_unlock_irqrestore(&timr->it_lock, flags); 150 171 } 151 172 152 173 static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) ··· 285 222 286 223 static __init int init_posix_timers(void) 287 224 { 288 - posix_timers_cache = kmem_cache_create("posix_timers_cache", 289 - sizeof(struct k_itimer), 0, 290 - SLAB_PANIC | SLAB_ACCOUNT, NULL); 225 + posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer), 226 + __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL); 291 227 return 0; 292 228 } 293 229 __initcall(init_posix_timers); ··· 321 259 * since the signal was queued. In either case, don't rearm and 322 260 * drop the signal. 323 261 */ 324 - if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal)) 262 + if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr))) 325 263 return false; 326 264 327 265 if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) ··· 366 304 { 367 305 lockdep_assert_held(&timr->it_lock); 368 306 307 + if (!posixtimer_valid(timr)) 308 + return; 309 + 369 310 timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED; 370 311 posixtimer_send_sigqueue(timr); 371 312 } ··· 387 322 guard(spinlock_irqsave)(&timr->it_lock); 388 323 posix_timer_queue_signal(timr); 389 324 return HRTIMER_NORESTART; 325 + } 326 + 327 + long posixtimer_create_prctl(unsigned long ctrl) 328 + { 329 + switch (ctrl) { 330 + case PR_TIMER_CREATE_RESTORE_IDS_OFF: 331 + current->signal->timer_create_restore_ids = 0; 332 + return 0; 333 + case PR_TIMER_CREATE_RESTORE_IDS_ON: 334 + current->signal->timer_create_restore_ids = 1; 335 + return 0; 336 + case PR_TIMER_CREATE_RESTORE_IDS_GET: 337 + return current->signal->timer_create_restore_ids; 338 + } 339 + return -EINVAL; 390 340 } 391 341 392 342 static struct pid *good_sigevent(sigevent_t * event) ··· 430 350 431 351 static struct k_itimer *alloc_posix_timer(void) 432 352 { 433 - struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); 353 + struct k_itimer *tmr; 434 354 355 + if (unlikely(!posix_timers_cache)) 356 + return NULL; 357 + 358 + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); 435 359 if (!tmr) 436 360 return tmr; 437 361 ··· 457 373 458 374 static void posix_timer_unhash_and_free(struct k_itimer *tmr) 459 375 { 460 - spin_lock(&hash_lock); 461 - hlist_del_rcu(&tmr->t_hash); 462 - spin_unlock(&hash_lock); 376 + struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id); 377 + 378 + scoped_guard (spinlock, &bucket->lock) 379 + hlist_del_rcu(&tmr->t_hash); 463 380 posixtimer_putref(tmr); 464 381 } 465 382 ··· 475 390 timer_t __user *created_timer_id) 476 391 { 477 392 const struct k_clock *kc = clockid_to_kclock(which_clock); 393 + timer_t req_id = TIMER_ANY_ID; 478 394 struct k_itimer *new_timer; 479 395 int error, new_timer_id; 480 396 ··· 490 404 491 405 spin_lock_init(&new_timer->it_lock); 492 406 407 + /* Special case for CRIU to restore timers with a given timer ID. */ 408 + if (unlikely(current->signal->timer_create_restore_ids)) { 409 + if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) 410 + return -EFAULT; 411 + /* Valid IDs are 0..INT_MAX */ 412 + if ((unsigned int)req_id > INT_MAX) 413 + return -EINVAL; 414 + } 415 + 493 416 /* 494 417 * Add the timer to the hash table. The timer is not yet valid 495 - * because new_timer::it_signal is still NULL. The timer id is also 496 - * not yet visible to user space. 418 + * after insertion, but has a unique ID allocated. 497 419 */ 498 - new_timer_id = posix_timer_add(new_timer); 420 + new_timer_id = posix_timer_add(new_timer, req_id); 499 421 if (new_timer_id < 0) { 500 422 posixtimer_free_timer(new_timer); 501 423 return new_timer_id; 502 424 } 503 425 504 - new_timer->it_id = (timer_t) new_timer_id; 505 426 new_timer->it_clock = which_clock; 506 427 new_timer->kclock = kc; 507 428 new_timer->it_overrun = -1LL; 508 429 509 430 if (event) { 510 - rcu_read_lock(); 511 - new_timer->it_pid = get_pid(good_sigevent(event)); 512 - rcu_read_unlock(); 431 + scoped_guard (rcu) 432 + new_timer->it_pid = get_pid(good_sigevent(event)); 513 433 if (!new_timer->it_pid) { 514 434 error = -EINVAL; 515 435 goto out; ··· 526 434 } else { 527 435 new_timer->it_sigev_notify = SIGEV_SIGNAL; 528 436 new_timer->sigq.info.si_signo = SIGALRM; 529 - memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t)); 530 437 new_timer->sigq.info.si_value.sival_int = new_timer->it_id; 531 438 new_timer->it_pid = get_pid(task_tgid(current)); 532 439 } ··· 544 453 } 545 454 /* 546 455 * After succesful copy out, the timer ID is visible to user space 547 - * now but not yet valid because new_timer::signal is still NULL. 456 + * now but not yet valid because new_timer::signal low order bit is 1. 548 457 * 549 458 * Complete the initialization with the clock specific create 550 459 * callback. ··· 553 462 if (error) 554 463 goto out; 555 464 556 - spin_lock_irq(&current->sighand->siglock); 557 - /* This makes the timer valid in the hash table */ 558 - WRITE_ONCE(new_timer->it_signal, current->signal); 559 - hlist_add_head(&new_timer->list, &current->signal->posix_timers); 560 - spin_unlock_irq(&current->sighand->siglock); 561 465 /* 562 - * After unlocking sighand::siglock @new_timer is subject to 563 - * concurrent removal and cannot be touched anymore 466 + * timer::it_lock ensures that __lock_timer() observes a fully 467 + * initialized timer when it observes a valid timer::it_signal. 468 + * 469 + * sighand::siglock is required to protect signal::posix_timers. 470 + */ 471 + scoped_guard (spinlock_irq, &new_timer->it_lock) { 472 + guard(spinlock)(&current->sighand->siglock); 473 + /* 474 + * new_timer::it_signal contains the signal pointer with 475 + * bit 0 set, which makes it invalid for syscall operations. 476 + * Store the unmodified signal pointer to make it valid. 477 + */ 478 + WRITE_ONCE(new_timer->it_signal, current->signal); 479 + hlist_add_head_rcu(&new_timer->list, &current->signal->posix_timers); 480 + } 481 + /* 482 + * After unlocking @new_timer is subject to concurrent removal and 483 + * cannot be touched anymore 564 484 */ 565 485 return 0; 566 486 out: ··· 609 507 } 610 508 #endif 611 509 612 - static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) 510 + static struct k_itimer *__lock_timer(timer_t timer_id) 613 511 { 614 512 struct k_itimer *timr; 615 513 ··· 624 522 * The hash lookup and the timers are RCU protected. 625 523 * 626 524 * Timers are added to the hash in invalid state where 627 - * timr::it_signal == NULL. timer::it_signal is only set after the 628 - * rest of the initialization succeeded. 525 + * timr::it_signal is marked invalid. timer::it_signal is only set 526 + * after the rest of the initialization succeeded. 629 527 * 630 528 * Timer destruction happens in steps: 631 - * 1) Set timr::it_signal to NULL with timr::it_lock held 529 + * 1) Set timr::it_signal marked invalid with timr::it_lock held 632 530 * 2) Release timr::it_lock 633 531 * 3) Remove from the hash under hash_lock 634 532 * 4) Put the reference count. ··· 645 543 * 646 544 * The lookup validates locklessly that timr::it_signal == 647 545 * current::it_signal and timr::it_id == @timer_id. timr::it_id 648 - * can't change, but timr::it_signal becomes NULL during 649 - * destruction. 546 + * can't change, but timr::it_signal can become invalid during 547 + * destruction, which makes the locked check fail. 650 548 */ 651 - rcu_read_lock(); 549 + guard(rcu)(); 652 550 timr = posix_timer_by_id(timer_id); 653 551 if (timr) { 654 - spin_lock_irqsave(&timr->it_lock, *flags); 552 + spin_lock_irq(&timr->it_lock); 655 553 /* 656 554 * Validate under timr::it_lock that timr::it_signal is 657 555 * still valid. Pairs with #1 above. 658 556 */ 659 - if (timr->it_signal == current->signal) { 660 - rcu_read_unlock(); 557 + if (timr->it_signal == current->signal) 661 558 return timr; 662 - } 663 - spin_unlock_irqrestore(&timr->it_lock, *flags); 559 + spin_unlock_irq(&timr->it_lock); 664 560 } 665 - rcu_read_unlock(); 666 - 667 561 return NULL; 668 562 } 669 563 ··· 750 652 751 653 static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) 752 654 { 753 - const struct k_clock *kc; 754 - struct k_itimer *timr; 755 - unsigned long flags; 756 - int ret = 0; 757 - 758 - timr = lock_timer(timer_id, &flags); 759 - if (!timr) 760 - return -EINVAL; 761 - 762 655 memset(setting, 0, sizeof(*setting)); 763 - kc = timr->kclock; 764 - if (WARN_ON_ONCE(!kc || !kc->timer_get)) 765 - ret = -EINVAL; 766 - else 767 - kc->timer_get(timr, setting); 768 - 769 - unlock_timer(timr, flags); 770 - return ret; 656 + scoped_timer_get_or_fail(timer_id) 657 + scoped_timer->kclock->timer_get(scoped_timer, setting); 658 + return 0; 771 659 } 772 660 773 661 /* Get the time remaining on a POSIX.1b interval timer. */ ··· 807 723 */ 808 724 SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) 809 725 { 810 - struct k_itimer *timr; 811 - unsigned long flags; 812 - int overrun; 813 - 814 - timr = lock_timer(timer_id, &flags); 815 - if (!timr) 816 - return -EINVAL; 817 - 818 - overrun = timer_overrun_to_int(timr); 819 - unlock_timer(timr, flags); 820 - 821 - return overrun; 726 + scoped_timer_get_or_fail(timer_id) 727 + return timer_overrun_to_int(scoped_timer); 822 728 } 823 729 824 730 static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, ··· 865 791 * when the task which tries to delete or disarm the timer has preempted 866 792 * the task which runs the expiry in task work context. 867 793 */ 868 - static struct k_itimer *timer_wait_running(struct k_itimer *timer, 869 - unsigned long *flags) 794 + static void timer_wait_running(struct k_itimer *timer) 870 795 { 871 - const struct k_clock *kc = READ_ONCE(timer->kclock); 872 - timer_t timer_id = READ_ONCE(timer->it_id); 873 - 874 - /* Prevent kfree(timer) after dropping the lock */ 875 - rcu_read_lock(); 876 - unlock_timer(timer, *flags); 877 - 878 796 /* 879 797 * kc->timer_wait_running() might drop RCU lock. So @timer 880 798 * cannot be touched anymore after the function returns! 881 799 */ 882 - if (!WARN_ON_ONCE(!kc->timer_wait_running)) 883 - kc->timer_wait_running(timer); 884 - 885 - rcu_read_unlock(); 886 - /* Relock the timer. It might be not longer hashed. */ 887 - return lock_timer(timer_id, flags); 800 + timer->kclock->timer_wait_running(timer); 888 801 } 889 802 890 803 /* ··· 926 865 return 0; 927 866 } 928 867 929 - static int do_timer_settime(timer_t timer_id, int tmr_flags, 930 - struct itimerspec64 *new_spec64, 868 + static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, 931 869 struct itimerspec64 *old_spec64) 932 870 { 933 - const struct k_clock *kc; 934 - struct k_itimer *timr; 935 - unsigned long flags; 936 - int error; 937 - 938 871 if (!timespec64_valid(&new_spec64->it_interval) || 939 872 !timespec64_valid(&new_spec64->it_value)) 940 873 return -EINVAL; ··· 936 881 if (old_spec64) 937 882 memset(old_spec64, 0, sizeof(*old_spec64)); 938 883 939 - timr = lock_timer(timer_id, &flags); 940 - retry: 941 - if (!timr) 942 - return -EINVAL; 884 + for (; ; old_spec64 = NULL) { 885 + struct k_itimer *timr; 943 886 944 - if (old_spec64) 945 - old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); 887 + scoped_timer_get_or_fail(timer_id) { 888 + timr = scoped_timer; 946 889 947 - /* Prevent signal delivery and rearming. */ 948 - timr->it_signal_seq++; 890 + if (old_spec64) 891 + old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); 949 892 950 - kc = timr->kclock; 951 - if (WARN_ON_ONCE(!kc || !kc->timer_set)) 952 - error = -EINVAL; 953 - else 954 - error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); 893 + /* Prevent signal delivery and rearming. */ 894 + timr->it_signal_seq++; 955 895 956 - if (error == TIMER_RETRY) { 957 - // We already got the old time... 958 - old_spec64 = NULL; 959 - /* Unlocks and relocks the timer if it still exists */ 960 - timr = timer_wait_running(timr, &flags); 961 - goto retry; 896 + int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64); 897 + if (ret != TIMER_RETRY) 898 + return ret; 899 + 900 + /* Protect the timer from being freed when leaving the lock scope */ 901 + rcu_read_lock(); 902 + } 903 + timer_wait_running(timr); 904 + rcu_read_unlock(); 962 905 } 963 - unlock_timer(timr, flags); 964 - 965 - return error; 966 906 } 967 907 968 908 /* Set a POSIX.1b interval timer */ ··· 1028 978 } 1029 979 } 1030 980 1031 - static inline int timer_delete_hook(struct k_itimer *timer) 981 + static void posix_timer_delete(struct k_itimer *timer) 1032 982 { 1033 - const struct k_clock *kc = timer->kclock; 1034 - 1035 - /* Prevent signal delivery and rearming. */ 983 + /* 984 + * Invalidate the timer, remove it from the linked list and remove 985 + * it from the ignored list if pending. 986 + * 987 + * The invalidation must be written with siglock held so that the 988 + * signal code observes the invalidated timer::it_signal in 989 + * do_sigaction(), which prevents it from moving a pending signal 990 + * of a deleted timer to the ignore list. 991 + * 992 + * The invalidation also prevents signal queueing, signal delivery 993 + * and therefore rearming from the signal delivery path. 994 + * 995 + * A concurrent lookup can still find the timer in the hash, but it 996 + * will check timer::it_signal with timer::it_lock held and observe 997 + * bit 0 set, which invalidates it. That also prevents the timer ID 998 + * from being handed out before this timer is completely gone. 999 + */ 1036 1000 timer->it_signal_seq++; 1037 1001 1038 - if (WARN_ON_ONCE(!kc || !kc->timer_del)) 1039 - return -EINVAL; 1040 - return kc->timer_del(timer); 1002 + scoped_guard (spinlock, &current->sighand->siglock) { 1003 + unsigned long sig = (unsigned long)timer->it_signal | 1UL; 1004 + 1005 + WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig); 1006 + hlist_del_rcu(&timer->list); 1007 + posix_timer_cleanup_ignored(timer); 1008 + } 1009 + 1010 + while (timer->kclock->timer_del(timer) == TIMER_RETRY) { 1011 + guard(rcu)(); 1012 + spin_unlock_irq(&timer->it_lock); 1013 + timer_wait_running(timer); 1014 + spin_lock_irq(&timer->it_lock); 1015 + } 1041 1016 } 1042 1017 1043 1018 /* Delete a POSIX.1b interval timer. */ 1044 1019 SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) 1045 1020 { 1046 1021 struct k_itimer *timer; 1047 - unsigned long flags; 1048 1022 1049 - timer = lock_timer(timer_id, &flags); 1050 - 1051 - retry_delete: 1052 - if (!timer) 1053 - return -EINVAL; 1054 - 1055 - if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { 1056 - /* Unlocks and relocks the timer if it still exists */ 1057 - timer = timer_wait_running(timer, &flags); 1058 - goto retry_delete; 1023 + scoped_timer_get_or_fail(timer_id) { 1024 + timer = scoped_timer; 1025 + posix_timer_delete(timer); 1059 1026 } 1060 - 1061 - spin_lock(&current->sighand->siglock); 1062 - hlist_del(&timer->list); 1063 - posix_timer_cleanup_ignored(timer); 1064 - /* 1065 - * A concurrent lookup could check timer::it_signal lockless. It 1066 - * will reevaluate with timer::it_lock held and observe the NULL. 1067 - * 1068 - * It must be written with siglock held so that the signal code 1069 - * observes timer->it_signal == NULL in do_sigaction(SIG_IGN), 1070 - * which prevents it from moving a pending signal of a deleted 1071 - * timer to the ignore list. 1072 - */ 1073 - WRITE_ONCE(timer->it_signal, NULL); 1074 - spin_unlock(&current->sighand->siglock); 1075 - 1076 - unlock_timer(timer, flags); 1027 + /* Remove it from the hash, which frees up the timer ID */ 1077 1028 posix_timer_unhash_and_free(timer); 1078 1029 return 0; 1079 - } 1080 - 1081 - /* 1082 - * Delete a timer if it is armed, remove it from the hash and schedule it 1083 - * for RCU freeing. 1084 - */ 1085 - static void itimer_delete(struct k_itimer *timer) 1086 - { 1087 - unsigned long flags; 1088 - 1089 - /* 1090 - * irqsave is required to make timer_wait_running() work. 1091 - */ 1092 - spin_lock_irqsave(&timer->it_lock, flags); 1093 - 1094 - retry_delete: 1095 - /* 1096 - * Even if the timer is not longer accessible from other tasks 1097 - * it still might be armed and queued in the underlying timer 1098 - * mechanism. Worse, that timer mechanism might run the expiry 1099 - * function concurrently. 1100 - */ 1101 - if (timer_delete_hook(timer) == TIMER_RETRY) { 1102 - /* 1103 - * Timer is expired concurrently, prevent livelocks 1104 - * and pointless spinning on RT. 1105 - * 1106 - * timer_wait_running() drops timer::it_lock, which opens 1107 - * the possibility for another task to delete the timer. 1108 - * 1109 - * That's not possible here because this is invoked from 1110 - * do_exit() only for the last thread of the thread group. 1111 - * So no other task can access and delete that timer. 1112 - */ 1113 - if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) 1114 - return; 1115 - 1116 - goto retry_delete; 1117 - } 1118 - hlist_del(&timer->list); 1119 - 1120 - posix_timer_cleanup_ignored(timer); 1121 - 1122 - /* 1123 - * Setting timer::it_signal to NULL is technically not required 1124 - * here as nothing can access the timer anymore legitimately via 1125 - * the hash table. Set it to NULL nevertheless so that all deletion 1126 - * paths are consistent. 1127 - */ 1128 - WRITE_ONCE(timer->it_signal, NULL); 1129 - 1130 - spin_unlock_irqrestore(&timer->it_lock, flags); 1131 - posix_timer_unhash_and_free(timer); 1132 1030 } 1133 1031 1134 1032 /* ··· 1087 1089 void exit_itimers(struct task_struct *tsk) 1088 1090 { 1089 1091 struct hlist_head timers; 1092 + struct hlist_node *next; 1093 + struct k_itimer *timer; 1094 + 1095 + /* Clear restore mode for exec() */ 1096 + tsk->signal->timer_create_restore_ids = 0; 1090 1097 1091 1098 if (hlist_empty(&tsk->signal->posix_timers)) 1092 1099 return; 1093 1100 1094 1101 /* Protect against concurrent read via /proc/$PID/timers */ 1095 - spin_lock_irq(&tsk->sighand->siglock); 1096 - hlist_move_list(&tsk->signal->posix_timers, &timers); 1097 - spin_unlock_irq(&tsk->sighand->siglock); 1102 + scoped_guard (spinlock_irq, &tsk->sighand->siglock) 1103 + hlist_move_list(&tsk->signal->posix_timers, &timers); 1098 1104 1099 1105 /* The timers are not longer accessible via tsk::signal */ 1100 - while (!hlist_empty(&timers)) 1101 - itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); 1106 + hlist_for_each_entry_safe(timer, next, &timers, list) { 1107 + scoped_guard (spinlock_irq, &timer->it_lock) 1108 + posix_timer_delete(timer); 1109 + posix_timer_unhash_and_free(timer); 1110 + cond_resched(); 1111 + } 1102 1112 1103 1113 /* 1104 1114 * There should be no timers on the ignored list. itimer_delete() has ··· 1551 1545 1552 1546 return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; 1553 1547 } 1548 + 1549 + static int __init posixtimer_init(void) 1550 + { 1551 + unsigned long i, size; 1552 + unsigned int shift; 1553 + 1554 + if (IS_ENABLED(CONFIG_BASE_SMALL)) 1555 + size = 512; 1556 + else 1557 + size = roundup_pow_of_two(512 * num_possible_cpus()); 1558 + 1559 + timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets), 1560 + size, 0, 0, &shift, NULL, size, size); 1561 + size = 1UL << shift; 1562 + timer_hashmask = size - 1; 1563 + 1564 + for (i = 0; i < size; i++) { 1565 + spin_lock_init(&timer_buckets[i].lock); 1566 + INIT_HLIST_HEAD(&timer_buckets[i].head); 1567 + } 1568 + return 0; 1569 + } 1570 + core_initcall(posixtimer_init);
+69 -25
kernel/time/timekeeping.c
··· 682 682 } 683 683 684 684 /** 685 - * timekeeping_forward_now - update clock to the current time 685 + * timekeeping_forward - update clock to given cycle now value 686 686 * @tk: Pointer to the timekeeper to update 687 + * @cycle_now: Current clocksource read value 687 688 * 688 689 * Forward the current clock to update its state since the last call to 689 690 * update_wall_time(). This is useful before significant clock changes, 690 691 * as it avoids having to deal with this time offset explicitly. 691 692 */ 692 - static void timekeeping_forward_now(struct timekeeper *tk) 693 + static void timekeeping_forward(struct timekeeper *tk, u64 cycle_now) 693 694 { 694 - u64 cycle_now, delta; 695 + u64 delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, 696 + tk->tkr_mono.clock->max_raw_delta); 695 697 696 - cycle_now = tk_clock_read(&tk->tkr_mono); 697 - delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, 698 - tk->tkr_mono.clock->max_raw_delta); 699 698 tk->tkr_mono.cycle_last = cycle_now; 700 699 tk->tkr_raw.cycle_last = cycle_now; 701 700 ··· 707 708 tk_normalize_xtime(tk); 708 709 delta -= incr; 709 710 } 711 + } 712 + 713 + /** 714 + * timekeeping_forward_now - update clock to the current time 715 + * @tk: Pointer to the timekeeper to update 716 + * 717 + * Forward the current clock to update its state since the last call to 718 + * update_wall_time(). This is useful before significant clock changes, 719 + * as it avoids having to deal with this time offset explicitly. 720 + */ 721 + static void timekeeping_forward_now(struct timekeeper *tk) 722 + { 723 + u64 cycle_now = tk_clock_read(&tk->tkr_mono); 724 + 725 + timekeeping_forward(tk, cycle_now); 710 726 } 711 727 712 728 /** ··· 2165 2151 return offset; 2166 2152 } 2167 2153 2154 + static u64 timekeeping_accumulate(struct timekeeper *tk, u64 offset, 2155 + enum timekeeping_adv_mode mode, 2156 + unsigned int *clock_set) 2157 + { 2158 + int shift = 0, maxshift; 2159 + 2160 + /* 2161 + * TK_ADV_FREQ indicates that adjtimex(2) directly set the 2162 + * frequency or the tick length. 2163 + * 2164 + * Accumulate the offset, so that the new multiplier starts from 2165 + * now. This is required as otherwise for offsets, which are 2166 + * smaller than tk::cycle_interval, timekeeping_adjust() could set 2167 + * xtime_nsec backwards, which subsequently causes time going 2168 + * backwards in the coarse time getters. But even for the case 2169 + * where offset is greater than tk::cycle_interval the periodic 2170 + * accumulation does not have much value. 2171 + * 2172 + * Also reset tk::ntp_error as it does not make sense to keep the 2173 + * old accumulated error around in this case. 2174 + */ 2175 + if (mode == TK_ADV_FREQ) { 2176 + timekeeping_forward(tk, tk->tkr_mono.cycle_last + offset); 2177 + tk->ntp_error = 0; 2178 + return 0; 2179 + } 2180 + 2181 + /* 2182 + * With NO_HZ we may have to accumulate many cycle_intervals 2183 + * (think "ticks") worth of time at once. To do this efficiently, 2184 + * we calculate the largest doubling multiple of cycle_intervals 2185 + * that is smaller than the offset. We then accumulate that 2186 + * chunk in one go, and then try to consume the next smaller 2187 + * doubled multiple. 2188 + */ 2189 + shift = ilog2(offset) - ilog2(tk->cycle_interval); 2190 + shift = max(0, shift); 2191 + /* Bound shift to one less than what overflows tick_length */ 2192 + maxshift = (64 - (ilog2(ntp_tick_length()) + 1)) - 1; 2193 + shift = min(shift, maxshift); 2194 + while (offset >= tk->cycle_interval) { 2195 + offset = logarithmic_accumulation(tk, offset, shift, clock_set); 2196 + if (offset < tk->cycle_interval << shift) 2197 + shift--; 2198 + } 2199 + return offset; 2200 + } 2201 + 2168 2202 /* 2169 2203 * timekeeping_advance - Updates the timekeeper to the current time and 2170 2204 * current NTP tick length ··· 2222 2160 struct timekeeper *tk = &tk_core.shadow_timekeeper; 2223 2161 struct timekeeper *real_tk = &tk_core.timekeeper; 2224 2162 unsigned int clock_set = 0; 2225 - int shift = 0, maxshift; 2226 2163 u64 offset; 2227 2164 2228 2165 guard(raw_spinlock_irqsave)(&tk_core.lock); ··· 2238 2177 if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) 2239 2178 return false; 2240 2179 2241 - /* 2242 - * With NO_HZ we may have to accumulate many cycle_intervals 2243 - * (think "ticks") worth of time at once. To do this efficiently, 2244 - * we calculate the largest doubling multiple of cycle_intervals 2245 - * that is smaller than the offset. We then accumulate that 2246 - * chunk in one go, and then try to consume the next smaller 2247 - * doubled multiple. 2248 - */ 2249 - shift = ilog2(offset) - ilog2(tk->cycle_interval); 2250 - shift = max(0, shift); 2251 - /* Bound shift to one less than what overflows tick_length */ 2252 - maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; 2253 - shift = min(shift, maxshift); 2254 - while (offset >= tk->cycle_interval) { 2255 - offset = logarithmic_accumulation(tk, offset, shift, &clock_set); 2256 - if (offset < tk->cycle_interval<<shift) 2257 - shift--; 2258 - } 2180 + offset = timekeeping_accumulate(tk, offset, mode, &clock_set); 2259 2181 2260 2182 /* Adjust the multiplier to correct NTP error */ 2261 2183 timekeeping_adjust(tk, offset);
+2 -2
kernel/time/timer_list.c
··· 46 46 print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, 47 47 int idx, u64 now) 48 48 { 49 - SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); 49 + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function); 50 50 SEQ_printf(m, ", S:%02x", timer->state); 51 51 SEQ_printf(m, "\n"); 52 52 SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", ··· 98 98 static void 99 99 print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) 100 100 { 101 - SEQ_printf(m, " .base: %pK\n", base); 101 + SEQ_printf(m, " .base: %p\n", base); 102 102 SEQ_printf(m, " .index: %d\n", base->index); 103 103 104 104 SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
+72 -1
tools/testing/selftests/timers/posix_timers.c
··· 7 7 * Kernel loop code stolen from Steven Rostedt <srostedt@redhat.com> 8 8 */ 9 9 #define _GNU_SOURCE 10 + #include <sys/prctl.h> 10 11 #include <sys/time.h> 11 12 #include <sys/types.h> 12 13 #include <stdio.h> ··· 600 599 "check_overrun %s\n", name); 601 600 } 602 601 602 + #include <sys/syscall.h> 603 + 604 + static int do_timer_create(int *id) 605 + { 606 + return syscall(__NR_timer_create, CLOCK_MONOTONIC, NULL, id); 607 + } 608 + 609 + static int do_timer_delete(int id) 610 + { 611 + return syscall(__NR_timer_delete, id); 612 + } 613 + 614 + #ifndef PR_TIMER_CREATE_RESTORE_IDS 615 + # define PR_TIMER_CREATE_RESTORE_IDS 77 616 + # define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 617 + # define PR_TIMER_CREATE_RESTORE_IDS_ON 1 618 + # define PR_TIMER_CREATE_RESTORE_IDS_GET 2 619 + #endif 620 + 621 + static void check_timer_create_exact(void) 622 + { 623 + int id; 624 + 625 + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_ON, 0, 0, 0)) { 626 + switch (errno) { 627 + case EINVAL: 628 + ksft_test_result_skip("check timer create exact, not supported\n"); 629 + return; 630 + default: 631 + ksft_test_result_skip("check timer create exact, errno = %d\n", errno); 632 + return; 633 + } 634 + } 635 + 636 + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) != 1) 637 + fatal_error(NULL, "prctl(GET) failed\n"); 638 + 639 + id = 8; 640 + if (do_timer_create(&id) < 0) 641 + fatal_error(NULL, "timer_create()"); 642 + 643 + if (do_timer_delete(id)) 644 + fatal_error(NULL, "timer_delete()"); 645 + 646 + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_OFF, 0, 0, 0)) 647 + fatal_error(NULL, "prctl(OFF)"); 648 + 649 + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) != 0) 650 + fatal_error(NULL, "prctl(GET) failed\n"); 651 + 652 + if (id != 8) { 653 + ksft_test_result_fail("check timer create exact %d != 8\n", id); 654 + return; 655 + } 656 + 657 + /* Validate that it went back to normal mode and allocates ID 9 */ 658 + if (do_timer_create(&id) < 0) 659 + fatal_error(NULL, "timer_create()"); 660 + 661 + if (do_timer_delete(id)) 662 + fatal_error(NULL, "timer_delete()"); 663 + 664 + if (id == 9) 665 + ksft_test_result_pass("check timer create exact\n"); 666 + else 667 + ksft_test_result_fail("check timer create exact. Disabling failed.\n"); 668 + } 669 + 603 670 int main(int argc, char **argv) 604 671 { 605 672 ksft_print_header(); 606 - ksft_set_plan(18); 673 + ksft_set_plan(19); 607 674 608 675 ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); 609 676 ksft_print_msg("based timers if other threads run on the CPU...\n"); 677 + 678 + check_timer_create_exact(); 610 679 611 680 check_itimer(ITIMER_VIRTUAL, "ITIMER_VIRTUAL"); 612 681 check_itimer(ITIMER_PROF, "ITIMER_PROF");
+1 -1
tools/testing/selftests/timers/skew_consistency.c
··· 47 47 48 48 pid = fork(); 49 49 if (!pid) 50 - return system("./inconsistency-check -c 1 -t 600"); 50 + return system("./inconsistency-check -t 60"); 51 51 52 52 ppm = 500; 53 53 ret = 0;