Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

timerfd: Manage cancelable timers in timerfd

Peter is concerned about the extra scan of CLOCK_REALTIME_COS in the
timer interrupt. Yes, I did not think about it, because the solution
was so elegant. I didn't like the extra list in timerfd when it was
proposed some time ago, but with a rcu based list the list walk it's
less horrible than the original global lock, which was held over the
list iteration.

Requested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>

+117 -106
+78 -35
fs/timerfd.c
··· 22 22 #include <linux/anon_inodes.h> 23 23 #include <linux/timerfd.h> 24 24 #include <linux/syscalls.h> 25 + #include <linux/rcupdate.h> 25 26 26 27 struct timerfd_ctx { 27 28 struct hrtimer tmr; ··· 32 31 u64 ticks; 33 32 int expired; 34 33 int clockid; 34 + struct rcu_head rcu; 35 + struct list_head clist; 35 36 bool might_cancel; 36 37 }; 38 + 39 + static LIST_HEAD(cancel_list); 40 + static DEFINE_SPINLOCK(cancel_lock); 37 41 38 42 /* 39 43 * This gets called when the timer event triggers. We set the "expired" ··· 59 53 return HRTIMER_NORESTART; 60 54 } 61 55 56 + /* 57 + * Called when the clock was set to cancel the timers in the cancel 58 + * list. 59 + */ 60 + void timerfd_clock_was_set(void) 61 + { 62 + ktime_t moffs = ktime_get_monotonic_offset(); 63 + struct timerfd_ctx *ctx; 64 + unsigned long flags; 65 + 66 + rcu_read_lock(); 67 + list_for_each_entry_rcu(ctx, &cancel_list, clist) { 68 + if (!ctx->might_cancel) 69 + continue; 70 + spin_lock_irqsave(&ctx->wqh.lock, flags); 71 + if (ctx->moffs.tv64 != moffs.tv64) { 72 + ctx->moffs.tv64 = KTIME_MAX; 73 + wake_up_locked(&ctx->wqh); 74 + } 75 + spin_unlock_irqrestore(&ctx->wqh.lock, flags); 76 + } 77 + rcu_read_unlock(); 78 + } 79 + 80 + static void timerfd_remove_cancel(struct timerfd_ctx *ctx) 81 + { 82 + if (ctx->might_cancel) { 83 + ctx->might_cancel = false; 84 + spin_lock(&cancel_lock); 85 + list_del_rcu(&ctx->clist); 86 + spin_unlock(&cancel_lock); 87 + } 88 + } 89 + 90 + static bool timerfd_canceled(struct timerfd_ctx *ctx) 91 + { 92 + if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX) 93 + return false; 94 + ctx->moffs = ktime_get_monotonic_offset(); 95 + return true; 96 + } 97 + 98 + static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) 99 + { 100 + if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) && 101 + (flags & TFD_TIMER_CANCEL_ON_SET)) { 102 + if (!ctx->might_cancel) { 103 + ctx->might_cancel = true; 104 + spin_lock(&cancel_lock); 105 + list_add_rcu(&ctx->clist, &cancel_list); 106 + spin_unlock(&cancel_lock); 107 + } 108 + } else if (ctx->might_cancel) { 109 + timerfd_remove_cancel(ctx); 110 + } 111 + } 112 + 62 113 static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) 63 114 { 64 115 ktime_t remaining; 65 116 66 117 remaining = hrtimer_expires_remaining(&ctx->tmr); 67 118 return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; 68 - } 69 - 70 - static bool timerfd_canceled(struct timerfd_ctx *ctx) 71 - { 72 - ktime_t moffs; 73 - 74 - if (!ctx->might_cancel) 75 - return false; 76 - 77 - moffs = ktime_get_monotonic_offset(); 78 - 79 - if (moffs.tv64 == ctx->moffs.tv64) 80 - return false; 81 - 82 - ctx->moffs = moffs; 83 - return true; 84 119 } 85 120 86 121 static int timerfd_setup(struct timerfd_ctx *ctx, int flags, ··· 133 86 134 87 htmode = (flags & TFD_TIMER_ABSTIME) ? 135 88 HRTIMER_MODE_ABS: HRTIMER_MODE_REL; 136 - 137 - ctx->might_cancel = false; 138 - if (htmode == HRTIMER_MODE_ABS && ctx->clockid == CLOCK_REALTIME && 139 - (flags & TFD_TIMER_CANCELON_SET)) { 140 - clockid = CLOCK_REALTIME_COS; 141 - ctx->might_cancel = true; 142 - } 143 89 144 90 texp = timespec_to_ktime(ktmr->it_value); 145 91 ctx->expired = 0; ··· 153 113 { 154 114 struct timerfd_ctx *ctx = file->private_data; 155 115 116 + timerfd_remove_cancel(ctx); 156 117 hrtimer_cancel(&ctx->tmr); 157 - kfree(ctx); 118 + kfree_rcu(ctx, rcu); 158 119 return 0; 159 120 } 160 121 ··· 190 149 else 191 150 res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); 192 151 152 + /* 153 + * If clock has changed, we do not care about the 154 + * ticks and we do not rearm the timer. Userspace must 155 + * reevaluate anyway. 156 + */ 157 + if (timerfd_canceled(ctx)) { 158 + ctx->ticks = 0; 159 + ctx->expired = 0; 160 + res = -ECANCELED; 161 + } 162 + 193 163 if (ctx->ticks) { 194 164 ticks = ctx->ticks; 195 - 196 - /* 197 - * If clock has changed, we do not care about the 198 - * ticks and we do not rearm the timer. Userspace must 199 - * reevaluate anyway. 200 - */ 201 - if (timerfd_canceled(ctx)) { 202 - ticks = 0; 203 - ctx->expired = 0; 204 - res = -ECANCELED; 205 - } 206 165 207 166 if (ctx->expired && ctx->tintv.tv64) { 208 167 /* ··· 298 257 if (IS_ERR(file)) 299 258 return PTR_ERR(file); 300 259 ctx = file->private_data; 260 + 261 + timerfd_setup_cancel(ctx, flags); 301 262 302 263 /* 303 264 * We need to stop the existing timer before reprogramming
+5 -1
include/linux/hrtimer.h
··· 155 155 HRTIMER_BASE_REALTIME, 156 156 HRTIMER_BASE_MONOTONIC, 157 157 HRTIMER_BASE_BOOTTIME, 158 - HRTIMER_BASE_REALTIME_COS, 159 158 HRTIMER_MAX_CLOCK_BASES, 160 159 }; 161 160 ··· 305 306 #endif 306 307 307 308 extern void clock_was_set(void); 309 + #ifdef CONFIG_TIMERFD 310 + extern void timerfd_clock_was_set(void); 311 + #else 312 + static inline void timerfd_clock_was_set(void) { } 313 + #endif 308 314 extern void hrtimers_resume(void); 309 315 310 316 extern ktime_t ktime_get(void);
-6
include/linux/time.h
··· 302 302 * The IDs of various hardware clocks: 303 303 */ 304 304 #define CLOCK_SGI_CYCLE 10 305 - 306 - #ifdef __KERNEL__ 307 - /* This clock is not exposed to user space */ 308 - #define CLOCK_REALTIME_COS 15 309 - #endif 310 - 311 305 #define MAX_CLOCKS 16 312 306 #define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC) 313 307 #define CLOCKS_MONO CLOCK_MONOTONIC
+2 -2
include/linux/timerfd.h
··· 19 19 * shared O_* flags. 20 20 */ 21 21 #define TFD_TIMER_ABSTIME (1 << 0) 22 - #define TFD_TIMER_CANCELON_SET (1 << 1) 22 + #define TFD_TIMER_CANCEL_ON_SET (1 << 1) 23 23 #define TFD_CLOEXEC O_CLOEXEC 24 24 #define TFD_NONBLOCK O_NONBLOCK 25 25 ··· 27 27 /* Flags for timerfd_create. */ 28 28 #define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS 29 29 /* Flags for timerfd_settime. */ 30 - #define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCELON_SET) 30 + #define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET) 31 31 32 32 #endif /* _LINUX_TIMERFD_H */
+32 -62
kernel/hrtimer.c
··· 78 78 .get_time = &ktime_get_boottime, 79 79 .resolution = KTIME_LOW_RES, 80 80 }, 81 - { 82 - .index = CLOCK_REALTIME_COS, 83 - .get_time = &ktime_get_real, 84 - .resolution = KTIME_LOW_RES, 85 - }, 86 81 } 87 82 }; 88 83 ··· 85 90 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, 86 91 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, 87 92 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, 88 - [CLOCK_REALTIME_COS] = HRTIMER_BASE_REALTIME_COS, 89 93 }; 90 94 91 95 static inline int hrtimer_clockid_to_base(clockid_t clock_id) ··· 110 116 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; 111 117 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; 112 118 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; 113 - base->clock_base[HRTIMER_BASE_REALTIME_COS].softirq_time = xtim; 114 119 } 115 120 116 121 /* ··· 479 486 trace_hrtimer_cancel(timer); 480 487 } 481 488 482 - static void hrtimer_expire_cancelable(struct hrtimer_cpu_base *cpu_base); 483 - 484 489 /* High resolution timer related functions */ 485 490 #ifdef CONFIG_HIGH_RES_TIMERS 486 491 ··· 654 663 return 0; 655 664 } 656 665 657 - static void retrigger_next_event(void *arg); 666 + /* 667 + * Retrigger next event is called after clock was set 668 + * 669 + * Called with interrupts disabled via on_each_cpu() 670 + */ 671 + static void retrigger_next_event(void *arg) 672 + { 673 + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); 674 + struct timespec realtime_offset, xtim, wtm, sleep; 675 + 676 + if (!hrtimer_hres_active()) 677 + return; 678 + 679 + /* Optimized out for !HIGH_RES */ 680 + get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); 681 + set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); 682 + 683 + /* Adjust CLOCK_REALTIME offset */ 684 + raw_spin_lock(&base->lock); 685 + base->clock_base[HRTIMER_BASE_REALTIME].offset = 686 + timespec_to_ktime(realtime_offset); 687 + base->clock_base[HRTIMER_BASE_BOOTTIME].offset = 688 + timespec_to_ktime(sleep); 689 + 690 + hrtimer_force_reprogram(base, 0); 691 + raw_spin_unlock(&base->lock); 692 + } 658 693 659 694 /* 660 695 * Switch to high resolution mode ··· 728 711 return 0; 729 712 } 730 713 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 714 + static inline void retrigger_next_event(void *arg) { } 731 715 732 716 #endif /* CONFIG_HIGH_RES_TIMERS */ 733 - 734 - /* 735 - * Retrigger next event is called after clock was set 736 - * 737 - * Called with interrupts disabled via on_each_cpu() 738 - */ 739 - static void retrigger_next_event(void *arg) 740 - { 741 - struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); 742 - struct timespec realtime_offset, xtim, wtm, sleep; 743 - 744 - if (!hrtimer_hres_active()) { 745 - raw_spin_lock(&base->lock); 746 - hrtimer_expire_cancelable(base); 747 - raw_spin_unlock(&base->lock); 748 - return; 749 - } 750 - 751 - /* Optimized out for !HIGH_RES */ 752 - get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); 753 - set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); 754 - 755 - /* Adjust CLOCK_REALTIME offset */ 756 - raw_spin_lock(&base->lock); 757 - base->clock_base[HRTIMER_BASE_REALTIME].offset = 758 - timespec_to_ktime(realtime_offset); 759 - base->clock_base[HRTIMER_BASE_BOOTTIME].offset = 760 - timespec_to_ktime(sleep); 761 - base->clock_base[HRTIMER_BASE_REALTIME_COS].offset = 762 - timespec_to_ktime(realtime_offset); 763 - 764 - hrtimer_expire_cancelable(base); 765 - 766 - hrtimer_force_reprogram(base, 0); 767 - raw_spin_unlock(&base->lock); 768 - } 769 717 770 718 /* 771 719 * Clock realtime was set ··· 745 763 */ 746 764 void clock_was_set(void) 747 765 { 766 + #ifdef CONFIG_HIGHRES_TIMERS 748 767 /* Retrigger the CPU local events everywhere */ 749 768 on_each_cpu(retrigger_next_event, NULL, 1); 769 + #endif 770 + timerfd_clock_was_set(); 750 771 } 751 772 752 773 /* ··· 762 777 KERN_INFO "hrtimers_resume() called with IRQs enabled!"); 763 778 764 779 retrigger_next_event(NULL); 780 + timerfd_clock_was_set(); 765 781 } 766 782 767 783 static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) ··· 1224 1238 WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); 1225 1239 1226 1240 timer->state &= ~HRTIMER_STATE_CALLBACK; 1227 - } 1228 - 1229 - static void hrtimer_expire_cancelable(struct hrtimer_cpu_base *cpu_base) 1230 - { 1231 - struct timerqueue_node *node; 1232 - struct hrtimer_clock_base *base; 1233 - ktime_t now = ktime_get_real(); 1234 - 1235 - base = &cpu_base->clock_base[HRTIMER_BASE_REALTIME_COS]; 1236 - 1237 - while ((node = timerqueue_getnext(&base->active))) { 1238 - struct hrtimer *timer; 1239 - 1240 - timer = container_of(node, struct hrtimer, node); 1241 - __run_hrtimer(timer, &now); 1242 - } 1243 1241 } 1244 1242 1245 1243 #ifdef CONFIG_HIGH_RES_TIMERS