Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'timers-core-2024-01-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull timer subsystem updates from Ingo Molnar:

- Various preparatory cleanups & enhancements of the timer-wheel code,
in preparation for the WIP 'pull timers at expiry' timer migration
model series (which will replace the current 'push timers at enqueue'
migration model), by Anna-Maria Behnsen:

- Update comments and clean up confusing variable names

- Add debug check to warn about time travel

- Improve/expand timer-wheel tracepoints

- Optimize away unnecessary IPIs for deferrable timers

- Restructure & clean up next_expiry_recalc()

- Clean up forward_timer_base()

- Introduce __forward_timer_base() and use it to simplify and
micro-optimize get_next_timer_interrupt()

- Restructure the get_next_timer_interrupt()'s idle logic for better
readability and to enable a minor optimization.

- Fix the nextevt calculation when no timers are pending

- Fix the sysfs_get_uname() prototype declaration

* tag 'timers-core-2024-01-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
timers: Fix nextevt calculation when no timers are pending
timers: Rework idle logic
timers: Use already existing function for forwarding timer base
timers: Split out forward timer base functionality
timers: Clarify check in forward_timer_base()
timers: Move store of next event into __next_timer_interrupt()
timers: Do not IPI for deferrable timers
tracing/timers: Add tracepoint for tracking timer base is_idle flag
tracing/timers: Enhance timer_start tracepoint
tick-sched: Warn when next tick seems to be in the past
tick/sched: Cleanup confusing variables
tick-sched: Fix function names in comments
time: Make sysfs_get_uname() function visible in header

+112 -72
+30 -10
include/trace/events/timer.h
··· 46 46 47 47 /** 48 48 * timer_start - called when the timer is started 49 - * @timer: pointer to struct timer_list 50 - * @expires: the timers expiry time 51 - * @flags: the timers flags 49 + * @timer: pointer to struct timer_list 50 + * @bucket_expiry: the bucket expiry time 52 51 */ 53 52 TRACE_EVENT(timer_start, 54 53 55 54 TP_PROTO(struct timer_list *timer, 56 - unsigned long expires, 57 - unsigned int flags), 55 + unsigned long bucket_expiry), 58 56 59 - TP_ARGS(timer, expires, flags), 57 + TP_ARGS(timer, bucket_expiry), 60 58 61 59 TP_STRUCT__entry( 62 60 __field( void *, timer ) 63 61 __field( void *, function ) 64 62 __field( unsigned long, expires ) 63 + __field( unsigned long, bucket_expiry ) 65 64 __field( unsigned long, now ) 66 65 __field( unsigned int, flags ) 67 66 ), ··· 68 69 TP_fast_assign( 69 70 __entry->timer = timer; 70 71 __entry->function = timer->function; 71 - __entry->expires = expires; 72 + __entry->expires = timer->expires; 73 + __entry->bucket_expiry = bucket_expiry; 72 74 __entry->now = jiffies; 73 - __entry->flags = flags; 75 + __entry->flags = timer->flags; 74 76 ), 75 77 76 - TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s", 78 + TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s", 77 79 __entry->timer, __entry->function, __entry->expires, 78 80 (long)__entry->expires - __entry->now, 79 - __entry->flags & TIMER_CPUMASK, 81 + __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK, 80 82 __entry->flags >> TIMER_ARRAYSHIFT, 81 83 decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK)) 82 84 ); ··· 140 140 TP_PROTO(struct timer_list *timer), 141 141 142 142 TP_ARGS(timer) 143 + ); 144 + 145 + TRACE_EVENT(timer_base_idle, 146 + 147 + TP_PROTO(bool is_idle, unsigned int cpu), 148 + 149 + TP_ARGS(is_idle, cpu), 150 + 151 + TP_STRUCT__entry( 152 + __field( bool, is_idle ) 153 + __field( unsigned int, cpu ) 154 + ), 155 + 156 + TP_fast_assign( 157 + __entry->is_idle = is_idle; 158 + __entry->cpu = cpu; 159 + ), 160 + 161 + TP_printk("is_idle=%d cpu=%d", 162 + __entry->is_idle, __entry->cpu) 143 163 ); 144 164 145 165 #define decode_clockid(type) \
+2 -1
kernel/time/tick-internal.h
··· 56 56 ktime_t expires, bool force); 57 57 extern void clockevents_handle_noop(struct clock_event_device *dev); 58 58 extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); 59 - extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); 60 59 61 60 /* Broadcasting support */ 62 61 # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST ··· 196 197 #else 197 198 #define JIFFIES_SHIFT 8 198 199 #endif 200 + 201 + extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
+14 -11
kernel/time/tick-sched.c
··· 839 839 ts->next_timer = next_tick; 840 840 } 841 841 842 + /* Make sure next_tick is never before basemono! */ 843 + if (WARN_ON_ONCE(basemono > next_tick)) 844 + next_tick = basemono; 845 + 842 846 /* 843 847 * If the tick is due in the next period, keep it ticking or 844 848 * force prod the timer. ··· 891 887 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 892 888 u64 basemono = ts->timer_expires_base; 893 889 u64 expires = ts->timer_expires; 894 - ktime_t tick = expires; 895 890 896 891 /* Make sure we won't be trying to stop it twice in a row. */ 897 892 ts->timer_expires_base = 0; ··· 913 910 /* Skip reprogram of event if it's not changed */ 914 911 if (ts->tick_stopped && (expires == ts->next_tick)) { 915 912 /* Sanity check: make sure clockevent is actually programmed */ 916 - if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) 913 + if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) 917 914 return; 918 915 919 916 WARN_ON_ONCE(1); ··· 923 920 } 924 921 925 922 /* 926 - * nohz_stop_sched_tick() can be called several times before 927 - * nohz_restart_sched_tick() is called. This happens when 928 - * interrupts arrive which do not cause a reschedule. In the 929 - * first call we save the current tick time, so we can restart 930 - * the scheduler tick in nohz_restart_sched_tick(). 923 + * tick_nohz_stop_tick() can be called several times before 924 + * tick_nohz_restart_sched_tick() is called. This happens when 925 + * interrupts arrive which do not cause a reschedule. In the first 926 + * call we save the current tick time, so we can restart the 927 + * scheduler tick in tick_nohz_restart_sched_tick(). 931 928 */ 932 929 if (!ts->tick_stopped) { 933 930 calc_load_nohz_start(); ··· 938 935 trace_tick_stop(1, TICK_DEP_MASK_NONE); 939 936 } 940 937 941 - ts->next_tick = tick; 938 + ts->next_tick = expires; 942 939 943 940 /* 944 941 * If the expiration time == KTIME_MAX, then we simply stop ··· 953 950 } 954 951 955 952 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 956 - hrtimer_start(&ts->sched_timer, tick, 953 + hrtimer_start(&ts->sched_timer, expires, 957 954 HRTIMER_MODE_ABS_PINNED_HARD); 958 955 } else { 959 - hrtimer_set_expires(&ts->sched_timer, tick); 960 - tick_program_event(tick, 1); 956 + hrtimer_set_expires(&ts->sched_timer, expires); 957 + tick_program_event(expires, 1); 961 958 } 962 959 } 963 960
+66 -50
kernel/time/timer.c
··· 571 571 static void 572 572 trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) 573 573 { 574 - if (!is_timers_nohz_active()) 575 - return; 576 - 577 574 /* 578 - * TODO: This wants some optimizing similar to the code below, but we 579 - * will do that when we switch from push to pull for deferrable timers. 575 + * Deferrable timers do not prevent the CPU from entering dynticks and 576 + * are not taken into account on the idle/nohz_full path. An IPI when a 577 + * new deferrable timer is enqueued will wake up the remote CPU but 578 + * nothing will be done with the deferrable timer base. Therefore skip 579 + * the remote IPI for deferrable timers completely. 580 580 */ 581 - if (timer->flags & TIMER_DEFERRABLE) { 582 - if (tick_nohz_full_cpu(base->cpu)) 583 - wake_up_nohz_cpu(base->cpu); 581 + if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE) 584 582 return; 585 - } 586 583 587 584 /* 588 585 * We might have to IPI the remote CPU if the base is idle and the ··· 603 606 __set_bit(idx, base->pending_map); 604 607 timer_set_idx(timer, idx); 605 608 606 - trace_timer_start(timer, timer->expires, timer->flags); 609 + trace_timer_start(timer, bucket_expiry); 607 610 608 611 /* 609 612 * Check whether this is the new first expiring timer. The ··· 939 942 return get_timer_this_cpu_base(tflags); 940 943 } 941 944 942 - static inline void forward_timer_base(struct timer_base *base) 945 + static inline void __forward_timer_base(struct timer_base *base, 946 + unsigned long basej) 943 947 { 944 - unsigned long jnow = READ_ONCE(jiffies); 945 - 946 948 /* 947 - * No need to forward if we are close enough below jiffies. 948 - * Also while executing timers, base->clk is 1 offset ahead 949 - * of jiffies to avoid endless requeuing to current jiffies. 949 + * Check whether we can forward the base. We can only do that when 950 + * @basej is past base->clk otherwise we might rewind base->clk. 950 951 */ 951 - if ((long)(jnow - base->clk) < 1) 952 + if (time_before_eq(basej, base->clk)) 952 953 return; 953 954 954 955 /* 955 956 * If the next expiry value is > jiffies, then we fast forward to 956 957 * jiffies otherwise we forward to the next expiry value. 957 958 */ 958 - if (time_after(base->next_expiry, jnow)) { 959 - base->clk = jnow; 959 + if (time_after(base->next_expiry, basej)) { 960 + base->clk = basej; 960 961 } else { 961 962 if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) 962 963 return; 963 964 base->clk = base->next_expiry; 964 965 } 966 + 965 967 } 966 968 969 + static inline void forward_timer_base(struct timer_base *base) 970 + { 971 + __forward_timer_base(base, READ_ONCE(jiffies)); 972 + } 967 973 968 974 /* 969 975 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means ··· 1803 1803 /* 1804 1804 * Search the first expiring timer in the various clock levels. Caller must 1805 1805 * hold base->lock. 1806 + * 1807 + * Store next expiry time in base->next_expiry. 1806 1808 */ 1807 - static unsigned long __next_timer_interrupt(struct timer_base *base) 1809 + static void next_expiry_recalc(struct timer_base *base) 1808 1810 { 1809 1811 unsigned long clk, next, adj; 1810 1812 unsigned lvl, offset = 0; ··· 1872 1870 clk += adj; 1873 1871 } 1874 1872 1873 + base->next_expiry = next; 1875 1874 base->next_expiry_recalc = false; 1876 1875 base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); 1877 - 1878 - return next; 1879 1876 } 1880 1877 1881 1878 #ifdef CONFIG_NO_HZ_COMMON ··· 1922 1921 u64 get_next_timer_interrupt(unsigned long basej, u64 basem) 1923 1922 { 1924 1923 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); 1924 + unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA; 1925 1925 u64 expires = KTIME_MAX; 1926 - unsigned long nextevt; 1926 + bool was_idle; 1927 1927 1928 1928 /* 1929 1929 * Pretend that there is no timer pending if the cpu is offline. ··· 1935 1933 1936 1934 raw_spin_lock(&base->lock); 1937 1935 if (base->next_expiry_recalc) 1938 - base->next_expiry = __next_timer_interrupt(base); 1939 - nextevt = base->next_expiry; 1936 + next_expiry_recalc(base); 1940 1937 1941 1938 /* 1942 1939 * We have a fresh next event. Check whether we can forward the 1943 - * base. We can only do that when @basej is past base->clk 1944 - * otherwise we might rewind base->clk. 1940 + * base. 1945 1941 */ 1946 - if (time_after(basej, base->clk)) { 1947 - if (time_after(nextevt, basej)) 1948 - base->clk = basej; 1949 - else if (time_after(nextevt, base->clk)) 1950 - base->clk = nextevt; 1942 + __forward_timer_base(base, basej); 1943 + 1944 + if (base->timers_pending) { 1945 + nextevt = base->next_expiry; 1946 + 1947 + /* If we missed a tick already, force 0 delta */ 1948 + if (time_before(nextevt, basej)) 1949 + nextevt = basej; 1950 + expires = basem + (u64)(nextevt - basej) * TICK_NSEC; 1951 + } else { 1952 + /* 1953 + * Move next_expiry for the empty base into the future to 1954 + * prevent a unnecessary raise of the timer softirq when the 1955 + * next_expiry value will be reached even if there is no timer 1956 + * pending. 1957 + */ 1958 + base->next_expiry = nextevt; 1951 1959 } 1952 1960 1953 - if (time_before_eq(nextevt, basej)) { 1954 - expires = basem; 1955 - base->is_idle = false; 1956 - } else { 1957 - if (base->timers_pending) 1958 - expires = basem + (u64)(nextevt - basej) * TICK_NSEC; 1959 - /* 1960 - * If we expect to sleep more than a tick, mark the base idle. 1961 - * Also the tick is stopped so any added timer must forward 1962 - * the base clk itself to keep granularity small. This idle 1963 - * logic is only maintained for the BASE_STD base, deferrable 1964 - * timers may still see large granularity skew (by design). 1965 - */ 1966 - if ((expires - basem) > TICK_NSEC) 1967 - base->is_idle = true; 1968 - } 1961 + /* 1962 + * Base is idle if the next event is more than a tick away. 1963 + * 1964 + * If the base is marked idle then any timer add operation must forward 1965 + * the base clk itself to keep granularity small. This idle logic is 1966 + * only maintained for the BASE_STD base, deferrable timers may still 1967 + * see large granularity skew (by design). 1968 + */ 1969 + was_idle = base->is_idle; 1970 + base->is_idle = time_after(nextevt, basej + 1); 1971 + if (was_idle != base->is_idle) 1972 + trace_timer_base_idle(base->is_idle, base->cpu); 1973 + 1969 1974 raw_spin_unlock(&base->lock); 1970 1975 1971 1976 return cmp_next_hrtimer_event(basem, expires); ··· 1993 1984 * sending the IPI a few instructions smaller for the cost of taking 1994 1985 * the lock in the exit from idle path. 1995 1986 */ 1996 - base->is_idle = false; 1987 + if (base->is_idle) { 1988 + base->is_idle = false; 1989 + trace_timer_base_idle(false, smp_processor_id()); 1990 + } 1997 1991 } 1998 1992 #endif 1999 1993 ··· 2027 2015 */ 2028 2016 WARN_ON_ONCE(!levels && !base->next_expiry_recalc 2029 2017 && base->timers_pending); 2018 + /* 2019 + * While executing timers, base->clk is set 1 offset ahead of 2020 + * jiffies to avoid endless requeuing to current jiffies. 2021 + */ 2030 2022 base->clk++; 2031 - base->next_expiry = __next_timer_interrupt(base); 2023 + next_expiry_recalc(base); 2032 2024 2033 2025 while (levels--) 2034 2026 expire_timers(base, heads + levels);