Merge tag 'timers_urgent_for_v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull timer fixes from Borislav Petkov:

- Reset hrtimers correctly when a CPU hotplug state traversal happens
"half-ways" and leaves hrtimers not (re-)initialized properly

- Annotate accesses to a timer group's ignore flag to prevent KCSAN
from raising data_race warnings

- Make sure timer group initialization is visible to timer tree walkers
and avoid a hypothetical race

- Fix another race between CPU hotplug and idle entry/exit where timers
on a fully idle system are getting ignored

- Fix a case where an ignored signal is still being handled which it
shouldn't be

* tag 'timers_urgent_for_v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
hrtimers: Handle CPU state correctly on hotplug
timers/migration: Annotate accesses to ignore flag
timers/migration: Enforce group initialization visibility to tree walkers
timers/migration: Fix another race between hotplug and idle entry/exit
signal/posixtimers: Handle ignore/blocked sequences correctly

Changed files
+95 -20
include
linux
kernel
+1
include/linux/hrtimer.h
··· 386 386 extern void sysrq_timer_list_show(void); 387 387 388 388 int hrtimers_prepare_cpu(unsigned int cpu); 389 + int hrtimers_cpu_starting(unsigned int cpu); 389 390 #ifdef CONFIG_HOTPLUG_CPU 390 391 int hrtimers_cpu_dying(unsigned int cpu); 391 392 #else
+1 -1
kernel/cpu.c
··· 2179 2179 }, 2180 2180 [CPUHP_AP_HRTIMERS_DYING] = { 2181 2181 .name = "hrtimers:dying", 2182 - .startup.single = NULL, 2182 + .startup.single = hrtimers_cpu_starting, 2183 2183 .teardown.single = hrtimers_cpu_dying, 2184 2184 }, 2185 2185 [CPUHP_AP_TICK_DYING] = {
+28 -9
kernel/signal.c
··· 2007 2007 2008 2008 if (!list_empty(&q->list)) { 2009 2009 /* 2010 - * If task group is exiting with the signal already pending, 2011 - * wait for __exit_signal() to do its job. Otherwise if 2012 - * ignored, it's not supposed to be queued. Try to survive. 2010 + * The signal was ignored and blocked. The timer 2011 + * expiry queued it because blocked signals are 2012 + * queued independent of the ignored state. 2013 + * 2014 + * The unblocking set SIGPENDING, but the signal 2015 + * was not yet dequeued from the pending list. 2016 + * So prepare_signal() sees unblocked and ignored, 2017 + * which ends up here. Leave it queued like a 2018 + * regular signal. 2019 + * 2020 + * The same happens when the task group is exiting 2021 + * and the signal is already queued. 2022 + * prepare_signal() treats SIGNAL_GROUP_EXIT as 2023 + * ignored independent of its queued state. This 2024 + * gets cleaned up in __exit_signal(). 2013 2025 */ 2014 - WARN_ON_ONCE(!(t->signal->flags & SIGNAL_GROUP_EXIT)); 2015 2026 goto out; 2016 2027 } 2017 2028 ··· 2057 2046 goto out; 2058 2047 } 2059 2048 2060 - /* This should never happen and leaks a reference count */ 2061 - if (WARN_ON_ONCE(!hlist_unhashed(&tmr->ignored_list))) 2062 - hlist_del_init(&tmr->ignored_list); 2063 - 2064 2049 if (unlikely(!list_empty(&q->list))) { 2065 2050 /* This holds a reference count already */ 2066 2051 result = TRACE_SIGNAL_ALREADY_PENDING; 2067 2052 goto out; 2068 2053 } 2069 2054 2070 - posixtimer_sigqueue_getref(q); 2055 + /* 2056 + * If the signal is on the ignore list, it got blocked after it was 2057 + * ignored earlier. But nothing lifted the ignore. Move it back to 2058 + * the pending list to be consistent with the regular signal 2059 + * handling. This already holds a reference count. 2060 + * 2061 + * If it's not on the ignore list acquire a reference count. 2062 + */ 2063 + if (likely(hlist_unhashed(&tmr->ignored_list))) 2064 + posixtimer_sigqueue_getref(q); 2065 + else 2066 + hlist_del_init(&tmr->ignored_list); 2067 + 2071 2068 posixtimer_queue_sigqueue(q, t, tmr->it_pid_type); 2072 2069 result = TRACE_SIGNAL_DELIVERED; 2073 2070 out:
+10 -1
kernel/time/hrtimer.c
··· 2202 2202 } 2203 2203 2204 2204 cpu_base->cpu = cpu; 2205 + hrtimer_cpu_base_init_expiry_lock(cpu_base); 2206 + return 0; 2207 + } 2208 + 2209 + int hrtimers_cpu_starting(unsigned int cpu) 2210 + { 2211 + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 2212 + 2213 + /* Clear out any left over state from a CPU down operation */ 2205 2214 cpu_base->active_bases = 0; 2206 2215 cpu_base->hres_active = 0; 2207 2216 cpu_base->hang_detected = 0; ··· 2219 2210 cpu_base->expires_next = KTIME_MAX; 2220 2211 cpu_base->softirq_expires_next = KTIME_MAX; 2221 2212 cpu_base->online = 1; 2222 - hrtimer_cpu_base_init_expiry_lock(cpu_base); 2223 2213 return 0; 2224 2214 } 2225 2215 ··· 2294 2286 void __init hrtimers_init(void) 2295 2287 { 2296 2288 hrtimers_prepare_cpu(smp_processor_id()); 2289 + hrtimers_cpu_starting(smp_processor_id()); 2297 2290 open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); 2298 2291 }
+55 -9
kernel/time/timer_migration.c
··· 534 534 break; 535 535 536 536 child = group; 537 - group = group->parent; 537 + /* 538 + * Pairs with the store release on group connection 539 + * to make sure group initialization is visible. 540 + */ 541 + group = READ_ONCE(group->parent); 538 542 data->childmask = child->groupmask; 543 + WARN_ON_ONCE(!data->childmask); 539 544 } while (group); 540 545 } 541 546 ··· 569 564 while ((node = timerqueue_getnext(&group->events))) { 570 565 evt = container_of(node, struct tmigr_event, nextevt); 571 566 572 - if (!evt->ignore) { 567 + if (!READ_ONCE(evt->ignore)) { 573 568 WRITE_ONCE(group->next_expiry, evt->nextevt.expires); 574 569 return evt; 575 570 } ··· 665 660 * lock is held while updating the ignore flag in idle path. So this 666 661 * state change will not be lost. 667 662 */ 668 - group->groupevt.ignore = true; 663 + WRITE_ONCE(group->groupevt.ignore, true); 669 664 670 665 return walk_done; 671 666 } ··· 726 721 union tmigr_state childstate, groupstate; 727 722 bool remote = data->remote; 728 723 bool walk_done = false; 724 + bool ignore; 729 725 u64 nextexp; 730 726 731 727 if (child) { ··· 745 739 nextexp = child->next_expiry; 746 740 evt = &child->groupevt; 747 741 748 - evt->ignore = (nextexp == KTIME_MAX) ? true : false; 742 + /* 743 + * This can race with concurrent idle exit (activate). 744 + * If the current writer wins, a useless remote expiration may 745 + * be scheduled. If the activate wins, the event is properly 746 + * ignored. 747 + */ 748 + ignore = (nextexp == KTIME_MAX) ? true : false; 749 + WRITE_ONCE(evt->ignore, ignore); 749 750 } else { 750 751 nextexp = data->nextexp; 751 752 752 753 first_childevt = evt = data->evt; 754 + ignore = evt->ignore; 753 755 754 756 /* 755 757 * Walking the hierarchy is required in any case when a ··· 783 769 * first event information of the group is updated properly and 784 770 * also handled properly, so skip this fast return path. 785 771 */ 786 - if (evt->ignore && !remote && group->parent) 772 + if (ignore && !remote && group->parent) 787 773 return true; 788 774 789 775 raw_spin_lock(&group->lock); ··· 797 783 * queue when the expiry time changed only or when it could be ignored. 798 784 */ 799 785 if (timerqueue_node_queued(&evt->nextevt)) { 800 - if ((evt->nextevt.expires == nextexp) && !evt->ignore) { 786 + if ((evt->nextevt.expires == nextexp) && !ignore) { 801 787 /* Make sure not to miss a new CPU event with the same expiry */ 802 788 evt->cpu = first_childevt->cpu; 803 789 goto check_toplvl; ··· 807 793 WRITE_ONCE(group->next_expiry, KTIME_MAX); 808 794 } 809 795 810 - if (evt->ignore) { 796 + if (ignore) { 811 797 /* 812 798 * When the next child event could be ignored (nextexp is 813 799 * KTIME_MAX) and there was no remote timer handling before or ··· 1501 1487 s.seq = 0; 1502 1488 atomic_set(&group->migr_state, s.state); 1503 1489 1490 + /* 1491 + * If this is a new top-level, prepare its groupmask in advance. 1492 + * This avoids accidents where yet another new top-level is 1493 + * created in the future and made visible before the current groupmask. 1494 + */ 1495 + if (list_empty(&tmigr_level_list[lvl])) { 1496 + group->groupmask = BIT(0); 1497 + /* 1498 + * The previous top level has prepared its groupmask already, 1499 + * simply account it as the first child. 1500 + */ 1501 + if (lvl > 0) 1502 + group->num_children = 1; 1503 + } 1504 + 1504 1505 timerqueue_init_head(&group->events); 1505 1506 timerqueue_init(&group->groupevt.nextevt); 1506 1507 group->groupevt.nextevt.expires = KTIME_MAX; ··· 1579 1550 raw_spin_lock_irq(&child->lock); 1580 1551 raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); 1581 1552 1582 - child->parent = parent; 1583 - child->groupmask = BIT(parent->num_children++); 1553 + if (activate) { 1554 + /* 1555 + * @child is the old top and @parent the new one. In this 1556 + * case groupmask is pre-initialized and @child already 1557 + * accounted, along with its new sibling corresponding to the 1558 + * CPU going up. 1559 + */ 1560 + WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); 1561 + } else { 1562 + /* Adding @child for the CPU going up to @parent. */ 1563 + child->groupmask = BIT(parent->num_children++); 1564 + } 1565 + 1566 + /* 1567 + * Make sure parent initialization is visible before publishing it to a 1568 + * racing CPU entering/exiting idle. This RELEASE barrier enforces an 1569 + * address dependency that pairs with the READ_ONCE() in __walk_groups(). 1570 + */ 1571 + smp_store_release(&child->parent, parent); 1584 1572 1585 1573 raw_spin_unlock(&parent->lock); 1586 1574 raw_spin_unlock_irq(&child->lock);