Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

posix-cpu-timers: Implement the missing timer_wait_running callback

For some unknown reason the introduction of the timer_wait_running callback
missed to fixup posix CPU timers, which went unnoticed for almost four years.
Marco reported recently that the WARN_ON() in timer_wait_running()
triggers with a posix CPU timer test case.

Posix CPU timers have two execution models for expiring timers depending on
CONFIG_POSIX_CPU_TIMERS_TASK_WORK:

1) If not enabled, the expiry happens in hard interrupt context so
spin waiting on the remote CPU is reasonably time bound.

Implement an empty stub function for that case.

2) If enabled, the expiry happens in task work before returning to user
space or guest mode. The expired timers are marked as firing and moved
from the timer queue to a local list head with sighand lock held. Once
the timers are moved, sighand lock is dropped and the expiry happens in
fully preemptible context. That means the expiring task can be scheduled
out, migrated, interrupted etc. So spin waiting on it is more than
suboptimal.

The timer wheel has a timer_wait_running() mechanism for RT, which uses
a per CPU timer-base expiry lock which is held by the expiry code and the
task waiting for the timer function to complete blocks on that lock.

This does not work in the same way for posix CPU timers as there is no
timer base and expiry for process wide timers can run on any task
belonging to that process, but the concept of waiting on an expiry lock
can be used too in a slightly different way:

- Add a mutex to struct posix_cputimers_work. This struct is per task
and used to schedule the expiry task work from the timer interrupt.

- Add a task_struct pointer to struct cpu_timer which is used to store
a the task which runs the expiry. That's filled in when the task
moves the expired timers to the local expiry list. That's not
affecting the size of the k_itimer union as there are bigger union
members already

- Let the task take the expiry mutex around the expiry function

- Let the waiter acquire a task reference with rcu_read_lock() held and
block on the expiry mutex

This avoids spin-waiting on a task which might not even be on a CPU and
works nicely for RT too.

Fixes: ec8f954a40da ("posix-timers: Use a callback for cancel synchronization on PREEMPT_RT")
Reported-by: Marco Elver <elver@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marco Elver <elver@google.com>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/87zg764ojw.ffs@tglx

+82 -20
+11 -6
include/linux/posix-timers.h
··· 4 4 5 5 #include <linux/spinlock.h> 6 6 #include <linux/list.h> 7 + #include <linux/mutex.h> 7 8 #include <linux/alarmtimer.h> 8 9 #include <linux/timerqueue.h> 9 10 ··· 63 62 * cpu_timer - Posix CPU timer representation for k_itimer 64 63 * @node: timerqueue node to queue in the task/sig 65 64 * @head: timerqueue head on which this timer is queued 66 - * @task: Pointer to target task 65 + * @pid: Pointer to target task PID 67 66 * @elist: List head for the expiry list 68 67 * @firing: Timer is currently firing 68 + * @handling: Pointer to the task which handles expiry 69 69 */ 70 70 struct cpu_timer { 71 - struct timerqueue_node node; 72 - struct timerqueue_head *head; 73 - struct pid *pid; 74 - struct list_head elist; 75 - int firing; 71 + struct timerqueue_node node; 72 + struct timerqueue_head *head; 73 + struct pid *pid; 74 + struct list_head elist; 75 + int firing; 76 + struct task_struct __rcu *handling; 76 77 }; 77 78 78 79 static inline bool cpu_timer_enqueue(struct timerqueue_head *head, ··· 138 135 /** 139 136 * posix_cputimers_work - Container for task work based posix CPU timer expiry 140 137 * @work: The task work to be scheduled 138 + * @mutex: Mutex held around expiry in context of this task work 141 139 * @scheduled: @work has been scheduled already, no further processing 142 140 */ 143 141 struct posix_cputimers_work { 144 142 struct callback_head work; 143 + struct mutex mutex; 145 144 unsigned int scheduled; 146 145 }; 147 146
+67 -14
kernel/time/posix-cpu-timers.c
··· 846 846 return expires; 847 847 848 848 ctmr->firing = 1; 849 + /* See posix_cpu_timer_wait_running() */ 850 + rcu_assign_pointer(ctmr->handling, current); 849 851 cpu_timer_dequeue(ctmr); 850 852 list_add_tail(&ctmr->elist, firing); 851 853 } ··· 1163 1161 #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK 1164 1162 static void posix_cpu_timers_work(struct callback_head *work) 1165 1163 { 1164 + struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work); 1165 + 1166 + mutex_lock(&cw->mutex); 1166 1167 handle_posix_cpu_timers(current); 1168 + mutex_unlock(&cw->mutex); 1169 + } 1170 + 1171 + /* 1172 + * Invoked from the posix-timer core when a cancel operation failed because 1173 + * the timer is marked firing. The caller holds rcu_read_lock(), which 1174 + * protects the timer and the task which is expiring it from being freed. 1175 + */ 1176 + static void posix_cpu_timer_wait_running(struct k_itimer *timr) 1177 + { 1178 + struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling); 1179 + 1180 + /* Has the handling task completed expiry already? */ 1181 + if (!tsk) 1182 + return; 1183 + 1184 + /* Ensure that the task cannot go away */ 1185 + get_task_struct(tsk); 1186 + /* Now drop the RCU protection so the mutex can be locked */ 1187 + rcu_read_unlock(); 1188 + /* Wait on the expiry mutex */ 1189 + mutex_lock(&tsk->posix_cputimers_work.mutex); 1190 + /* Release it immediately again. */ 1191 + mutex_unlock(&tsk->posix_cputimers_work.mutex); 1192 + /* Drop the task reference. */ 1193 + put_task_struct(tsk); 1194 + /* Relock RCU so the callsite is balanced */ 1195 + rcu_read_lock(); 1196 + } 1197 + 1198 + static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) 1199 + { 1200 + /* Ensure that timr->it.cpu.handling task cannot go away */ 1201 + rcu_read_lock(); 1202 + spin_unlock_irq(&timr->it_lock); 1203 + posix_cpu_timer_wait_running(timr); 1204 + rcu_read_unlock(); 1205 + /* @timr is on stack and is valid */ 1206 + spin_lock_irq(&timr->it_lock); 1167 1207 } 1168 1208 1169 1209 /* ··· 1221 1177 sizeof(p->posix_cputimers_work.work)); 1222 1178 init_task_work(&p->posix_cputimers_work.work, 1223 1179 posix_cpu_timers_work); 1180 + mutex_init(&p->posix_cputimers_work.mutex); 1224 1181 p->posix_cputimers_work.scheduled = false; 1225 1182 } 1226 1183 ··· 1298 1253 lockdep_posixtimer_enter(); 1299 1254 handle_posix_cpu_timers(tsk); 1300 1255 lockdep_posixtimer_exit(); 1256 + } 1257 + 1258 + static void posix_cpu_timer_wait_running(struct k_itimer *timr) 1259 + { 1260 + cpu_relax(); 1261 + } 1262 + 1263 + static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) 1264 + { 1265 + spin_unlock_irq(&timr->it_lock); 1266 + cpu_relax(); 1267 + spin_lock_irq(&timr->it_lock); 1301 1268 } 1302 1269 1303 1270 static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) ··· 1420 1363 */ 1421 1364 if (likely(cpu_firing >= 0)) 1422 1365 cpu_timer_fire(timer); 1366 + /* See posix_cpu_timer_wait_running() */ 1367 + rcu_assign_pointer(timer->it.cpu.handling, NULL); 1423 1368 spin_unlock(&timer->it_lock); 1424 1369 } 1425 1370 } ··· 1556 1497 expires = cpu_timer_getexpires(&timer.it.cpu); 1557 1498 error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); 1558 1499 if (!error) { 1559 - /* 1560 - * Timer is now unarmed, deletion can not fail. 1561 - */ 1500 + /* Timer is now unarmed, deletion can not fail. */ 1562 1501 posix_cpu_timer_del(&timer); 1502 + } else { 1503 + while (error == TIMER_RETRY) { 1504 + posix_cpu_timer_wait_running_nsleep(&timer); 1505 + error = posix_cpu_timer_del(&timer); 1506 + } 1563 1507 } 1564 - spin_unlock_irq(&timer.it_lock); 1565 1508 1566 - while (error == TIMER_RETRY) { 1567 - /* 1568 - * We need to handle case when timer was or is in the 1569 - * middle of firing. In other cases we already freed 1570 - * resources. 1571 - */ 1572 - spin_lock_irq(&timer.it_lock); 1573 - error = posix_cpu_timer_del(&timer); 1574 - spin_unlock_irq(&timer.it_lock); 1575 - } 1509 + spin_unlock_irq(&timer.it_lock); 1576 1510 1577 1511 if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { 1578 1512 /* ··· 1675 1623 .timer_del = posix_cpu_timer_del, 1676 1624 .timer_get = posix_cpu_timer_get, 1677 1625 .timer_rearm = posix_cpu_timer_rearm, 1626 + .timer_wait_running = posix_cpu_timer_wait_running, 1678 1627 }; 1679 1628 1680 1629 const struct k_clock clock_process = {
+4
kernel/time/posix-timers.c
··· 846 846 rcu_read_lock(); 847 847 unlock_timer(timer, *flags); 848 848 849 + /* 850 + * kc->timer_wait_running() might drop RCU lock. So @timer 851 + * cannot be touched anymore after the function returns! 852 + */ 849 853 if (!WARN_ON_ONCE(!kc->timer_wait_running)) 850 854 kc->timer_wait_running(timer); 851 855