Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

task_work_add: generic process-context callbacks

Provide a simple mechanism that allows running code in the (nonatomic)
context of the arbitrary task.

The caller does task_work_add(task, task_work) and this task executes
task_work->func() either from do_notify_resume() or from do_exit(). The
callback can rely on PF_EXITING to detect the latter case.

"struct task_work" can be embedded in another struct, still it has "void
*data" to handle the most common/simple case.

This allows us to kill the ->replacement_session_keyring hack, and
potentially this can have more users.

Performance-wise, this adds 2 "unlikely(!hlist_empty())" checks into
tracehook_notify_resume() and do_exit(). But at the same time we can
remove the "replacement_session_keyring != NULL" checks from
arch/*/signal.c and exit_creds().

Note: task_work_add/task_work_run abuses ->pi_lock. This is only because
this lock is already used by lookup_pi_state() to synchronize with
do_exit() setting PF_EXITING. Fortunately the scope of this lock in
task_work.c is really tiny, and the code is unlikely anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Alexander Gordeev <agordeev@redhat.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Smith <dsmith@redhat.com>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

authored by

Oleg Nesterov and committed by
Al Viro
e73f8959 62366c88

+136 -2
+2
include/linux/sched.h
··· 1400 1400 int (*notifier)(void *priv); 1401 1401 void *notifier_data; 1402 1402 sigset_t *notifier_mask; 1403 + struct hlist_head task_works; 1404 + 1403 1405 struct audit_context *audit_context; 1404 1406 #ifdef CONFIG_AUDITSYSCALL 1405 1407 uid_t loginuid;
+33
include/linux/task_work.h
··· 1 + #ifndef _LINUX_TASK_WORK_H 2 + #define _LINUX_TASK_WORK_H 3 + 4 + #include <linux/list.h> 5 + #include <linux/sched.h> 6 + 7 + struct task_work; 8 + typedef void (*task_work_func_t)(struct task_work *); 9 + 10 + struct task_work { 11 + struct hlist_node hlist; 12 + task_work_func_t func; 13 + void *data; 14 + }; 15 + 16 + static inline void 17 + init_task_work(struct task_work *twork, task_work_func_t func, void *data) 18 + { 19 + twork->func = func; 20 + twork->data = data; 21 + } 22 + 23 + int task_work_add(struct task_struct *task, struct task_work *twork, bool); 24 + struct task_work *task_work_cancel(struct task_struct *, task_work_func_t); 25 + void task_work_run(void); 26 + 27 + static inline void exit_task_work(struct task_struct *task) 28 + { 29 + if (unlikely(!hlist_empty(&task->task_works))) 30 + task_work_run(); 31 + } 32 + 33 + #endif /* _LINUX_TASK_WORK_H */
+11
include/linux/tracehook.h
··· 49 49 #include <linux/sched.h> 50 50 #include <linux/ptrace.h> 51 51 #include <linux/security.h> 52 + #include <linux/task_work.h> 52 53 struct linux_binprm; 53 54 54 55 /* ··· 165 164 */ 166 165 static inline void set_notify_resume(struct task_struct *task) 167 166 { 167 + #ifdef TIF_NOTIFY_RESUME 168 168 if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME)) 169 169 kick_process(task); 170 + #endif 170 171 } 171 172 172 173 /** ··· 188 185 { 189 186 if (current->replacement_session_keyring) 190 187 key_replace_session_keyring(); 188 + /* 189 + * The caller just cleared TIF_NOTIFY_RESUME. This barrier 190 + * pairs with task_work_add()->set_notify_resume() after 191 + * hlist_add_head(task->task_works); 192 + */ 193 + smp_mb__after_clear_bit(); 194 + if (unlikely(!hlist_empty(&current->task_works))) 195 + task_work_run(); 191 196 } 192 197 193 198 #endif /* <linux/tracehook.h> */
+1 -1
kernel/Makefile
··· 5 5 obj-y = fork.o exec_domain.o panic.o printk.o \ 6 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 7 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 8 - signal.o sys.o kmod.o workqueue.o pid.o \ 8 + signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 9 9 rcupdate.o extable.o params.o posix-timers.o \ 10 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 11 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
+4 -1
kernel/exit.c
··· 946 946 exit_signals(tsk); /* sets PF_EXITING */ 947 947 /* 948 948 * tsk->flags are checked in the futex code to protect against 949 - * an exiting task cleaning up the robust pi futexes. 949 + * an exiting task cleaning up the robust pi futexes, and in 950 + * task_work_add() to avoid the race with exit_task_work(). 950 951 */ 951 952 smp_mb(); 952 953 raw_spin_unlock_wait(&tsk->pi_lock); 954 + 955 + exit_task_work(tsk); 953 956 954 957 exit_irq_thread(); 955 958
+1
kernel/fork.c
··· 1411 1411 */ 1412 1412 p->group_leader = p; 1413 1413 INIT_LIST_HEAD(&p->thread_group); 1414 + INIT_HLIST_HEAD(&p->task_works); 1414 1415 1415 1416 /* Now that the task is set up, run cgroup callbacks if 1416 1417 * necessary. We need to run them before the task is visible
+84
kernel/task_work.c
··· 1 + #include <linux/spinlock.h> 2 + #include <linux/task_work.h> 3 + #include <linux/tracehook.h> 4 + 5 + int 6 + task_work_add(struct task_struct *task, struct task_work *twork, bool notify) 7 + { 8 + unsigned long flags; 9 + int err = -ESRCH; 10 + 11 + #ifndef TIF_NOTIFY_RESUME 12 + if (notify) 13 + return -ENOTSUPP; 14 + #endif 15 + /* 16 + * We must not insert the new work if the task has already passed 17 + * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() 18 + * and check PF_EXITING under pi_lock. 19 + */ 20 + raw_spin_lock_irqsave(&task->pi_lock, flags); 21 + if (likely(!(task->flags & PF_EXITING))) { 22 + hlist_add_head(&twork->hlist, &task->task_works); 23 + err = 0; 24 + } 25 + raw_spin_unlock_irqrestore(&task->pi_lock, flags); 26 + 27 + /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ 28 + if (likely(!err) && notify) 29 + set_notify_resume(task); 30 + return err; 31 + } 32 + 33 + struct task_work * 34 + task_work_cancel(struct task_struct *task, task_work_func_t func) 35 + { 36 + unsigned long flags; 37 + struct task_work *twork; 38 + struct hlist_node *pos; 39 + 40 + raw_spin_lock_irqsave(&task->pi_lock, flags); 41 + hlist_for_each_entry(twork, pos, &task->task_works, hlist) { 42 + if (twork->func == func) { 43 + hlist_del(&twork->hlist); 44 + goto found; 45 + } 46 + } 47 + twork = NULL; 48 + found: 49 + raw_spin_unlock_irqrestore(&task->pi_lock, flags); 50 + 51 + return twork; 52 + } 53 + 54 + void task_work_run(void) 55 + { 56 + struct task_struct *task = current; 57 + struct hlist_head task_works; 58 + struct hlist_node *pos; 59 + 60 + raw_spin_lock_irq(&task->pi_lock); 61 + hlist_move_list(&task->task_works, &task_works); 62 + raw_spin_unlock_irq(&task->pi_lock); 63 + 64 + if (unlikely(hlist_empty(&task_works))) 65 + return; 66 + /* 67 + * We use hlist to save the space in task_struct, but we want fifo. 68 + * Find the last entry, the list should be short, then process them 69 + * in reverse order. 70 + */ 71 + for (pos = task_works.first; pos->next; pos = pos->next) 72 + ; 73 + 74 + for (;;) { 75 + struct hlist_node **pprev = pos->pprev; 76 + struct task_work *twork = container_of(pos, struct task_work, 77 + hlist); 78 + twork->func(twork); 79 + 80 + if (pprev == &task_works.first) 81 + break; 82 + pos = container_of(pprev, struct hlist_node, next); 83 + } 84 + }