Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

membarrier: Provide register expedited private command

This introduces a "register private expedited" membarrier command which
allows eventual removal of important memory barrier constraints on the
scheduler fast-paths. It changes how the "private expedited" membarrier
command (new to 4.14) is used from user-space.

This new command allows processes to register their intent to use the
private expedited command. This affects how the expedited private
command introduced in 4.14-rc is meant to be used, and should be merged
before 4.14 final.

Processes are now required to register before using
MEMBARRIER_CMD_PRIVATE_EXPEDITED, otherwise that command returns EPERM.

This fixes a problem that arose when designing requested extensions to
sys_membarrier() to allow JITs to efficiently flush old code from
instruction caches. Several potential algorithms are much less painful
if the user register intent to use this functionality early on, for
example, before the process spawns the second thread. Registering at
this time removes the need to interrupt each and every thread in that
process at the first expedited sys_membarrier() system call.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Mathieu Desnoyers and committed by
Linus Torvalds
a961e409 96f893ab

+66 -11
+1
fs/exec.c
··· 1802 1802 /* execve succeeded */ 1803 1803 current->fs->in_exec = 0; 1804 1804 current->in_execve = 0; 1805 + membarrier_execve(current); 1805 1806 acct_update_integrals(current); 1806 1807 task_numa_free(current); 1807 1808 free_bprm(bprm);
+3
include/linux/mm_types.h
··· 445 445 unsigned long flags; /* Must use atomic bitops to access the bits */ 446 446 447 447 struct core_state *core_state; /* coredumping support */ 448 + #ifdef CONFIG_MEMBARRIER 449 + atomic_t membarrier_state; 450 + #endif 448 451 #ifdef CONFIG_AIO 449 452 spinlock_t ioctx_lock; 450 453 struct kioctx_table __rcu *ioctx_table;
+16
include/linux/sched/mm.h
··· 211 211 current->flags = (current->flags & ~PF_MEMALLOC) | flags; 212 212 } 213 213 214 + #ifdef CONFIG_MEMBARRIER 215 + enum { 216 + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), 217 + MEMBARRIER_STATE_SWITCH_MM = (1U << 1), 218 + }; 219 + 220 + static inline void membarrier_execve(struct task_struct *t) 221 + { 222 + atomic_set(&t->mm->membarrier_state, 0); 223 + } 224 + #else 225 + static inline void membarrier_execve(struct task_struct *t) 226 + { 227 + } 228 + #endif 229 + 214 230 #endif /* _LINUX_SCHED_MM_H */
+16 -7
include/uapi/linux/membarrier.h
··· 52 52 * (non-running threads are de facto in such a 53 53 * state). This only covers threads from the 54 54 * same processes as the caller thread. This 55 - * command returns 0. The "expedited" commands 56 - * complete faster than the non-expedited ones, 57 - * they never block, but have the downside of 58 - * causing extra overhead. 55 + * command returns 0 on success. The 56 + * "expedited" commands complete faster than 57 + * the non-expedited ones, they never block, 58 + * but have the downside of causing extra 59 + * overhead. A process needs to register its 60 + * intent to use the private expedited command 61 + * prior to using it, otherwise this command 62 + * returns -EPERM. 63 + * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: 64 + * Register the process intent to use 65 + * MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always 66 + * returns 0. 59 67 * 60 68 * Command to be passed to the membarrier system call. The commands need to 61 69 * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to 62 70 * the value 0. 63 71 */ 64 72 enum membarrier_cmd { 65 - MEMBARRIER_CMD_QUERY = 0, 66 - MEMBARRIER_CMD_SHARED = (1 << 0), 73 + MEMBARRIER_CMD_QUERY = 0, 74 + MEMBARRIER_CMD_SHARED = (1 << 0), 67 75 /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */ 68 76 /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */ 69 - MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), 77 + MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), 78 + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), 70 79 }; 71 80 72 81 #endif /* _UAPI_LINUX_MEMBARRIER_H */
+30 -4
kernel/sched/membarrier.c
··· 18 18 #include <linux/membarrier.h> 19 19 #include <linux/tick.h> 20 20 #include <linux/cpumask.h> 21 + #include <linux/atomic.h> 21 22 22 23 #include "sched.h" /* for cpu_rq(). */ 23 24 ··· 27 26 * except MEMBARRIER_CMD_QUERY. 28 27 */ 29 28 #define MEMBARRIER_CMD_BITMASK \ 30 - (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) 29 + (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ 30 + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) 31 31 32 32 static void ipi_mb(void *info) 33 33 { 34 34 smp_mb(); /* IPIs should be serializing but paranoid. */ 35 35 } 36 36 37 - static void membarrier_private_expedited(void) 37 + static int membarrier_private_expedited(void) 38 38 { 39 39 int cpu; 40 40 bool fallback = false; 41 41 cpumask_var_t tmpmask; 42 42 43 + if (!(atomic_read(&current->mm->membarrier_state) 44 + & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) 45 + return -EPERM; 46 + 43 47 if (num_online_cpus() == 1) 44 - return; 48 + return 0; 45 49 46 50 /* 47 51 * Matches memory barriers around rq->curr modification in ··· 100 94 * rq->curr modification in scheduler. 101 95 */ 102 96 smp_mb(); /* exit from system call is not a mb */ 97 + return 0; 98 + } 99 + 100 + static void membarrier_register_private_expedited(void) 101 + { 102 + struct task_struct *p = current; 103 + struct mm_struct *mm = p->mm; 104 + 105 + /* 106 + * We need to consider threads belonging to different thread 107 + * groups, which use the same mm. (CLONE_VM but not 108 + * CLONE_THREAD). 109 + */ 110 + if (atomic_read(&mm->membarrier_state) 111 + & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) 112 + return; 113 + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, 114 + &mm->membarrier_state); 103 115 } 104 116 105 117 /** ··· 168 144 synchronize_sched(); 169 145 return 0; 170 146 case MEMBARRIER_CMD_PRIVATE_EXPEDITED: 171 - membarrier_private_expedited(); 147 + return membarrier_private_expedited(); 148 + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: 149 + membarrier_register_private_expedited(); 172 150 return 0; 173 151 default: 174 152 return -EINVAL;