Merge tag 'locking-urgent-2020-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking fixes from Thomas Gleixner:
"Three small fixes/updates for the locking core code:

- Plug a task struct reference leak in the percpu rswem
implementation.

- Document the refcount interaction with PID_MAX_LIMIT

- Improve the 'invalid wait context' data dump in lockdep so it
contains all information which is required to decode the problem"

* tag 'locking-urgent-2020-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
locking/lockdep: Improve 'invalid wait context' splat
locking/refcount: Document interaction with PID_MAX_LIMIT
locking/percpu-rwsem: Fix a task_struct refcount

Changed files
+51 -26
include
linux
kernel
+18 -5
include/linux/refcount.h
··· 38 38 * atomic operations, then the count will continue to edge closer to 0. If it 39 39 * reaches a value of 1 before /any/ of the threads reset it to the saturated 40 40 * value, then a concurrent refcount_dec_and_test() may erroneously free the 41 - * underlying object. Given the precise timing details involved with the 42 - * round-robin scheduling of each thread manipulating the refcount and the need 43 - * to hit the race multiple times in succession, there doesn't appear to be a 44 - * practical avenue of attack even if using refcount_add() operations with 45 - * larger increments. 41 + * underlying object. 42 + * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently 43 + * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK). 44 + * With the current PID limit, if no batched refcounting operations are used and 45 + * the attacker can't repeatedly trigger kernel oopses in the middle of refcount 46 + * operations, this makes it impossible for a saturated refcount to leave the 47 + * saturation range, even if it is possible for multiple uses of the same 48 + * refcount to nest in the context of a single task: 49 + * 50 + * (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT = 51 + * 0x40000000 / 0x400000 = 0x100 = 256 52 + * 53 + * If hundreds of references are added/removed with a single refcounting 54 + * operation, it may potentially be possible to leave the saturation range; but 55 + * given the precise timing details involved with the round-robin scheduling of 56 + * each thread manipulating the refcount and the need to hit the race multiple 57 + * times in succession, there doesn't appear to be a practical avenue of attack 58 + * even if using refcount_add() operations with larger increments. 46 59 * 47 60 * Memory ordering 48 61 * ===============
+31 -20
kernel/locking/lockdep.c
··· 3952 3952 return ret; 3953 3953 } 3954 3954 3955 + static inline short task_wait_context(struct task_struct *curr) 3956 + { 3957 + /* 3958 + * Set appropriate wait type for the context; for IRQs we have to take 3959 + * into account force_irqthread as that is implied by PREEMPT_RT. 3960 + */ 3961 + if (curr->hardirq_context) { 3962 + /* 3963 + * Check if force_irqthreads will run us threaded. 3964 + */ 3965 + if (curr->hardirq_threaded || curr->irq_config) 3966 + return LD_WAIT_CONFIG; 3967 + 3968 + return LD_WAIT_SPIN; 3969 + } else if (curr->softirq_context) { 3970 + /* 3971 + * Softirqs are always threaded. 3972 + */ 3973 + return LD_WAIT_CONFIG; 3974 + } 3975 + 3976 + return LD_WAIT_MAX; 3977 + } 3978 + 3955 3979 static int 3956 3980 print_lock_invalid_wait_context(struct task_struct *curr, 3957 3981 struct held_lock *hlock) 3958 3982 { 3983 + short curr_inner; 3984 + 3959 3985 if (!debug_locks_off()) 3960 3986 return 0; 3961 3987 if (debug_locks_silent) ··· 3997 3971 print_lock(hlock); 3998 3972 3999 3973 pr_warn("other info that might help us debug this:\n"); 3974 + 3975 + curr_inner = task_wait_context(curr); 3976 + pr_warn("context-{%d:%d}\n", curr_inner, curr_inner); 3977 + 4000 3978 lockdep_print_held_locks(curr); 4001 3979 4002 3980 pr_warn("stack backtrace:\n"); ··· 4047 4017 } 4048 4018 depth++; 4049 4019 4050 - /* 4051 - * Set appropriate wait type for the context; for IRQs we have to take 4052 - * into account force_irqthread as that is implied by PREEMPT_RT. 4053 - */ 4054 - if (curr->hardirq_context) { 4055 - /* 4056 - * Check if force_irqthreads will run us threaded. 4057 - */ 4058 - if (curr->hardirq_threaded || curr->irq_config) 4059 - curr_inner = LD_WAIT_CONFIG; 4060 - else 4061 - curr_inner = LD_WAIT_SPIN; 4062 - } else if (curr->softirq_context) { 4063 - /* 4064 - * Softirqs are always threaded. 4065 - */ 4066 - curr_inner = LD_WAIT_CONFIG; 4067 - } else { 4068 - curr_inner = LD_WAIT_MAX; 4069 - } 4020 + curr_inner = task_wait_context(curr); 4070 4021 4071 4022 for (; depth < curr->lockdep_depth; depth++) { 4072 4023 struct held_lock *prev = curr->held_locks + depth;
+2 -1
kernel/locking/percpu-rwsem.c
··· 118 118 unsigned int mode, int wake_flags, 119 119 void *key) 120 120 { 121 - struct task_struct *p = get_task_struct(wq_entry->private); 122 121 bool reader = wq_entry->flags & WQ_FLAG_CUSTOM; 123 122 struct percpu_rw_semaphore *sem = key; 123 + struct task_struct *p; 124 124 125 125 /* concurrent against percpu_down_write(), can get stolen */ 126 126 if (!__percpu_rwsem_trylock(sem, reader)) 127 127 return 1; 128 128 129 + p = get_task_struct(wq_entry->private); 129 130 list_del_init(&wq_entry->entry); 130 131 smp_store_release(&wq_entry->private, NULL); 131 132