mm/mmap_lock.c at v6.16-rc2

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / mm / mmap_lock.c
at v6.16-rc2 317 lines 8.4 kB view raw
wrap content
  1// SPDX-License-Identifier: GPL-2.0
  2#define CREATE_TRACE_POINTS
  3#include <trace/events/mmap_lock.h>
  4
  5#include <linux/mm.h>
  6#include <linux/cgroup.h>
  7#include <linux/memcontrol.h>
  8#include <linux/mmap_lock.h>
  9#include <linux/mutex.h>
 10#include <linux/percpu.h>
 11#include <linux/rcupdate.h>
 12#include <linux/smp.h>
 13#include <linux/trace_events.h>
 14#include <linux/local_lock.h>
 15
 16EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
 17EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
 18EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
 19
 20#ifdef CONFIG_TRACING
 21/*
 22 * Trace calls must be in a separate file, as otherwise there's a circular
 23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
 24 */
 25
 26void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
 27{
 28	trace_mmap_lock_start_locking(mm, write);
 29}
 30EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
 31
 32void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
 33					   bool success)
 34{
 35	trace_mmap_lock_acquire_returned(mm, write, success);
 36}
 37EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
 38
 39void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
 40{
 41	trace_mmap_lock_released(mm, write);
 42}
 43EXPORT_SYMBOL(__mmap_lock_do_trace_released);
 44#endif /* CONFIG_TRACING */
 45
 46#ifdef CONFIG_MMU
 47#ifdef CONFIG_PER_VMA_LOCK
 48static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
 49{
 50	unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
 51
 52	/* Additional refcnt if the vma is attached. */
 53	if (!detaching)
 54		tgt_refcnt++;
 55
 56	/*
 57	 * If vma is detached then only vma_mark_attached() can raise the
 58	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
 59	 */
 60	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
 61		return false;
 62
 63	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
 64	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
 65		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
 66		   TASK_UNINTERRUPTIBLE);
 67	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
 68
 69	return true;
 70}
 71
 72static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
 73{
 74	*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
 75	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
 76}
 77
 78void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
 79{
 80	bool locked;
 81
 82	/*
 83	 * __vma_enter_locked() returns false immediately if the vma is not
 84	 * attached, otherwise it waits until refcnt is indicating that vma
 85	 * is attached with no readers.
 86	 */
 87	locked = __vma_enter_locked(vma, false);
 88
 89	/*
 90	 * We should use WRITE_ONCE() here because we can have concurrent reads
 91	 * from the early lockless pessimistic check in vma_start_read().
 92	 * We don't really care about the correctness of that early check, but
 93	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
 94	 */
 95	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
 96
 97	if (locked) {
 98		bool detached;
 99
100		__vma_exit_locked(vma, &detached);
101		WARN_ON_ONCE(detached); /* vma should remain attached */
102	}
103}
104EXPORT_SYMBOL_GPL(__vma_start_write);
105
106void vma_mark_detached(struct vm_area_struct *vma)
107{
108	vma_assert_write_locked(vma);
109	vma_assert_attached(vma);
110
111	/*
112	 * We are the only writer, so no need to use vma_refcount_put().
113	 * The condition below is unlikely because the vma has been already
114	 * write-locked and readers can increment vm_refcnt only temporarily
115	 * before they check vm_lock_seq, realize the vma is locked and drop
116	 * back the vm_refcnt. That is a narrow window for observing a raised
117	 * vm_refcnt.
118	 */
119	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
120		/* Wait until vma is detached with no readers. */
121		if (__vma_enter_locked(vma, true)) {
122			bool detached;
123
124			__vma_exit_locked(vma, &detached);
125			WARN_ON_ONCE(!detached);
126		}
127	}
128}
129
130/*
131 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
132 * stable and not isolated. If the VMA is not found or is being modified the
133 * function returns NULL.
134 */
135struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
136					  unsigned long address)
137{
138	MA_STATE(mas, &mm->mm_mt, address, address);
139	struct vm_area_struct *vma;
140
141	rcu_read_lock();
142retry:
143	vma = mas_walk(&mas);
144	if (!vma)
145		goto inval;
146
147	vma = vma_start_read(mm, vma);
148	if (IS_ERR_OR_NULL(vma)) {
149		/* Check if the VMA got isolated after we found it */
150		if (PTR_ERR(vma) == -EAGAIN) {
151			count_vm_vma_lock_event(VMA_LOCK_MISS);
152			/* The area was replaced with another one */
153			goto retry;
154		}
155
156		/* Failed to lock the VMA */
157		goto inval;
158	}
159	/*
160	 * At this point, we have a stable reference to a VMA: The VMA is
161	 * locked and we know it hasn't already been isolated.
162	 * From here on, we can access the VMA without worrying about which
163	 * fields are accessible for RCU readers.
164	 */
165
166	/* Check if the vma we locked is the right one. */
167	if (unlikely(vma->vm_mm != mm ||
168		     address < vma->vm_start || address >= vma->vm_end))
169		goto inval_end_read;
170
171	rcu_read_unlock();
172	return vma;
173
174inval_end_read:
175	vma_end_read(vma);
176inval:
177	rcu_read_unlock();
178	count_vm_vma_lock_event(VMA_LOCK_ABORT);
179	return NULL;
180}
181#endif /* CONFIG_PER_VMA_LOCK */
182
183#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
184#include <linux/extable.h>
185
186static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
187{
188	if (likely(mmap_read_trylock(mm)))
189		return true;
190
191	if (regs && !user_mode(regs)) {
192		unsigned long ip = exception_ip(regs);
193		if (!search_exception_tables(ip))
194			return false;
195	}
196
197	return !mmap_read_lock_killable(mm);
198}
199
200static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
201{
202	/*
203	 * We don't have this operation yet.
204	 *
205	 * It should be easy enough to do: it's basically a
206	 *    atomic_long_try_cmpxchg_acquire()
207	 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
208	 * it also needs the proper lockdep magic etc.
209	 */
210	return false;
211}
212
213static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
214{
215	mmap_read_unlock(mm);
216	if (regs && !user_mode(regs)) {
217		unsigned long ip = exception_ip(regs);
218		if (!search_exception_tables(ip))
219			return false;
220	}
221	return !mmap_write_lock_killable(mm);
222}
223
224/*
225 * Helper for page fault handling.
226 *
227 * This is kind of equivalent to "mmap_read_lock()" followed
228 * by "find_extend_vma()", except it's a lot more careful about
229 * the locking (and will drop the lock on failure).
230 *
231 * For example, if we have a kernel bug that causes a page
232 * fault, we don't want to just use mmap_read_lock() to get
233 * the mm lock, because that would deadlock if the bug were
234 * to happen while we're holding the mm lock for writing.
235 *
236 * So this checks the exception tables on kernel faults in
237 * order to only do this all for instructions that are actually
238 * expected to fault.
239 *
240 * We can also actually take the mm lock for writing if we
241 * need to extend the vma, which helps the VM layer a lot.
242 */
243struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
244			unsigned long addr, struct pt_regs *regs)
245{
246	struct vm_area_struct *vma;
247
248	if (!get_mmap_lock_carefully(mm, regs))
249		return NULL;
250
251	vma = find_vma(mm, addr);
252	if (likely(vma && (vma->vm_start <= addr)))
253		return vma;
254
255	/*
256	 * Well, dang. We might still be successful, but only
257	 * if we can extend a vma to do so.
258	 */
259	if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
260		mmap_read_unlock(mm);
261		return NULL;
262	}
263
264	/*
265	 * We can try to upgrade the mmap lock atomically,
266	 * in which case we can continue to use the vma
267	 * we already looked up.
268	 *
269	 * Otherwise we'll have to drop the mmap lock and
270	 * re-take it, and also look up the vma again,
271	 * re-checking it.
272	 */
273	if (!mmap_upgrade_trylock(mm)) {
274		if (!upgrade_mmap_lock_carefully(mm, regs))
275			return NULL;
276
277		vma = find_vma(mm, addr);
278		if (!vma)
279			goto fail;
280		if (vma->vm_start <= addr)
281			goto success;
282		if (!(vma->vm_flags & VM_GROWSDOWN))
283			goto fail;
284	}
285
286	if (expand_stack_locked(vma, addr))
287		goto fail;
288
289success:
290	mmap_write_downgrade(mm);
291	return vma;
292
293fail:
294	mmap_write_unlock(mm);
295	return NULL;
296}
297#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
298
299#else /* CONFIG_MMU */
300
301/*
302 * At least xtensa ends up having protection faults even with no
303 * MMU.. No stack expansion, at least.
304 */
305struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
306			unsigned long addr, struct pt_regs *regs)
307{
308	struct vm_area_struct *vma;
309
310	mmap_read_lock(mm);
311	vma = vma_lookup(mm, addr);
312	if (!vma)
313		mmap_read_unlock(mm);
314	return vma;
315}
316
317#endif /* CONFIG_MMU */
Configure Feed

Configure Feed