at v6.17 497 lines 15 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef _LINUX_MMAP_LOCK_H 3#define _LINUX_MMAP_LOCK_H 4 5/* Avoid a dependency loop by declaring here. */ 6extern int rcuwait_wake_up(struct rcuwait *w); 7 8#include <linux/lockdep.h> 9#include <linux/mm_types.h> 10#include <linux/mmdebug.h> 11#include <linux/rwsem.h> 12#include <linux/tracepoint-defs.h> 13#include <linux/types.h> 14#include <linux/cleanup.h> 15#include <linux/sched/mm.h> 16 17#define MMAP_LOCK_INITIALIZER(name) \ 18 .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), 19 20DECLARE_TRACEPOINT(mmap_lock_start_locking); 21DECLARE_TRACEPOINT(mmap_lock_acquire_returned); 22DECLARE_TRACEPOINT(mmap_lock_released); 23 24#ifdef CONFIG_TRACING 25 26void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write); 27void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, 28 bool success); 29void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write); 30 31static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, 32 bool write) 33{ 34 if (tracepoint_enabled(mmap_lock_start_locking)) 35 __mmap_lock_do_trace_start_locking(mm, write); 36} 37 38static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, 39 bool write, bool success) 40{ 41 if (tracepoint_enabled(mmap_lock_acquire_returned)) 42 __mmap_lock_do_trace_acquire_returned(mm, write, success); 43} 44 45static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) 46{ 47 if (tracepoint_enabled(mmap_lock_released)) 48 __mmap_lock_do_trace_released(mm, write); 49} 50 51#else /* !CONFIG_TRACING */ 52 53static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, 54 bool write) 55{ 56} 57 58static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, 59 bool write, bool success) 60{ 61} 62 63static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) 64{ 65} 66 67#endif /* CONFIG_TRACING */ 68 69static inline void mmap_assert_locked(const struct mm_struct *mm) 70{ 71 rwsem_assert_held(&mm->mmap_lock); 72} 73 74static inline void mmap_assert_write_locked(const struct mm_struct *mm) 75{ 76 rwsem_assert_held_write(&mm->mmap_lock); 77} 78 79#ifdef CONFIG_PER_VMA_LOCK 80 81static inline void mm_lock_seqcount_init(struct mm_struct *mm) 82{ 83 seqcount_init(&mm->mm_lock_seq); 84} 85 86static inline void mm_lock_seqcount_begin(struct mm_struct *mm) 87{ 88 do_raw_write_seqcount_begin(&mm->mm_lock_seq); 89} 90 91static inline void mm_lock_seqcount_end(struct mm_struct *mm) 92{ 93 ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); 94 do_raw_write_seqcount_end(&mm->mm_lock_seq); 95} 96 97static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) 98{ 99 /* 100 * Since mmap_lock is a sleeping lock, and waiting for it to become 101 * unlocked is more or less equivalent with taking it ourselves, don't 102 * bother with the speculative path if mmap_lock is already write-locked 103 * and take the slow path, which takes the lock. 104 */ 105 return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); 106} 107 108static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) 109{ 110 return read_seqcount_retry(&mm->mm_lock_seq, seq); 111} 112 113static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) 114{ 115#ifdef CONFIG_DEBUG_LOCK_ALLOC 116 static struct lock_class_key lockdep_key; 117 118 lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); 119#endif 120 if (reset_refcnt) 121 refcount_set(&vma->vm_refcnt, 0); 122 vma->vm_lock_seq = UINT_MAX; 123} 124 125static inline bool is_vma_writer_only(int refcnt) 126{ 127 /* 128 * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma 129 * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on 130 * a detached vma happens only in vma_mark_detached() and is a rare 131 * case, therefore most of the time there will be no unnecessary wakeup. 132 */ 133 return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; 134} 135 136static inline void vma_refcount_put(struct vm_area_struct *vma) 137{ 138 /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ 139 struct mm_struct *mm = vma->vm_mm; 140 int oldcnt; 141 142 rwsem_release(&vma->vmlock_dep_map, _RET_IP_); 143 if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { 144 145 if (is_vma_writer_only(oldcnt - 1)) 146 rcuwait_wake_up(&mm->vma_writer_wait); 147 } 148} 149 150/* 151 * Try to read-lock a vma. The function is allowed to occasionally yield false 152 * locked result to avoid performance overhead, in which case we fall back to 153 * using mmap_lock. The function should never yield false unlocked result. 154 * False locked result is possible if mm_lock_seq overflows or if vma gets 155 * reused and attached to a different mm before we lock it. 156 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got 157 * detached. 158 * 159 * WARNING! The vma passed to this function cannot be used if the function 160 * fails to lock it because in certain cases RCU lock is dropped and then 161 * reacquired. Once RCU lock is dropped the vma can be concurently freed. 162 */ 163static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, 164 struct vm_area_struct *vma) 165{ 166 int oldcnt; 167 168 /* 169 * Check before locking. A race might cause false locked result. 170 * We can use READ_ONCE() for the mm_lock_seq here, and don't need 171 * ACQUIRE semantics, because this is just a lockless check whose result 172 * we don't rely on for anything - the mm_lock_seq read against which we 173 * need ordering is below. 174 */ 175 if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) 176 return NULL; 177 178 /* 179 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() 180 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. 181 * Acquire fence is required here to avoid reordering against later 182 * vm_lock_seq check and checks inside lock_vma_under_rcu(). 183 */ 184 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, 185 VMA_REF_LIMIT))) { 186 /* return EAGAIN if vma got detached from under us */ 187 return oldcnt ? NULL : ERR_PTR(-EAGAIN); 188 } 189 190 rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); 191 192 /* 193 * If vma got attached to another mm from under us, that mm is not 194 * stable and can be freed in the narrow window after vma->vm_refcnt 195 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before 196 * releasing vma->vm_refcnt. 197 */ 198 if (unlikely(vma->vm_mm != mm)) { 199 /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ 200 struct mm_struct *other_mm = vma->vm_mm; 201 202 /* 203 * __mmdrop() is a heavy operation and we don't need RCU 204 * protection here. Release RCU lock during these operations. 205 * We reinstate the RCU read lock as the caller expects it to 206 * be held when this function returns even on error. 207 */ 208 rcu_read_unlock(); 209 mmgrab(other_mm); 210 vma_refcount_put(vma); 211 mmdrop(other_mm); 212 rcu_read_lock(); 213 return NULL; 214 } 215 216 /* 217 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. 218 * False unlocked result is impossible because we modify and check 219 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq 220 * modification invalidates all existing locks. 221 * 222 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are 223 * racing with vma_end_write_all(), we only start reading from the VMA 224 * after it has been unlocked. 225 * This pairs with RELEASE semantics in vma_end_write_all(). 226 */ 227 if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { 228 vma_refcount_put(vma); 229 return NULL; 230 } 231 232 return vma; 233} 234 235/* 236 * Use only while holding mmap read lock which guarantees that locking will not 237 * fail (nobody can concurrently write-lock the vma). vma_start_read() should 238 * not be used in such cases because it might fail due to mm_lock_seq overflow. 239 * This functionality is used to obtain vma read lock and drop the mmap read lock. 240 */ 241static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) 242{ 243 int oldcnt; 244 245 mmap_assert_locked(vma->vm_mm); 246 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, 247 VMA_REF_LIMIT))) 248 return false; 249 250 rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); 251 return true; 252} 253 254/* 255 * Use only while holding mmap read lock which guarantees that locking will not 256 * fail (nobody can concurrently write-lock the vma). vma_start_read() should 257 * not be used in such cases because it might fail due to mm_lock_seq overflow. 258 * This functionality is used to obtain vma read lock and drop the mmap read lock. 259 */ 260static inline bool vma_start_read_locked(struct vm_area_struct *vma) 261{ 262 return vma_start_read_locked_nested(vma, 0); 263} 264 265static inline void vma_end_read(struct vm_area_struct *vma) 266{ 267 vma_refcount_put(vma); 268} 269 270/* WARNING! Can only be used if mmap_lock is expected to be write-locked */ 271static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) 272{ 273 mmap_assert_write_locked(vma->vm_mm); 274 275 /* 276 * current task is holding mmap_write_lock, both vma->vm_lock_seq and 277 * mm->mm_lock_seq can't be concurrently modified. 278 */ 279 *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; 280 return (vma->vm_lock_seq == *mm_lock_seq); 281} 282 283void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); 284 285/* 286 * Begin writing to a VMA. 287 * Exclude concurrent readers under the per-VMA lock until the currently 288 * write-locked mmap_lock is dropped or downgraded. 289 */ 290static inline void vma_start_write(struct vm_area_struct *vma) 291{ 292 unsigned int mm_lock_seq; 293 294 if (__is_vma_write_locked(vma, &mm_lock_seq)) 295 return; 296 297 __vma_start_write(vma, mm_lock_seq); 298} 299 300static inline void vma_assert_write_locked(struct vm_area_struct *vma) 301{ 302 unsigned int mm_lock_seq; 303 304 VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); 305} 306 307static inline void vma_assert_locked(struct vm_area_struct *vma) 308{ 309 unsigned int mm_lock_seq; 310 311 VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && 312 !__is_vma_write_locked(vma, &mm_lock_seq), vma); 313} 314 315/* 316 * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these 317 * assertions should be made either under mmap_write_lock or when the object 318 * has been isolated under mmap_write_lock, ensuring no competing writers. 319 */ 320static inline void vma_assert_attached(struct vm_area_struct *vma) 321{ 322 WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); 323} 324 325static inline void vma_assert_detached(struct vm_area_struct *vma) 326{ 327 WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); 328} 329 330static inline void vma_mark_attached(struct vm_area_struct *vma) 331{ 332 vma_assert_write_locked(vma); 333 vma_assert_detached(vma); 334 refcount_set_release(&vma->vm_refcnt, 1); 335} 336 337void vma_mark_detached(struct vm_area_struct *vma); 338 339struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, 340 unsigned long address); 341 342/* 343 * Locks next vma pointed by the iterator. Confirms the locked vma has not 344 * been modified and will retry under mmap_lock protection if modification 345 * was detected. Should be called from read RCU section. 346 * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the 347 * process was interrupted. 348 */ 349struct vm_area_struct *lock_next_vma(struct mm_struct *mm, 350 struct vma_iterator *iter, 351 unsigned long address); 352 353#else /* CONFIG_PER_VMA_LOCK */ 354 355static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} 356static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} 357static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} 358 359static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) 360{ 361 return false; 362} 363 364static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) 365{ 366 return true; 367} 368static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} 369static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, 370 struct vm_area_struct *vma) 371 { return NULL; } 372static inline void vma_end_read(struct vm_area_struct *vma) {} 373static inline void vma_start_write(struct vm_area_struct *vma) {} 374static inline void vma_assert_write_locked(struct vm_area_struct *vma) 375 { mmap_assert_write_locked(vma->vm_mm); } 376static inline void vma_assert_attached(struct vm_area_struct *vma) {} 377static inline void vma_assert_detached(struct vm_area_struct *vma) {} 378static inline void vma_mark_attached(struct vm_area_struct *vma) {} 379static inline void vma_mark_detached(struct vm_area_struct *vma) {} 380 381static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, 382 unsigned long address) 383{ 384 return NULL; 385} 386 387static inline void vma_assert_locked(struct vm_area_struct *vma) 388{ 389 mmap_assert_locked(vma->vm_mm); 390} 391 392#endif /* CONFIG_PER_VMA_LOCK */ 393 394static inline void mmap_write_lock(struct mm_struct *mm) 395{ 396 __mmap_lock_trace_start_locking(mm, true); 397 down_write(&mm->mmap_lock); 398 mm_lock_seqcount_begin(mm); 399 __mmap_lock_trace_acquire_returned(mm, true, true); 400} 401 402static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) 403{ 404 __mmap_lock_trace_start_locking(mm, true); 405 down_write_nested(&mm->mmap_lock, subclass); 406 mm_lock_seqcount_begin(mm); 407 __mmap_lock_trace_acquire_returned(mm, true, true); 408} 409 410static inline int mmap_write_lock_killable(struct mm_struct *mm) 411{ 412 int ret; 413 414 __mmap_lock_trace_start_locking(mm, true); 415 ret = down_write_killable(&mm->mmap_lock); 416 if (!ret) 417 mm_lock_seqcount_begin(mm); 418 __mmap_lock_trace_acquire_returned(mm, true, ret == 0); 419 return ret; 420} 421 422/* 423 * Drop all currently-held per-VMA locks. 424 * This is called from the mmap_lock implementation directly before releasing 425 * a write-locked mmap_lock (or downgrading it to read-locked). 426 * This should normally NOT be called manually from other places. 427 * If you want to call this manually anyway, keep in mind that this will release 428 * *all* VMA write locks, including ones from further up the stack. 429 */ 430static inline void vma_end_write_all(struct mm_struct *mm) 431{ 432 mmap_assert_write_locked(mm); 433 mm_lock_seqcount_end(mm); 434} 435 436static inline void mmap_write_unlock(struct mm_struct *mm) 437{ 438 __mmap_lock_trace_released(mm, true); 439 vma_end_write_all(mm); 440 up_write(&mm->mmap_lock); 441} 442 443static inline void mmap_write_downgrade(struct mm_struct *mm) 444{ 445 __mmap_lock_trace_acquire_returned(mm, false, true); 446 vma_end_write_all(mm); 447 downgrade_write(&mm->mmap_lock); 448} 449 450static inline void mmap_read_lock(struct mm_struct *mm) 451{ 452 __mmap_lock_trace_start_locking(mm, false); 453 down_read(&mm->mmap_lock); 454 __mmap_lock_trace_acquire_returned(mm, false, true); 455} 456 457static inline int mmap_read_lock_killable(struct mm_struct *mm) 458{ 459 int ret; 460 461 __mmap_lock_trace_start_locking(mm, false); 462 ret = down_read_killable(&mm->mmap_lock); 463 __mmap_lock_trace_acquire_returned(mm, false, ret == 0); 464 return ret; 465} 466 467static inline bool mmap_read_trylock(struct mm_struct *mm) 468{ 469 bool ret; 470 471 __mmap_lock_trace_start_locking(mm, false); 472 ret = down_read_trylock(&mm->mmap_lock) != 0; 473 __mmap_lock_trace_acquire_returned(mm, false, ret); 474 return ret; 475} 476 477static inline void mmap_read_unlock(struct mm_struct *mm) 478{ 479 __mmap_lock_trace_released(mm, false); 480 up_read(&mm->mmap_lock); 481} 482 483DEFINE_GUARD(mmap_read_lock, struct mm_struct *, 484 mmap_read_lock(_T), mmap_read_unlock(_T)) 485 486static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) 487{ 488 __mmap_lock_trace_released(mm, false); 489 up_read_non_owner(&mm->mmap_lock); 490} 491 492static inline int mmap_lock_is_contended(struct mm_struct *mm) 493{ 494 return rwsem_is_contended(&mm->mmap_lock); 495} 496 497#endif /* _LINUX_MMAP_LOCK_H */