at v6.17-rc2 1066 lines 35 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef _LINUX_RMAP_H 3#define _LINUX_RMAP_H 4/* 5 * Declarations for Reverse Mapping functions in mm/rmap.c 6 */ 7 8#include <linux/list.h> 9#include <linux/slab.h> 10#include <linux/mm.h> 11#include <linux/rwsem.h> 12#include <linux/memcontrol.h> 13#include <linux/highmem.h> 14#include <linux/pagemap.h> 15#include <linux/memremap.h> 16#include <linux/bit_spinlock.h> 17 18/* 19 * The anon_vma heads a list of private "related" vmas, to scan if 20 * an anonymous page pointing to this anon_vma needs to be unmapped: 21 * the vmas on the list will be related by forking, or by splitting. 22 * 23 * Since vmas come and go as they are split and merged (particularly 24 * in mprotect), the mapping field of an anonymous page cannot point 25 * directly to a vma: instead it points to an anon_vma, on whose list 26 * the related vmas can be easily linked or unlinked. 27 * 28 * After unlinking the last vma on the list, we must garbage collect 29 * the anon_vma object itself: we're guaranteed no page can be 30 * pointing to this anon_vma once its vma list is empty. 31 */ 32struct anon_vma { 33 struct anon_vma *root; /* Root of this anon_vma tree */ 34 struct rw_semaphore rwsem; /* W: modification, R: walking the list */ 35 /* 36 * The refcount is taken on an anon_vma when there is no 37 * guarantee that the vma of page tables will exist for 38 * the duration of the operation. A caller that takes 39 * the reference is responsible for clearing up the 40 * anon_vma if they are the last user on release 41 */ 42 atomic_t refcount; 43 44 /* 45 * Count of child anon_vmas. Equals to the count of all anon_vmas that 46 * have ->parent pointing to this one, including itself. 47 * 48 * This counter is used for making decision about reusing anon_vma 49 * instead of forking new one. See comments in function anon_vma_clone. 50 */ 51 unsigned long num_children; 52 /* Count of VMAs whose ->anon_vma pointer points to this object. */ 53 unsigned long num_active_vmas; 54 55 struct anon_vma *parent; /* Parent of this anon_vma */ 56 57 /* 58 * NOTE: the LSB of the rb_root.rb_node is set by 59 * mm_take_all_locks() _after_ taking the above lock. So the 60 * rb_root must only be read/written after taking the above lock 61 * to be sure to see a valid next pointer. The LSB bit itself 62 * is serialized by a system wide lock only visible to 63 * mm_take_all_locks() (mm_all_locks_mutex). 64 */ 65 66 /* Interval tree of private "related" vmas */ 67 struct rb_root_cached rb_root; 68}; 69 70/* 71 * The copy-on-write semantics of fork mean that an anon_vma 72 * can become associated with multiple processes. Furthermore, 73 * each child process will have its own anon_vma, where new 74 * pages for that process are instantiated. 75 * 76 * This structure allows us to find the anon_vmas associated 77 * with a VMA, or the VMAs associated with an anon_vma. 78 * The "same_vma" list contains the anon_vma_chains linking 79 * all the anon_vmas associated with this VMA. 80 * The "rb" field indexes on an interval tree the anon_vma_chains 81 * which link all the VMAs associated with this anon_vma. 82 */ 83struct anon_vma_chain { 84 struct vm_area_struct *vma; 85 struct anon_vma *anon_vma; 86 struct list_head same_vma; /* locked by mmap_lock & page_table_lock */ 87 struct rb_node rb; /* locked by anon_vma->rwsem */ 88 unsigned long rb_subtree_last; 89#ifdef CONFIG_DEBUG_VM_RB 90 unsigned long cached_vma_start, cached_vma_last; 91#endif 92}; 93 94enum ttu_flags { 95 TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ 96 TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ 97 TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ 98 TTU_HWPOISON = 0x20, /* do convert pte to hwpoison entry */ 99 TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible 100 * and caller guarantees they will 101 * do a final flush if necessary */ 102 TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: 103 * caller holds it */ 104}; 105 106#ifdef CONFIG_MMU 107static inline void get_anon_vma(struct anon_vma *anon_vma) 108{ 109 atomic_inc(&anon_vma->refcount); 110} 111 112void __put_anon_vma(struct anon_vma *anon_vma); 113 114static inline void put_anon_vma(struct anon_vma *anon_vma) 115{ 116 if (atomic_dec_and_test(&anon_vma->refcount)) 117 __put_anon_vma(anon_vma); 118} 119 120static inline void anon_vma_lock_write(struct anon_vma *anon_vma) 121{ 122 down_write(&anon_vma->root->rwsem); 123} 124 125static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) 126{ 127 return down_write_trylock(&anon_vma->root->rwsem); 128} 129 130static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) 131{ 132 up_write(&anon_vma->root->rwsem); 133} 134 135static inline void anon_vma_lock_read(struct anon_vma *anon_vma) 136{ 137 down_read(&anon_vma->root->rwsem); 138} 139 140static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) 141{ 142 return down_read_trylock(&anon_vma->root->rwsem); 143} 144 145static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) 146{ 147 up_read(&anon_vma->root->rwsem); 148} 149 150 151/* 152 * anon_vma helper functions. 153 */ 154void anon_vma_init(void); /* create anon_vma_cachep */ 155int __anon_vma_prepare(struct vm_area_struct *); 156void unlink_anon_vmas(struct vm_area_struct *); 157int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); 158int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); 159 160static inline int anon_vma_prepare(struct vm_area_struct *vma) 161{ 162 if (likely(vma->anon_vma)) 163 return 0; 164 165 return __anon_vma_prepare(vma); 166} 167 168static inline void anon_vma_merge(struct vm_area_struct *vma, 169 struct vm_area_struct *next) 170{ 171 VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); 172 unlink_anon_vmas(next); 173} 174 175struct anon_vma *folio_get_anon_vma(const struct folio *folio); 176 177#ifdef CONFIG_MM_ID 178static __always_inline void folio_lock_large_mapcount(struct folio *folio) 179{ 180 bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); 181} 182 183static __always_inline void folio_unlock_large_mapcount(struct folio *folio) 184{ 185 __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); 186} 187 188static inline unsigned int folio_mm_id(const struct folio *folio, int idx) 189{ 190 VM_WARN_ON_ONCE(idx != 0 && idx != 1); 191 return folio->_mm_id[idx] & MM_ID_MASK; 192} 193 194static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id) 195{ 196 VM_WARN_ON_ONCE(idx != 0 && idx != 1); 197 folio->_mm_id[idx] &= ~MM_ID_MASK; 198 folio->_mm_id[idx] |= id; 199} 200 201static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio, 202 int diff, mm_id_t mm_id) 203{ 204 VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio)); 205 VM_WARN_ON_ONCE(diff <= 0); 206 VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX); 207 208 /* 209 * Make sure we can detect at least one complete PTE mapping of the 210 * folio in a single MM as "exclusively mapped". This is primarily 211 * a check on 32bit, where we currently reduce the size of the per-MM 212 * mapcount to a short. 213 */ 214 VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio)); 215 VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX); 216 217 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY && 218 folio->_mm_id_mapcount[0] != -1); 219 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY && 220 folio->_mm_id_mapcount[0] < 0); 221 VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY && 222 folio->_mm_id_mapcount[1] != -1); 223 VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY && 224 folio->_mm_id_mapcount[1] < 0); 225 VM_WARN_ON_ONCE(!folio_mapped(folio) && 226 test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)); 227} 228 229static __always_inline void folio_set_large_mapcount(struct folio *folio, 230 int mapcount, struct vm_area_struct *vma) 231{ 232 __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id); 233 234 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY); 235 VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY); 236 237 /* Note: mapcounts start at -1. */ 238 atomic_set(&folio->_large_mapcount, mapcount - 1); 239 folio->_mm_id_mapcount[0] = mapcount - 1; 240 folio_set_mm_id(folio, 0, vma->vm_mm->mm_id); 241} 242 243static __always_inline int folio_add_return_large_mapcount(struct folio *folio, 244 int diff, struct vm_area_struct *vma) 245{ 246 const mm_id_t mm_id = vma->vm_mm->mm_id; 247 int new_mapcount_val; 248 249 folio_lock_large_mapcount(folio); 250 __folio_large_mapcount_sanity_checks(folio, diff, mm_id); 251 252 new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff; 253 atomic_set(&folio->_large_mapcount, new_mapcount_val); 254 255 /* 256 * If a folio is mapped more than once into an MM on 32bit, we 257 * can in theory overflow the per-MM mapcount (although only for 258 * fairly large folios), turning it negative. In that case, just 259 * free up the slot and mark the folio "mapped shared", otherwise 260 * we might be in trouble when unmapping pages later. 261 */ 262 if (folio_mm_id(folio, 0) == mm_id) { 263 folio->_mm_id_mapcount[0] += diff; 264 if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) { 265 folio->_mm_id_mapcount[0] = -1; 266 folio_set_mm_id(folio, 0, MM_ID_DUMMY); 267 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; 268 } 269 } else if (folio_mm_id(folio, 1) == mm_id) { 270 folio->_mm_id_mapcount[1] += diff; 271 if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) { 272 folio->_mm_id_mapcount[1] = -1; 273 folio_set_mm_id(folio, 1, MM_ID_DUMMY); 274 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; 275 } 276 } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) { 277 folio_set_mm_id(folio, 0, mm_id); 278 folio->_mm_id_mapcount[0] = diff - 1; 279 /* We might have other mappings already. */ 280 if (new_mapcount_val != diff - 1) 281 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; 282 } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) { 283 folio_set_mm_id(folio, 1, mm_id); 284 folio->_mm_id_mapcount[1] = diff - 1; 285 /* Slot 0 certainly has mappings as well. */ 286 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; 287 } 288 folio_unlock_large_mapcount(folio); 289 return new_mapcount_val + 1; 290} 291#define folio_add_large_mapcount folio_add_return_large_mapcount 292 293static __always_inline int folio_sub_return_large_mapcount(struct folio *folio, 294 int diff, struct vm_area_struct *vma) 295{ 296 const mm_id_t mm_id = vma->vm_mm->mm_id; 297 int new_mapcount_val; 298 299 folio_lock_large_mapcount(folio); 300 __folio_large_mapcount_sanity_checks(folio, diff, mm_id); 301 302 new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff; 303 atomic_set(&folio->_large_mapcount, new_mapcount_val); 304 305 /* 306 * There are valid corner cases where we might underflow a per-MM 307 * mapcount (some mappings added when no slot was free, some mappings 308 * added once a slot was free), so we always set it to -1 once we go 309 * negative. 310 */ 311 if (folio_mm_id(folio, 0) == mm_id) { 312 folio->_mm_id_mapcount[0] -= diff; 313 if (folio->_mm_id_mapcount[0] >= 0) 314 goto out; 315 folio->_mm_id_mapcount[0] = -1; 316 folio_set_mm_id(folio, 0, MM_ID_DUMMY); 317 } else if (folio_mm_id(folio, 1) == mm_id) { 318 folio->_mm_id_mapcount[1] -= diff; 319 if (folio->_mm_id_mapcount[1] >= 0) 320 goto out; 321 folio->_mm_id_mapcount[1] = -1; 322 folio_set_mm_id(folio, 1, MM_ID_DUMMY); 323 } 324 325 /* 326 * If one MM slot owns all mappings, the folio is mapped exclusively. 327 * Note that if the folio is now unmapped (new_mapcount_val == -1), both 328 * slots must be free (mapcount == -1), and we'll also mark it as 329 * exclusive. 330 */ 331 if (folio->_mm_id_mapcount[0] == new_mapcount_val || 332 folio->_mm_id_mapcount[1] == new_mapcount_val) 333 folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT; 334out: 335 folio_unlock_large_mapcount(folio); 336 return new_mapcount_val + 1; 337} 338#define folio_sub_large_mapcount folio_sub_return_large_mapcount 339#else /* !CONFIG_MM_ID */ 340/* 341 * See __folio_rmap_sanity_checks(), we might map large folios even without 342 * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now. 343 */ 344static inline void folio_set_large_mapcount(struct folio *folio, int mapcount, 345 struct vm_area_struct *vma) 346{ 347 /* Note: mapcounts start at -1. */ 348 atomic_set(&folio->_large_mapcount, mapcount - 1); 349} 350 351static inline void folio_add_large_mapcount(struct folio *folio, 352 int diff, struct vm_area_struct *vma) 353{ 354 atomic_add(diff, &folio->_large_mapcount); 355} 356 357static inline int folio_add_return_large_mapcount(struct folio *folio, 358 int diff, struct vm_area_struct *vma) 359{ 360 BUILD_BUG(); 361} 362 363static inline void folio_sub_large_mapcount(struct folio *folio, 364 int diff, struct vm_area_struct *vma) 365{ 366 atomic_sub(diff, &folio->_large_mapcount); 367} 368 369static inline int folio_sub_return_large_mapcount(struct folio *folio, 370 int diff, struct vm_area_struct *vma) 371{ 372 BUILD_BUG(); 373} 374#endif /* CONFIG_MM_ID */ 375 376#define folio_inc_large_mapcount(folio, vma) \ 377 folio_add_large_mapcount(folio, 1, vma) 378#define folio_inc_return_large_mapcount(folio, vma) \ 379 folio_add_return_large_mapcount(folio, 1, vma) 380#define folio_dec_large_mapcount(folio, vma) \ 381 folio_sub_large_mapcount(folio, 1, vma) 382#define folio_dec_return_large_mapcount(folio, vma) \ 383 folio_sub_return_large_mapcount(folio, 1, vma) 384 385/* RMAP flags, currently only relevant for some anon rmap operations. */ 386typedef int __bitwise rmap_t; 387 388/* 389 * No special request: A mapped anonymous (sub)page is possibly shared between 390 * processes. 391 */ 392#define RMAP_NONE ((__force rmap_t)0) 393 394/* The anonymous (sub)page is exclusive to a single process. */ 395#define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) 396 397/* 398 * Internally, we're using an enum to specify the granularity. We make the 399 * compiler emit specialized code for each granularity. 400 */ 401enum rmap_level { 402 RMAP_LEVEL_PTE = 0, 403 RMAP_LEVEL_PMD, 404 RMAP_LEVEL_PUD, 405}; 406 407static inline void __folio_rmap_sanity_checks(const struct folio *folio, 408 const struct page *page, int nr_pages, enum rmap_level level) 409{ 410 /* hugetlb folios are handled separately. */ 411 VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); 412 413 /* When (un)mapping zeropages, we should never touch ref+mapcount. */ 414 VM_WARN_ON_FOLIO(is_zero_folio(folio), folio); 415 416 /* 417 * TODO: we get driver-allocated folios that have nothing to do with 418 * the rmap using vm_insert_page(); therefore, we cannot assume that 419 * folio_test_large_rmappable() holds for large folios. We should 420 * handle any desired mapcount+stats accounting for these folios in 421 * VM_MIXEDMAP VMAs separately, and then sanity-check here that 422 * we really only get rmappable folios. 423 */ 424 425 VM_WARN_ON_ONCE(nr_pages <= 0); 426 VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); 427 VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); 428 429 switch (level) { 430 case RMAP_LEVEL_PTE: 431 break; 432 case RMAP_LEVEL_PMD: 433 /* 434 * We don't support folios larger than a single PMD yet. So 435 * when RMAP_LEVEL_PMD is set, we assume that we are creating 436 * a single "entire" mapping of the folio. 437 */ 438 VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); 439 VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); 440 break; 441 case RMAP_LEVEL_PUD: 442 /* 443 * Assume that we are creating a single "entire" mapping of the 444 * folio. 445 */ 446 VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio); 447 VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); 448 break; 449 default: 450 VM_WARN_ON_ONCE(true); 451 } 452 453 /* 454 * Anon folios must have an associated live anon_vma as long as they're 455 * mapped into userspace. 456 * Note that the atomic_read() mainly does two things: 457 * 458 * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to 459 * check that the associated anon_vma has not yet been freed (subject 460 * to KASAN's usual limitations). This check will pass if the 461 * anon_vma's refcount has already dropped to 0 but an RCU grace 462 * period hasn't passed since then. 463 * 2. If the anon_vma has not yet been freed, it checks that the 464 * anon_vma still has a nonzero refcount (as opposed to being in the 465 * middle of an RCU delay for getting freed). 466 */ 467 if (folio_test_anon(folio) && !folio_test_ksm(folio)) { 468 unsigned long mapping = (unsigned long)folio->mapping; 469 struct anon_vma *anon_vma; 470 471 anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON); 472 VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio); 473 } 474} 475 476/* 477 * rmap interfaces called when adding or removing pte of page 478 */ 479void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); 480void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, 481 struct vm_area_struct *, unsigned long address, rmap_t flags); 482#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \ 483 folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) 484void folio_add_anon_rmap_pmd(struct folio *, struct page *, 485 struct vm_area_struct *, unsigned long address, rmap_t flags); 486void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, 487 unsigned long address, rmap_t flags); 488void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, 489 struct vm_area_struct *); 490#define folio_add_file_rmap_pte(folio, page, vma) \ 491 folio_add_file_rmap_ptes(folio, page, 1, vma) 492void folio_add_file_rmap_pmd(struct folio *, struct page *, 493 struct vm_area_struct *); 494void folio_add_file_rmap_pud(struct folio *, struct page *, 495 struct vm_area_struct *); 496void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, 497 struct vm_area_struct *); 498#define folio_remove_rmap_pte(folio, page, vma) \ 499 folio_remove_rmap_ptes(folio, page, 1, vma) 500void folio_remove_rmap_pmd(struct folio *, struct page *, 501 struct vm_area_struct *); 502void folio_remove_rmap_pud(struct folio *, struct page *, 503 struct vm_area_struct *); 504 505void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, 506 unsigned long address, rmap_t flags); 507void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, 508 unsigned long address); 509 510/* See folio_try_dup_anon_rmap_*() */ 511static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, 512 struct vm_area_struct *vma) 513{ 514 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 515 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 516 517 if (PageAnonExclusive(&folio->page)) { 518 if (unlikely(folio_needs_cow_for_dma(vma, folio))) 519 return -EBUSY; 520 ClearPageAnonExclusive(&folio->page); 521 } 522 atomic_inc(&folio->_entire_mapcount); 523 atomic_inc(&folio->_large_mapcount); 524 return 0; 525} 526 527/* See folio_try_share_anon_rmap_*() */ 528static inline int hugetlb_try_share_anon_rmap(struct folio *folio) 529{ 530 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 531 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 532 VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio); 533 534 /* Paired with the memory barrier in try_grab_folio(). */ 535 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 536 smp_mb(); 537 538 if (unlikely(folio_maybe_dma_pinned(folio))) 539 return -EBUSY; 540 ClearPageAnonExclusive(&folio->page); 541 542 /* 543 * This is conceptually a smp_wmb() paired with the smp_rmb() in 544 * gup_must_unshare(). 545 */ 546 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 547 smp_mb__after_atomic(); 548 return 0; 549} 550 551static inline void hugetlb_add_file_rmap(struct folio *folio) 552{ 553 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 554 VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); 555 556 atomic_inc(&folio->_entire_mapcount); 557 atomic_inc(&folio->_large_mapcount); 558} 559 560static inline void hugetlb_remove_rmap(struct folio *folio) 561{ 562 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 563 564 atomic_dec(&folio->_entire_mapcount); 565 atomic_dec(&folio->_large_mapcount); 566} 567 568static __always_inline void __folio_dup_file_rmap(struct folio *folio, 569 struct page *page, int nr_pages, struct vm_area_struct *dst_vma, 570 enum rmap_level level) 571{ 572 const int orig_nr_pages = nr_pages; 573 574 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 575 576 switch (level) { 577 case RMAP_LEVEL_PTE: 578 if (!folio_test_large(folio)) { 579 atomic_inc(&folio->_mapcount); 580 break; 581 } 582 583 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { 584 do { 585 atomic_inc(&page->_mapcount); 586 } while (page++, --nr_pages > 0); 587 } 588 folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); 589 break; 590 case RMAP_LEVEL_PMD: 591 case RMAP_LEVEL_PUD: 592 atomic_inc(&folio->_entire_mapcount); 593 folio_inc_large_mapcount(folio, dst_vma); 594 break; 595 } 596} 597 598/** 599 * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio 600 * @folio: The folio to duplicate the mappings of 601 * @page: The first page to duplicate the mappings of 602 * @nr_pages: The number of pages of which the mapping will be duplicated 603 * @dst_vma: The destination vm area 604 * 605 * The page range of the folio is defined by [page, page + nr_pages) 606 * 607 * The caller needs to hold the page table lock. 608 */ 609static inline void folio_dup_file_rmap_ptes(struct folio *folio, 610 struct page *page, int nr_pages, struct vm_area_struct *dst_vma) 611{ 612 __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE); 613} 614 615static __always_inline void folio_dup_file_rmap_pte(struct folio *folio, 616 struct page *page, struct vm_area_struct *dst_vma) 617{ 618 __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE); 619} 620 621/** 622 * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio 623 * @folio: The folio to duplicate the mapping of 624 * @page: The first page to duplicate the mapping of 625 * @dst_vma: The destination vm area 626 * 627 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 628 * 629 * The caller needs to hold the page table lock. 630 */ 631static inline void folio_dup_file_rmap_pmd(struct folio *folio, 632 struct page *page, struct vm_area_struct *dst_vma) 633{ 634#ifdef CONFIG_TRANSPARENT_HUGEPAGE 635 __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE); 636#else 637 WARN_ON_ONCE(true); 638#endif 639} 640 641static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, 642 struct page *page, int nr_pages, struct vm_area_struct *dst_vma, 643 struct vm_area_struct *src_vma, enum rmap_level level) 644{ 645 const int orig_nr_pages = nr_pages; 646 bool maybe_pinned; 647 int i; 648 649 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 650 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 651 652 /* 653 * If this folio may have been pinned by the parent process, 654 * don't allow to duplicate the mappings but instead require to e.g., 655 * copy the subpage immediately for the child so that we'll always 656 * guarantee the pinned folio won't be randomly replaced in the 657 * future on write faults. 658 */ 659 maybe_pinned = likely(!folio_is_device_private(folio)) && 660 unlikely(folio_needs_cow_for_dma(src_vma, folio)); 661 662 /* 663 * No need to check+clear for already shared PTEs/PMDs of the 664 * folio. But if any page is PageAnonExclusive, we must fallback to 665 * copying if the folio maybe pinned. 666 */ 667 switch (level) { 668 case RMAP_LEVEL_PTE: 669 if (unlikely(maybe_pinned)) { 670 for (i = 0; i < nr_pages; i++) 671 if (PageAnonExclusive(page + i)) 672 return -EBUSY; 673 } 674 675 if (!folio_test_large(folio)) { 676 if (PageAnonExclusive(page)) 677 ClearPageAnonExclusive(page); 678 atomic_inc(&folio->_mapcount); 679 break; 680 } 681 682 do { 683 if (PageAnonExclusive(page)) 684 ClearPageAnonExclusive(page); 685 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) 686 atomic_inc(&page->_mapcount); 687 } while (page++, --nr_pages > 0); 688 folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); 689 break; 690 case RMAP_LEVEL_PMD: 691 case RMAP_LEVEL_PUD: 692 if (PageAnonExclusive(page)) { 693 if (unlikely(maybe_pinned)) 694 return -EBUSY; 695 ClearPageAnonExclusive(page); 696 } 697 atomic_inc(&folio->_entire_mapcount); 698 folio_inc_large_mapcount(folio, dst_vma); 699 break; 700 } 701 return 0; 702} 703 704/** 705 * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range 706 * of a folio 707 * @folio: The folio to duplicate the mappings of 708 * @page: The first page to duplicate the mappings of 709 * @nr_pages: The number of pages of which the mapping will be duplicated 710 * @dst_vma: The destination vm area 711 * @src_vma: The vm area from which the mappings are duplicated 712 * 713 * The page range of the folio is defined by [page, page + nr_pages) 714 * 715 * The caller needs to hold the page table lock and the 716 * vma->vma_mm->write_protect_seq. 717 * 718 * Duplicating the mappings can only fail if the folio may be pinned; device 719 * private folios cannot get pinned and consequently this function cannot fail 720 * for them. 721 * 722 * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in 723 * the parent and the child. They must *not* be writable after this call 724 * succeeded. 725 * 726 * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. 727 */ 728static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, 729 struct page *page, int nr_pages, struct vm_area_struct *dst_vma, 730 struct vm_area_struct *src_vma) 731{ 732 return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma, 733 src_vma, RMAP_LEVEL_PTE); 734} 735 736static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, 737 struct page *page, struct vm_area_struct *dst_vma, 738 struct vm_area_struct *src_vma) 739{ 740 return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma, 741 RMAP_LEVEL_PTE); 742} 743 744/** 745 * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range 746 * of a folio 747 * @folio: The folio to duplicate the mapping of 748 * @page: The first page to duplicate the mapping of 749 * @dst_vma: The destination vm area 750 * @src_vma: The vm area from which the mapping is duplicated 751 * 752 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 753 * 754 * The caller needs to hold the page table lock and the 755 * vma->vma_mm->write_protect_seq. 756 * 757 * Duplicating the mapping can only fail if the folio may be pinned; device 758 * private folios cannot get pinned and consequently this function cannot fail 759 * for them. 760 * 761 * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in 762 * the parent and the child. They must *not* be writable after this call 763 * succeeded. 764 * 765 * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. 766 */ 767static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, 768 struct page *page, struct vm_area_struct *dst_vma, 769 struct vm_area_struct *src_vma) 770{ 771#ifdef CONFIG_TRANSPARENT_HUGEPAGE 772 return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma, 773 src_vma, RMAP_LEVEL_PMD); 774#else 775 WARN_ON_ONCE(true); 776 return -EBUSY; 777#endif 778} 779 780static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, 781 struct page *page, int nr_pages, enum rmap_level level) 782{ 783 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 784 VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); 785 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 786 787 /* device private folios cannot get pinned via GUP. */ 788 if (unlikely(folio_is_device_private(folio))) { 789 ClearPageAnonExclusive(page); 790 return 0; 791 } 792 793 /* 794 * We have to make sure that when we clear PageAnonExclusive, that 795 * the page is not pinned and that concurrent GUP-fast won't succeed in 796 * concurrently pinning the page. 797 * 798 * Conceptually, PageAnonExclusive clearing consists of: 799 * (A1) Clear PTE 800 * (A2) Check if the page is pinned; back off if so. 801 * (A3) Clear PageAnonExclusive 802 * (A4) Restore PTE (optional, but certainly not writable) 803 * 804 * When clearing PageAnonExclusive, we cannot possibly map the page 805 * writable again, because anon pages that may be shared must never 806 * be writable. So in any case, if the PTE was writable it cannot 807 * be writable anymore afterwards and there would be a PTE change. Only 808 * if the PTE wasn't writable, there might not be a PTE change. 809 * 810 * Conceptually, GUP-fast pinning of an anon page consists of: 811 * (B1) Read the PTE 812 * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so. 813 * (B3) Pin the mapped page 814 * (B4) Check if the PTE changed by re-reading it; back off if so. 815 * (B5) If the original PTE is not writable, check if 816 * PageAnonExclusive is not set; back off if so. 817 * 818 * If the PTE was writable, we only have to make sure that GUP-fast 819 * observes a PTE change and properly backs off. 820 * 821 * If the PTE was not writable, we have to make sure that GUP-fast either 822 * detects a (temporary) PTE change or that PageAnonExclusive is cleared 823 * and properly backs off. 824 * 825 * Consequently, when clearing PageAnonExclusive(), we have to make 826 * sure that (A1), (A2)/(A3) and (A4) happen in the right memory 827 * order. In GUP-fast pinning code, we have to make sure that (B3),(B4) 828 * and (B5) happen in the right memory order. 829 * 830 * We assume that there might not be a memory barrier after 831 * clearing/invalidating the PTE (A1) and before restoring the PTE (A4), 832 * so we use explicit ones here. 833 */ 834 835 /* Paired with the memory barrier in try_grab_folio(). */ 836 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 837 smp_mb(); 838 839 if (unlikely(folio_maybe_dma_pinned(folio))) 840 return -EBUSY; 841 ClearPageAnonExclusive(page); 842 843 /* 844 * This is conceptually a smp_wmb() paired with the smp_rmb() in 845 * gup_must_unshare(). 846 */ 847 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 848 smp_mb__after_atomic(); 849 return 0; 850} 851 852/** 853 * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page 854 * mapped by a PTE possibly shared to prepare 855 * for KSM or temporary unmapping 856 * @folio: The folio to share a mapping of 857 * @page: The mapped exclusive page 858 * 859 * The caller needs to hold the page table lock and has to have the page table 860 * entries cleared/invalidated. 861 * 862 * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during 863 * fork() to duplicate mappings, but instead to prepare for KSM or temporarily 864 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte(). 865 * 866 * Marking the mapped page shared can only fail if the folio maybe pinned; 867 * device private folios cannot get pinned and consequently this function cannot 868 * fail. 869 * 870 * Returns 0 if marking the mapped page possibly shared succeeded. Returns 871 * -EBUSY otherwise. 872 */ 873static inline int folio_try_share_anon_rmap_pte(struct folio *folio, 874 struct page *page) 875{ 876 return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE); 877} 878 879/** 880 * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page 881 * range mapped by a PMD possibly shared to 882 * prepare for temporary unmapping 883 * @folio: The folio to share the mapping of 884 * @page: The first page to share the mapping of 885 * 886 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 887 * 888 * The caller needs to hold the page table lock and has to have the page table 889 * entries cleared/invalidated. 890 * 891 * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during 892 * fork() to duplicate a mapping, but instead to prepare for temporarily 893 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd(). 894 * 895 * Marking the mapped pages shared can only fail if the folio maybe pinned; 896 * device private folios cannot get pinned and consequently this function cannot 897 * fail. 898 * 899 * Returns 0 if marking the mapped pages possibly shared succeeded. Returns 900 * -EBUSY otherwise. 901 */ 902static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, 903 struct page *page) 904{ 905#ifdef CONFIG_TRANSPARENT_HUGEPAGE 906 return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, 907 RMAP_LEVEL_PMD); 908#else 909 WARN_ON_ONCE(true); 910 return -EBUSY; 911#endif 912} 913 914/* 915 * Called from mm/vmscan.c to handle paging out 916 */ 917int folio_referenced(struct folio *, int is_locked, 918 struct mem_cgroup *memcg, vm_flags_t *vm_flags); 919 920void try_to_migrate(struct folio *folio, enum ttu_flags flags); 921void try_to_unmap(struct folio *, enum ttu_flags flags); 922 923struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, 924 void *owner, struct folio **foliop); 925 926/* Avoid racy checks */ 927#define PVMW_SYNC (1 << 0) 928/* Look for migration entries rather than present PTEs */ 929#define PVMW_MIGRATION (1 << 1) 930 931struct page_vma_mapped_walk { 932 unsigned long pfn; 933 unsigned long nr_pages; 934 pgoff_t pgoff; 935 struct vm_area_struct *vma; 936 unsigned long address; 937 pmd_t *pmd; 938 pte_t *pte; 939 spinlock_t *ptl; 940 unsigned int flags; 941}; 942 943#define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ 944 struct page_vma_mapped_walk name = { \ 945 .pfn = folio_pfn(_folio), \ 946 .nr_pages = folio_nr_pages(_folio), \ 947 .pgoff = folio_pgoff(_folio), \ 948 .vma = _vma, \ 949 .address = _address, \ 950 .flags = _flags, \ 951 } 952 953static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) 954{ 955 /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ 956 if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma)) 957 pte_unmap(pvmw->pte); 958 if (pvmw->ptl) 959 spin_unlock(pvmw->ptl); 960} 961 962/** 963 * page_vma_mapped_walk_restart - Restart the page table walk. 964 * @pvmw: Pointer to struct page_vma_mapped_walk. 965 * 966 * It restarts the page table walk when changes occur in the page 967 * table, such as splitting a PMD. Ensures that the PTL held during 968 * the previous walk is released and resets the state to allow for 969 * a new walk starting at the current address stored in pvmw->address. 970 */ 971static inline void 972page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw) 973{ 974 WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte); 975 976 if (likely(pvmw->ptl)) 977 spin_unlock(pvmw->ptl); 978 else 979 WARN_ON_ONCE(1); 980 981 pvmw->ptl = NULL; 982 pvmw->pmd = NULL; 983 pvmw->pte = NULL; 984} 985 986bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); 987unsigned long page_address_in_vma(const struct folio *folio, 988 const struct page *, const struct vm_area_struct *); 989 990/* 991 * Cleans the PTEs of shared mappings. 992 * (and since clean PTEs should also be readonly, write protects them too) 993 * 994 * returns the number of cleaned PTEs. 995 */ 996int folio_mkclean(struct folio *); 997 998int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, 999 unsigned long pfn, unsigned long nr_pages); 1000 1001int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, 1002 struct vm_area_struct *vma); 1003 1004enum rmp_flags { 1005 RMP_LOCKED = 1 << 0, 1006 RMP_USE_SHARED_ZEROPAGE = 1 << 1, 1007}; 1008 1009void remove_migration_ptes(struct folio *src, struct folio *dst, int flags); 1010 1011/* 1012 * rmap_walk_control: To control rmap traversing for specific needs 1013 * 1014 * arg: passed to rmap_one() and invalid_vma() 1015 * try_lock: bail out if the rmap lock is contended 1016 * contended: indicate the rmap traversal bailed out due to lock contention 1017 * rmap_one: executed on each vma where page is mapped 1018 * done: for checking traversing termination condition 1019 * anon_lock: for getting anon_lock by optimized way rather than default 1020 * invalid_vma: for skipping uninterested vma 1021 */ 1022struct rmap_walk_control { 1023 void *arg; 1024 bool try_lock; 1025 bool contended; 1026 /* 1027 * Return false if page table scanning in rmap_walk should be stopped. 1028 * Otherwise, return true. 1029 */ 1030 bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, 1031 unsigned long addr, void *arg); 1032 int (*done)(struct folio *folio); 1033 struct anon_vma *(*anon_lock)(const struct folio *folio, 1034 struct rmap_walk_control *rwc); 1035 bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); 1036}; 1037 1038void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); 1039void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); 1040struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, 1041 struct rmap_walk_control *rwc); 1042 1043#else /* !CONFIG_MMU */ 1044 1045#define anon_vma_init() do {} while (0) 1046#define anon_vma_prepare(vma) (0) 1047 1048static inline int folio_referenced(struct folio *folio, int is_locked, 1049 struct mem_cgroup *memcg, 1050 vm_flags_t *vm_flags) 1051{ 1052 *vm_flags = 0; 1053 return 0; 1054} 1055 1056static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags) 1057{ 1058} 1059 1060static inline int folio_mkclean(struct folio *folio) 1061{ 1062 return 0; 1063} 1064#endif /* CONFIG_MMU */ 1065 1066#endif /* _LINUX_RMAP_H */