at v5.2 19 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef _LINUX_MMU_NOTIFIER_H 3#define _LINUX_MMU_NOTIFIER_H 4 5#include <linux/list.h> 6#include <linux/spinlock.h> 7#include <linux/mm_types.h> 8#include <linux/srcu.h> 9 10struct mmu_notifier; 11struct mmu_notifier_ops; 12 13/** 14 * enum mmu_notifier_event - reason for the mmu notifier callback 15 * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that 16 * move the range 17 * 18 * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like 19 * madvise() or replacing a page by another one, ...). 20 * 21 * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range 22 * ie using the vma access permission (vm_page_prot) to update the whole range 23 * is enough no need to inspect changes to the CPU page table (mprotect() 24 * syscall) 25 * 26 * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for 27 * pages in the range so to mirror those changes the user must inspect the CPU 28 * page table (from the end callback). 29 * 30 * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same 31 * access flags). User should soft dirty the page in the end callback to make 32 * sure that anyone relying on soft dirtyness catch pages that might be written 33 * through non CPU mappings. 34 */ 35enum mmu_notifier_event { 36 MMU_NOTIFY_UNMAP = 0, 37 MMU_NOTIFY_CLEAR, 38 MMU_NOTIFY_PROTECTION_VMA, 39 MMU_NOTIFY_PROTECTION_PAGE, 40 MMU_NOTIFY_SOFT_DIRTY, 41}; 42 43#ifdef CONFIG_MMU_NOTIFIER 44 45/* 46 * The mmu notifier_mm structure is allocated and installed in 47 * mm->mmu_notifier_mm inside the mm_take_all_locks() protected 48 * critical section and it's released only when mm_count reaches zero 49 * in mmdrop(). 50 */ 51struct mmu_notifier_mm { 52 /* all mmu notifiers registerd in this mm are queued in this list */ 53 struct hlist_head list; 54 /* to serialize the list modifications and hlist_unhashed */ 55 spinlock_t lock; 56}; 57 58#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) 59 60struct mmu_notifier_range { 61 struct vm_area_struct *vma; 62 struct mm_struct *mm; 63 unsigned long start; 64 unsigned long end; 65 unsigned flags; 66 enum mmu_notifier_event event; 67}; 68 69struct mmu_notifier_ops { 70 /* 71 * Called either by mmu_notifier_unregister or when the mm is 72 * being destroyed by exit_mmap, always before all pages are 73 * freed. This can run concurrently with other mmu notifier 74 * methods (the ones invoked outside the mm context) and it 75 * should tear down all secondary mmu mappings and freeze the 76 * secondary mmu. If this method isn't implemented you've to 77 * be sure that nothing could possibly write to the pages 78 * through the secondary mmu by the time the last thread with 79 * tsk->mm == mm exits. 80 * 81 * As side note: the pages freed after ->release returns could 82 * be immediately reallocated by the gart at an alias physical 83 * address with a different cache model, so if ->release isn't 84 * implemented because all _software_ driven memory accesses 85 * through the secondary mmu are terminated by the time the 86 * last thread of this mm quits, you've also to be sure that 87 * speculative _hardware_ operations can't allocate dirty 88 * cachelines in the cpu that could not be snooped and made 89 * coherent with the other read and write operations happening 90 * through the gart alias address, so leading to memory 91 * corruption. 92 */ 93 void (*release)(struct mmu_notifier *mn, 94 struct mm_struct *mm); 95 96 /* 97 * clear_flush_young is called after the VM is 98 * test-and-clearing the young/accessed bitflag in the 99 * pte. This way the VM will provide proper aging to the 100 * accesses to the page through the secondary MMUs and not 101 * only to the ones through the Linux pte. 102 * Start-end is necessary in case the secondary MMU is mapping the page 103 * at a smaller granularity than the primary MMU. 104 */ 105 int (*clear_flush_young)(struct mmu_notifier *mn, 106 struct mm_struct *mm, 107 unsigned long start, 108 unsigned long end); 109 110 /* 111 * clear_young is a lightweight version of clear_flush_young. Like the 112 * latter, it is supposed to test-and-clear the young/accessed bitflag 113 * in the secondary pte, but it may omit flushing the secondary tlb. 114 */ 115 int (*clear_young)(struct mmu_notifier *mn, 116 struct mm_struct *mm, 117 unsigned long start, 118 unsigned long end); 119 120 /* 121 * test_young is called to check the young/accessed bitflag in 122 * the secondary pte. This is used to know if the page is 123 * frequently used without actually clearing the flag or tearing 124 * down the secondary mapping on the page. 125 */ 126 int (*test_young)(struct mmu_notifier *mn, 127 struct mm_struct *mm, 128 unsigned long address); 129 130 /* 131 * change_pte is called in cases that pte mapping to page is changed: 132 * for example, when ksm remaps pte to point to a new shared page. 133 */ 134 void (*change_pte)(struct mmu_notifier *mn, 135 struct mm_struct *mm, 136 unsigned long address, 137 pte_t pte); 138 139 /* 140 * invalidate_range_start() and invalidate_range_end() must be 141 * paired and are called only when the mmap_sem and/or the 142 * locks protecting the reverse maps are held. If the subsystem 143 * can't guarantee that no additional references are taken to 144 * the pages in the range, it has to implement the 145 * invalidate_range() notifier to remove any references taken 146 * after invalidate_range_start(). 147 * 148 * Invalidation of multiple concurrent ranges may be 149 * optionally permitted by the driver. Either way the 150 * establishment of sptes is forbidden in the range passed to 151 * invalidate_range_begin/end for the whole duration of the 152 * invalidate_range_begin/end critical section. 153 * 154 * invalidate_range_start() is called when all pages in the 155 * range are still mapped and have at least a refcount of one. 156 * 157 * invalidate_range_end() is called when all pages in the 158 * range have been unmapped and the pages have been freed by 159 * the VM. 160 * 161 * The VM will remove the page table entries and potentially 162 * the page between invalidate_range_start() and 163 * invalidate_range_end(). If the page must not be freed 164 * because of pending I/O or other circumstances then the 165 * invalidate_range_start() callback (or the initial mapping 166 * by the driver) must make sure that the refcount is kept 167 * elevated. 168 * 169 * If the driver increases the refcount when the pages are 170 * initially mapped into an address space then either 171 * invalidate_range_start() or invalidate_range_end() may 172 * decrease the refcount. If the refcount is decreased on 173 * invalidate_range_start() then the VM can free pages as page 174 * table entries are removed. If the refcount is only 175 * droppped on invalidate_range_end() then the driver itself 176 * will drop the last refcount but it must take care to flush 177 * any secondary tlb before doing the final free on the 178 * page. Pages will no longer be referenced by the linux 179 * address space but may still be referenced by sptes until 180 * the last refcount is dropped. 181 * 182 * If blockable argument is set to false then the callback cannot 183 * sleep and has to return with -EAGAIN. 0 should be returned 184 * otherwise. Please note that if invalidate_range_start approves 185 * a non-blocking behavior then the same applies to 186 * invalidate_range_end. 187 * 188 */ 189 int (*invalidate_range_start)(struct mmu_notifier *mn, 190 const struct mmu_notifier_range *range); 191 void (*invalidate_range_end)(struct mmu_notifier *mn, 192 const struct mmu_notifier_range *range); 193 194 /* 195 * invalidate_range() is either called between 196 * invalidate_range_start() and invalidate_range_end() when the 197 * VM has to free pages that where unmapped, but before the 198 * pages are actually freed, or outside of _start()/_end() when 199 * a (remote) TLB is necessary. 200 * 201 * If invalidate_range() is used to manage a non-CPU TLB with 202 * shared page-tables, it not necessary to implement the 203 * invalidate_range_start()/end() notifiers, as 204 * invalidate_range() alread catches the points in time when an 205 * external TLB range needs to be flushed. For more in depth 206 * discussion on this see Documentation/vm/mmu_notifier.rst 207 * 208 * Note that this function might be called with just a sub-range 209 * of what was passed to invalidate_range_start()/end(), if 210 * called between those functions. 211 */ 212 void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, 213 unsigned long start, unsigned long end); 214}; 215 216/* 217 * The notifier chains are protected by mmap_sem and/or the reverse map 218 * semaphores. Notifier chains are only changed when all reverse maps and 219 * the mmap_sem locks are taken. 220 * 221 * Therefore notifier chains can only be traversed when either 222 * 223 * 1. mmap_sem is held. 224 * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). 225 * 3. No other concurrent thread can access the list (release) 226 */ 227struct mmu_notifier { 228 struct hlist_node hlist; 229 const struct mmu_notifier_ops *ops; 230}; 231 232static inline int mm_has_notifiers(struct mm_struct *mm) 233{ 234 return unlikely(mm->mmu_notifier_mm); 235} 236 237extern int mmu_notifier_register(struct mmu_notifier *mn, 238 struct mm_struct *mm); 239extern int __mmu_notifier_register(struct mmu_notifier *mn, 240 struct mm_struct *mm); 241extern void mmu_notifier_unregister(struct mmu_notifier *mn, 242 struct mm_struct *mm); 243extern void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, 244 struct mm_struct *mm); 245extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); 246extern void __mmu_notifier_release(struct mm_struct *mm); 247extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, 248 unsigned long start, 249 unsigned long end); 250extern int __mmu_notifier_clear_young(struct mm_struct *mm, 251 unsigned long start, 252 unsigned long end); 253extern int __mmu_notifier_test_young(struct mm_struct *mm, 254 unsigned long address); 255extern void __mmu_notifier_change_pte(struct mm_struct *mm, 256 unsigned long address, pte_t pte); 257extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); 258extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, 259 bool only_end); 260extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, 261 unsigned long start, unsigned long end); 262extern bool 263mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); 264 265static inline bool 266mmu_notifier_range_blockable(const struct mmu_notifier_range *range) 267{ 268 return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); 269} 270 271static inline void mmu_notifier_release(struct mm_struct *mm) 272{ 273 if (mm_has_notifiers(mm)) 274 __mmu_notifier_release(mm); 275} 276 277static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, 278 unsigned long start, 279 unsigned long end) 280{ 281 if (mm_has_notifiers(mm)) 282 return __mmu_notifier_clear_flush_young(mm, start, end); 283 return 0; 284} 285 286static inline int mmu_notifier_clear_young(struct mm_struct *mm, 287 unsigned long start, 288 unsigned long end) 289{ 290 if (mm_has_notifiers(mm)) 291 return __mmu_notifier_clear_young(mm, start, end); 292 return 0; 293} 294 295static inline int mmu_notifier_test_young(struct mm_struct *mm, 296 unsigned long address) 297{ 298 if (mm_has_notifiers(mm)) 299 return __mmu_notifier_test_young(mm, address); 300 return 0; 301} 302 303static inline void mmu_notifier_change_pte(struct mm_struct *mm, 304 unsigned long address, pte_t pte) 305{ 306 if (mm_has_notifiers(mm)) 307 __mmu_notifier_change_pte(mm, address, pte); 308} 309 310static inline void 311mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) 312{ 313 if (mm_has_notifiers(range->mm)) { 314 range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; 315 __mmu_notifier_invalidate_range_start(range); 316 } 317} 318 319static inline int 320mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) 321{ 322 if (mm_has_notifiers(range->mm)) { 323 range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; 324 return __mmu_notifier_invalidate_range_start(range); 325 } 326 return 0; 327} 328 329static inline void 330mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) 331{ 332 if (mm_has_notifiers(range->mm)) 333 __mmu_notifier_invalidate_range_end(range, false); 334} 335 336static inline void 337mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) 338{ 339 if (mm_has_notifiers(range->mm)) 340 __mmu_notifier_invalidate_range_end(range, true); 341} 342 343static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, 344 unsigned long start, unsigned long end) 345{ 346 if (mm_has_notifiers(mm)) 347 __mmu_notifier_invalidate_range(mm, start, end); 348} 349 350static inline void mmu_notifier_mm_init(struct mm_struct *mm) 351{ 352 mm->mmu_notifier_mm = NULL; 353} 354 355static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) 356{ 357 if (mm_has_notifiers(mm)) 358 __mmu_notifier_mm_destroy(mm); 359} 360 361 362static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, 363 enum mmu_notifier_event event, 364 unsigned flags, 365 struct vm_area_struct *vma, 366 struct mm_struct *mm, 367 unsigned long start, 368 unsigned long end) 369{ 370 range->vma = vma; 371 range->event = event; 372 range->mm = mm; 373 range->start = start; 374 range->end = end; 375 range->flags = flags; 376} 377 378#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ 379({ \ 380 int __young; \ 381 struct vm_area_struct *___vma = __vma; \ 382 unsigned long ___address = __address; \ 383 __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ 384 __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ 385 ___address, \ 386 ___address + \ 387 PAGE_SIZE); \ 388 __young; \ 389}) 390 391#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ 392({ \ 393 int __young; \ 394 struct vm_area_struct *___vma = __vma; \ 395 unsigned long ___address = __address; \ 396 __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ 397 __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ 398 ___address, \ 399 ___address + \ 400 PMD_SIZE); \ 401 __young; \ 402}) 403 404#define ptep_clear_young_notify(__vma, __address, __ptep) \ 405({ \ 406 int __young; \ 407 struct vm_area_struct *___vma = __vma; \ 408 unsigned long ___address = __address; \ 409 __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ 410 __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ 411 ___address + PAGE_SIZE); \ 412 __young; \ 413}) 414 415#define pmdp_clear_young_notify(__vma, __address, __pmdp) \ 416({ \ 417 int __young; \ 418 struct vm_area_struct *___vma = __vma; \ 419 unsigned long ___address = __address; \ 420 __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ 421 __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ 422 ___address + PMD_SIZE); \ 423 __young; \ 424}) 425 426#define ptep_clear_flush_notify(__vma, __address, __ptep) \ 427({ \ 428 unsigned long ___addr = __address & PAGE_MASK; \ 429 struct mm_struct *___mm = (__vma)->vm_mm; \ 430 pte_t ___pte; \ 431 \ 432 ___pte = ptep_clear_flush(__vma, __address, __ptep); \ 433 mmu_notifier_invalidate_range(___mm, ___addr, \ 434 ___addr + PAGE_SIZE); \ 435 \ 436 ___pte; \ 437}) 438 439#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ 440({ \ 441 unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ 442 struct mm_struct *___mm = (__vma)->vm_mm; \ 443 pmd_t ___pmd; \ 444 \ 445 ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ 446 mmu_notifier_invalidate_range(___mm, ___haddr, \ 447 ___haddr + HPAGE_PMD_SIZE); \ 448 \ 449 ___pmd; \ 450}) 451 452#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \ 453({ \ 454 unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \ 455 struct mm_struct *___mm = (__vma)->vm_mm; \ 456 pud_t ___pud; \ 457 \ 458 ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \ 459 mmu_notifier_invalidate_range(___mm, ___haddr, \ 460 ___haddr + HPAGE_PUD_SIZE); \ 461 \ 462 ___pud; \ 463}) 464 465/* 466 * set_pte_at_notify() sets the pte _after_ running the notifier. 467 * This is safe to start by updating the secondary MMUs, because the primary MMU 468 * pte invalidate must have already happened with a ptep_clear_flush() before 469 * set_pte_at_notify() has been invoked. Updating the secondary MMUs first is 470 * required when we change both the protection of the mapping from read-only to 471 * read-write and the pfn (like during copy on write page faults). Otherwise the 472 * old page would remain mapped readonly in the secondary MMUs after the new 473 * page is already writable by some CPU through the primary MMU. 474 */ 475#define set_pte_at_notify(__mm, __address, __ptep, __pte) \ 476({ \ 477 struct mm_struct *___mm = __mm; \ 478 unsigned long ___address = __address; \ 479 pte_t ___pte = __pte; \ 480 \ 481 mmu_notifier_change_pte(___mm, ___address, ___pte); \ 482 set_pte_at(___mm, ___address, __ptep, ___pte); \ 483}) 484 485extern void mmu_notifier_call_srcu(struct rcu_head *rcu, 486 void (*func)(struct rcu_head *rcu)); 487 488#else /* CONFIG_MMU_NOTIFIER */ 489 490struct mmu_notifier_range { 491 unsigned long start; 492 unsigned long end; 493}; 494 495static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, 496 unsigned long start, 497 unsigned long end) 498{ 499 range->start = start; 500 range->end = end; 501} 502 503#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ 504 _mmu_notifier_range_init(range, start, end) 505 506static inline bool 507mmu_notifier_range_blockable(const struct mmu_notifier_range *range) 508{ 509 return true; 510} 511 512static inline int mm_has_notifiers(struct mm_struct *mm) 513{ 514 return 0; 515} 516 517static inline void mmu_notifier_release(struct mm_struct *mm) 518{ 519} 520 521static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, 522 unsigned long start, 523 unsigned long end) 524{ 525 return 0; 526} 527 528static inline int mmu_notifier_test_young(struct mm_struct *mm, 529 unsigned long address) 530{ 531 return 0; 532} 533 534static inline void mmu_notifier_change_pte(struct mm_struct *mm, 535 unsigned long address, pte_t pte) 536{ 537} 538 539static inline void 540mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) 541{ 542} 543 544static inline int 545mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) 546{ 547 return 0; 548} 549 550static inline 551void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) 552{ 553} 554 555static inline void 556mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) 557{ 558} 559 560static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, 561 unsigned long start, unsigned long end) 562{ 563} 564 565static inline void mmu_notifier_mm_init(struct mm_struct *mm) 566{ 567} 568 569static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) 570{ 571} 572 573#define mmu_notifier_range_update_to_read_only(r) false 574 575#define ptep_clear_flush_young_notify ptep_clear_flush_young 576#define pmdp_clear_flush_young_notify pmdp_clear_flush_young 577#define ptep_clear_young_notify ptep_test_and_clear_young 578#define pmdp_clear_young_notify pmdp_test_and_clear_young 579#define ptep_clear_flush_notify ptep_clear_flush 580#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush 581#define pudp_huge_clear_flush_notify pudp_huge_clear_flush 582#define set_pte_at_notify set_pte_at 583 584#endif /* CONFIG_MMU_NOTIFIER */ 585 586#endif /* _LINUX_MMU_NOTIFIER_H */