Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.38 2033 lines 56 kB view raw
1/* 2 * Memory merging support. 3 * 4 * This code enables dynamic sharing of identical pages found in different 5 * memory areas, even if they are not shared by fork() 6 * 7 * Copyright (C) 2008-2009 Red Hat, Inc. 8 * Authors: 9 * Izik Eidus 10 * Andrea Arcangeli 11 * Chris Wright 12 * Hugh Dickins 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. 15 */ 16 17#include <linux/errno.h> 18#include <linux/mm.h> 19#include <linux/fs.h> 20#include <linux/mman.h> 21#include <linux/sched.h> 22#include <linux/rwsem.h> 23#include <linux/pagemap.h> 24#include <linux/rmap.h> 25#include <linux/spinlock.h> 26#include <linux/jhash.h> 27#include <linux/delay.h> 28#include <linux/kthread.h> 29#include <linux/wait.h> 30#include <linux/slab.h> 31#include <linux/rbtree.h> 32#include <linux/memory.h> 33#include <linux/mmu_notifier.h> 34#include <linux/swap.h> 35#include <linux/ksm.h> 36#include <linux/hash.h> 37#include <linux/freezer.h> 38 39#include <asm/tlbflush.h> 40#include "internal.h" 41 42/* 43 * A few notes about the KSM scanning process, 44 * to make it easier to understand the data structures below: 45 * 46 * In order to reduce excessive scanning, KSM sorts the memory pages by their 47 * contents into a data structure that holds pointers to the pages' locations. 48 * 49 * Since the contents of the pages may change at any moment, KSM cannot just 50 * insert the pages into a normal sorted tree and expect it to find anything. 51 * Therefore KSM uses two data structures - the stable and the unstable tree. 52 * 53 * The stable tree holds pointers to all the merged pages (ksm pages), sorted 54 * by their contents. Because each such page is write-protected, searching on 55 * this tree is fully assured to be working (except when pages are unmapped), 56 * and therefore this tree is called the stable tree. 57 * 58 * In addition to the stable tree, KSM uses a second data structure called the 59 * unstable tree: this tree holds pointers to pages which have been found to 60 * be "unchanged for a period of time". The unstable tree sorts these pages 61 * by their contents, but since they are not write-protected, KSM cannot rely 62 * upon the unstable tree to work correctly - the unstable tree is liable to 63 * be corrupted as its contents are modified, and so it is called unstable. 64 * 65 * KSM solves this problem by several techniques: 66 * 67 * 1) The unstable tree is flushed every time KSM completes scanning all 68 * memory areas, and then the tree is rebuilt again from the beginning. 69 * 2) KSM will only insert into the unstable tree, pages whose hash value 70 * has not changed since the previous scan of all memory areas. 71 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the 72 * colors of the nodes and not on their contents, assuring that even when 73 * the tree gets "corrupted" it won't get out of balance, so scanning time 74 * remains the same (also, searching and inserting nodes in an rbtree uses 75 * the same algorithm, so we have no overhead when we flush and rebuild). 76 * 4) KSM never flushes the stable tree, which means that even if it were to 77 * take 10 attempts to find a page in the unstable tree, once it is found, 78 * it is secured in the stable tree. (When we scan a new page, we first 79 * compare it against the stable tree, and then against the unstable tree.) 80 */ 81 82/** 83 * struct mm_slot - ksm information per mm that is being scanned 84 * @link: link to the mm_slots hash list 85 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head 86 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items 87 * @mm: the mm that this information is valid for 88 */ 89struct mm_slot { 90 struct hlist_node link; 91 struct list_head mm_list; 92 struct rmap_item *rmap_list; 93 struct mm_struct *mm; 94}; 95 96/** 97 * struct ksm_scan - cursor for scanning 98 * @mm_slot: the current mm_slot we are scanning 99 * @address: the next address inside that to be scanned 100 * @rmap_list: link to the next rmap to be scanned in the rmap_list 101 * @seqnr: count of completed full scans (needed when removing unstable node) 102 * 103 * There is only the one ksm_scan instance of this cursor structure. 104 */ 105struct ksm_scan { 106 struct mm_slot *mm_slot; 107 unsigned long address; 108 struct rmap_item **rmap_list; 109 unsigned long seqnr; 110}; 111 112/** 113 * struct stable_node - node of the stable rbtree 114 * @node: rb node of this ksm page in the stable tree 115 * @hlist: hlist head of rmap_items using this ksm page 116 * @kpfn: page frame number of this ksm page 117 */ 118struct stable_node { 119 struct rb_node node; 120 struct hlist_head hlist; 121 unsigned long kpfn; 122}; 123 124/** 125 * struct rmap_item - reverse mapping item for virtual addresses 126 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list 127 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree 128 * @mm: the memory structure this rmap_item is pointing into 129 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 130 * @oldchecksum: previous checksum of the page at that virtual address 131 * @node: rb node of this rmap_item in the unstable tree 132 * @head: pointer to stable_node heading this list in the stable tree 133 * @hlist: link into hlist of rmap_items hanging off that stable_node 134 */ 135struct rmap_item { 136 struct rmap_item *rmap_list; 137 struct anon_vma *anon_vma; /* when stable */ 138 struct mm_struct *mm; 139 unsigned long address; /* + low bits used for flags below */ 140 unsigned int oldchecksum; /* when unstable */ 141 union { 142 struct rb_node node; /* when node of unstable tree */ 143 struct { /* when listed from stable tree */ 144 struct stable_node *head; 145 struct hlist_node hlist; 146 }; 147 }; 148}; 149 150#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ 151#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ 152#define STABLE_FLAG 0x200 /* is listed from the stable tree */ 153 154/* The stable and unstable tree heads */ 155static struct rb_root root_stable_tree = RB_ROOT; 156static struct rb_root root_unstable_tree = RB_ROOT; 157 158#define MM_SLOTS_HASH_SHIFT 10 159#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) 160static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; 161 162static struct mm_slot ksm_mm_head = { 163 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 164}; 165static struct ksm_scan ksm_scan = { 166 .mm_slot = &ksm_mm_head, 167}; 168 169static struct kmem_cache *rmap_item_cache; 170static struct kmem_cache *stable_node_cache; 171static struct kmem_cache *mm_slot_cache; 172 173/* The number of nodes in the stable tree */ 174static unsigned long ksm_pages_shared; 175 176/* The number of page slots additionally sharing those nodes */ 177static unsigned long ksm_pages_sharing; 178 179/* The number of nodes in the unstable tree */ 180static unsigned long ksm_pages_unshared; 181 182/* The number of rmap_items in use: to calculate pages_volatile */ 183static unsigned long ksm_rmap_items; 184 185/* Number of pages ksmd should scan in one batch */ 186static unsigned int ksm_thread_pages_to_scan = 100; 187 188/* Milliseconds ksmd should sleep between batches */ 189static unsigned int ksm_thread_sleep_millisecs = 20; 190 191#define KSM_RUN_STOP 0 192#define KSM_RUN_MERGE 1 193#define KSM_RUN_UNMERGE 2 194static unsigned int ksm_run = KSM_RUN_STOP; 195 196static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 197static DEFINE_MUTEX(ksm_thread_mutex); 198static DEFINE_SPINLOCK(ksm_mmlist_lock); 199 200#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ 201 sizeof(struct __struct), __alignof__(struct __struct),\ 202 (__flags), NULL) 203 204static int __init ksm_slab_init(void) 205{ 206 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); 207 if (!rmap_item_cache) 208 goto out; 209 210 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); 211 if (!stable_node_cache) 212 goto out_free1; 213 214 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); 215 if (!mm_slot_cache) 216 goto out_free2; 217 218 return 0; 219 220out_free2: 221 kmem_cache_destroy(stable_node_cache); 222out_free1: 223 kmem_cache_destroy(rmap_item_cache); 224out: 225 return -ENOMEM; 226} 227 228static void __init ksm_slab_free(void) 229{ 230 kmem_cache_destroy(mm_slot_cache); 231 kmem_cache_destroy(stable_node_cache); 232 kmem_cache_destroy(rmap_item_cache); 233 mm_slot_cache = NULL; 234} 235 236static inline struct rmap_item *alloc_rmap_item(void) 237{ 238 struct rmap_item *rmap_item; 239 240 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); 241 if (rmap_item) 242 ksm_rmap_items++; 243 return rmap_item; 244} 245 246static inline void free_rmap_item(struct rmap_item *rmap_item) 247{ 248 ksm_rmap_items--; 249 rmap_item->mm = NULL; /* debug safety */ 250 kmem_cache_free(rmap_item_cache, rmap_item); 251} 252 253static inline struct stable_node *alloc_stable_node(void) 254{ 255 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); 256} 257 258static inline void free_stable_node(struct stable_node *stable_node) 259{ 260 kmem_cache_free(stable_node_cache, stable_node); 261} 262 263static inline struct mm_slot *alloc_mm_slot(void) 264{ 265 if (!mm_slot_cache) /* initialization failed */ 266 return NULL; 267 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 268} 269 270static inline void free_mm_slot(struct mm_slot *mm_slot) 271{ 272 kmem_cache_free(mm_slot_cache, mm_slot); 273} 274 275static struct mm_slot *get_mm_slot(struct mm_struct *mm) 276{ 277 struct mm_slot *mm_slot; 278 struct hlist_head *bucket; 279 struct hlist_node *node; 280 281 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; 282 hlist_for_each_entry(mm_slot, node, bucket, link) { 283 if (mm == mm_slot->mm) 284 return mm_slot; 285 } 286 return NULL; 287} 288 289static void insert_to_mm_slots_hash(struct mm_struct *mm, 290 struct mm_slot *mm_slot) 291{ 292 struct hlist_head *bucket; 293 294 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; 295 mm_slot->mm = mm; 296 hlist_add_head(&mm_slot->link, bucket); 297} 298 299static inline int in_stable_tree(struct rmap_item *rmap_item) 300{ 301 return rmap_item->address & STABLE_FLAG; 302} 303 304static void hold_anon_vma(struct rmap_item *rmap_item, 305 struct anon_vma *anon_vma) 306{ 307 rmap_item->anon_vma = anon_vma; 308 get_anon_vma(anon_vma); 309} 310 311static void ksm_drop_anon_vma(struct rmap_item *rmap_item) 312{ 313 struct anon_vma *anon_vma = rmap_item->anon_vma; 314 315 drop_anon_vma(anon_vma); 316} 317 318/* 319 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 320 * page tables after it has passed through ksm_exit() - which, if necessary, 321 * takes mmap_sem briefly to serialize against them. ksm_exit() does not set 322 * a special flag: they can just back out as soon as mm_users goes to zero. 323 * ksm_test_exit() is used throughout to make this test for exit: in some 324 * places for correctness, in some places just to avoid unnecessary work. 325 */ 326static inline bool ksm_test_exit(struct mm_struct *mm) 327{ 328 return atomic_read(&mm->mm_users) == 0; 329} 330 331/* 332 * We use break_ksm to break COW on a ksm page: it's a stripped down 333 * 334 * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) 335 * put_page(page); 336 * 337 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, 338 * in case the application has unmapped and remapped mm,addr meanwhile. 339 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP 340 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. 341 */ 342static int break_ksm(struct vm_area_struct *vma, unsigned long addr) 343{ 344 struct page *page; 345 int ret = 0; 346 347 do { 348 cond_resched(); 349 page = follow_page(vma, addr, FOLL_GET); 350 if (IS_ERR_OR_NULL(page)) 351 break; 352 if (PageKsm(page)) 353 ret = handle_mm_fault(vma->vm_mm, vma, addr, 354 FAULT_FLAG_WRITE); 355 else 356 ret = VM_FAULT_WRITE; 357 put_page(page); 358 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); 359 /* 360 * We must loop because handle_mm_fault() may back out if there's 361 * any difficulty e.g. if pte accessed bit gets updated concurrently. 362 * 363 * VM_FAULT_WRITE is what we have been hoping for: it indicates that 364 * COW has been broken, even if the vma does not permit VM_WRITE; 365 * but note that a concurrent fault might break PageKsm for us. 366 * 367 * VM_FAULT_SIGBUS could occur if we race with truncation of the 368 * backing file, which also invalidates anonymous pages: that's 369 * okay, that truncation will have unmapped the PageKsm for us. 370 * 371 * VM_FAULT_OOM: at the time of writing (late July 2009), setting 372 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the 373 * current task has TIF_MEMDIE set, and will be OOM killed on return 374 * to user; and ksmd, having no mm, would never be chosen for that. 375 * 376 * But if the mm is in a limited mem_cgroup, then the fault may fail 377 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and 378 * even ksmd can fail in this way - though it's usually breaking ksm 379 * just to undo a merge it made a moment before, so unlikely to oom. 380 * 381 * That's a pity: we might therefore have more kernel pages allocated 382 * than we're counting as nodes in the stable tree; but ksm_do_scan 383 * will retry to break_cow on each pass, so should recover the page 384 * in due course. The important thing is to not let VM_MERGEABLE 385 * be cleared while any such pages might remain in the area. 386 */ 387 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 388} 389 390static void break_cow(struct rmap_item *rmap_item) 391{ 392 struct mm_struct *mm = rmap_item->mm; 393 unsigned long addr = rmap_item->address; 394 struct vm_area_struct *vma; 395 396 /* 397 * It is not an accident that whenever we want to break COW 398 * to undo, we also need to drop a reference to the anon_vma. 399 */ 400 ksm_drop_anon_vma(rmap_item); 401 402 down_read(&mm->mmap_sem); 403 if (ksm_test_exit(mm)) 404 goto out; 405 vma = find_vma(mm, addr); 406 if (!vma || vma->vm_start > addr) 407 goto out; 408 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 409 goto out; 410 break_ksm(vma, addr); 411out: 412 up_read(&mm->mmap_sem); 413} 414 415static struct page *page_trans_compound_anon(struct page *page) 416{ 417 if (PageTransCompound(page)) { 418 struct page *head = compound_trans_head(page); 419 /* 420 * head may actually be splitted and freed from under 421 * us but it's ok here. 422 */ 423 if (PageAnon(head)) 424 return head; 425 } 426 return NULL; 427} 428 429static struct page *get_mergeable_page(struct rmap_item *rmap_item) 430{ 431 struct mm_struct *mm = rmap_item->mm; 432 unsigned long addr = rmap_item->address; 433 struct vm_area_struct *vma; 434 struct page *page; 435 436 down_read(&mm->mmap_sem); 437 if (ksm_test_exit(mm)) 438 goto out; 439 vma = find_vma(mm, addr); 440 if (!vma || vma->vm_start > addr) 441 goto out; 442 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 443 goto out; 444 445 page = follow_page(vma, addr, FOLL_GET); 446 if (IS_ERR_OR_NULL(page)) 447 goto out; 448 if (PageAnon(page) || page_trans_compound_anon(page)) { 449 flush_anon_page(vma, page, addr); 450 flush_dcache_page(page); 451 } else { 452 put_page(page); 453out: page = NULL; 454 } 455 up_read(&mm->mmap_sem); 456 return page; 457} 458 459static void remove_node_from_stable_tree(struct stable_node *stable_node) 460{ 461 struct rmap_item *rmap_item; 462 struct hlist_node *hlist; 463 464 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 465 if (rmap_item->hlist.next) 466 ksm_pages_sharing--; 467 else 468 ksm_pages_shared--; 469 ksm_drop_anon_vma(rmap_item); 470 rmap_item->address &= PAGE_MASK; 471 cond_resched(); 472 } 473 474 rb_erase(&stable_node->node, &root_stable_tree); 475 free_stable_node(stable_node); 476} 477 478/* 479 * get_ksm_page: checks if the page indicated by the stable node 480 * is still its ksm page, despite having held no reference to it. 481 * In which case we can trust the content of the page, and it 482 * returns the gotten page; but if the page has now been zapped, 483 * remove the stale node from the stable tree and return NULL. 484 * 485 * You would expect the stable_node to hold a reference to the ksm page. 486 * But if it increments the page's count, swapping out has to wait for 487 * ksmd to come around again before it can free the page, which may take 488 * seconds or even minutes: much too unresponsive. So instead we use a 489 * "keyhole reference": access to the ksm page from the stable node peeps 490 * out through its keyhole to see if that page still holds the right key, 491 * pointing back to this stable node. This relies on freeing a PageAnon 492 * page to reset its page->mapping to NULL, and relies on no other use of 493 * a page to put something that might look like our key in page->mapping. 494 * 495 * include/linux/pagemap.h page_cache_get_speculative() is a good reference, 496 * but this is different - made simpler by ksm_thread_mutex being held, but 497 * interesting for assuming that no other use of the struct page could ever 498 * put our expected_mapping into page->mapping (or a field of the union which 499 * coincides with page->mapping). The RCU calls are not for KSM at all, but 500 * to keep the page_count protocol described with page_cache_get_speculative. 501 * 502 * Note: it is possible that get_ksm_page() will return NULL one moment, 503 * then page the next, if the page is in between page_freeze_refs() and 504 * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page 505 * is on its way to being freed; but it is an anomaly to bear in mind. 506 */ 507static struct page *get_ksm_page(struct stable_node *stable_node) 508{ 509 struct page *page; 510 void *expected_mapping; 511 512 page = pfn_to_page(stable_node->kpfn); 513 expected_mapping = (void *)stable_node + 514 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 515 rcu_read_lock(); 516 if (page->mapping != expected_mapping) 517 goto stale; 518 if (!get_page_unless_zero(page)) 519 goto stale; 520 if (page->mapping != expected_mapping) { 521 put_page(page); 522 goto stale; 523 } 524 rcu_read_unlock(); 525 return page; 526stale: 527 rcu_read_unlock(); 528 remove_node_from_stable_tree(stable_node); 529 return NULL; 530} 531 532/* 533 * Removing rmap_item from stable or unstable tree. 534 * This function will clean the information from the stable/unstable tree. 535 */ 536static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) 537{ 538 if (rmap_item->address & STABLE_FLAG) { 539 struct stable_node *stable_node; 540 struct page *page; 541 542 stable_node = rmap_item->head; 543 page = get_ksm_page(stable_node); 544 if (!page) 545 goto out; 546 547 lock_page(page); 548 hlist_del(&rmap_item->hlist); 549 unlock_page(page); 550 put_page(page); 551 552 if (stable_node->hlist.first) 553 ksm_pages_sharing--; 554 else 555 ksm_pages_shared--; 556 557 ksm_drop_anon_vma(rmap_item); 558 rmap_item->address &= PAGE_MASK; 559 560 } else if (rmap_item->address & UNSTABLE_FLAG) { 561 unsigned char age; 562 /* 563 * Usually ksmd can and must skip the rb_erase, because 564 * root_unstable_tree was already reset to RB_ROOT. 565 * But be careful when an mm is exiting: do the rb_erase 566 * if this rmap_item was inserted by this scan, rather 567 * than left over from before. 568 */ 569 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 570 BUG_ON(age > 1); 571 if (!age) 572 rb_erase(&rmap_item->node, &root_unstable_tree); 573 574 ksm_pages_unshared--; 575 rmap_item->address &= PAGE_MASK; 576 } 577out: 578 cond_resched(); /* we're called from many long loops */ 579} 580 581static void remove_trailing_rmap_items(struct mm_slot *mm_slot, 582 struct rmap_item **rmap_list) 583{ 584 while (*rmap_list) { 585 struct rmap_item *rmap_item = *rmap_list; 586 *rmap_list = rmap_item->rmap_list; 587 remove_rmap_item_from_tree(rmap_item); 588 free_rmap_item(rmap_item); 589 } 590} 591 592/* 593 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather 594 * than check every pte of a given vma, the locking doesn't quite work for 595 * that - an rmap_item is assigned to the stable tree after inserting ksm 596 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing 597 * rmap_items from parent to child at fork time (so as not to waste time 598 * if exit comes before the next scan reaches it). 599 * 600 * Similarly, although we'd like to remove rmap_items (so updating counts 601 * and freeing memory) when unmerging an area, it's easier to leave that 602 * to the next pass of ksmd - consider, for example, how ksmd might be 603 * in cmp_and_merge_page on one of the rmap_items we would be removing. 604 */ 605static int unmerge_ksm_pages(struct vm_area_struct *vma, 606 unsigned long start, unsigned long end) 607{ 608 unsigned long addr; 609 int err = 0; 610 611 for (addr = start; addr < end && !err; addr += PAGE_SIZE) { 612 if (ksm_test_exit(vma->vm_mm)) 613 break; 614 if (signal_pending(current)) 615 err = -ERESTARTSYS; 616 else 617 err = break_ksm(vma, addr); 618 } 619 return err; 620} 621 622#ifdef CONFIG_SYSFS 623/* 624 * Only called through the sysfs control interface: 625 */ 626static int unmerge_and_remove_all_rmap_items(void) 627{ 628 struct mm_slot *mm_slot; 629 struct mm_struct *mm; 630 struct vm_area_struct *vma; 631 int err = 0; 632 633 spin_lock(&ksm_mmlist_lock); 634 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, 635 struct mm_slot, mm_list); 636 spin_unlock(&ksm_mmlist_lock); 637 638 for (mm_slot = ksm_scan.mm_slot; 639 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { 640 mm = mm_slot->mm; 641 down_read(&mm->mmap_sem); 642 for (vma = mm->mmap; vma; vma = vma->vm_next) { 643 if (ksm_test_exit(mm)) 644 break; 645 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 646 continue; 647 err = unmerge_ksm_pages(vma, 648 vma->vm_start, vma->vm_end); 649 if (err) 650 goto error; 651 } 652 653 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); 654 655 spin_lock(&ksm_mmlist_lock); 656 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, 657 struct mm_slot, mm_list); 658 if (ksm_test_exit(mm)) { 659 hlist_del(&mm_slot->link); 660 list_del(&mm_slot->mm_list); 661 spin_unlock(&ksm_mmlist_lock); 662 663 free_mm_slot(mm_slot); 664 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 665 up_read(&mm->mmap_sem); 666 mmdrop(mm); 667 } else { 668 spin_unlock(&ksm_mmlist_lock); 669 up_read(&mm->mmap_sem); 670 } 671 } 672 673 ksm_scan.seqnr = 0; 674 return 0; 675 676error: 677 up_read(&mm->mmap_sem); 678 spin_lock(&ksm_mmlist_lock); 679 ksm_scan.mm_slot = &ksm_mm_head; 680 spin_unlock(&ksm_mmlist_lock); 681 return err; 682} 683#endif /* CONFIG_SYSFS */ 684 685static u32 calc_checksum(struct page *page) 686{ 687 u32 checksum; 688 void *addr = kmap_atomic(page, KM_USER0); 689 checksum = jhash2(addr, PAGE_SIZE / 4, 17); 690 kunmap_atomic(addr, KM_USER0); 691 return checksum; 692} 693 694static int memcmp_pages(struct page *page1, struct page *page2) 695{ 696 char *addr1, *addr2; 697 int ret; 698 699 addr1 = kmap_atomic(page1, KM_USER0); 700 addr2 = kmap_atomic(page2, KM_USER1); 701 ret = memcmp(addr1, addr2, PAGE_SIZE); 702 kunmap_atomic(addr2, KM_USER1); 703 kunmap_atomic(addr1, KM_USER0); 704 return ret; 705} 706 707static inline int pages_identical(struct page *page1, struct page *page2) 708{ 709 return !memcmp_pages(page1, page2); 710} 711 712static int write_protect_page(struct vm_area_struct *vma, struct page *page, 713 pte_t *orig_pte) 714{ 715 struct mm_struct *mm = vma->vm_mm; 716 unsigned long addr; 717 pte_t *ptep; 718 spinlock_t *ptl; 719 int swapped; 720 int err = -EFAULT; 721 722 addr = page_address_in_vma(page, vma); 723 if (addr == -EFAULT) 724 goto out; 725 726 BUG_ON(PageTransCompound(page)); 727 ptep = page_check_address(page, mm, addr, &ptl, 0); 728 if (!ptep) 729 goto out; 730 731 if (pte_write(*ptep) || pte_dirty(*ptep)) { 732 pte_t entry; 733 734 swapped = PageSwapCache(page); 735 flush_cache_page(vma, addr, page_to_pfn(page)); 736 /* 737 * Ok this is tricky, when get_user_pages_fast() run it doesnt 738 * take any lock, therefore the check that we are going to make 739 * with the pagecount against the mapcount is racey and 740 * O_DIRECT can happen right after the check. 741 * So we clear the pte and flush the tlb before the check 742 * this assure us that no O_DIRECT can happen after the check 743 * or in the middle of the check. 744 */ 745 entry = ptep_clear_flush(vma, addr, ptep); 746 /* 747 * Check that no O_DIRECT or similar I/O is in progress on the 748 * page 749 */ 750 if (page_mapcount(page) + 1 + swapped != page_count(page)) { 751 set_pte_at(mm, addr, ptep, entry); 752 goto out_unlock; 753 } 754 if (pte_dirty(entry)) 755 set_page_dirty(page); 756 entry = pte_mkclean(pte_wrprotect(entry)); 757 set_pte_at_notify(mm, addr, ptep, entry); 758 } 759 *orig_pte = *ptep; 760 err = 0; 761 762out_unlock: 763 pte_unmap_unlock(ptep, ptl); 764out: 765 return err; 766} 767 768/** 769 * replace_page - replace page in vma by new ksm page 770 * @vma: vma that holds the pte pointing to page 771 * @page: the page we are replacing by kpage 772 * @kpage: the ksm page we replace page by 773 * @orig_pte: the original value of the pte 774 * 775 * Returns 0 on success, -EFAULT on failure. 776 */ 777static int replace_page(struct vm_area_struct *vma, struct page *page, 778 struct page *kpage, pte_t orig_pte) 779{ 780 struct mm_struct *mm = vma->vm_mm; 781 pgd_t *pgd; 782 pud_t *pud; 783 pmd_t *pmd; 784 pte_t *ptep; 785 spinlock_t *ptl; 786 unsigned long addr; 787 int err = -EFAULT; 788 789 addr = page_address_in_vma(page, vma); 790 if (addr == -EFAULT) 791 goto out; 792 793 pgd = pgd_offset(mm, addr); 794 if (!pgd_present(*pgd)) 795 goto out; 796 797 pud = pud_offset(pgd, addr); 798 if (!pud_present(*pud)) 799 goto out; 800 801 pmd = pmd_offset(pud, addr); 802 BUG_ON(pmd_trans_huge(*pmd)); 803 if (!pmd_present(*pmd)) 804 goto out; 805 806 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 807 if (!pte_same(*ptep, orig_pte)) { 808 pte_unmap_unlock(ptep, ptl); 809 goto out; 810 } 811 812 get_page(kpage); 813 page_add_anon_rmap(kpage, vma, addr); 814 815 flush_cache_page(vma, addr, pte_pfn(*ptep)); 816 ptep_clear_flush(vma, addr, ptep); 817 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 818 819 page_remove_rmap(page); 820 if (!page_mapped(page)) 821 try_to_free_swap(page); 822 put_page(page); 823 824 pte_unmap_unlock(ptep, ptl); 825 err = 0; 826out: 827 return err; 828} 829 830static int page_trans_compound_anon_split(struct page *page) 831{ 832 int ret = 0; 833 struct page *transhuge_head = page_trans_compound_anon(page); 834 if (transhuge_head) { 835 /* Get the reference on the head to split it. */ 836 if (get_page_unless_zero(transhuge_head)) { 837 /* 838 * Recheck we got the reference while the head 839 * was still anonymous. 840 */ 841 if (PageAnon(transhuge_head)) 842 ret = split_huge_page(transhuge_head); 843 else 844 /* 845 * Retry later if split_huge_page run 846 * from under us. 847 */ 848 ret = 1; 849 put_page(transhuge_head); 850 } else 851 /* Retry later if split_huge_page run from under us. */ 852 ret = 1; 853 } 854 return ret; 855} 856 857/* 858 * try_to_merge_one_page - take two pages and merge them into one 859 * @vma: the vma that holds the pte pointing to page 860 * @page: the PageAnon page that we want to replace with kpage 861 * @kpage: the PageKsm page that we want to map instead of page, 862 * or NULL the first time when we want to use page as kpage. 863 * 864 * This function returns 0 if the pages were merged, -EFAULT otherwise. 865 */ 866static int try_to_merge_one_page(struct vm_area_struct *vma, 867 struct page *page, struct page *kpage) 868{ 869 pte_t orig_pte = __pte(0); 870 int err = -EFAULT; 871 872 if (page == kpage) /* ksm page forked */ 873 return 0; 874 875 if (!(vma->vm_flags & VM_MERGEABLE)) 876 goto out; 877 if (PageTransCompound(page) && page_trans_compound_anon_split(page)) 878 goto out; 879 BUG_ON(PageTransCompound(page)); 880 if (!PageAnon(page)) 881 goto out; 882 883 /* 884 * We need the page lock to read a stable PageSwapCache in 885 * write_protect_page(). We use trylock_page() instead of 886 * lock_page() because we don't want to wait here - we 887 * prefer to continue scanning and merging different pages, 888 * then come back to this page when it is unlocked. 889 */ 890 if (!trylock_page(page)) 891 goto out; 892 /* 893 * If this anonymous page is mapped only here, its pte may need 894 * to be write-protected. If it's mapped elsewhere, all of its 895 * ptes are necessarily already write-protected. But in either 896 * case, we need to lock and check page_count is not raised. 897 */ 898 if (write_protect_page(vma, page, &orig_pte) == 0) { 899 if (!kpage) { 900 /* 901 * While we hold page lock, upgrade page from 902 * PageAnon+anon_vma to PageKsm+NULL stable_node: 903 * stable_tree_insert() will update stable_node. 904 */ 905 set_page_stable_node(page, NULL); 906 mark_page_accessed(page); 907 err = 0; 908 } else if (pages_identical(page, kpage)) 909 err = replace_page(vma, page, kpage, orig_pte); 910 } 911 912 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { 913 munlock_vma_page(page); 914 if (!PageMlocked(kpage)) { 915 unlock_page(page); 916 lock_page(kpage); 917 mlock_vma_page(kpage); 918 page = kpage; /* for final unlock */ 919 } 920 } 921 922 unlock_page(page); 923out: 924 return err; 925} 926 927/* 928 * try_to_merge_with_ksm_page - like try_to_merge_two_pages, 929 * but no new kernel page is allocated: kpage must already be a ksm page. 930 * 931 * This function returns 0 if the pages were merged, -EFAULT otherwise. 932 */ 933static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, 934 struct page *page, struct page *kpage) 935{ 936 struct mm_struct *mm = rmap_item->mm; 937 struct vm_area_struct *vma; 938 int err = -EFAULT; 939 940 down_read(&mm->mmap_sem); 941 if (ksm_test_exit(mm)) 942 goto out; 943 vma = find_vma(mm, rmap_item->address); 944 if (!vma || vma->vm_start > rmap_item->address) 945 goto out; 946 947 err = try_to_merge_one_page(vma, page, kpage); 948 if (err) 949 goto out; 950 951 /* Must get reference to anon_vma while still holding mmap_sem */ 952 hold_anon_vma(rmap_item, vma->anon_vma); 953out: 954 up_read(&mm->mmap_sem); 955 return err; 956} 957 958/* 959 * try_to_merge_two_pages - take two identical pages and prepare them 960 * to be merged into one page. 961 * 962 * This function returns the kpage if we successfully merged two identical 963 * pages into one ksm page, NULL otherwise. 964 * 965 * Note that this function upgrades page to ksm page: if one of the pages 966 * is already a ksm page, try_to_merge_with_ksm_page should be used. 967 */ 968static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, 969 struct page *page, 970 struct rmap_item *tree_rmap_item, 971 struct page *tree_page) 972{ 973 int err; 974 975 err = try_to_merge_with_ksm_page(rmap_item, page, NULL); 976 if (!err) { 977 err = try_to_merge_with_ksm_page(tree_rmap_item, 978 tree_page, page); 979 /* 980 * If that fails, we have a ksm page with only one pte 981 * pointing to it: so break it. 982 */ 983 if (err) 984 break_cow(rmap_item); 985 } 986 return err ? NULL : page; 987} 988 989/* 990 * stable_tree_search - search for page inside the stable tree 991 * 992 * This function checks if there is a page inside the stable tree 993 * with identical content to the page that we are scanning right now. 994 * 995 * This function returns the stable tree node of identical content if found, 996 * NULL otherwise. 997 */ 998static struct page *stable_tree_search(struct page *page) 999{ 1000 struct rb_node *node = root_stable_tree.rb_node; 1001 struct stable_node *stable_node; 1002 1003 stable_node = page_stable_node(page); 1004 if (stable_node) { /* ksm page forked */ 1005 get_page(page); 1006 return page; 1007 } 1008 1009 while (node) { 1010 struct page *tree_page; 1011 int ret; 1012 1013 cond_resched(); 1014 stable_node = rb_entry(node, struct stable_node, node); 1015 tree_page = get_ksm_page(stable_node); 1016 if (!tree_page) 1017 return NULL; 1018 1019 ret = memcmp_pages(page, tree_page); 1020 1021 if (ret < 0) { 1022 put_page(tree_page); 1023 node = node->rb_left; 1024 } else if (ret > 0) { 1025 put_page(tree_page); 1026 node = node->rb_right; 1027 } else 1028 return tree_page; 1029 } 1030 1031 return NULL; 1032} 1033 1034/* 1035 * stable_tree_insert - insert rmap_item pointing to new ksm page 1036 * into the stable tree. 1037 * 1038 * This function returns the stable tree node just allocated on success, 1039 * NULL otherwise. 1040 */ 1041static struct stable_node *stable_tree_insert(struct page *kpage) 1042{ 1043 struct rb_node **new = &root_stable_tree.rb_node; 1044 struct rb_node *parent = NULL; 1045 struct stable_node *stable_node; 1046 1047 while (*new) { 1048 struct page *tree_page; 1049 int ret; 1050 1051 cond_resched(); 1052 stable_node = rb_entry(*new, struct stable_node, node); 1053 tree_page = get_ksm_page(stable_node); 1054 if (!tree_page) 1055 return NULL; 1056 1057 ret = memcmp_pages(kpage, tree_page); 1058 put_page(tree_page); 1059 1060 parent = *new; 1061 if (ret < 0) 1062 new = &parent->rb_left; 1063 else if (ret > 0) 1064 new = &parent->rb_right; 1065 else { 1066 /* 1067 * It is not a bug that stable_tree_search() didn't 1068 * find this node: because at that time our page was 1069 * not yet write-protected, so may have changed since. 1070 */ 1071 return NULL; 1072 } 1073 } 1074 1075 stable_node = alloc_stable_node(); 1076 if (!stable_node) 1077 return NULL; 1078 1079 rb_link_node(&stable_node->node, parent, new); 1080 rb_insert_color(&stable_node->node, &root_stable_tree); 1081 1082 INIT_HLIST_HEAD(&stable_node->hlist); 1083 1084 stable_node->kpfn = page_to_pfn(kpage); 1085 set_page_stable_node(kpage, stable_node); 1086 1087 return stable_node; 1088} 1089 1090/* 1091 * unstable_tree_search_insert - search for identical page, 1092 * else insert rmap_item into the unstable tree. 1093 * 1094 * This function searches for a page in the unstable tree identical to the 1095 * page currently being scanned; and if no identical page is found in the 1096 * tree, we insert rmap_item as a new object into the unstable tree. 1097 * 1098 * This function returns pointer to rmap_item found to be identical 1099 * to the currently scanned page, NULL otherwise. 1100 * 1101 * This function does both searching and inserting, because they share 1102 * the same walking algorithm in an rbtree. 1103 */ 1104static 1105struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, 1106 struct page *page, 1107 struct page **tree_pagep) 1108 1109{ 1110 struct rb_node **new = &root_unstable_tree.rb_node; 1111 struct rb_node *parent = NULL; 1112 1113 while (*new) { 1114 struct rmap_item *tree_rmap_item; 1115 struct page *tree_page; 1116 int ret; 1117 1118 cond_resched(); 1119 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1120 tree_page = get_mergeable_page(tree_rmap_item); 1121 if (IS_ERR_OR_NULL(tree_page)) 1122 return NULL; 1123 1124 /* 1125 * Don't substitute a ksm page for a forked page. 1126 */ 1127 if (page == tree_page) { 1128 put_page(tree_page); 1129 return NULL; 1130 } 1131 1132 ret = memcmp_pages(page, tree_page); 1133 1134 parent = *new; 1135 if (ret < 0) { 1136 put_page(tree_page); 1137 new = &parent->rb_left; 1138 } else if (ret > 0) { 1139 put_page(tree_page); 1140 new = &parent->rb_right; 1141 } else { 1142 *tree_pagep = tree_page; 1143 return tree_rmap_item; 1144 } 1145 } 1146 1147 rmap_item->address |= UNSTABLE_FLAG; 1148 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1149 rb_link_node(&rmap_item->node, parent, new); 1150 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1151 1152 ksm_pages_unshared++; 1153 return NULL; 1154} 1155 1156/* 1157 * stable_tree_append - add another rmap_item to the linked list of 1158 * rmap_items hanging off a given node of the stable tree, all sharing 1159 * the same ksm page. 1160 */ 1161static void stable_tree_append(struct rmap_item *rmap_item, 1162 struct stable_node *stable_node) 1163{ 1164 rmap_item->head = stable_node; 1165 rmap_item->address |= STABLE_FLAG; 1166 hlist_add_head(&rmap_item->hlist, &stable_node->hlist); 1167 1168 if (rmap_item->hlist.next) 1169 ksm_pages_sharing++; 1170 else 1171 ksm_pages_shared++; 1172} 1173 1174/* 1175 * cmp_and_merge_page - first see if page can be merged into the stable tree; 1176 * if not, compare checksum to previous and if it's the same, see if page can 1177 * be inserted into the unstable tree, or merged with a page already there and 1178 * both transferred to the stable tree. 1179 * 1180 * @page: the page that we are searching identical page to. 1181 * @rmap_item: the reverse mapping into the virtual address of this page 1182 */ 1183static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) 1184{ 1185 struct rmap_item *tree_rmap_item; 1186 struct page *tree_page = NULL; 1187 struct stable_node *stable_node; 1188 struct page *kpage; 1189 unsigned int checksum; 1190 int err; 1191 1192 remove_rmap_item_from_tree(rmap_item); 1193 1194 /* We first start with searching the page inside the stable tree */ 1195 kpage = stable_tree_search(page); 1196 if (kpage) { 1197 err = try_to_merge_with_ksm_page(rmap_item, page, kpage); 1198 if (!err) { 1199 /* 1200 * The page was successfully merged: 1201 * add its rmap_item to the stable tree. 1202 */ 1203 lock_page(kpage); 1204 stable_tree_append(rmap_item, page_stable_node(kpage)); 1205 unlock_page(kpage); 1206 } 1207 put_page(kpage); 1208 return; 1209 } 1210 1211 /* 1212 * If the hash value of the page has changed from the last time 1213 * we calculated it, this page is changing frequently: therefore we 1214 * don't want to insert it in the unstable tree, and we don't want 1215 * to waste our time searching for something identical to it there. 1216 */ 1217 checksum = calc_checksum(page); 1218 if (rmap_item->oldchecksum != checksum) { 1219 rmap_item->oldchecksum = checksum; 1220 return; 1221 } 1222 1223 tree_rmap_item = 1224 unstable_tree_search_insert(rmap_item, page, &tree_page); 1225 if (tree_rmap_item) { 1226 kpage = try_to_merge_two_pages(rmap_item, page, 1227 tree_rmap_item, tree_page); 1228 put_page(tree_page); 1229 /* 1230 * As soon as we merge this page, we want to remove the 1231 * rmap_item of the page we have merged with from the unstable 1232 * tree, and insert it instead as new node in the stable tree. 1233 */ 1234 if (kpage) { 1235 remove_rmap_item_from_tree(tree_rmap_item); 1236 1237 lock_page(kpage); 1238 stable_node = stable_tree_insert(kpage); 1239 if (stable_node) { 1240 stable_tree_append(tree_rmap_item, stable_node); 1241 stable_tree_append(rmap_item, stable_node); 1242 } 1243 unlock_page(kpage); 1244 1245 /* 1246 * If we fail to insert the page into the stable tree, 1247 * we will have 2 virtual addresses that are pointing 1248 * to a ksm page left outside the stable tree, 1249 * in which case we need to break_cow on both. 1250 */ 1251 if (!stable_node) { 1252 break_cow(tree_rmap_item); 1253 break_cow(rmap_item); 1254 } 1255 } 1256 } 1257} 1258 1259static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, 1260 struct rmap_item **rmap_list, 1261 unsigned long addr) 1262{ 1263 struct rmap_item *rmap_item; 1264 1265 while (*rmap_list) { 1266 rmap_item = *rmap_list; 1267 if ((rmap_item->address & PAGE_MASK) == addr) 1268 return rmap_item; 1269 if (rmap_item->address > addr) 1270 break; 1271 *rmap_list = rmap_item->rmap_list; 1272 remove_rmap_item_from_tree(rmap_item); 1273 free_rmap_item(rmap_item); 1274 } 1275 1276 rmap_item = alloc_rmap_item(); 1277 if (rmap_item) { 1278 /* It has already been zeroed */ 1279 rmap_item->mm = mm_slot->mm; 1280 rmap_item->address = addr; 1281 rmap_item->rmap_list = *rmap_list; 1282 *rmap_list = rmap_item; 1283 } 1284 return rmap_item; 1285} 1286 1287static struct rmap_item *scan_get_next_rmap_item(struct page **page) 1288{ 1289 struct mm_struct *mm; 1290 struct mm_slot *slot; 1291 struct vm_area_struct *vma; 1292 struct rmap_item *rmap_item; 1293 1294 if (list_empty(&ksm_mm_head.mm_list)) 1295 return NULL; 1296 1297 slot = ksm_scan.mm_slot; 1298 if (slot == &ksm_mm_head) { 1299 /* 1300 * A number of pages can hang around indefinitely on per-cpu 1301 * pagevecs, raised page count preventing write_protect_page 1302 * from merging them. Though it doesn't really matter much, 1303 * it is puzzling to see some stuck in pages_volatile until 1304 * other activity jostles them out, and they also prevented 1305 * LTP's KSM test from succeeding deterministically; so drain 1306 * them here (here rather than on entry to ksm_do_scan(), 1307 * so we don't IPI too often when pages_to_scan is set low). 1308 */ 1309 lru_add_drain_all(); 1310 1311 root_unstable_tree = RB_ROOT; 1312 1313 spin_lock(&ksm_mmlist_lock); 1314 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1315 ksm_scan.mm_slot = slot; 1316 spin_unlock(&ksm_mmlist_lock); 1317next_mm: 1318 ksm_scan.address = 0; 1319 ksm_scan.rmap_list = &slot->rmap_list; 1320 } 1321 1322 mm = slot->mm; 1323 down_read(&mm->mmap_sem); 1324 if (ksm_test_exit(mm)) 1325 vma = NULL; 1326 else 1327 vma = find_vma(mm, ksm_scan.address); 1328 1329 for (; vma; vma = vma->vm_next) { 1330 if (!(vma->vm_flags & VM_MERGEABLE)) 1331 continue; 1332 if (ksm_scan.address < vma->vm_start) 1333 ksm_scan.address = vma->vm_start; 1334 if (!vma->anon_vma) 1335 ksm_scan.address = vma->vm_end; 1336 1337 while (ksm_scan.address < vma->vm_end) { 1338 if (ksm_test_exit(mm)) 1339 break; 1340 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1341 if (IS_ERR_OR_NULL(*page)) { 1342 ksm_scan.address += PAGE_SIZE; 1343 cond_resched(); 1344 continue; 1345 } 1346 if (PageAnon(*page) || 1347 page_trans_compound_anon(*page)) { 1348 flush_anon_page(vma, *page, ksm_scan.address); 1349 flush_dcache_page(*page); 1350 rmap_item = get_next_rmap_item(slot, 1351 ksm_scan.rmap_list, ksm_scan.address); 1352 if (rmap_item) { 1353 ksm_scan.rmap_list = 1354 &rmap_item->rmap_list; 1355 ksm_scan.address += PAGE_SIZE; 1356 } else 1357 put_page(*page); 1358 up_read(&mm->mmap_sem); 1359 return rmap_item; 1360 } 1361 put_page(*page); 1362 ksm_scan.address += PAGE_SIZE; 1363 cond_resched(); 1364 } 1365 } 1366 1367 if (ksm_test_exit(mm)) { 1368 ksm_scan.address = 0; 1369 ksm_scan.rmap_list = &slot->rmap_list; 1370 } 1371 /* 1372 * Nuke all the rmap_items that are above this current rmap: 1373 * because there were no VM_MERGEABLE vmas with such addresses. 1374 */ 1375 remove_trailing_rmap_items(slot, ksm_scan.rmap_list); 1376 1377 spin_lock(&ksm_mmlist_lock); 1378 ksm_scan.mm_slot = list_entry(slot->mm_list.next, 1379 struct mm_slot, mm_list); 1380 if (ksm_scan.address == 0) { 1381 /* 1382 * We've completed a full scan of all vmas, holding mmap_sem 1383 * throughout, and found no VM_MERGEABLE: so do the same as 1384 * __ksm_exit does to remove this mm from all our lists now. 1385 * This applies either when cleaning up after __ksm_exit 1386 * (but beware: we can reach here even before __ksm_exit), 1387 * or when all VM_MERGEABLE areas have been unmapped (and 1388 * mmap_sem then protects against race with MADV_MERGEABLE). 1389 */ 1390 hlist_del(&slot->link); 1391 list_del(&slot->mm_list); 1392 spin_unlock(&ksm_mmlist_lock); 1393 1394 free_mm_slot(slot); 1395 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 1396 up_read(&mm->mmap_sem); 1397 mmdrop(mm); 1398 } else { 1399 spin_unlock(&ksm_mmlist_lock); 1400 up_read(&mm->mmap_sem); 1401 } 1402 1403 /* Repeat until we've completed scanning the whole list */ 1404 slot = ksm_scan.mm_slot; 1405 if (slot != &ksm_mm_head) 1406 goto next_mm; 1407 1408 ksm_scan.seqnr++; 1409 return NULL; 1410} 1411 1412/** 1413 * ksm_do_scan - the ksm scanner main worker function. 1414 * @scan_npages - number of pages we want to scan before we return. 1415 */ 1416static void ksm_do_scan(unsigned int scan_npages) 1417{ 1418 struct rmap_item *rmap_item; 1419 struct page *uninitialized_var(page); 1420 1421 while (scan_npages-- && likely(!freezing(current))) { 1422 cond_resched(); 1423 rmap_item = scan_get_next_rmap_item(&page); 1424 if (!rmap_item) 1425 return; 1426 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1427 cmp_and_merge_page(page, rmap_item); 1428 put_page(page); 1429 } 1430} 1431 1432static int ksmd_should_run(void) 1433{ 1434 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); 1435} 1436 1437static int ksm_scan_thread(void *nothing) 1438{ 1439 set_freezable(); 1440 set_user_nice(current, 5); 1441 1442 while (!kthread_should_stop()) { 1443 mutex_lock(&ksm_thread_mutex); 1444 if (ksmd_should_run()) 1445 ksm_do_scan(ksm_thread_pages_to_scan); 1446 mutex_unlock(&ksm_thread_mutex); 1447 1448 try_to_freeze(); 1449 1450 if (ksmd_should_run()) { 1451 schedule_timeout_interruptible( 1452 msecs_to_jiffies(ksm_thread_sleep_millisecs)); 1453 } else { 1454 wait_event_freezable(ksm_thread_wait, 1455 ksmd_should_run() || kthread_should_stop()); 1456 } 1457 } 1458 return 0; 1459} 1460 1461int ksm_madvise(struct vm_area_struct *vma, unsigned long start, 1462 unsigned long end, int advice, unsigned long *vm_flags) 1463{ 1464 struct mm_struct *mm = vma->vm_mm; 1465 int err; 1466 1467 switch (advice) { 1468 case MADV_MERGEABLE: 1469 /* 1470 * Be somewhat over-protective for now! 1471 */ 1472 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1473 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1474 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1475 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) 1476 return 0; /* just ignore the advice */ 1477 1478 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 1479 err = __ksm_enter(mm); 1480 if (err) 1481 return err; 1482 } 1483 1484 *vm_flags |= VM_MERGEABLE; 1485 break; 1486 1487 case MADV_UNMERGEABLE: 1488 if (!(*vm_flags & VM_MERGEABLE)) 1489 return 0; /* just ignore the advice */ 1490 1491 if (vma->anon_vma) { 1492 err = unmerge_ksm_pages(vma, start, end); 1493 if (err) 1494 return err; 1495 } 1496 1497 *vm_flags &= ~VM_MERGEABLE; 1498 break; 1499 } 1500 1501 return 0; 1502} 1503 1504int __ksm_enter(struct mm_struct *mm) 1505{ 1506 struct mm_slot *mm_slot; 1507 int needs_wakeup; 1508 1509 mm_slot = alloc_mm_slot(); 1510 if (!mm_slot) 1511 return -ENOMEM; 1512 1513 /* Check ksm_run too? Would need tighter locking */ 1514 needs_wakeup = list_empty(&ksm_mm_head.mm_list); 1515 1516 spin_lock(&ksm_mmlist_lock); 1517 insert_to_mm_slots_hash(mm, mm_slot); 1518 /* 1519 * Insert just behind the scanning cursor, to let the area settle 1520 * down a little; when fork is followed by immediate exec, we don't 1521 * want ksmd to waste time setting up and tearing down an rmap_list. 1522 */ 1523 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); 1524 spin_unlock(&ksm_mmlist_lock); 1525 1526 set_bit(MMF_VM_MERGEABLE, &mm->flags); 1527 atomic_inc(&mm->mm_count); 1528 1529 if (needs_wakeup) 1530 wake_up_interruptible(&ksm_thread_wait); 1531 1532 return 0; 1533} 1534 1535void __ksm_exit(struct mm_struct *mm) 1536{ 1537 struct mm_slot *mm_slot; 1538 int easy_to_free = 0; 1539 1540 /* 1541 * This process is exiting: if it's straightforward (as is the 1542 * case when ksmd was never running), free mm_slot immediately. 1543 * But if it's at the cursor or has rmap_items linked to it, use 1544 * mmap_sem to synchronize with any break_cows before pagetables 1545 * are freed, and leave the mm_slot on the list for ksmd to free. 1546 * Beware: ksm may already have noticed it exiting and freed the slot. 1547 */ 1548 1549 spin_lock(&ksm_mmlist_lock); 1550 mm_slot = get_mm_slot(mm); 1551 if (mm_slot && ksm_scan.mm_slot != mm_slot) { 1552 if (!mm_slot->rmap_list) { 1553 hlist_del(&mm_slot->link); 1554 list_del(&mm_slot->mm_list); 1555 easy_to_free = 1; 1556 } else { 1557 list_move(&mm_slot->mm_list, 1558 &ksm_scan.mm_slot->mm_list); 1559 } 1560 } 1561 spin_unlock(&ksm_mmlist_lock); 1562 1563 if (easy_to_free) { 1564 free_mm_slot(mm_slot); 1565 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 1566 mmdrop(mm); 1567 } else if (mm_slot) { 1568 down_write(&mm->mmap_sem); 1569 up_write(&mm->mmap_sem); 1570 } 1571} 1572 1573struct page *ksm_does_need_to_copy(struct page *page, 1574 struct vm_area_struct *vma, unsigned long address) 1575{ 1576 struct page *new_page; 1577 1578 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1579 if (new_page) { 1580 copy_user_highpage(new_page, page, address, vma); 1581 1582 SetPageDirty(new_page); 1583 __SetPageUptodate(new_page); 1584 SetPageSwapBacked(new_page); 1585 __set_page_locked(new_page); 1586 1587 if (page_evictable(new_page, vma)) 1588 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); 1589 else 1590 add_page_to_unevictable_list(new_page); 1591 } 1592 1593 return new_page; 1594} 1595 1596int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, 1597 unsigned long *vm_flags) 1598{ 1599 struct stable_node *stable_node; 1600 struct rmap_item *rmap_item; 1601 struct hlist_node *hlist; 1602 unsigned int mapcount = page_mapcount(page); 1603 int referenced = 0; 1604 int search_new_forks = 0; 1605 1606 VM_BUG_ON(!PageKsm(page)); 1607 VM_BUG_ON(!PageLocked(page)); 1608 1609 stable_node = page_stable_node(page); 1610 if (!stable_node) 1611 return 0; 1612again: 1613 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1614 struct anon_vma *anon_vma = rmap_item->anon_vma; 1615 struct anon_vma_chain *vmac; 1616 struct vm_area_struct *vma; 1617 1618 anon_vma_lock(anon_vma); 1619 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1620 vma = vmac->vma; 1621 if (rmap_item->address < vma->vm_start || 1622 rmap_item->address >= vma->vm_end) 1623 continue; 1624 /* 1625 * Initially we examine only the vma which covers this 1626 * rmap_item; but later, if there is still work to do, 1627 * we examine covering vmas in other mms: in case they 1628 * were forked from the original since ksmd passed. 1629 */ 1630 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1631 continue; 1632 1633 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) 1634 continue; 1635 1636 referenced += page_referenced_one(page, vma, 1637 rmap_item->address, &mapcount, vm_flags); 1638 if (!search_new_forks || !mapcount) 1639 break; 1640 } 1641 anon_vma_unlock(anon_vma); 1642 if (!mapcount) 1643 goto out; 1644 } 1645 if (!search_new_forks++) 1646 goto again; 1647out: 1648 return referenced; 1649} 1650 1651int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) 1652{ 1653 struct stable_node *stable_node; 1654 struct hlist_node *hlist; 1655 struct rmap_item *rmap_item; 1656 int ret = SWAP_AGAIN; 1657 int search_new_forks = 0; 1658 1659 VM_BUG_ON(!PageKsm(page)); 1660 VM_BUG_ON(!PageLocked(page)); 1661 1662 stable_node = page_stable_node(page); 1663 if (!stable_node) 1664 return SWAP_FAIL; 1665again: 1666 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1667 struct anon_vma *anon_vma = rmap_item->anon_vma; 1668 struct anon_vma_chain *vmac; 1669 struct vm_area_struct *vma; 1670 1671 anon_vma_lock(anon_vma); 1672 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1673 vma = vmac->vma; 1674 if (rmap_item->address < vma->vm_start || 1675 rmap_item->address >= vma->vm_end) 1676 continue; 1677 /* 1678 * Initially we examine only the vma which covers this 1679 * rmap_item; but later, if there is still work to do, 1680 * we examine covering vmas in other mms: in case they 1681 * were forked from the original since ksmd passed. 1682 */ 1683 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1684 continue; 1685 1686 ret = try_to_unmap_one(page, vma, 1687 rmap_item->address, flags); 1688 if (ret != SWAP_AGAIN || !page_mapped(page)) { 1689 anon_vma_unlock(anon_vma); 1690 goto out; 1691 } 1692 } 1693 anon_vma_unlock(anon_vma); 1694 } 1695 if (!search_new_forks++) 1696 goto again; 1697out: 1698 return ret; 1699} 1700 1701#ifdef CONFIG_MIGRATION 1702int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, 1703 struct vm_area_struct *, unsigned long, void *), void *arg) 1704{ 1705 struct stable_node *stable_node; 1706 struct hlist_node *hlist; 1707 struct rmap_item *rmap_item; 1708 int ret = SWAP_AGAIN; 1709 int search_new_forks = 0; 1710 1711 VM_BUG_ON(!PageKsm(page)); 1712 VM_BUG_ON(!PageLocked(page)); 1713 1714 stable_node = page_stable_node(page); 1715 if (!stable_node) 1716 return ret; 1717again: 1718 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1719 struct anon_vma *anon_vma = rmap_item->anon_vma; 1720 struct anon_vma_chain *vmac; 1721 struct vm_area_struct *vma; 1722 1723 anon_vma_lock(anon_vma); 1724 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1725 vma = vmac->vma; 1726 if (rmap_item->address < vma->vm_start || 1727 rmap_item->address >= vma->vm_end) 1728 continue; 1729 /* 1730 * Initially we examine only the vma which covers this 1731 * rmap_item; but later, if there is still work to do, 1732 * we examine covering vmas in other mms: in case they 1733 * were forked from the original since ksmd passed. 1734 */ 1735 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1736 continue; 1737 1738 ret = rmap_one(page, vma, rmap_item->address, arg); 1739 if (ret != SWAP_AGAIN) { 1740 anon_vma_unlock(anon_vma); 1741 goto out; 1742 } 1743 } 1744 anon_vma_unlock(anon_vma); 1745 } 1746 if (!search_new_forks++) 1747 goto again; 1748out: 1749 return ret; 1750} 1751 1752void ksm_migrate_page(struct page *newpage, struct page *oldpage) 1753{ 1754 struct stable_node *stable_node; 1755 1756 VM_BUG_ON(!PageLocked(oldpage)); 1757 VM_BUG_ON(!PageLocked(newpage)); 1758 VM_BUG_ON(newpage->mapping != oldpage->mapping); 1759 1760 stable_node = page_stable_node(newpage); 1761 if (stable_node) { 1762 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); 1763 stable_node->kpfn = page_to_pfn(newpage); 1764 } 1765} 1766#endif /* CONFIG_MIGRATION */ 1767 1768#ifdef CONFIG_MEMORY_HOTREMOVE 1769static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, 1770 unsigned long end_pfn) 1771{ 1772 struct rb_node *node; 1773 1774 for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { 1775 struct stable_node *stable_node; 1776 1777 stable_node = rb_entry(node, struct stable_node, node); 1778 if (stable_node->kpfn >= start_pfn && 1779 stable_node->kpfn < end_pfn) 1780 return stable_node; 1781 } 1782 return NULL; 1783} 1784 1785static int ksm_memory_callback(struct notifier_block *self, 1786 unsigned long action, void *arg) 1787{ 1788 struct memory_notify *mn = arg; 1789 struct stable_node *stable_node; 1790 1791 switch (action) { 1792 case MEM_GOING_OFFLINE: 1793 /* 1794 * Keep it very simple for now: just lock out ksmd and 1795 * MADV_UNMERGEABLE while any memory is going offline. 1796 * mutex_lock_nested() is necessary because lockdep was alarmed 1797 * that here we take ksm_thread_mutex inside notifier chain 1798 * mutex, and later take notifier chain mutex inside 1799 * ksm_thread_mutex to unlock it. But that's safe because both 1800 * are inside mem_hotplug_mutex. 1801 */ 1802 mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); 1803 break; 1804 1805 case MEM_OFFLINE: 1806 /* 1807 * Most of the work is done by page migration; but there might 1808 * be a few stable_nodes left over, still pointing to struct 1809 * pages which have been offlined: prune those from the tree. 1810 */ 1811 while ((stable_node = ksm_check_stable_tree(mn->start_pfn, 1812 mn->start_pfn + mn->nr_pages)) != NULL) 1813 remove_node_from_stable_tree(stable_node); 1814 /* fallthrough */ 1815 1816 case MEM_CANCEL_OFFLINE: 1817 mutex_unlock(&ksm_thread_mutex); 1818 break; 1819 } 1820 return NOTIFY_OK; 1821} 1822#endif /* CONFIG_MEMORY_HOTREMOVE */ 1823 1824#ifdef CONFIG_SYSFS 1825/* 1826 * This all compiles without CONFIG_SYSFS, but is a waste of space. 1827 */ 1828 1829#define KSM_ATTR_RO(_name) \ 1830 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 1831#define KSM_ATTR(_name) \ 1832 static struct kobj_attribute _name##_attr = \ 1833 __ATTR(_name, 0644, _name##_show, _name##_store) 1834 1835static ssize_t sleep_millisecs_show(struct kobject *kobj, 1836 struct kobj_attribute *attr, char *buf) 1837{ 1838 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); 1839} 1840 1841static ssize_t sleep_millisecs_store(struct kobject *kobj, 1842 struct kobj_attribute *attr, 1843 const char *buf, size_t count) 1844{ 1845 unsigned long msecs; 1846 int err; 1847 1848 err = strict_strtoul(buf, 10, &msecs); 1849 if (err || msecs > UINT_MAX) 1850 return -EINVAL; 1851 1852 ksm_thread_sleep_millisecs = msecs; 1853 1854 return count; 1855} 1856KSM_ATTR(sleep_millisecs); 1857 1858static ssize_t pages_to_scan_show(struct kobject *kobj, 1859 struct kobj_attribute *attr, char *buf) 1860{ 1861 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); 1862} 1863 1864static ssize_t pages_to_scan_store(struct kobject *kobj, 1865 struct kobj_attribute *attr, 1866 const char *buf, size_t count) 1867{ 1868 int err; 1869 unsigned long nr_pages; 1870 1871 err = strict_strtoul(buf, 10, &nr_pages); 1872 if (err || nr_pages > UINT_MAX) 1873 return -EINVAL; 1874 1875 ksm_thread_pages_to_scan = nr_pages; 1876 1877 return count; 1878} 1879KSM_ATTR(pages_to_scan); 1880 1881static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, 1882 char *buf) 1883{ 1884 return sprintf(buf, "%u\n", ksm_run); 1885} 1886 1887static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, 1888 const char *buf, size_t count) 1889{ 1890 int err; 1891 unsigned long flags; 1892 1893 err = strict_strtoul(buf, 10, &flags); 1894 if (err || flags > UINT_MAX) 1895 return -EINVAL; 1896 if (flags > KSM_RUN_UNMERGE) 1897 return -EINVAL; 1898 1899 /* 1900 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. 1901 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, 1902 * breaking COW to free the pages_shared (but leaves mm_slots 1903 * on the list for when ksmd may be set running again). 1904 */ 1905 1906 mutex_lock(&ksm_thread_mutex); 1907 if (ksm_run != flags) { 1908 ksm_run = flags; 1909 if (flags & KSM_RUN_UNMERGE) { 1910 current->flags |= PF_OOM_ORIGIN; 1911 err = unmerge_and_remove_all_rmap_items(); 1912 current->flags &= ~PF_OOM_ORIGIN; 1913 if (err) { 1914 ksm_run = KSM_RUN_STOP; 1915 count = err; 1916 } 1917 } 1918 } 1919 mutex_unlock(&ksm_thread_mutex); 1920 1921 if (flags & KSM_RUN_MERGE) 1922 wake_up_interruptible(&ksm_thread_wait); 1923 1924 return count; 1925} 1926KSM_ATTR(run); 1927 1928static ssize_t pages_shared_show(struct kobject *kobj, 1929 struct kobj_attribute *attr, char *buf) 1930{ 1931 return sprintf(buf, "%lu\n", ksm_pages_shared); 1932} 1933KSM_ATTR_RO(pages_shared); 1934 1935static ssize_t pages_sharing_show(struct kobject *kobj, 1936 struct kobj_attribute *attr, char *buf) 1937{ 1938 return sprintf(buf, "%lu\n", ksm_pages_sharing); 1939} 1940KSM_ATTR_RO(pages_sharing); 1941 1942static ssize_t pages_unshared_show(struct kobject *kobj, 1943 struct kobj_attribute *attr, char *buf) 1944{ 1945 return sprintf(buf, "%lu\n", ksm_pages_unshared); 1946} 1947KSM_ATTR_RO(pages_unshared); 1948 1949static ssize_t pages_volatile_show(struct kobject *kobj, 1950 struct kobj_attribute *attr, char *buf) 1951{ 1952 long ksm_pages_volatile; 1953 1954 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared 1955 - ksm_pages_sharing - ksm_pages_unshared; 1956 /* 1957 * It was not worth any locking to calculate that statistic, 1958 * but it might therefore sometimes be negative: conceal that. 1959 */ 1960 if (ksm_pages_volatile < 0) 1961 ksm_pages_volatile = 0; 1962 return sprintf(buf, "%ld\n", ksm_pages_volatile); 1963} 1964KSM_ATTR_RO(pages_volatile); 1965 1966static ssize_t full_scans_show(struct kobject *kobj, 1967 struct kobj_attribute *attr, char *buf) 1968{ 1969 return sprintf(buf, "%lu\n", ksm_scan.seqnr); 1970} 1971KSM_ATTR_RO(full_scans); 1972 1973static struct attribute *ksm_attrs[] = { 1974 &sleep_millisecs_attr.attr, 1975 &pages_to_scan_attr.attr, 1976 &run_attr.attr, 1977 &pages_shared_attr.attr, 1978 &pages_sharing_attr.attr, 1979 &pages_unshared_attr.attr, 1980 &pages_volatile_attr.attr, 1981 &full_scans_attr.attr, 1982 NULL, 1983}; 1984 1985static struct attribute_group ksm_attr_group = { 1986 .attrs = ksm_attrs, 1987 .name = "ksm", 1988}; 1989#endif /* CONFIG_SYSFS */ 1990 1991static int __init ksm_init(void) 1992{ 1993 struct task_struct *ksm_thread; 1994 int err; 1995 1996 err = ksm_slab_init(); 1997 if (err) 1998 goto out; 1999 2000 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 2001 if (IS_ERR(ksm_thread)) { 2002 printk(KERN_ERR "ksm: creating kthread failed\n"); 2003 err = PTR_ERR(ksm_thread); 2004 goto out_free; 2005 } 2006 2007#ifdef CONFIG_SYSFS 2008 err = sysfs_create_group(mm_kobj, &ksm_attr_group); 2009 if (err) { 2010 printk(KERN_ERR "ksm: register sysfs failed\n"); 2011 kthread_stop(ksm_thread); 2012 goto out_free; 2013 } 2014#else 2015 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ 2016 2017#endif /* CONFIG_SYSFS */ 2018 2019#ifdef CONFIG_MEMORY_HOTREMOVE 2020 /* 2021 * Choose a high priority since the callback takes ksm_thread_mutex: 2022 * later callbacks could only be taking locks which nest within that. 2023 */ 2024 hotplug_memory_notifier(ksm_memory_callback, 100); 2025#endif 2026 return 0; 2027 2028out_free: 2029 ksm_slab_free(); 2030out: 2031 return err; 2032} 2033module_init(ksm_init)