commit 5af9c2e19da6514a1a50b07d97d93b74a7711873 · tjh.dev/kernel

-2

MAINTAINERS

··· 223 223 224 224 ABI/API 225 225 L: linux-api@vger.kernel.org 226 - F: Documentation/ABI/ 227 226 F: include/linux/syscalls.h 228 - F: include/uapi/ 229 227 F: kernel/sys_ni.c 230 228 231 229 ABIT UGURU 1,2 HARDWARE MONITOR DRIVER

+1

arch/m32r/Kconfig

··· 276 276 277 277 config SMP 278 278 bool "Symmetric multi-processing support" 279 + depends on MMU 279 280 ---help--- 280 281 This enables support for systems with more than one CPU. If you have 281 282 a system with only one CPU, say N. If you have a system with more

+9 -12

arch/um/include/asm/page.h

··· 34 34 35 35 #if defined(CONFIG_3_LEVEL_PGTABLES) && !defined(CONFIG_64BIT) 36 36 37 - typedef struct { unsigned long pte_low, pte_high; } pte_t; 37 + typedef struct { unsigned long pte; } pte_t; 38 38 typedef struct { unsigned long pmd; } pmd_t; 39 39 typedef struct { unsigned long pgd; } pgd_t; 40 - #define pte_val(x) ((x).pte_low | ((unsigned long long) (x).pte_high << 32)) 40 + #define pte_val(p) ((p).pte) 41 41 42 - #define pte_get_bits(pte, bits) ((pte).pte_low & (bits)) 43 - #define pte_set_bits(pte, bits) ((pte).pte_low |= (bits)) 44 - #define pte_clear_bits(pte, bits) ((pte).pte_low &= ~(bits)) 45 - #define pte_copy(to, from) ({ (to).pte_high = (from).pte_high; \ 46 - smp_wmb(); \ 47 - (to).pte_low = (from).pte_low; }) 48 - #define pte_is_zero(pte) (!((pte).pte_low & ~_PAGE_NEWPAGE) && !(pte).pte_high) 49 - #define pte_set_val(pte, phys, prot) \ 50 - ({ (pte).pte_high = (phys) >> 32; \ 51 - (pte).pte_low = (phys) | pgprot_val(prot); }) 42 + #define pte_get_bits(p, bits) ((p).pte & (bits)) 43 + #define pte_set_bits(p, bits) ((p).pte |= (bits)) 44 + #define pte_clear_bits(p, bits) ((p).pte &= ~(bits)) 45 + #define pte_copy(to, from) ({ (to).pte = (from).pte; }) 46 + #define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE)) 47 + #define pte_set_val(p, phys, prot) \ 48 + ({ (p).pte = (phys) | pgprot_val(prot); }) 52 49 53 50 #define pmd_val(x) ((x).pmd) 54 51 #define __pmd(x) ((pmd_t) { (x) } )

+2 -2

arch/x86/mm/hugetlbpage.c

··· 173 173 } 174 174 __setup("hugepagesz=", setup_hugepagesz); 175 175 176 - #ifdef CONFIG_CMA 176 + #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) 177 177 static __init int gigantic_pages_init(void) 178 178 { 179 - /* With CMA we can allocate gigantic pages at runtime */ 179 + /* With compaction or CMA we can allocate gigantic pages at runtime */ 180 180 if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT)) 181 181 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); 182 182 return 0;

+7 -1

fs/block_dev.c

··· 1730 1730 return __dax_fault(vma, vmf, blkdev_get_block, NULL); 1731 1731 } 1732 1732 1733 + static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma, 1734 + struct vm_fault *vmf) 1735 + { 1736 + return dax_pfn_mkwrite(vma, vmf); 1737 + } 1738 + 1733 1739 static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, 1734 1740 pmd_t *pmd, unsigned int flags) 1735 1741 { ··· 1745 1739 static const struct vm_operations_struct blkdev_dax_vm_ops = { 1746 1740 .fault = blkdev_dax_fault, 1747 1741 .pmd_fault = blkdev_dax_pmd_fault, 1748 - .pfn_mkwrite = blkdev_dax_fault, 1742 + .pfn_mkwrite = blkdev_dax_pfn_mkwrite, 1749 1743 }; 1750 1744 1751 1745 static const struct vm_operations_struct blkdev_default_vm_ops = {

+2 -1

fs/dax.c

··· 358 358 void *entry; 359 359 360 360 WARN_ON_ONCE(pmd_entry && !dirty); 361 - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 361 + if (dirty) 362 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 362 363 363 364 spin_lock_irq(&mapping->tree_lock); 364 365

+32 -6

fs/eventpoll.c

··· 94 94 /* Epoll private bits inside the event mask */ 95 95 #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) 96 96 97 + #define EPOLLINOUT_BITS (POLLIN | POLLOUT) 98 + 99 + #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \ 100 + EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) 101 + 97 102 /* Maximum number of nesting allowed inside epoll sets */ 98 103 #define EP_MAX_NESTS 4 99 104 ··· 1073 1068 * wait list. 1074 1069 */ 1075 1070 if (waitqueue_active(&ep->wq)) { 1076 - ewake = 1; 1071 + if ((epi->event.events & EPOLLEXCLUSIVE) && 1072 + !((unsigned long)key & POLLFREE)) { 1073 + switch ((unsigned long)key & EPOLLINOUT_BITS) { 1074 + case POLLIN: 1075 + if (epi->event.events & POLLIN) 1076 + ewake = 1; 1077 + break; 1078 + case POLLOUT: 1079 + if (epi->event.events & POLLOUT) 1080 + ewake = 1; 1081 + break; 1082 + case 0: 1083 + ewake = 1; 1084 + break; 1085 + } 1086 + } 1077 1087 wake_up_locked(&ep->wq); 1078 1088 } 1079 1089 if (waitqueue_active(&ep->poll_wait)) ··· 1895 1875 * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. 1896 1876 * Also, we do not currently supported nested exclusive wakeups. 1897 1877 */ 1898 - if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD || 1899 - (op == EPOLL_CTL_ADD && is_file_epoll(tf.file)))) 1900 - goto error_tgt_fput; 1878 + if (epds.events & EPOLLEXCLUSIVE) { 1879 + if (op == EPOLL_CTL_MOD) 1880 + goto error_tgt_fput; 1881 + if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || 1882 + (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) 1883 + goto error_tgt_fput; 1884 + } 1901 1885 1902 1886 /* 1903 1887 * At this point it is safe to assume that the "private_data" contains ··· 1974 1950 break; 1975 1951 case EPOLL_CTL_MOD: 1976 1952 if (epi) { 1977 - epds.events |= POLLERR | POLLHUP; 1978 - error = ep_modify(ep, epi, &epds); 1953 + if (!(epi->event.events & EPOLLEXCLUSIVE)) { 1954 + epds.events |= POLLERR | POLLHUP; 1955 + error = ep_modify(ep, epi, &epds); 1956 + } 1979 1957 } else 1980 1958 error = -ENOENT; 1981 1959 break;

+2

fs/ocfs2/dlm/dlmrecovery.c

··· 2367 2367 break; 2368 2368 } 2369 2369 } 2370 + dlm_lockres_clear_refmap_bit(dlm, res, 2371 + dead_node); 2370 2372 spin_unlock(&res->spinlock); 2371 2373 continue; 2372 2374 }

+3 -3

include/linux/gfp.h

··· 547 547 } 548 548 #endif /* CONFIG_PM_SLEEP */ 549 549 550 - #ifdef CONFIG_CMA 551 - 550 + #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) 552 551 /* The below functions must be run on a range from a single zone. */ 553 552 extern int alloc_contig_range(unsigned long start, unsigned long end, 554 553 unsigned migratetype); 555 554 extern void free_contig_range(unsigned long pfn, unsigned nr_pages); 555 + #endif 556 556 557 + #ifdef CONFIG_CMA 557 558 /* CMA stuff */ 558 559 extern void init_cma_reserved_pageblock(struct page *page); 559 - 560 560 #endif 561 561 562 562 #endif /* __LINUX_GFP_H */

+3 -3

include/linux/radix-tree.h

··· 400 400 * @iter: pointer to radix tree iterator 401 401 * Returns: current chunk size 402 402 */ 403 - static __always_inline unsigned 403 + static __always_inline long 404 404 radix_tree_chunk_size(struct radix_tree_iter *iter) 405 405 { 406 406 return iter->next_index - iter->index; ··· 434 434 return slot + offset + 1; 435 435 } 436 436 } else { 437 - unsigned size = radix_tree_chunk_size(iter) - 1; 437 + long size = radix_tree_chunk_size(iter); 438 438 439 - while (size--) { 439 + while (--size > 0) { 440 440 slot++; 441 441 iter->index++; 442 442 if (likely(*slot))

-14

include/linux/rmap.h

··· 109 109 __put_anon_vma(anon_vma); 110 110 } 111 111 112 - static inline void vma_lock_anon_vma(struct vm_area_struct *vma) 113 - { 114 - struct anon_vma *anon_vma = vma->anon_vma; 115 - if (anon_vma) 116 - down_write(&anon_vma->root->rwsem); 117 - } 118 - 119 - static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) 120 - { 121 - struct anon_vma *anon_vma = vma->anon_vma; 122 - if (anon_vma) 123 - up_write(&anon_vma->root->rwsem); 124 - } 125 - 126 112 static inline void anon_vma_lock_write(struct anon_vma *anon_vma) 127 113 { 128 114 down_write(&anon_vma->root->rwsem);

+4 -2

kernel/signal.c

··· 3508 3508 current->saved_sigmask = current->blocked; 3509 3509 set_current_blocked(set); 3510 3510 3511 - __set_current_state(TASK_INTERRUPTIBLE); 3512 - schedule(); 3511 + while (!signal_pending(current)) { 3512 + __set_current_state(TASK_INTERRUPTIBLE); 3513 + schedule(); 3514 + } 3513 3515 set_restore_sigmask(); 3514 3516 return -ERESTARTNOHAND; 3515 3517 }

+4 -3

lib/dump_stack.c

··· 25 25 26 26 asmlinkage __visible void dump_stack(void) 27 27 { 28 + unsigned long flags; 28 29 int was_locked; 29 30 int old; 30 31 int cpu; ··· 34 33 * Permit this cpu to perform nested stack dumps while serialising 35 34 * against other CPUs 36 35 */ 37 - preempt_disable(); 38 - 39 36 retry: 37 + local_irq_save(flags); 40 38 cpu = smp_processor_id(); 41 39 old = atomic_cmpxchg(&dump_lock, -1, cpu); 42 40 if (old == -1) { ··· 43 43 } else if (old == cpu) { 44 44 was_locked = 1; 45 45 } else { 46 + local_irq_restore(flags); 46 47 cpu_relax(); 47 48 goto retry; 48 49 } ··· 53 52 if (!was_locked) 54 53 atomic_set(&dump_lock, -1); 55 54 56 - preempt_enable(); 55 + local_irq_restore(flags); 57 56 } 58 57 #else 59 58 asmlinkage __visible void dump_stack(void)

+5 -4

mm/Kconfig

··· 624 624 bool 625 625 626 626 config DEFERRED_STRUCT_PAGE_INIT 627 - bool "Defer initialisation of struct pages to kswapd" 627 + bool "Defer initialisation of struct pages to kthreads" 628 628 default n 629 629 depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT 630 630 depends on MEMORY_HOTPLUG ··· 633 633 single thread. On very large machines this can take a considerable 634 634 amount of time. If this option is set, large machines will bring up 635 635 a subset of memmap at boot and then initialise the rest in parallel 636 - when kswapd starts. This has a potential performance impact on 637 - processes running early in the lifetime of the systemm until kswapd 638 - finishes the initialisation. 636 + by starting one-off "pgdatinitX" kernel thread for each node X. This 637 + has a potential performance impact on processes running early in the 638 + lifetime of the system until these kthreads finish the 639 + initialisation. 639 640 640 641 config IDLE_PAGE_TRACKING 641 642 bool "Enable idle page tracking"

+1 -1

mm/backing-dev.c

··· 989 989 * here rather than calling cond_resched(). 990 990 */ 991 991 if (current->flags & PF_WQ_WORKER) 992 - schedule_timeout(1); 992 + schedule_timeout_uninterruptible(1); 993 993 else 994 994 cond_resched(); 995 995

+1 -1

mm/huge_memory.c

··· 3482 3482 3483 3483 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3484 3484 /* Take pin on all head pages to avoid freeing them under us */ 3485 - list_for_each_safe(pos, next, &list) { 3485 + list_for_each_safe(pos, next, &pgdata->split_queue) { 3486 3486 page = list_entry((void *)pos, struct page, mapping); 3487 3487 page = compound_head(page); 3488 3488 if (get_page_unless_zero(page)) {

+4 -3

mm/hugetlb.c

··· 1001 1001 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 1002 1002 nr_nodes--) 1003 1003 1004 - #if defined(CONFIG_CMA) && defined(CONFIG_X86_64) 1004 + #if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)) 1005 1005 static void destroy_compound_gigantic_page(struct page *page, 1006 1006 unsigned int order) 1007 1007 { ··· 1214 1214 1215 1215 set_page_private(page, 0); 1216 1216 page->mapping = NULL; 1217 - BUG_ON(page_count(page)); 1218 - BUG_ON(page_mapcount(page)); 1217 + VM_BUG_ON_PAGE(page_count(page), page); 1218 + VM_BUG_ON_PAGE(page_mapcount(page), page); 1219 1219 restore_reserve = PagePrivate(page); 1220 1220 ClearPagePrivate(page); 1221 1221 ··· 1286 1286 set_page_count(p, 0); 1287 1287 set_compound_head(p, page); 1288 1288 } 1289 + atomic_set(compound_mapcount_ptr(page), -1); 1289 1290 } 1290 1291 1291 1292 /*

+1 -1

mm/memblock.c

··· 1448 1448 * Remaining API functions 1449 1449 */ 1450 1450 1451 - phys_addr_t __init memblock_phys_mem_size(void) 1451 + phys_addr_t __init_memblock memblock_phys_mem_size(void) 1452 1452 { 1453 1453 return memblock.memory.total_size; 1454 1454 }

+5 -9

mm/mempolicy.c

··· 548 548 goto retry; 549 549 } 550 550 551 - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 552 - migrate_page_add(page, qp->pagelist, flags); 551 + migrate_page_add(page, qp->pagelist, flags); 553 552 } 554 553 pte_unmap_unlock(pte - 1, ptl); 555 554 cond_resched(); ··· 624 625 unsigned long endvma = vma->vm_end; 625 626 unsigned long flags = qp->flags; 626 627 627 - if (vma->vm_flags & VM_PFNMAP) 628 + if (!vma_migratable(vma)) 628 629 return 1; 629 630 630 631 if (endvma > end) ··· 643 644 644 645 if (flags & MPOL_MF_LAZY) { 645 646 /* Similar to task_numa_work, skip inaccessible VMAs */ 646 - if (vma_migratable(vma) && 647 - vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) 647 + if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) 648 648 change_prot_numa(vma, start, endvma); 649 649 return 1; 650 650 } 651 651 652 - if ((flags & MPOL_MF_STRICT) || 653 - ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 654 - vma_migratable(vma))) 655 - /* queue pages from current vma */ 652 + /* queue pages from current vma */ 653 + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 656 654 return 0; 657 655 return 1; 658 656 }

+30 -32

mm/mmap.c

··· 390 390 } 391 391 392 392 #ifdef CONFIG_DEBUG_VM_RB 393 - static int browse_rb(struct rb_root *root) 393 + static int browse_rb(struct mm_struct *mm) 394 394 { 395 + struct rb_root *root = &mm->mm_rb; 395 396 int i = 0, j, bug = 0; 396 397 struct rb_node *nd, *pn = NULL; 397 398 unsigned long prev = 0, pend = 0; ··· 415 414 vma->vm_start, vma->vm_end); 416 415 bug = 1; 417 416 } 417 + spin_lock(&mm->page_table_lock); 418 418 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { 419 419 pr_emerg("free gap %lx, correct %lx\n", 420 420 vma->rb_subtree_gap, 421 421 vma_compute_subtree_gap(vma)); 422 422 bug = 1; 423 423 } 424 + spin_unlock(&mm->page_table_lock); 424 425 i++; 425 426 pn = nd; 426 427 prev = vma->vm_start; ··· 459 456 struct vm_area_struct *vma = mm->mmap; 460 457 461 458 while (vma) { 459 + struct anon_vma *anon_vma = vma->anon_vma; 462 460 struct anon_vma_chain *avc; 463 461 464 - vma_lock_anon_vma(vma); 465 - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 466 - anon_vma_interval_tree_verify(avc); 467 - vma_unlock_anon_vma(vma); 462 + if (anon_vma) { 463 + anon_vma_lock_read(anon_vma); 464 + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 465 + anon_vma_interval_tree_verify(avc); 466 + anon_vma_unlock_read(anon_vma); 467 + } 468 + 468 469 highest_address = vma->vm_end; 469 470 vma = vma->vm_next; 470 471 i++; ··· 482 475 mm->highest_vm_end, highest_address); 483 476 bug = 1; 484 477 } 485 - i = browse_rb(&mm->mm_rb); 478 + i = browse_rb(mm); 486 479 if (i != mm->map_count) { 487 480 if (i != -1) 488 481 pr_emerg("map_count %d rb %d\n", mm->map_count, i); ··· 2149 2142 int expand_upwards(struct vm_area_struct *vma, unsigned long address) 2150 2143 { 2151 2144 struct mm_struct *mm = vma->vm_mm; 2152 - int error; 2145 + int error = 0; 2153 2146 2154 2147 if (!(vma->vm_flags & VM_GROWSUP)) 2155 2148 return -EFAULT; 2156 2149 2157 - /* 2158 - * We must make sure the anon_vma is allocated 2159 - * so that the anon_vma locking is not a noop. 2160 - */ 2150 + /* Guard against wrapping around to address 0. */ 2151 + if (address < PAGE_ALIGN(address+4)) 2152 + address = PAGE_ALIGN(address+4); 2153 + else 2154 + return -ENOMEM; 2155 + 2156 + /* We must make sure the anon_vma is allocated. */ 2161 2157 if (unlikely(anon_vma_prepare(vma))) 2162 2158 return -ENOMEM; 2163 - vma_lock_anon_vma(vma); 2164 2159 2165 2160 /* 2166 2161 * vma->vm_start/vm_end cannot change under us because the caller 2167 2162 * is required to hold the mmap_sem in read mode. We need the 2168 2163 * anon_vma lock to serialize against concurrent expand_stacks. 2169 - * Also guard against wrapping around to address 0. 2170 2164 */ 2171 - if (address < PAGE_ALIGN(address+4)) 2172 - address = PAGE_ALIGN(address+4); 2173 - else { 2174 - vma_unlock_anon_vma(vma); 2175 - return -ENOMEM; 2176 - } 2177 - error = 0; 2165 + anon_vma_lock_write(vma->anon_vma); 2178 2166 2179 2167 /* Somebody else might have raced and expanded it already */ 2180 2168 if (address > vma->vm_end) { ··· 2187 2185 * updates, but we only hold a shared mmap_sem 2188 2186 * lock here, so we need to protect against 2189 2187 * concurrent vma expansions. 2190 - * vma_lock_anon_vma() doesn't help here, as 2188 + * anon_vma_lock_write() doesn't help here, as 2191 2189 * we don't guarantee that all growable vmas 2192 2190 * in a mm share the same root anon vma. 2193 2191 * So, we reuse mm->page_table_lock to guard ··· 2210 2208 } 2211 2209 } 2212 2210 } 2213 - vma_unlock_anon_vma(vma); 2211 + anon_vma_unlock_write(vma->anon_vma); 2214 2212 khugepaged_enter_vma_merge(vma, vma->vm_flags); 2215 2213 validate_mm(mm); 2216 2214 return error; ··· 2226 2224 struct mm_struct *mm = vma->vm_mm; 2227 2225 int error; 2228 2226 2229 - /* 2230 - * We must make sure the anon_vma is allocated 2231 - * so that the anon_vma locking is not a noop. 2232 - */ 2233 - if (unlikely(anon_vma_prepare(vma))) 2234 - return -ENOMEM; 2235 - 2236 2227 address &= PAGE_MASK; 2237 2228 error = security_mmap_addr(address); 2238 2229 if (error) 2239 2230 return error; 2240 2231 2241 - vma_lock_anon_vma(vma); 2232 + /* We must make sure the anon_vma is allocated. */ 2233 + if (unlikely(anon_vma_prepare(vma))) 2234 + return -ENOMEM; 2242 2235 2243 2236 /* 2244 2237 * vma->vm_start/vm_end cannot change under us because the caller 2245 2238 * is required to hold the mmap_sem in read mode. We need the 2246 2239 * anon_vma lock to serialize against concurrent expand_stacks. 2247 2240 */ 2241 + anon_vma_lock_write(vma->anon_vma); 2248 2242 2249 2243 /* Somebody else might have raced and expanded it already */ 2250 2244 if (address < vma->vm_start) { ··· 2258 2260 * updates, but we only hold a shared mmap_sem 2259 2261 * lock here, so we need to protect against 2260 2262 * concurrent vma expansions. 2261 - * vma_lock_anon_vma() doesn't help here, as 2263 + * anon_vma_lock_write() doesn't help here, as 2262 2264 * we don't guarantee that all growable vmas 2263 2265 * in a mm share the same root anon vma. 2264 2266 * So, we reuse mm->page_table_lock to guard ··· 2279 2281 } 2280 2282 } 2281 2283 } 2282 - vma_unlock_anon_vma(vma); 2284 + anon_vma_unlock_write(vma->anon_vma); 2283 2285 khugepaged_enter_vma_merge(vma, vma->vm_flags); 2284 2286 validate_mm(mm); 2285 2287 return error;

+1 -1

mm/page_alloc.c

··· 6620 6620 return !has_unmovable_pages(zone, page, 0, true); 6621 6621 } 6622 6622 6623 - #ifdef CONFIG_CMA 6623 + #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) 6624 6624 6625 6625 static unsigned long pfn_max_align_down(unsigned long pfn) 6626 6626 {

+1 -1

mm/vmscan.c

··· 1443 1443 int ret = -EBUSY; 1444 1444 1445 1445 VM_BUG_ON_PAGE(!page_count(page), page); 1446 - VM_BUG_ON_PAGE(PageTail(page), page); 1446 + WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); 1447 1447 1448 1448 if (PageLRU(page)) { 1449 1449 struct zone *zone = page_zone(page);

+47 -23

mm/vmstat.c

··· 1396 1396 * Counters were updated so we expect more updates 1397 1397 * to occur in the future. Keep on running the 1398 1398 * update worker thread. 1399 + * If we were marked on cpu_stat_off clear the flag 1400 + * so that vmstat_shepherd doesn't schedule us again. 1399 1401 */ 1400 - queue_delayed_work_on(smp_processor_id(), vmstat_wq, 1401 - this_cpu_ptr(&vmstat_work), 1402 - round_jiffies_relative(sysctl_stat_interval)); 1402 + if (!cpumask_test_and_clear_cpu(smp_processor_id(), 1403 + cpu_stat_off)) { 1404 + queue_delayed_work_on(smp_processor_id(), vmstat_wq, 1405 + this_cpu_ptr(&vmstat_work), 1406 + round_jiffies_relative(sysctl_stat_interval)); 1407 + } 1403 1408 } else { 1404 1409 /* 1405 1410 * We did not update any counters so the app may be in ··· 1422 1417 * until the diffs stay at zero. The function is used by NOHZ and can only be 1423 1418 * invoked when tick processing is not active. 1424 1419 */ 1425 - void quiet_vmstat(void) 1426 - { 1427 - if (system_state != SYSTEM_RUNNING) 1428 - return; 1429 - 1430 - do { 1431 - if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) 1432 - cancel_delayed_work(this_cpu_ptr(&vmstat_work)); 1433 - 1434 - } while (refresh_cpu_vm_stats(false)); 1435 - } 1436 - 1437 1420 /* 1438 1421 * Check if the diffs for a certain cpu indicate that 1439 1422 * an update is needed. ··· 1445 1452 return false; 1446 1453 } 1447 1454 1455 + void quiet_vmstat(void) 1456 + { 1457 + if (system_state != SYSTEM_RUNNING) 1458 + return; 1459 + 1460 + /* 1461 + * If we are already in hands of the shepherd then there 1462 + * is nothing for us to do here. 1463 + */ 1464 + if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) 1465 + return; 1466 + 1467 + if (!need_update(smp_processor_id())) 1468 + return; 1469 + 1470 + /* 1471 + * Just refresh counters and do not care about the pending delayed 1472 + * vmstat_update. It doesn't fire that often to matter and canceling 1473 + * it would be too expensive from this path. 1474 + * vmstat_shepherd will take care about that for us. 1475 + */ 1476 + refresh_cpu_vm_stats(false); 1477 + } 1478 + 1448 1479 1449 1480 /* 1450 1481 * Shepherd worker thread that checks the ··· 1486 1469 1487 1470 get_online_cpus(); 1488 1471 /* Check processors whose vmstat worker threads have been disabled */ 1489 - for_each_cpu(cpu, cpu_stat_off) 1490 - if (need_update(cpu) && 1491 - cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) 1472 + for_each_cpu(cpu, cpu_stat_off) { 1473 + struct delayed_work *dw = &per_cpu(vmstat_work, cpu); 1492 1474 1493 - queue_delayed_work_on(cpu, vmstat_wq, 1494 - &per_cpu(vmstat_work, cpu), 0); 1495 - 1475 + if (need_update(cpu)) { 1476 + if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) 1477 + queue_delayed_work_on(cpu, vmstat_wq, dw, 0); 1478 + } else { 1479 + /* 1480 + * Cancel the work if quiet_vmstat has put this 1481 + * cpu on cpu_stat_off because the work item might 1482 + * be still scheduled 1483 + */ 1484 + cancel_delayed_work(dw); 1485 + } 1486 + } 1496 1487 put_online_cpus(); 1497 1488 1498 1489 schedule_delayed_work(&shepherd, 1499 1490 round_jiffies_relative(sysctl_stat_interval)); 1500 - 1501 1491 } 1502 1492 1503 1493 static void __init start_shepherd_timer(void) ··· 1512 1488 int cpu; 1513 1489 1514 1490 for_each_possible_cpu(cpu) 1515 - INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), 1491 + INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), 1516 1492 vmstat_update); 1517 1493 1518 1494 if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))