Merge branch 'akpm' (patches from Andrew Morton)

+2

Documentation/cgroups/memory.txt

··· 490 490 pgpgout - # of uncharging events to the memory cgroup. The uncharging 491 491 event happens each time a page is unaccounted from the cgroup. 492 492 swap - # of bytes of swap usage 493 + writeback - # of bytes of file/anon cache that are queued for syncing to 494 + disk. 493 495 inactive_anon - # of bytes of anonymous and swap cache memory on inactive 494 496 LRU list. 495 497 active_anon - # of bytes of anonymous and swap cache memory on active

+4 -3

arch/alpha/mm/fault.c

··· 89 89 const struct exception_table_entry *fixup; 90 90 int fault, si_code = SEGV_MAPERR; 91 91 siginfo_t info; 92 - unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 93 - (cause > 0 ? FAULT_FLAG_WRITE : 0)); 92 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 94 93 95 94 /* As of EV6, a load into $31/$f31 is a prefetch, and never faults 96 95 (or is suppressed by the PALcode). Support that for older CPUs ··· 114 115 if (address >= TASK_SIZE) 115 116 goto vmalloc_fault; 116 117 #endif 117 - 118 + if (user_mode(regs)) 119 + flags |= FAULT_FLAG_USER; 118 120 retry: 119 121 down_read(&mm->mmap_sem); 120 122 vma = find_vma(mm, address); ··· 142 142 } else { 143 143 if (!(vma->vm_flags & VM_WRITE)) 144 144 goto bad_area; 145 + flags |= FAULT_FLAG_WRITE; 145 146 } 146 147 147 148 /* If for any reason at all we couldn't handle the fault,

+4 -7

arch/arc/mm/fault.c

··· 60 60 siginfo_t info; 61 61 int fault, ret; 62 62 int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */ 63 - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 64 - (write ? FAULT_FLAG_WRITE : 0); 63 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 65 64 66 65 /* 67 66 * We fault-in kernel-space virtual memory on-demand. The ··· 88 89 if (in_atomic() || !mm) 89 90 goto no_context; 90 91 92 + if (user_mode(regs)) 93 + flags |= FAULT_FLAG_USER; 91 94 retry: 92 95 down_read(&mm->mmap_sem); 93 96 vma = find_vma(mm, address); ··· 118 117 if (write) { 119 118 if (!(vma->vm_flags & VM_WRITE)) 120 119 goto bad_area; 120 + flags |= FAULT_FLAG_WRITE; 121 121 } else { 122 122 if (!(vma->vm_flags & (VM_READ | VM_EXEC))) 123 123 goto bad_area; 124 124 } 125 125 126 - survive: 127 126 /* 128 127 * If for any reason at all we couldn't handle the fault, 129 128 * make sure we exit gracefully rather than endlessly redo ··· 202 201 die("Oops", regs, address); 203 202 204 203 out_of_memory: 205 - if (is_global_init(tsk)) { 206 - yield(); 207 - goto survive; 208 - } 209 204 up_read(&mm->mmap_sem); 210 205 211 206 if (user_mode(regs)) {

+13 -10

arch/arm/mm/fault.c

··· 261 261 struct task_struct *tsk; 262 262 struct mm_struct *mm; 263 263 int fault, sig, code; 264 - int write = fsr & FSR_WRITE; 265 - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 266 - (write ? FAULT_FLAG_WRITE : 0); 264 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 267 265 268 266 if (notify_page_fault(regs, fsr)) 269 267 return 0; ··· 279 281 */ 280 282 if (in_atomic() || !mm) 281 283 goto no_context; 284 + 285 + if (user_mode(regs)) 286 + flags |= FAULT_FLAG_USER; 287 + if (fsr & FSR_WRITE) 288 + flags |= FAULT_FLAG_WRITE; 282 289 283 290 /* 284 291 * As per x86, we may deadlock here. However, since the kernel only ··· 352 349 if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) 353 350 return 0; 354 351 352 + /* 353 + * If we are in kernel mode at this point, we 354 + * have no context to handle this fault with. 355 + */ 356 + if (!user_mode(regs)) 357 + goto no_context; 358 + 355 359 if (fault & VM_FAULT_OOM) { 356 360 /* 357 361 * We ran out of memory, call the OOM killer, and return to ··· 368 358 pagefault_out_of_memory(); 369 359 return 0; 370 360 } 371 - 372 - /* 373 - * If we are in kernel mode at this point, we 374 - * have no context to handle this fault with. 375 - */ 376 - if (!user_mode(regs)) 377 - goto no_context; 378 361 379 362 if (fault & VM_FAULT_SIGBUS) { 380 363 /*

+17 -14

arch/arm64/mm/fault.c

··· 199 199 unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC; 200 200 unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 201 201 202 - if (esr & ESR_LNX_EXEC) { 203 - vm_flags = VM_EXEC; 204 - } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { 205 - vm_flags = VM_WRITE; 206 - mm_flags |= FAULT_FLAG_WRITE; 207 - } 208 - 209 202 tsk = current; 210 203 mm = tsk->mm; 211 204 ··· 212 219 */ 213 220 if (in_atomic() || !mm) 214 221 goto no_context; 222 + 223 + if (user_mode(regs)) 224 + mm_flags |= FAULT_FLAG_USER; 225 + 226 + if (esr & ESR_LNX_EXEC) { 227 + vm_flags = VM_EXEC; 228 + } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { 229 + vm_flags = VM_WRITE; 230 + mm_flags |= FAULT_FLAG_WRITE; 231 + } 215 232 216 233 /* 217 234 * As per x86, we may deadlock here. However, since the kernel only ··· 291 288 VM_FAULT_BADACCESS)))) 292 289 return 0; 293 290 291 + /* 292 + * If we are in kernel mode at this point, we have no context to 293 + * handle this fault with. 294 + */ 295 + if (!user_mode(regs)) 296 + goto no_context; 297 + 294 298 if (fault & VM_FAULT_OOM) { 295 299 /* 296 300 * We ran out of memory, call the OOM killer, and return to ··· 307 297 pagefault_out_of_memory(); 308 298 return 0; 309 299 } 310 - 311 - /* 312 - * If we are in kernel mode at this point, we have no context to 313 - * handle this fault with. 314 - */ 315 - if (!user_mode(regs)) 316 - goto no_context; 317 300 318 301 if (fault & VM_FAULT_SIGBUS) { 319 302 /*

+3 -1

arch/avr32/mm/fault.c

··· 86 86 87 87 local_irq_enable(); 88 88 89 + if (user_mode(regs)) 90 + flags |= FAULT_FLAG_USER; 89 91 retry: 90 92 down_read(&mm->mmap_sem); 91 93 ··· 230 228 */ 231 229 out_of_memory: 232 230 up_read(&mm->mmap_sem); 233 - pagefault_out_of_memory(); 234 231 if (!user_mode(regs)) 235 232 goto no_context; 233 + pagefault_out_of_memory(); 236 234 return; 237 235 238 236 do_sigbus:

+4 -2

arch/cris/mm/fault.c

··· 58 58 struct vm_area_struct * vma; 59 59 siginfo_t info; 60 60 int fault; 61 - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 62 - ((writeaccess & 1) ? FAULT_FLAG_WRITE : 0); 61 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 63 62 64 63 D(printk(KERN_DEBUG 65 64 "Page fault for %lX on %X at %lX, prot %d write %d\n", ··· 116 117 if (in_atomic() || !mm) 117 118 goto no_context; 118 119 120 + if (user_mode(regs)) 121 + flags |= FAULT_FLAG_USER; 119 122 retry: 120 123 down_read(&mm->mmap_sem); 121 124 vma = find_vma(mm, address); ··· 156 155 } else if (writeaccess == 1) { 157 156 if (!(vma->vm_flags & VM_WRITE)) 158 157 goto bad_area; 158 + flags |= FAULT_FLAG_WRITE; 159 159 } else { 160 160 if (!(vma->vm_flags & (VM_READ | VM_EXEC))) 161 161 goto bad_area;

+6 -4

arch/frv/mm/fault.c

··· 34 34 struct vm_area_struct *vma; 35 35 struct mm_struct *mm; 36 36 unsigned long _pme, lrai, lrad, fixup; 37 + unsigned long flags = 0; 37 38 siginfo_t info; 38 39 pgd_t *pge; 39 40 pud_t *pue; 40 41 pte_t *pte; 41 - int write; 42 42 int fault; 43 43 44 44 #if 0 ··· 80 80 */ 81 81 if (in_atomic() || !mm) 82 82 goto no_context; 83 + 84 + if (user_mode(__frame)) 85 + flags |= FAULT_FLAG_USER; 83 86 84 87 down_read(&mm->mmap_sem); 85 88 ··· 132 129 */ 133 130 good_area: 134 131 info.si_code = SEGV_ACCERR; 135 - write = 0; 136 132 switch (esr0 & ESR0_ATXC) { 137 133 default: 138 134 /* handle write to write protected page */ ··· 142 140 #endif 143 141 if (!(vma->vm_flags & VM_WRITE)) 144 142 goto bad_area; 145 - write = 1; 143 + flags |= FAULT_FLAG_WRITE; 146 144 break; 147 145 148 146 /* handle read from protected page */ ··· 164 162 * make sure we exit gracefully rather than endlessly redo 165 163 * the fault. 166 164 */ 167 - fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0); 165 + fault = handle_mm_fault(mm, vma, ear0, flags); 168 166 if (unlikely(fault & VM_FAULT_ERROR)) { 169 167 if (fault & VM_FAULT_OOM) 170 168 goto out_of_memory;

+4 -2

arch/hexagon/mm/vm_fault.c

··· 53 53 int si_code = SEGV_MAPERR; 54 54 int fault; 55 55 const struct exception_table_entry *fixup; 56 - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 57 - (cause > 0 ? FAULT_FLAG_WRITE : 0); 56 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 58 57 59 58 /* 60 59 * If we're in an interrupt or have no user context, ··· 64 65 65 66 local_irq_enable(); 66 67 68 + if (user_mode(regs)) 69 + flags |= FAULT_FLAG_USER; 67 70 retry: 68 71 down_read(&mm->mmap_sem); 69 72 vma = find_vma(mm, address); ··· 97 96 case FLT_STORE: 98 97 if (!(vma->vm_flags & VM_WRITE)) 99 98 goto bad_area; 99 + flags |= FAULT_FLAG_WRITE; 100 100 break; 101 101 } 102 102

+4 -2

arch/ia64/mm/fault.c

··· 90 90 mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) 91 91 | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); 92 92 93 - flags |= ((mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0); 94 - 95 93 /* mmap_sem is performance critical.... */ 96 94 prefetchw(&mm->mmap_sem); 97 95 ··· 117 119 if (notify_page_fault(regs, TRAP_BRKPT)) 118 120 return; 119 121 122 + if (user_mode(regs)) 123 + flags |= FAULT_FLAG_USER; 124 + if (mask & VM_WRITE) 125 + flags |= FAULT_FLAG_WRITE; 120 126 retry: 121 127 down_read(&mm->mmap_sem); 122 128

+6 -4

arch/m32r/mm/fault.c

··· 78 78 struct mm_struct *mm; 79 79 struct vm_area_struct * vma; 80 80 unsigned long page, addr; 81 - int write; 81 + unsigned long flags = 0; 82 82 int fault; 83 83 siginfo_t info; 84 84 ··· 116 116 */ 117 117 if (in_atomic() || !mm) 118 118 goto bad_area_nosemaphore; 119 + 120 + if (error_code & ACE_USERMODE) 121 + flags |= FAULT_FLAG_USER; 119 122 120 123 /* When running in the kernel we expect faults to occur only to 121 124 * addresses in user space. All other faults represent errors in the ··· 169 166 */ 170 167 good_area: 171 168 info.si_code = SEGV_ACCERR; 172 - write = 0; 173 169 switch (error_code & (ACE_WRITE|ACE_PROTECTION)) { 174 170 default: /* 3: write, present */ 175 171 /* fall through */ 176 172 case ACE_WRITE: /* write, not present */ 177 173 if (!(vma->vm_flags & VM_WRITE)) 178 174 goto bad_area; 179 - write++; 175 + flags |= FAULT_FLAG_WRITE; 180 176 break; 181 177 case ACE_PROTECTION: /* read, present */ 182 178 case 0: /* read, not present */ ··· 196 194 */ 197 195 addr = (address & PAGE_MASK); 198 196 set_thread_fault_code(error_code); 199 - fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0); 197 + fault = handle_mm_fault(mm, vma, addr, flags); 200 198 if (unlikely(fault & VM_FAULT_ERROR)) { 201 199 if (fault & VM_FAULT_OOM) 202 200 goto out_of_memory;

+2

arch/m68k/mm/fault.c

··· 88 88 if (in_atomic() || !mm) 89 89 goto no_context; 90 90 91 + if (user_mode(regs)) 92 + flags |= FAULT_FLAG_USER; 91 93 retry: 92 94 down_read(&mm->mmap_sem); 93 95

+4 -2

arch/metag/mm/fault.c

··· 53 53 struct vm_area_struct *vma, *prev_vma; 54 54 siginfo_t info; 55 55 int fault; 56 - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 57 - (write_access ? FAULT_FLAG_WRITE : 0); 56 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 58 57 59 58 tsk = current; 60 59 ··· 108 109 if (in_atomic() || !mm) 109 110 goto no_context; 110 111 112 + if (user_mode(regs)) 113 + flags |= FAULT_FLAG_USER; 111 114 retry: 112 115 down_read(&mm->mmap_sem); 113 116 ··· 122 121 if (write_access) { 123 122 if (!(vma->vm_flags & VM_WRITE)) 124 123 goto bad_area; 124 + flags |= FAULT_FLAG_WRITE; 125 125 } else { 126 126 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) 127 127 goto bad_area;

+5 -2

arch/microblaze/mm/fault.c

··· 92 92 int code = SEGV_MAPERR; 93 93 int is_write = error_code & ESR_S; 94 94 int fault; 95 - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 96 - (is_write ? FAULT_FLAG_WRITE : 0); 95 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 97 96 98 97 regs->ear = address; 99 98 regs->esr = error_code; ··· 119 120 regs->r15, regs->msr); 120 121 die("Weird page fault", regs, SIGSEGV); 121 122 } 123 + 124 + if (user_mode(regs)) 125 + flags |= FAULT_FLAG_USER; 122 126 123 127 /* When running in the kernel we expect faults to occur only to 124 128 * addresses in user space. All other faults represent errors in the ··· 201 199 if (unlikely(is_write)) { 202 200 if (unlikely(!(vma->vm_flags & VM_WRITE))) 203 201 goto bad_area; 202 + flags |= FAULT_FLAG_WRITE; 204 203 /* a read */ 205 204 } else { 206 205 /* protection fault */

+6 -2

arch/mips/mm/fault.c

··· 42 42 const int field = sizeof(unsigned long) * 2; 43 43 siginfo_t info; 44 44 int fault; 45 - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 46 - (write ? FAULT_FLAG_WRITE : 0); 45 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 47 46 48 47 #if 0 49 48 printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(), ··· 92 93 if (in_atomic() || !mm) 93 94 goto bad_area_nosemaphore; 94 95 96 + if (user_mode(regs)) 97 + flags |= FAULT_FLAG_USER; 95 98 retry: 96 99 down_read(&mm->mmap_sem); 97 100 vma = find_vma(mm, address); ··· 115 114 if (write) { 116 115 if (!(vma->vm_flags & VM_WRITE)) 117 116 goto bad_area; 117 + flags |= FAULT_FLAG_WRITE; 118 118 } else { 119 119 if (cpu_has_rixi) { 120 120 if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) { ··· 243 241 * (which will retry the fault, or kill us if we got oom-killed). 244 242 */ 245 243 up_read(&mm->mmap_sem); 244 + if (!user_mode(regs)) 245 + goto no_context; 246 246 pagefault_out_of_memory(); 247 247 return; 248 248

+2

arch/mn10300/mm/fault.c

··· 171 171 if (in_atomic() || !mm) 172 172 goto no_context; 173 173 174 + if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) 175 + flags |= FAULT_FLAG_USER; 174 176 retry: 175 177 down_read(&mm->mmap_sem); 176 178

+1

arch/openrisc/mm/fault.c

··· 86 86 if (user_mode(regs)) { 87 87 /* Exception was in userspace: reenable interrupts */ 88 88 local_irq_enable(); 89 + flags |= FAULT_FLAG_USER; 89 90 } else { 90 91 /* If exception was in a syscall, then IRQ's may have 91 92 * been enabled or disabled. If they were enabled,

+5 -2

arch/parisc/mm/fault.c

··· 180 180 if (in_atomic() || !mm) 181 181 goto no_context; 182 182 183 + if (user_mode(regs)) 184 + flags |= FAULT_FLAG_USER; 185 + if (acc_type & VM_WRITE) 186 + flags |= FAULT_FLAG_WRITE; 183 187 retry: 184 188 down_read(&mm->mmap_sem); 185 189 vma = find_vma_prev(mm, address, &prev_vma); ··· 207 203 * fault. 208 204 */ 209 205 210 - fault = handle_mm_fault(mm, vma, address, 211 - flags | ((acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0)); 206 + fault = handle_mm_fault(mm, vma, address, flags); 212 207 213 208 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 214 209 return;

+4 -3

arch/powerpc/mm/fault.c

··· 223 223 is_write = error_code & ESR_DST; 224 224 #endif /* CONFIG_4xx || CONFIG_BOOKE */ 225 225 226 - if (is_write) 227 - flags |= FAULT_FLAG_WRITE; 228 - 229 226 #ifdef CONFIG_PPC_ICSWX 230 227 /* 231 228 * we need to do this early because this "data storage ··· 284 287 */ 285 288 if (user_mode(regs)) 286 289 store_update_sp = store_updates_sp(regs); 290 + 291 + if (user_mode(regs)) 292 + flags |= FAULT_FLAG_USER; 287 293 288 294 /* When running in the kernel we expect faults to occur only to 289 295 * addresses in user space. All other faults represent errors in the ··· 415 415 } else if (is_write) { 416 416 if (!(vma->vm_flags & VM_WRITE)) 417 417 goto bad_area; 418 + flags |= FAULT_FLAG_WRITE; 418 419 /* a read */ 419 420 } else { 420 421 /* protection fault */

+2

arch/s390/mm/fault.c

··· 302 302 address = trans_exc_code & __FAIL_ADDR_MASK; 303 303 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 304 304 flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 305 + if (user_mode(regs)) 306 + flags |= FAULT_FLAG_USER; 305 307 if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) 306 308 flags |= FAULT_FLAG_WRITE; 307 309 down_read(&mm->mmap_sem);

+6 -7

arch/score/mm/fault.c

··· 47 47 struct task_struct *tsk = current; 48 48 struct mm_struct *mm = tsk->mm; 49 49 const int field = sizeof(unsigned long) * 2; 50 + unsigned long flags = 0; 50 51 siginfo_t info; 51 52 int fault; 52 53 ··· 76 75 if (in_atomic() || !mm) 77 76 goto bad_area_nosemaphore; 78 77 78 + if (user_mode(regs)) 79 + flags |= FAULT_FLAG_USER; 80 + 79 81 down_read(&mm->mmap_sem); 80 82 vma = find_vma(mm, address); 81 83 if (!vma) ··· 99 95 if (write) { 100 96 if (!(vma->vm_flags & VM_WRITE)) 101 97 goto bad_area; 98 + flags |= FAULT_FLAG_WRITE; 102 99 } else { 103 100 if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) 104 101 goto bad_area; 105 102 } 106 103 107 - survive: 108 104 /* 109 105 * If for any reason at all we couldn't handle the fault, 110 106 * make sure we exit gracefully rather than endlessly redo 111 107 * the fault. 112 108 */ 113 - fault = handle_mm_fault(mm, vma, address, write); 109 + fault = handle_mm_fault(mm, vma, address, flags); 114 110 if (unlikely(fault & VM_FAULT_ERROR)) { 115 111 if (fault & VM_FAULT_OOM) 116 112 goto out_of_memory; ··· 171 167 */ 172 168 out_of_memory: 173 169 up_read(&mm->mmap_sem); 174 - if (is_global_init(tsk)) { 175 - yield(); 176 - down_read(&mm->mmap_sem); 177 - goto survive; 178 - } 179 170 if (!user_mode(regs)) 180 171 goto no_context; 181 172 pagefault_out_of_memory();

+6 -3

arch/sh/mm/fault.c

··· 400 400 struct mm_struct *mm; 401 401 struct vm_area_struct * vma; 402 402 int fault; 403 - int write = error_code & FAULT_CODE_WRITE; 404 - unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 405 - (write ? FAULT_FLAG_WRITE : 0)); 403 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 406 404 407 405 tsk = current; 408 406 mm = tsk->mm; ··· 473 475 } 474 476 475 477 set_thread_fault_code(error_code); 478 + 479 + if (user_mode(regs)) 480 + flags |= FAULT_FLAG_USER; 481 + if (error_code & FAULT_CODE_WRITE) 482 + flags |= FAULT_FLAG_WRITE; 476 483 477 484 /* 478 485 * If for any reason at all we couldn't handle the fault,

+9 -3

arch/sparc/mm/fault_32.c

··· 177 177 unsigned long g2; 178 178 int from_user = !(regs->psr & PSR_PS); 179 179 int fault, code; 180 - unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 181 - (write ? FAULT_FLAG_WRITE : 0)); 180 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 182 181 183 182 if (text_fault) 184 183 address = regs->pc; ··· 233 234 if (!(vma->vm_flags & (VM_READ | VM_EXEC))) 234 235 goto bad_area; 235 236 } 237 + 238 + if (from_user) 239 + flags |= FAULT_FLAG_USER; 240 + if (write) 241 + flags |= FAULT_FLAG_WRITE; 236 242 237 243 /* 238 244 * If for any reason at all we couldn't handle the fault, ··· 387 383 struct vm_area_struct *vma; 388 384 struct task_struct *tsk = current; 389 385 struct mm_struct *mm = tsk->mm; 386 + unsigned int flags = FAULT_FLAG_USER; 390 387 int code; 391 388 392 389 code = SEGV_MAPERR; ··· 407 402 if (write) { 408 403 if (!(vma->vm_flags & VM_WRITE)) 409 404 goto bad_area; 405 + flags |= FAULT_FLAG_WRITE; 410 406 } else { 411 407 if (!(vma->vm_flags & (VM_READ | VM_EXEC))) 412 408 goto bad_area; 413 409 } 414 - switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) { 410 + switch (handle_mm_fault(mm, vma, address, flags)) { 415 411 case VM_FAULT_SIGBUS: 416 412 case VM_FAULT_OOM: 417 413 goto do_sigbus;

+4 -2

arch/sparc/mm/fault_64.c

··· 315 315 bad_kernel_pc(regs, address); 316 316 return; 317 317 } 318 - } 318 + } else 319 + flags |= FAULT_FLAG_USER; 319 320 320 321 /* 321 322 * If we're in an interrupt or have no user ··· 419 418 vma->vm_file != NULL) 420 419 set_thread_fault_code(fault_code | 421 420 FAULT_CODE_BLKCOMMIT); 421 + 422 + flags |= FAULT_FLAG_WRITE; 422 423 } else { 423 424 /* Allow reads even for write-only mappings */ 424 425 if (!(vma->vm_flags & (VM_READ | VM_EXEC))) 425 426 goto bad_area; 426 427 } 427 428 428 - flags |= ((fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0); 429 429 fault = handle_mm_fault(mm, vma, address, flags); 430 430 431 431 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))

+5 -8

arch/tile/mm/fault.c

··· 280 280 if (!is_page_fault) 281 281 write = 1; 282 282 283 - flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 284 - (write ? FAULT_FLAG_WRITE : 0)); 283 + flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 285 284 286 285 is_kernel_mode = !user_mode(regs); 287 286 ··· 364 365 goto bad_area_nosemaphore; 365 366 } 366 367 368 + if (!is_kernel_mode) 369 + flags |= FAULT_FLAG_USER; 370 + 367 371 /* 368 372 * When running in the kernel we expect faults to occur only to 369 373 * addresses in user space. All other faults represent errors in the ··· 427 425 #endif 428 426 if (!(vma->vm_flags & VM_WRITE)) 429 427 goto bad_area; 428 + flags |= FAULT_FLAG_WRITE; 430 429 } else { 431 430 if (!is_page_fault || !(vma->vm_flags & VM_READ)) 432 431 goto bad_area; 433 432 } 434 433 435 - survive: 436 434 /* 437 435 * If for any reason at all we couldn't handle the fault, 438 436 * make sure we exit gracefully rather than endlessly redo ··· 557 555 */ 558 556 out_of_memory: 559 557 up_read(&mm->mmap_sem); 560 - if (is_global_init(tsk)) { 561 - yield(); 562 - down_read(&mm->mmap_sem); 563 - goto survive; 564 - } 565 558 if (is_kernel_mode) 566 559 goto no_context; 567 560 pagefault_out_of_memory();

+14 -8

arch/um/kernel/trap.c

··· 30 30 pmd_t *pmd; 31 31 pte_t *pte; 32 32 int err = -EFAULT; 33 - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 34 - (is_write ? FAULT_FLAG_WRITE : 0); 33 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 35 34 36 35 *code_out = SEGV_MAPERR; 37 36 ··· 41 42 if (in_atomic()) 42 43 goto out_nosemaphore; 43 44 45 + if (is_user) 46 + flags |= FAULT_FLAG_USER; 44 47 retry: 45 48 down_read(&mm->mmap_sem); 46 49 vma = find_vma(mm, address); ··· 59 58 60 59 good_area: 61 60 *code_out = SEGV_ACCERR; 62 - if (is_write && !(vma->vm_flags & VM_WRITE)) 63 - goto out; 64 - 65 - /* Don't require VM_READ|VM_EXEC for write faults! */ 66 - if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC))) 67 - goto out; 61 + if (is_write) { 62 + if (!(vma->vm_flags & VM_WRITE)) 63 + goto out; 64 + flags |= FAULT_FLAG_WRITE; 65 + } else { 66 + /* Don't require VM_READ|VM_EXEC for write faults! */ 67 + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) 68 + goto out; 69 + } 68 70 69 71 do { 70 72 int fault; ··· 128 124 * (which will retry the fault, or kill us if we got oom-killed). 129 125 */ 130 126 up_read(&mm->mmap_sem); 127 + if (!is_user) 128 + goto out_nosemaphore; 131 129 pagefault_out_of_memory(); 132 130 return 0; 133 131 }

+13 -9

arch/unicore32/mm/fault.c

··· 209 209 struct task_struct *tsk; 210 210 struct mm_struct *mm; 211 211 int fault, sig, code; 212 - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 213 - ((!(fsr ^ 0x12)) ? FAULT_FLAG_WRITE : 0); 212 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 214 213 215 214 tsk = current; 216 215 mm = tsk->mm; ··· 220 221 */ 221 222 if (in_atomic() || !mm) 222 223 goto no_context; 224 + 225 + if (user_mode(regs)) 226 + flags |= FAULT_FLAG_USER; 227 + if (!(fsr ^ 0x12)) 228 + flags |= FAULT_FLAG_WRITE; 223 229 224 230 /* 225 231 * As per x86, we may deadlock here. However, since the kernel only ··· 282 278 (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) 283 279 return 0; 284 280 281 + /* 282 + * If we are in kernel mode at this point, we 283 + * have no context to handle this fault with. 284 + */ 285 + if (!user_mode(regs)) 286 + goto no_context; 287 + 285 288 if (fault & VM_FAULT_OOM) { 286 289 /* 287 290 * We ran out of memory, call the OOM killer, and return to ··· 298 287 pagefault_out_of_memory(); 299 288 return 0; 300 289 } 301 - 302 - /* 303 - * If we are in kernel mode at this point, we 304 - * have no context to handle this fault with. 305 - */ 306 - if (!user_mode(regs)) 307 - goto no_context; 308 290 309 291 if (fault & VM_FAULT_SIGBUS) { 310 292 /*

+22 -21

arch/x86/mm/fault.c

··· 842 842 force_sig_info_fault(SIGBUS, code, address, tsk, fault); 843 843 } 844 844 845 - static noinline int 845 + static noinline void 846 846 mm_fault_error(struct pt_regs *regs, unsigned long error_code, 847 847 unsigned long address, unsigned int fault) 848 848 { 849 - /* 850 - * Pagefault was interrupted by SIGKILL. We have no reason to 851 - * continue pagefault. 852 - */ 853 - if (fatal_signal_pending(current)) { 854 - if (!(fault & VM_FAULT_RETRY)) 855 - up_read(&current->mm->mmap_sem); 856 - if (!(error_code & PF_USER)) 857 - no_context(regs, error_code, address, 0, 0); 858 - return 1; 849 + if (fatal_signal_pending(current) && !(error_code & PF_USER)) { 850 + up_read(&current->mm->mmap_sem); 851 + no_context(regs, error_code, address, 0, 0); 852 + return; 859 853 } 860 - if (!(fault & VM_FAULT_ERROR)) 861 - return 0; 862 854 863 855 if (fault & VM_FAULT_OOM) { 864 856 /* Kernel mode? Handle exceptions or die: */ ··· 858 866 up_read(&current->mm->mmap_sem); 859 867 no_context(regs, error_code, address, 860 868 SIGSEGV, SEGV_MAPERR); 861 - return 1; 869 + return; 862 870 } 863 871 864 872 up_read(&current->mm->mmap_sem); ··· 876 884 else 877 885 BUG(); 878 886 } 879 - return 1; 880 887 } 881 888 882 889 static int spurious_fault_check(unsigned long error_code, pte_t *pte) ··· 1002 1011 unsigned long address; 1003 1012 struct mm_struct *mm; 1004 1013 int fault; 1005 - int write = error_code & PF_WRITE; 1006 - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | 1007 - (write ? FAULT_FLAG_WRITE : 0); 1014 + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 1008 1015 1009 1016 tsk = current; 1010 1017 mm = tsk->mm; ··· 1072 1083 if (user_mode_vm(regs)) { 1073 1084 local_irq_enable(); 1074 1085 error_code |= PF_USER; 1086 + flags |= FAULT_FLAG_USER; 1075 1087 } else { 1076 1088 if (regs->flags & X86_EFLAGS_IF) 1077 1089 local_irq_enable(); ··· 1098 1108 bad_area_nosemaphore(regs, error_code, address); 1099 1109 return; 1100 1110 } 1111 + 1112 + if (error_code & PF_WRITE) 1113 + flags |= FAULT_FLAG_WRITE; 1101 1114 1102 1115 /* 1103 1116 * When running in the kernel we expect faults to occur only to ··· 1180 1187 */ 1181 1188 fault = handle_mm_fault(mm, vma, address, flags); 1182 1189 1183 - if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { 1184 - if (mm_fault_error(regs, error_code, address, fault)) 1185 - return; 1190 + /* 1191 + * If we need to retry but a fatal signal is pending, handle the 1192 + * signal first. We do not need to release the mmap_sem because it 1193 + * would already be released in __lock_page_or_retry in mm/filemap.c. 1194 + */ 1195 + if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) 1196 + return; 1197 + 1198 + if (unlikely(fault & VM_FAULT_ERROR)) { 1199 + mm_fault_error(regs, error_code, address, fault); 1200 + return; 1186 1201 } 1187 1202 1188 1203 /*

+2

arch/xtensa/mm/fault.c

··· 72 72 address, exccause, regs->pc, is_write? "w":"", is_exec? "x":""); 73 73 #endif 74 74 75 + if (user_mode(regs)) 76 + flags |= FAULT_FLAG_USER; 75 77 retry: 76 78 down_read(&mm->mmap_sem); 77 79 vma = find_vma(mm, address);

-6

drivers/base/node.c

··· 125 125 nid, K(node_page_state(nid, NR_WRITEBACK)), 126 126 nid, K(node_page_state(nid, NR_FILE_PAGES)), 127 127 nid, K(node_page_state(nid, NR_FILE_MAPPED)), 128 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE 129 - nid, K(node_page_state(nid, NR_ANON_PAGES) 130 - + node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * 131 - HPAGE_PMD_NR), 132 - #else 133 128 nid, K(node_page_state(nid, NR_ANON_PAGES)), 134 - #endif 135 129 nid, K(node_page_state(nid, NR_SHMEM)), 136 130 nid, node_page_state(nid, NR_KERNEL_STACK) * 137 131 THREAD_SIZE / 1024,

+1 -1

fs/adfs/inode.c

··· 50 50 struct inode *inode = mapping->host; 51 51 52 52 if (to > inode->i_size) 53 - truncate_pagecache(inode, to, inode->i_size); 53 + truncate_pagecache(inode, inode->i_size); 54 54 } 55 55 56 56 static int adfs_write_begin(struct file *file, struct address_space *mapping,

+1 -1

fs/affs/file.c

··· 406 406 struct inode *inode = mapping->host; 407 407 408 408 if (to > inode->i_size) { 409 - truncate_pagecache(inode, to, inode->i_size); 409 + truncate_pagecache(inode, inode->i_size); 410 410 affs_truncate(inode); 411 411 } 412 412 }

+1 -1

fs/bfs/file.c

··· 166 166 struct inode *inode = mapping->host; 167 167 168 168 if (to > inode->i_size) 169 - truncate_pagecache(inode, to, inode->i_size); 169 + truncate_pagecache(inode, inode->i_size); 170 170 } 171 171 172 172 static int bfs_write_begin(struct file *file, struct address_space *mapping,

+1 -3

fs/btrfs/free-space-cache.c

··· 221 221 struct btrfs_path *path, 222 222 struct inode *inode) 223 223 { 224 - loff_t oldsize; 225 224 int ret = 0; 226 225 227 - oldsize = i_size_read(inode); 228 226 btrfs_i_size_write(inode, 0); 229 - truncate_pagecache(inode, oldsize, 0); 227 + truncate_pagecache(inode, 0); 230 228 231 229 /* 232 230 * We don't need an orphan item because truncating the free space cache

+1 -1

fs/btrfs/inode.c

··· 4349 4349 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); 4350 4350 4351 4351 if (newsize > oldsize) { 4352 - truncate_pagecache(inode, oldsize, newsize); 4352 + truncate_pagecache(inode, newsize); 4353 4353 ret = btrfs_cont_expand(inode, oldsize, newsize); 4354 4354 if (ret) 4355 4355 return ret;

+1 -4

fs/cifs/inode.c

··· 1856 1856 1857 1857 static void cifs_setsize(struct inode *inode, loff_t offset) 1858 1858 { 1859 - loff_t oldsize; 1860 - 1861 1859 spin_lock(&inode->i_lock); 1862 - oldsize = inode->i_size; 1863 1860 i_size_write(inode, offset); 1864 1861 spin_unlock(&inode->i_lock); 1865 1862 1866 - truncate_pagecache(inode, oldsize, offset); 1863 + truncate_pagecache(inode, offset); 1867 1864 } 1868 1865 1869 1866 static int

+1 -1

fs/exofs/inode.c

··· 861 861 static void _write_failed(struct inode *inode, loff_t to) 862 862 { 863 863 if (to > inode->i_size) 864 - truncate_pagecache(inode, to, inode->i_size); 864 + truncate_pagecache(inode, inode->i_size); 865 865 } 866 866 867 867 int exofs_write_begin(struct file *file, struct address_space *mapping,

+1 -1

fs/ext2/inode.c

··· 58 58 struct inode *inode = mapping->host; 59 59 60 60 if (to > inode->i_size) { 61 - truncate_pagecache(inode, to, inode->i_size); 61 + truncate_pagecache(inode, inode->i_size); 62 62 ext2_truncate_blocks(inode, inode->i_size); 63 63 } 64 64 }

+1 -2

fs/ext4/inode.c

··· 4587 4587 4588 4588 if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { 4589 4589 handle_t *handle; 4590 - loff_t oldsize = inode->i_size; 4591 4590 4592 4591 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4593 4592 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ··· 4649 4650 * Truncate pagecache after we've waited for commit 4650 4651 * in data=journal mode to make pages freeable. 4651 4652 */ 4652 - truncate_pagecache(inode, oldsize, inode->i_size); 4653 + truncate_pagecache(inode, inode->i_size); 4653 4654 } 4654 4655 /* 4655 4656 * We want to call ext4_truncate() even if attr->ia_size ==

+1 -1

fs/fat/inode.c

··· 147 147 struct inode *inode = mapping->host; 148 148 149 149 if (to > inode->i_size) { 150 - truncate_pagecache(inode, to, inode->i_size); 150 + truncate_pagecache(inode, inode->i_size); 151 151 fat_truncate_blocks(inode, inode->i_size); 152 152 } 153 153 }

+1 -1

fs/fuse/dir.c

··· 1678 1678 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. 1679 1679 */ 1680 1680 if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { 1681 - truncate_pagecache(inode, oldsize, outarg.attr.size); 1681 + truncate_pagecache(inode, outarg.attr.size); 1682 1682 invalidate_inode_pages2(inode->i_mapping); 1683 1683 } 1684 1684

+1 -1

fs/fuse/inode.c

··· 218 218 bool inval = false; 219 219 220 220 if (oldsize != attr->size) { 221 - truncate_pagecache(inode, oldsize, attr->size); 221 + truncate_pagecache(inode, attr->size); 222 222 inval = true; 223 223 } else if (fc->auto_inval_data) { 224 224 struct timespec new_mtime = {

+2 -2

fs/gfs2/bmap.c

··· 1016 1016 chunk = oldsize - newsize; 1017 1017 if (chunk > max_chunk) 1018 1018 chunk = max_chunk; 1019 - truncate_pagecache(inode, oldsize, oldsize - chunk); 1019 + truncate_pagecache(inode, oldsize - chunk); 1020 1020 oldsize -= chunk; 1021 1021 gfs2_trans_end(sdp); 1022 1022 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); ··· 1067 1067 if (journaled) 1068 1068 error = gfs2_journaled_truncate(inode, oldsize, newsize); 1069 1069 else 1070 - truncate_pagecache(inode, oldsize, newsize); 1070 + truncate_pagecache(inode, newsize); 1071 1071 1072 1072 if (error) { 1073 1073 brelse(dibh);

+1 -1

fs/hfs/inode.c

··· 41 41 struct inode *inode = mapping->host; 42 42 43 43 if (to > inode->i_size) { 44 - truncate_pagecache(inode, to, inode->i_size); 44 + truncate_pagecache(inode, inode->i_size); 45 45 hfs_file_truncate(inode); 46 46 } 47 47 }

+1 -1

fs/hfsplus/inode.c

··· 36 36 struct inode *inode = mapping->host; 37 37 38 38 if (to > inode->i_size) { 39 - truncate_pagecache(inode, to, inode->i_size); 39 + truncate_pagecache(inode, inode->i_size); 40 40 hfsplus_file_truncate(inode); 41 41 } 42 42 }

+1 -1

fs/hpfs/file.c

··· 138 138 hpfs_lock(inode->i_sb); 139 139 140 140 if (to > inode->i_size) { 141 - truncate_pagecache(inode, to, inode->i_size); 141 + truncate_pagecache(inode, inode->i_size); 142 142 hpfs_truncate(inode); 143 143 } 144 144

+1 -1

fs/jfs/inode.c

··· 306 306 struct inode *inode = mapping->host; 307 307 308 308 if (to > inode->i_size) { 309 - truncate_pagecache(inode, to, inode->i_size); 309 + truncate_pagecache(inode, inode->i_size); 310 310 jfs_truncate(inode); 311 311 } 312 312 }

+1 -1

fs/minix/inode.c

··· 400 400 struct inode *inode = mapping->host; 401 401 402 402 if (to > inode->i_size) { 403 - truncate_pagecache(inode, to, inode->i_size); 403 + truncate_pagecache(inode, inode->i_size); 404 404 minix_truncate(inode); 405 405 } 406 406 }

+1 -3

fs/nfs/inode.c

··· 541 541 */ 542 542 static int nfs_vmtruncate(struct inode * inode, loff_t offset) 543 543 { 544 - loff_t oldsize; 545 544 int err; 546 545 547 546 err = inode_newsize_ok(inode, offset); ··· 548 549 goto out; 549 550 550 551 spin_lock(&inode->i_lock); 551 - oldsize = inode->i_size; 552 552 i_size_write(inode, offset); 553 553 spin_unlock(&inode->i_lock); 554 554 555 - truncate_pagecache(inode, oldsize, offset); 555 + truncate_pagecache(inode, offset); 556 556 out: 557 557 return err; 558 558 }

+1 -1

fs/nilfs2/inode.c

··· 254 254 struct inode *inode = mapping->host; 255 255 256 256 if (to > inode->i_size) { 257 - truncate_pagecache(inode, to, inode->i_size); 257 + truncate_pagecache(inode, inode->i_size); 258 258 nilfs_truncate(inode); 259 259 } 260 260 }

+1 -1

fs/ntfs/file.c

··· 1768 1768 struct inode *inode = mapping->host; 1769 1769 1770 1770 if (to > inode->i_size) { 1771 - truncate_pagecache(inode, to, inode->i_size); 1771 + truncate_pagecache(inode, inode->i_size); 1772 1772 ntfs_truncate_vfs(inode); 1773 1773 } 1774 1774 }

+1 -1

fs/omfs/file.c

··· 311 311 struct inode *inode = mapping->host; 312 312 313 313 if (to > inode->i_size) { 314 - truncate_pagecache(inode, to, inode->i_size); 314 + truncate_pagecache(inode, inode->i_size); 315 315 omfs_truncate(inode); 316 316 } 317 317 }

-6

fs/proc/meminfo.c

··· 132 132 K(i.freeswap), 133 133 K(global_page_state(NR_FILE_DIRTY)), 134 134 K(global_page_state(NR_WRITEBACK)), 135 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE 136 - K(global_page_state(NR_ANON_PAGES) 137 - + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * 138 - HPAGE_PMD_NR), 139 - #else 140 135 K(global_page_state(NR_ANON_PAGES)), 141 - #endif 142 136 K(global_page_state(NR_FILE_MAPPED)), 143 137 K(global_page_state(NR_SHMEM)), 144 138 K(global_page_state(NR_SLAB_RECLAIMABLE) +

+1 -1

fs/sysv/itree.c

··· 469 469 struct inode *inode = mapping->host; 470 470 471 471 if (to > inode->i_size) { 472 - truncate_pagecache(inode, to, inode->i_size); 472 + truncate_pagecache(inode, inode->i_size); 473 473 sysv_truncate(inode); 474 474 } 475 475 }

+1 -1

fs/udf/inode.c

··· 172 172 loff_t isize = inode->i_size; 173 173 174 174 if (to > isize) { 175 - truncate_pagecache(inode, to, isize); 175 + truncate_pagecache(inode, isize); 176 176 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 177 177 down_write(&iinfo->i_data_sem); 178 178 udf_clear_extent_cache(inode);

+1 -1

fs/ufs/inode.c

··· 531 531 struct inode *inode = mapping->host; 532 532 533 533 if (to > inode->i_size) 534 - truncate_pagecache(inode, to, inode->i_size); 534 + truncate_pagecache(inode, inode->i_size); 535 535 } 536 536 537 537 static int ufs_write_begin(struct file *file, struct address_space *mapping,

+2 -2

fs/xfs/xfs_aops.c

··· 1582 1582 unlock_page(page); 1583 1583 1584 1584 if (pos + len > i_size_read(inode)) 1585 - truncate_pagecache(inode, pos + len, i_size_read(inode)); 1585 + truncate_pagecache(inode, i_size_read(inode)); 1586 1586 1587 1587 page_cache_release(page); 1588 1588 page = NULL; ··· 1618 1618 loff_t to = pos + len; 1619 1619 1620 1620 if (to > isize) { 1621 - truncate_pagecache(inode, to, isize); 1621 + truncate_pagecache(inode, isize); 1622 1622 xfs_vm_kill_delalloc_range(inode, isize, to); 1623 1623 } 1624 1624 }

-3

include/linux/huge_mm.h

··· 96 96 pmd_t *dst_pmd, pmd_t *src_pmd, 97 97 struct vm_area_struct *vma, 98 98 unsigned long addr, unsigned long end); 99 - extern int handle_pte_fault(struct mm_struct *mm, 100 - struct vm_area_struct *vma, unsigned long address, 101 - pte_t *pte, pmd_t *pmd, unsigned int flags); 102 99 extern int split_huge_page_to_list(struct page *page, struct list_head *list); 103 100 static inline int split_huge_page(struct page *page) 104 101 {

+130 -18

include/linux/memcontrol.h

··· 30 30 struct mm_struct; 31 31 struct kmem_cache; 32 32 33 - /* Stats that can be updated by kernel. */ 34 - enum mem_cgroup_page_stat_item { 35 - MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ 33 + /* 34 + * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c, 35 + * These two lists should keep in accord with each other. 36 + */ 37 + enum mem_cgroup_stat_index { 38 + /* 39 + * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 40 + */ 41 + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 42 + MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 43 + MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ 44 + MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 45 + MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ 46 + MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ 47 + MEM_CGROUP_STAT_NSTATS, 36 48 }; 37 49 38 50 struct mem_cgroup_reclaim_cookie { ··· 52 40 int priority; 53 41 unsigned int generation; 54 42 }; 43 + 44 + enum mem_cgroup_filter_t { 45 + VISIT, /* visit current node */ 46 + SKIP, /* skip the current node and continue traversal */ 47 + SKIP_TREE, /* skip the whole subtree and continue traversal */ 48 + }; 49 + 50 + /* 51 + * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to 52 + * iterate through the hierarchy tree. Each tree element is checked by the 53 + * predicate before it is returned by the iterator. If a filter returns 54 + * SKIP or SKIP_TREE then the iterator code continues traversal (with the 55 + * next node down the hierarchy or the next node that doesn't belong under the 56 + * memcg's subtree). 57 + */ 58 + typedef enum mem_cgroup_filter_t 59 + (*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root); 55 60 56 61 #ifdef CONFIG_MEMCG 57 62 /* ··· 137 108 extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, 138 109 struct page *oldpage, struct page *newpage, bool migration_ok); 139 110 140 - struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, 141 - struct mem_cgroup *, 142 - struct mem_cgroup_reclaim_cookie *); 111 + struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, 112 + struct mem_cgroup *prev, 113 + struct mem_cgroup_reclaim_cookie *reclaim, 114 + mem_cgroup_iter_filter cond); 115 + 116 + static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 117 + struct mem_cgroup *prev, 118 + struct mem_cgroup_reclaim_cookie *reclaim) 119 + { 120 + return mem_cgroup_iter_cond(root, prev, reclaim, NULL); 121 + } 122 + 143 123 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); 144 124 145 125 /* ··· 162 124 struct task_struct *p); 163 125 extern void mem_cgroup_replace_page_cache(struct page *oldpage, 164 126 struct page *newpage); 127 + 128 + /** 129 + * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task 130 + * @new: true to enable, false to disable 131 + * 132 + * Toggle whether a failed memcg charge should invoke the OOM killer 133 + * or just return -ENOMEM. Returns the previous toggle state. 134 + * 135 + * NOTE: Any path that enables the OOM killer before charging must 136 + * call mem_cgroup_oom_synchronize() afterward to finalize the 137 + * OOM handling and clean up. 138 + */ 139 + static inline bool mem_cgroup_toggle_oom(bool new) 140 + { 141 + bool old; 142 + 143 + old = current->memcg_oom.may_oom; 144 + current->memcg_oom.may_oom = new; 145 + 146 + return old; 147 + } 148 + 149 + static inline void mem_cgroup_enable_oom(void) 150 + { 151 + bool old = mem_cgroup_toggle_oom(true); 152 + 153 + WARN_ON(old == true); 154 + } 155 + 156 + static inline void mem_cgroup_disable_oom(void) 157 + { 158 + bool old = mem_cgroup_toggle_oom(false); 159 + 160 + WARN_ON(old == false); 161 + } 162 + 163 + static inline bool task_in_memcg_oom(struct task_struct *p) 164 + { 165 + return p->memcg_oom.in_memcg_oom; 166 + } 167 + 168 + bool mem_cgroup_oom_synchronize(void); 165 169 166 170 #ifdef CONFIG_MEMCG_SWAP 167 171 extern int do_swap_account; ··· 245 165 } 246 166 247 167 void mem_cgroup_update_page_stat(struct page *page, 248 - enum mem_cgroup_page_stat_item idx, 168 + enum mem_cgroup_stat_index idx, 249 169 int val); 250 170 251 171 static inline void mem_cgroup_inc_page_stat(struct page *page, 252 - enum mem_cgroup_page_stat_item idx) 172 + enum mem_cgroup_stat_index idx) 253 173 { 254 174 mem_cgroup_update_page_stat(page, idx, 1); 255 175 } 256 176 257 177 static inline void mem_cgroup_dec_page_stat(struct page *page, 258 - enum mem_cgroup_page_stat_item idx) 178 + enum mem_cgroup_stat_index idx) 259 179 { 260 180 mem_cgroup_update_page_stat(page, idx, -1); 261 181 } 262 182 263 - unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 264 - gfp_t gfp_mask, 265 - unsigned long *total_scanned); 183 + enum mem_cgroup_filter_t 184 + mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, 185 + struct mem_cgroup *root); 266 186 267 187 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); 268 188 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, ··· 376 296 struct page *oldpage, struct page *newpage, bool migration_ok) 377 297 { 378 298 } 299 + static inline struct mem_cgroup * 300 + mem_cgroup_iter_cond(struct mem_cgroup *root, 301 + struct mem_cgroup *prev, 302 + struct mem_cgroup_reclaim_cookie *reclaim, 303 + mem_cgroup_iter_filter cond) 304 + { 305 + /* first call must return non-NULL, second return NULL */ 306 + return (struct mem_cgroup *)(unsigned long)!prev; 307 + } 379 308 380 309 static inline struct mem_cgroup * 381 310 mem_cgroup_iter(struct mem_cgroup *root, ··· 437 348 { 438 349 } 439 350 351 + static inline bool mem_cgroup_toggle_oom(bool new) 352 + { 353 + return false; 354 + } 355 + 356 + static inline void mem_cgroup_enable_oom(void) 357 + { 358 + } 359 + 360 + static inline void mem_cgroup_disable_oom(void) 361 + { 362 + } 363 + 364 + static inline bool task_in_memcg_oom(struct task_struct *p) 365 + { 366 + return false; 367 + } 368 + 369 + static inline bool mem_cgroup_oom_synchronize(void) 370 + { 371 + return false; 372 + } 373 + 440 374 static inline void mem_cgroup_inc_page_stat(struct page *page, 441 - enum mem_cgroup_page_stat_item idx) 375 + enum mem_cgroup_stat_index idx) 442 376 { 443 377 } 444 378 445 379 static inline void mem_cgroup_dec_page_stat(struct page *page, 446 - enum mem_cgroup_page_stat_item idx) 380 + enum mem_cgroup_stat_index idx) 447 381 { 448 382 } 449 383 450 384 static inline 451 - unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 452 - gfp_t gfp_mask, 453 - unsigned long *total_scanned) 385 + enum mem_cgroup_filter_t 386 + mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, 387 + struct mem_cgroup *root) 454 388 { 455 - return 0; 389 + return VISIT; 456 390 } 457 391 458 392 static inline void mem_cgroup_split_huge_fixup(struct page *head)

+4 -2

include/linux/mm.h

··· 176 176 #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ 177 177 #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ 178 178 #define FAULT_FLAG_TRIED 0x40 /* second try */ 179 + #define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */ 179 180 180 181 /* 181 182 * vm_fault is filled by the the pagefault handler and passed to the vma's ··· 877 876 #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ 878 877 #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ 879 878 #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ 879 + #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ 880 880 881 881 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ 882 882 883 883 #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ 884 - VM_FAULT_HWPOISON_LARGE) 884 + VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE) 885 885 886 886 /* Encode hstate index for a hwpoisoned large page */ 887 887 #define VM_FAULT_SET_HINDEX(x) ((x) << 12) ··· 986 984 unmap_mapping_range(mapping, holebegin, holelen, 0); 987 985 } 988 986 989 - extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); 987 + extern void truncate_pagecache(struct inode *inode, loff_t new); 990 988 extern void truncate_setsize(struct inode *inode, loff_t newsize); 991 989 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); 992 990 int truncate_inode_page(struct address_space *mapping, struct page *page);

+1 -1

include/linux/res_counter.h

··· 54 54 struct res_counter *parent; 55 55 }; 56 56 57 - #define RESOURCE_MAX (unsigned long long)LLONG_MAX 57 + #define RES_COUNTER_MAX ULLONG_MAX 58 58 59 59 /** 60 60 * Helpers to interact with userspace

+7

include/linux/sched.h

··· 1393 1393 unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ 1394 1394 } memcg_batch; 1395 1395 unsigned int memcg_kmem_skip_account; 1396 + struct memcg_oom_info { 1397 + unsigned int may_oom:1; 1398 + unsigned int in_memcg_oom:1; 1399 + unsigned int oom_locked:1; 1400 + int wakeups; 1401 + struct mem_cgroup *wait_on_memcg; 1402 + } memcg_oom; 1396 1403 #endif 1397 1404 #ifdef CONFIG_UPROBES 1398 1405 struct uprobe_task *utask;

+1 -1

include/linux/swap.h

··· 280 280 extern void mark_page_accessed(struct page *); 281 281 extern void lru_add_drain(void); 282 282 extern void lru_add_drain_cpu(int cpu); 283 - extern int lru_add_drain_all(void); 283 + extern void lru_add_drain_all(void); 284 284 extern void rotate_reclaimable_page(struct page *page); 285 285 extern void deactivate_page(struct page *page); 286 286 extern void swap_setup(void);

+1 -1

kernel/gcov/fs.c

··· 74 74 { 75 75 unsigned long val; 76 76 77 - if (strict_strtoul(str, 0, &val)) { 77 + if (kstrtoul(str, 0, &val)) { 78 78 pr_warning("invalid gcov_persist parameter '%s'\n", str); 79 79 return 0; 80 80 }

+1 -1

kernel/ksysfs.c

··· 113 113 unsigned long cnt; 114 114 int ret; 115 115 116 - if (strict_strtoul(buf, 0, &cnt)) 116 + if (kstrtoul(buf, 0, &cnt)) 117 117 return -EINVAL; 118 118 119 119 ret = crash_shrink_memory(cnt);

+7 -7

kernel/params.c

··· 253 253 EXPORT_SYMBOL(param_ops_##name) 254 254 255 255 256 - STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, strict_strtoul); 257 - STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); 258 - STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); 259 - STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); 260 - STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); 261 - STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); 262 - STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); 256 + STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); 257 + STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtoul); 258 + STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); 259 + STANDARD_PARAM_DEF(int, int, "%i", long, kstrtoul); 260 + STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); 261 + STANDARD_PARAM_DEF(long, long, "%li", long, kstrtoul); 262 + STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); 263 263 264 264 int param_set_charp(const char *val, const struct kernel_param *kp) 265 265 {

+16 -9

kernel/res_counter.c

··· 17 17 void res_counter_init(struct res_counter *counter, struct res_counter *parent) 18 18 { 19 19 spin_lock_init(&counter->lock); 20 - counter->limit = RESOURCE_MAX; 21 - counter->soft_limit = RESOURCE_MAX; 20 + counter->limit = RES_COUNTER_MAX; 21 + counter->soft_limit = RES_COUNTER_MAX; 22 22 counter->parent = parent; 23 23 } 24 24 ··· 178 178 #endif 179 179 180 180 int res_counter_memparse_write_strategy(const char *buf, 181 - unsigned long long *res) 181 + unsigned long long *resp) 182 182 { 183 183 char *end; 184 + unsigned long long res; 184 185 185 - /* return RESOURCE_MAX(unlimited) if "-1" is specified */ 186 + /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ 186 187 if (*buf == '-') { 187 - *res = simple_strtoull(buf + 1, &end, 10); 188 - if (*res != 1 || *end != '\0') 188 + res = simple_strtoull(buf + 1, &end, 10); 189 + if (res != 1 || *end != '\0') 189 190 return -EINVAL; 190 - *res = RESOURCE_MAX; 191 + *resp = RES_COUNTER_MAX; 191 192 return 0; 192 193 } 193 194 194 - *res = memparse(buf, &end); 195 + res = memparse(buf, &end); 195 196 if (*end != '\0') 196 197 return -EINVAL; 197 198 198 - *res = PAGE_ALIGN(*res); 199 + if (PAGE_ALIGN(res) >= res) 200 + res = PAGE_ALIGN(res); 201 + else 202 + res = RES_COUNTER_MAX; 203 + 204 + *resp = res; 205 + 199 206 return 0; 200 207 }

+2 -2

mm/Kconfig

··· 245 245 config MIGRATION 246 246 bool "Page migration" 247 247 def_bool y 248 - depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA 248 + depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU 249 249 help 250 250 Allows the migration of the physical location of pages of processes 251 251 while the virtual addresses are not changed. This is useful in ··· 480 480 481 481 config CMA 482 482 bool "Contiguous Memory Allocator" 483 - depends on HAVE_MEMBLOCK 483 + depends on HAVE_MEMBLOCK && MMU 484 484 select MIGRATION 485 485 select MEMORY_ISOLATION 486 486 help

+35 -24

mm/filemap.c

··· 467 467 error = mem_cgroup_cache_charge(page, current->mm, 468 468 gfp_mask & GFP_RECLAIM_MASK); 469 469 if (error) 470 - goto out; 470 + return error; 471 471 472 472 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); 473 - if (error == 0) { 474 - page_cache_get(page); 475 - page->mapping = mapping; 476 - page->index = offset; 477 - 478 - spin_lock_irq(&mapping->tree_lock); 479 - error = radix_tree_insert(&mapping->page_tree, offset, page); 480 - if (likely(!error)) { 481 - mapping->nrpages++; 482 - __inc_zone_page_state(page, NR_FILE_PAGES); 483 - spin_unlock_irq(&mapping->tree_lock); 484 - trace_mm_filemap_add_to_page_cache(page); 485 - } else { 486 - page->mapping = NULL; 487 - /* Leave page->index set: truncation relies upon it */ 488 - spin_unlock_irq(&mapping->tree_lock); 489 - mem_cgroup_uncharge_cache_page(page); 490 - page_cache_release(page); 491 - } 492 - radix_tree_preload_end(); 493 - } else 473 + if (error) { 494 474 mem_cgroup_uncharge_cache_page(page); 495 - out: 475 + return error; 476 + } 477 + 478 + page_cache_get(page); 479 + page->mapping = mapping; 480 + page->index = offset; 481 + 482 + spin_lock_irq(&mapping->tree_lock); 483 + error = radix_tree_insert(&mapping->page_tree, offset, page); 484 + radix_tree_preload_end(); 485 + if (unlikely(error)) 486 + goto err_insert; 487 + mapping->nrpages++; 488 + __inc_zone_page_state(page, NR_FILE_PAGES); 489 + spin_unlock_irq(&mapping->tree_lock); 490 + trace_mm_filemap_add_to_page_cache(page); 491 + return 0; 492 + err_insert: 493 + page->mapping = NULL; 494 + /* Leave page->index set: truncation relies upon it */ 495 + spin_unlock_irq(&mapping->tree_lock); 496 + mem_cgroup_uncharge_cache_page(page); 497 + page_cache_release(page); 496 498 return error; 497 499 } 498 500 EXPORT_SYMBOL(add_to_page_cache_locked); ··· 1616 1614 struct inode *inode = mapping->host; 1617 1615 pgoff_t offset = vmf->pgoff; 1618 1616 struct page *page; 1617 + bool memcg_oom; 1619 1618 pgoff_t size; 1620 1619 int ret = 0; 1621 1620 ··· 1625 1622 return VM_FAULT_SIGBUS; 1626 1623 1627 1624 /* 1628 - * Do we have something in the page cache already? 1625 + * Do we have something in the page cache already? Either 1626 + * way, try readahead, but disable the memcg OOM killer for it 1627 + * as readahead is optional and no errors are propagated up 1628 + * the fault stack. The OOM killer is enabled while trying to 1629 + * instantiate the faulting page individually below. 1629 1630 */ 1630 1631 page = find_get_page(mapping, offset); 1631 1632 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { ··· 1637 1630 * We found the page, so try async readahead before 1638 1631 * waiting for the lock. 1639 1632 */ 1633 + memcg_oom = mem_cgroup_toggle_oom(false); 1640 1634 do_async_mmap_readahead(vma, ra, file, page, offset); 1635 + mem_cgroup_toggle_oom(memcg_oom); 1641 1636 } else if (!page) { 1642 1637 /* No page in the page cache at all */ 1638 + memcg_oom = mem_cgroup_toggle_oom(false); 1643 1639 do_sync_mmap_readahead(vma, ra, file, offset); 1640 + mem_cgroup_toggle_oom(memcg_oom); 1644 1641 count_vm_event(PGMAJFAULT); 1645 1642 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1646 1643 ret = VM_FAULT_MAJOR;

+56 -73

mm/huge_memory.c

··· 695 695 return pmd; 696 696 } 697 697 698 - static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) 698 + static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) 699 699 { 700 700 pmd_t entry; 701 - entry = mk_pmd(page, vma->vm_page_prot); 702 - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 701 + entry = mk_pmd(page, prot); 703 702 entry = pmd_mkhuge(entry); 704 703 return entry; 705 704 } ··· 731 732 pte_free(mm, pgtable); 732 733 } else { 733 734 pmd_t entry; 734 - entry = mk_huge_pmd(page, vma); 735 + entry = mk_huge_pmd(page, vma->vm_page_prot); 736 + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 735 737 page_add_new_anon_rmap(page, vma, haddr); 736 738 pgtable_trans_huge_deposit(mm, pmd, pgtable); 737 739 set_pmd_at(mm, haddr, pmd, entry); ··· 788 788 { 789 789 struct page *page; 790 790 unsigned long haddr = address & HPAGE_PMD_MASK; 791 - pte_t *pte; 792 791 793 - if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { 794 - if (unlikely(anon_vma_prepare(vma))) 792 + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) 793 + return VM_FAULT_FALLBACK; 794 + if (unlikely(anon_vma_prepare(vma))) 795 + return VM_FAULT_OOM; 796 + if (unlikely(khugepaged_enter(vma))) 797 + return VM_FAULT_OOM; 798 + if (!(flags & FAULT_FLAG_WRITE) && 799 + transparent_hugepage_use_zero_page()) { 800 + pgtable_t pgtable; 801 + struct page *zero_page; 802 + bool set; 803 + pgtable = pte_alloc_one(mm, haddr); 804 + if (unlikely(!pgtable)) 795 805 return VM_FAULT_OOM; 796 - if (unlikely(khugepaged_enter(vma))) 797 - return VM_FAULT_OOM; 798 - if (!(flags & FAULT_FLAG_WRITE) && 799 - transparent_hugepage_use_zero_page()) { 800 - pgtable_t pgtable; 801 - struct page *zero_page; 802 - bool set; 803 - pgtable = pte_alloc_one(mm, haddr); 804 - if (unlikely(!pgtable)) 805 - return VM_FAULT_OOM; 806 - zero_page = get_huge_zero_page(); 807 - if (unlikely(!zero_page)) { 808 - pte_free(mm, pgtable); 809 - count_vm_event(THP_FAULT_FALLBACK); 810 - goto out; 811 - } 812 - spin_lock(&mm->page_table_lock); 813 - set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, 814 - zero_page); 815 - spin_unlock(&mm->page_table_lock); 816 - if (!set) { 817 - pte_free(mm, pgtable); 818 - put_huge_zero_page(); 819 - } 820 - return 0; 821 - } 822 - page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 823 - vma, haddr, numa_node_id(), 0); 824 - if (unlikely(!page)) { 806 + zero_page = get_huge_zero_page(); 807 + if (unlikely(!zero_page)) { 808 + pte_free(mm, pgtable); 825 809 count_vm_event(THP_FAULT_FALLBACK); 826 - goto out; 810 + return VM_FAULT_FALLBACK; 827 811 } 828 - count_vm_event(THP_FAULT_ALLOC); 829 - if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { 830 - put_page(page); 831 - goto out; 812 + spin_lock(&mm->page_table_lock); 813 + set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, 814 + zero_page); 815 + spin_unlock(&mm->page_table_lock); 816 + if (!set) { 817 + pte_free(mm, pgtable); 818 + put_huge_zero_page(); 832 819 } 833 - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, 834 - page))) { 835 - mem_cgroup_uncharge_page(page); 836 - put_page(page); 837 - goto out; 838 - } 839 - 840 820 return 0; 841 821 } 842 - out: 843 - /* 844 - * Use __pte_alloc instead of pte_alloc_map, because we can't 845 - * run pte_offset_map on the pmd, if an huge pmd could 846 - * materialize from under us from a different thread. 847 - */ 848 - if (unlikely(pmd_none(*pmd)) && 849 - unlikely(__pte_alloc(mm, vma, pmd, address))) 850 - return VM_FAULT_OOM; 851 - /* if an huge pmd materialized from under us just retry later */ 852 - if (unlikely(pmd_trans_huge(*pmd))) 853 - return 0; 854 - /* 855 - * A regular pmd is established and it can't morph into a huge pmd 856 - * from under us anymore at this point because we hold the mmap_sem 857 - * read mode and khugepaged takes it in write mode. So now it's 858 - * safe to run pte_offset_map(). 859 - */ 860 - pte = pte_offset_map(pmd, address); 861 - return handle_pte_fault(mm, vma, address, pte, pmd, flags); 822 + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 823 + vma, haddr, numa_node_id(), 0); 824 + if (unlikely(!page)) { 825 + count_vm_event(THP_FAULT_FALLBACK); 826 + return VM_FAULT_FALLBACK; 827 + } 828 + if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { 829 + put_page(page); 830 + count_vm_event(THP_FAULT_FALLBACK); 831 + return VM_FAULT_FALLBACK; 832 + } 833 + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { 834 + mem_cgroup_uncharge_page(page); 835 + put_page(page); 836 + count_vm_event(THP_FAULT_FALLBACK); 837 + return VM_FAULT_FALLBACK; 838 + } 839 + 840 + count_vm_event(THP_FAULT_ALLOC); 841 + return 0; 862 842 } 863 843 864 844 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ··· 1150 1170 new_page = NULL; 1151 1171 1152 1172 if (unlikely(!new_page)) { 1153 - count_vm_event(THP_FAULT_FALLBACK); 1154 1173 if (is_huge_zero_pmd(orig_pmd)) { 1155 1174 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, 1156 1175 address, pmd, orig_pmd, haddr); ··· 1160 1181 split_huge_page(page); 1161 1182 put_page(page); 1162 1183 } 1184 + count_vm_event(THP_FAULT_FALLBACK); 1163 1185 goto out; 1164 1186 } 1165 - count_vm_event(THP_FAULT_ALLOC); 1166 1187 1167 1188 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1168 1189 put_page(new_page); ··· 1170 1191 split_huge_page(page); 1171 1192 put_page(page); 1172 1193 } 1194 + count_vm_event(THP_FAULT_FALLBACK); 1173 1195 ret |= VM_FAULT_OOM; 1174 1196 goto out; 1175 1197 } 1198 + 1199 + count_vm_event(THP_FAULT_ALLOC); 1176 1200 1177 1201 if (is_huge_zero_pmd(orig_pmd)) 1178 1202 clear_huge_page(new_page, haddr, HPAGE_PMD_NR); ··· 1197 1215 goto out_mn; 1198 1216 } else { 1199 1217 pmd_t entry; 1200 - entry = mk_huge_pmd(new_page, vma); 1218 + entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1219 + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1201 1220 pmdp_clear_flush(vma, haddr, pmd); 1202 1221 page_add_new_anon_rmap(new_page, vma, haddr); 1203 1222 set_pmd_at(mm, haddr, pmd, entry); ··· 1649 1666 BUG_ON(atomic_read(&page->_count) <= 0); 1650 1667 1651 1668 __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); 1652 - __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1653 1669 1654 1670 ClearPageCompound(page); 1655 1671 compound_unlock(page); ··· 2346 2364 __SetPageUptodate(new_page); 2347 2365 pgtable = pmd_pgtable(_pmd); 2348 2366 2349 - _pmd = mk_huge_pmd(new_page, vma); 2367 + _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); 2368 + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 2350 2369 2351 2370 /* 2352 2371 * spin_lock() below is not the equivalent of smp_wmb(), so

+336 -533

mm/memcontrol.c

··· 39 39 #include <linux/limits.h> 40 40 #include <linux/export.h> 41 41 #include <linux/mutex.h> 42 - #include <linux/rbtree.h> 43 42 #include <linux/slab.h> 44 43 #include <linux/swap.h> 45 44 #include <linux/swapops.h> ··· 84 85 #endif 85 86 86 87 87 - /* 88 - * Statistics for memory cgroup. 89 - */ 90 - enum mem_cgroup_stat_index { 91 - /* 92 - * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 93 - */ 94 - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 95 - MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 96 - MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ 97 - MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 98 - MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ 99 - MEM_CGROUP_STAT_NSTATS, 100 - }; 101 - 102 88 static const char * const mem_cgroup_stat_names[] = { 103 89 "cache", 104 90 "rss", 105 91 "rss_huge", 106 92 "mapped_file", 93 + "writeback", 107 94 "swap", 108 95 }; 109 96 ··· 160 175 161 176 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 162 177 163 - struct rb_node tree_node; /* RB tree node */ 164 - unsigned long long usage_in_excess;/* Set to the value by which */ 165 - /* the soft limit is exceeded*/ 166 - bool on_tree; 167 178 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 168 179 /* use container_of */ 169 180 }; ··· 167 186 struct mem_cgroup_per_node { 168 187 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 169 188 }; 170 - 171 - /* 172 - * Cgroups above their limits are maintained in a RB-Tree, independent of 173 - * their hierarchy representation 174 - */ 175 - 176 - struct mem_cgroup_tree_per_zone { 177 - struct rb_root rb_root; 178 - spinlock_t lock; 179 - }; 180 - 181 - struct mem_cgroup_tree_per_node { 182 - struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 183 - }; 184 - 185 - struct mem_cgroup_tree { 186 - struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 187 - }; 188 - 189 - static struct mem_cgroup_tree soft_limit_tree __read_mostly; 190 189 191 190 struct mem_cgroup_threshold { 192 191 struct eventfd_ctx *eventfd; ··· 241 280 242 281 bool oom_lock; 243 282 atomic_t under_oom; 283 + atomic_t oom_wakeups; 244 284 245 285 int swappiness; 246 286 /* OOM-Killer disable */ ··· 266 304 * Should we move charges of a task when a task is moved into this 267 305 * mem_cgroup ? And what type of charges should we move ? 268 306 */ 269 - unsigned long move_charge_at_immigrate; 307 + unsigned long move_charge_at_immigrate; 270 308 /* 271 309 * set > 0 if pages under this cgroup are moving to other cgroup. 272 310 */ ··· 303 341 atomic_t numainfo_events; 304 342 atomic_t numainfo_updating; 305 343 #endif 344 + /* 345 + * Protects soft_contributed transitions. 346 + * See mem_cgroup_update_soft_limit 347 + */ 348 + spinlock_t soft_lock; 349 + 350 + /* 351 + * If true then this group has increased parents' children_in_excess 352 + * when it got over the soft limit. 353 + * When a group falls bellow the soft limit, parents' children_in_excess 354 + * is decreased and soft_contributed changed to false. 355 + */ 356 + bool soft_contributed; 357 + 358 + /* Number of children that are in soft limit excess */ 359 + atomic_t children_in_excess; 306 360 307 361 struct mem_cgroup_per_node *nodeinfo[0]; 308 362 /* WARNING: nodeinfo must be the last member here */ ··· 422 444 * limit reclaim to prevent infinite loops, if they ever occur. 423 445 */ 424 446 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 425 - #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 426 447 427 448 enum charge_type { 428 449 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, ··· 648 671 return mem_cgroup_zoneinfo(memcg, nid, zid); 649 672 } 650 673 651 - static struct mem_cgroup_tree_per_zone * 652 - soft_limit_tree_node_zone(int nid, int zid) 653 - { 654 - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 655 - } 656 - 657 - static struct mem_cgroup_tree_per_zone * 658 - soft_limit_tree_from_page(struct page *page) 659 - { 660 - int nid = page_to_nid(page); 661 - int zid = page_zonenum(page); 662 - 663 - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 664 - } 665 - 666 - static void 667 - __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, 668 - struct mem_cgroup_per_zone *mz, 669 - struct mem_cgroup_tree_per_zone *mctz, 670 - unsigned long long new_usage_in_excess) 671 - { 672 - struct rb_node **p = &mctz->rb_root.rb_node; 673 - struct rb_node *parent = NULL; 674 - struct mem_cgroup_per_zone *mz_node; 675 - 676 - if (mz->on_tree) 677 - return; 678 - 679 - mz->usage_in_excess = new_usage_in_excess; 680 - if (!mz->usage_in_excess) 681 - return; 682 - while (*p) { 683 - parent = *p; 684 - mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 685 - tree_node); 686 - if (mz->usage_in_excess < mz_node->usage_in_excess) 687 - p = &(*p)->rb_left; 688 - /* 689 - * We can't avoid mem cgroups that are over their soft 690 - * limit by the same amount 691 - */ 692 - else if (mz->usage_in_excess >= mz_node->usage_in_excess) 693 - p = &(*p)->rb_right; 694 - } 695 - rb_link_node(&mz->tree_node, parent, p); 696 - rb_insert_color(&mz->tree_node, &mctz->rb_root); 697 - mz->on_tree = true; 698 - } 699 - 700 - static void 701 - __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 702 - struct mem_cgroup_per_zone *mz, 703 - struct mem_cgroup_tree_per_zone *mctz) 704 - { 705 - if (!mz->on_tree) 706 - return; 707 - rb_erase(&mz->tree_node, &mctz->rb_root); 708 - mz->on_tree = false; 709 - } 710 - 711 - static void 712 - mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 713 - struct mem_cgroup_per_zone *mz, 714 - struct mem_cgroup_tree_per_zone *mctz) 715 - { 716 - spin_lock(&mctz->lock); 717 - __mem_cgroup_remove_exceeded(memcg, mz, mctz); 718 - spin_unlock(&mctz->lock); 719 - } 720 - 721 - 722 - static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 723 - { 724 - unsigned long long excess; 725 - struct mem_cgroup_per_zone *mz; 726 - struct mem_cgroup_tree_per_zone *mctz; 727 - int nid = page_to_nid(page); 728 - int zid = page_zonenum(page); 729 - mctz = soft_limit_tree_from_page(page); 730 - 731 - /* 732 - * Necessary to update all ancestors when hierarchy is used. 733 - * because their event counter is not touched. 734 - */ 735 - for (; memcg; memcg = parent_mem_cgroup(memcg)) { 736 - mz = mem_cgroup_zoneinfo(memcg, nid, zid); 737 - excess = res_counter_soft_limit_excess(&memcg->res); 738 - /* 739 - * We have to update the tree if mz is on RB-tree or 740 - * mem is over its softlimit. 741 - */ 742 - if (excess || mz->on_tree) { 743 - spin_lock(&mctz->lock); 744 - /* if on-tree, remove it */ 745 - if (mz->on_tree) 746 - __mem_cgroup_remove_exceeded(memcg, mz, mctz); 747 - /* 748 - * Insert again. mz->usage_in_excess will be updated. 749 - * If excess is 0, no tree ops. 750 - */ 751 - __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); 752 - spin_unlock(&mctz->lock); 753 - } 754 - } 755 - } 756 - 757 - static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 758 - { 759 - int node, zone; 760 - struct mem_cgroup_per_zone *mz; 761 - struct mem_cgroup_tree_per_zone *mctz; 762 - 763 - for_each_node(node) { 764 - for (zone = 0; zone < MAX_NR_ZONES; zone++) { 765 - mz = mem_cgroup_zoneinfo(memcg, node, zone); 766 - mctz = soft_limit_tree_node_zone(node, zone); 767 - mem_cgroup_remove_exceeded(memcg, mz, mctz); 768 - } 769 - } 770 - } 771 - 772 - static struct mem_cgroup_per_zone * 773 - __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 774 - { 775 - struct rb_node *rightmost = NULL; 776 - struct mem_cgroup_per_zone *mz; 777 - 778 - retry: 779 - mz = NULL; 780 - rightmost = rb_last(&mctz->rb_root); 781 - if (!rightmost) 782 - goto done; /* Nothing to reclaim from */ 783 - 784 - mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 785 - /* 786 - * Remove the node now but someone else can add it back, 787 - * we will to add it back at the end of reclaim to its correct 788 - * position in the tree. 789 - */ 790 - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 791 - if (!res_counter_soft_limit_excess(&mz->memcg->res) || 792 - !css_tryget(&mz->memcg->css)) 793 - goto retry; 794 - done: 795 - return mz; 796 - } 797 - 798 - static struct mem_cgroup_per_zone * 799 - mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 800 - { 801 - struct mem_cgroup_per_zone *mz; 802 - 803 - spin_lock(&mctz->lock); 804 - mz = __mem_cgroup_largest_soft_limit_node(mctz); 805 - spin_unlock(&mctz->lock); 806 - return mz; 807 - } 808 - 809 674 /* 810 675 * Implementation Note: reading percpu statistics for memcg. 811 676 * ··· 822 1003 } 823 1004 824 1005 /* 1006 + * Called from rate-limited memcg_check_events when enough 1007 + * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure 1008 + * that all the parents up the hierarchy will be notified that this group 1009 + * is in excess or that it is not in excess anymore. mmecg->soft_contributed 1010 + * makes the transition a single action whenever the state flips from one to 1011 + * the other. 1012 + */ 1013 + static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) 1014 + { 1015 + unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); 1016 + struct mem_cgroup *parent = memcg; 1017 + int delta = 0; 1018 + 1019 + spin_lock(&memcg->soft_lock); 1020 + if (excess) { 1021 + if (!memcg->soft_contributed) { 1022 + delta = 1; 1023 + memcg->soft_contributed = true; 1024 + } 1025 + } else { 1026 + if (memcg->soft_contributed) { 1027 + delta = -1; 1028 + memcg->soft_contributed = false; 1029 + } 1030 + } 1031 + 1032 + /* 1033 + * Necessary to update all ancestors when hierarchy is used 1034 + * because their event counter is not touched. 1035 + * We track children even outside the hierarchy for the root 1036 + * cgroup because tree walk starting at root should visit 1037 + * all cgroups and we want to prevent from pointless tree 1038 + * walk if no children is below the limit. 1039 + */ 1040 + while (delta && (parent = parent_mem_cgroup(parent))) 1041 + atomic_add(delta, &parent->children_in_excess); 1042 + if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) 1043 + atomic_add(delta, &root_mem_cgroup->children_in_excess); 1044 + spin_unlock(&memcg->soft_lock); 1045 + } 1046 + 1047 + /* 825 1048 * Check events in order. 826 1049 * 827 1050 */ ··· 886 1025 887 1026 mem_cgroup_threshold(memcg); 888 1027 if (unlikely(do_softlimit)) 889 - mem_cgroup_update_tree(memcg, page); 1028 + mem_cgroup_update_soft_limit(memcg); 890 1029 #if MAX_NUMNODES > 1 891 1030 if (unlikely(do_numainfo)) 892 1031 atomic_inc(&memcg->numainfo_events); ··· 929 1068 return memcg; 930 1069 } 931 1070 1071 + static enum mem_cgroup_filter_t 1072 + mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, 1073 + mem_cgroup_iter_filter cond) 1074 + { 1075 + if (!cond) 1076 + return VISIT; 1077 + return cond(memcg, root); 1078 + } 1079 + 932 1080 /* 933 1081 * Returns a next (in a pre-order walk) alive memcg (with elevated css 934 1082 * ref. count) or NULL if the whole root's subtree has been visited. ··· 945 1075 * helper function to be used by mem_cgroup_iter 946 1076 */ 947 1077 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 948 - struct mem_cgroup *last_visited) 1078 + struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) 949 1079 { 950 1080 struct cgroup_subsys_state *prev_css, *next_css; 951 1081 ··· 963 1093 if (next_css) { 964 1094 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 965 1095 966 - if (css_tryget(&mem->css)) 967 - return mem; 968 - else { 1096 + switch (mem_cgroup_filter(mem, root, cond)) { 1097 + case SKIP: 969 1098 prev_css = next_css; 970 1099 goto skip_node; 1100 + case SKIP_TREE: 1101 + if (mem == root) 1102 + return NULL; 1103 + /* 1104 + * css_rightmost_descendant is not an optimal way to 1105 + * skip through a subtree (especially for imbalanced 1106 + * trees leaning to right) but that's what we have right 1107 + * now. More effective solution would be traversing 1108 + * right-up for first non-NULL without calling 1109 + * css_next_descendant_pre afterwards. 1110 + */ 1111 + prev_css = css_rightmost_descendant(next_css); 1112 + goto skip_node; 1113 + case VISIT: 1114 + if (css_tryget(&mem->css)) 1115 + return mem; 1116 + else { 1117 + prev_css = next_css; 1118 + goto skip_node; 1119 + } 1120 + break; 971 1121 } 972 1122 } 973 1123 ··· 1051 1161 * @root: hierarchy root 1052 1162 * @prev: previously returned memcg, NULL on first invocation 1053 1163 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1164 + * @cond: filter for visited nodes, NULL for no filter 1054 1165 * 1055 1166 * Returns references to children of the hierarchy below @root, or 1056 1167 * @root itself, or %NULL after a full round-trip. ··· 1064 1173 * divide up the memcgs in the hierarchy among all concurrent 1065 1174 * reclaimers operating on the same zone and priority. 1066 1175 */ 1067 - struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1176 + struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, 1068 1177 struct mem_cgroup *prev, 1069 - struct mem_cgroup_reclaim_cookie *reclaim) 1178 + struct mem_cgroup_reclaim_cookie *reclaim, 1179 + mem_cgroup_iter_filter cond) 1070 1180 { 1071 1181 struct mem_cgroup *memcg = NULL; 1072 1182 struct mem_cgroup *last_visited = NULL; 1073 1183 1074 - if (mem_cgroup_disabled()) 1075 - return NULL; 1184 + if (mem_cgroup_disabled()) { 1185 + /* first call must return non-NULL, second return NULL */ 1186 + return (struct mem_cgroup *)(unsigned long)!prev; 1187 + } 1076 1188 1077 1189 if (!root) 1078 1190 root = root_mem_cgroup; ··· 1086 1192 if (!root->use_hierarchy && root != root_mem_cgroup) { 1087 1193 if (prev) 1088 1194 goto out_css_put; 1089 - return root; 1195 + if (mem_cgroup_filter(root, root, cond) == VISIT) 1196 + return root; 1197 + return NULL; 1090 1198 } 1091 1199 1092 1200 rcu_read_lock(); ··· 1111 1215 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1112 1216 } 1113 1217 1114 - memcg = __mem_cgroup_iter_next(root, last_visited); 1218 + memcg = __mem_cgroup_iter_next(root, last_visited, cond); 1115 1219 1116 1220 if (reclaim) { 1117 1221 mem_cgroup_iter_update(iter, last_visited, memcg, seq); ··· 1122 1226 reclaim->generation = iter->generation; 1123 1227 } 1124 1228 1125 - if (prev && !memcg) 1229 + /* 1230 + * We have finished the whole tree walk or no group has been 1231 + * visited because filter told us to skip the root node. 1232 + */ 1233 + if (!memcg && (prev || (cond && !last_visited))) 1126 1234 goto out_unlock; 1127 1235 } 1128 1236 out_unlock: ··· 1767 1867 return total; 1768 1868 } 1769 1869 1870 + #if MAX_NUMNODES > 1 1770 1871 /** 1771 1872 * test_mem_cgroup_node_reclaimable 1772 1873 * @memcg: the target memcg ··· 1790 1889 return false; 1791 1890 1792 1891 } 1793 - #if MAX_NUMNODES > 1 1794 1892 1795 1893 /* 1796 1894 * Always updating the nodemask is not very good - even if we have an empty ··· 1857 1957 return node; 1858 1958 } 1859 1959 1860 - /* 1861 - * Check all nodes whether it contains reclaimable pages or not. 1862 - * For quick scan, we make use of scan_nodes. This will allow us to skip 1863 - * unused nodes. But scan_nodes is lazily updated and may not cotain 1864 - * enough new information. We need to do double check. 1865 - */ 1866 - static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1867 - { 1868 - int nid; 1869 - 1870 - /* 1871 - * quick check...making use of scan_node. 1872 - * We can skip unused nodes. 1873 - */ 1874 - if (!nodes_empty(memcg->scan_nodes)) { 1875 - for (nid = first_node(memcg->scan_nodes); 1876 - nid < MAX_NUMNODES; 1877 - nid = next_node(nid, memcg->scan_nodes)) { 1878 - 1879 - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1880 - return true; 1881 - } 1882 - } 1883 - /* 1884 - * Check rest of nodes. 1885 - */ 1886 - for_each_node_state(nid, N_MEMORY) { 1887 - if (node_isset(nid, memcg->scan_nodes)) 1888 - continue; 1889 - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1890 - return true; 1891 - } 1892 - return false; 1893 - } 1894 - 1895 1960 #else 1896 1961 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1897 1962 { 1898 1963 return 0; 1899 1964 } 1900 1965 1901 - static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1902 - { 1903 - return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 1904 - } 1905 1966 #endif 1906 1967 1907 - static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1908 - struct zone *zone, 1909 - gfp_t gfp_mask, 1910 - unsigned long *total_scanned) 1968 + /* 1969 + * A group is eligible for the soft limit reclaim under the given root 1970 + * hierarchy if 1971 + * a) it is over its soft limit 1972 + * b) any parent up the hierarchy is over its soft limit 1973 + * 1974 + * If the given group doesn't have any children over the limit then it 1975 + * doesn't make any sense to iterate its subtree. 1976 + */ 1977 + enum mem_cgroup_filter_t 1978 + mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, 1979 + struct mem_cgroup *root) 1911 1980 { 1912 - struct mem_cgroup *victim = NULL; 1913 - int total = 0; 1914 - int loop = 0; 1915 - unsigned long excess; 1916 - unsigned long nr_scanned; 1917 - struct mem_cgroup_reclaim_cookie reclaim = { 1918 - .zone = zone, 1919 - .priority = 0, 1920 - }; 1981 + struct mem_cgroup *parent; 1921 1982 1922 - excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1983 + if (!memcg) 1984 + memcg = root_mem_cgroup; 1985 + parent = memcg; 1923 1986 1924 - while (1) { 1925 - victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1926 - if (!victim) { 1927 - loop++; 1928 - if (loop >= 2) { 1929 - /* 1930 - * If we have not been able to reclaim 1931 - * anything, it might because there are 1932 - * no reclaimable pages under this hierarchy 1933 - */ 1934 - if (!total) 1935 - break; 1936 - /* 1937 - * We want to do more targeted reclaim. 1938 - * excess >> 2 is not to excessive so as to 1939 - * reclaim too much, nor too less that we keep 1940 - * coming back to reclaim from this cgroup 1941 - */ 1942 - if (total >= (excess >> 2) || 1943 - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1944 - break; 1945 - } 1946 - continue; 1947 - } 1948 - if (!mem_cgroup_reclaimable(victim, false)) 1949 - continue; 1950 - total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1951 - zone, &nr_scanned); 1952 - *total_scanned += nr_scanned; 1953 - if (!res_counter_soft_limit_excess(&root_memcg->res)) 1987 + if (res_counter_soft_limit_excess(&memcg->res)) 1988 + return VISIT; 1989 + 1990 + /* 1991 + * If any parent up to the root in the hierarchy is over its soft limit 1992 + * then we have to obey and reclaim from this group as well. 1993 + */ 1994 + while ((parent = parent_mem_cgroup(parent))) { 1995 + if (res_counter_soft_limit_excess(&parent->res)) 1996 + return VISIT; 1997 + if (parent == root) 1954 1998 break; 1955 1999 } 1956 - mem_cgroup_iter_break(root_memcg, victim); 1957 - return total; 2000 + 2001 + if (!atomic_read(&memcg->children_in_excess)) 2002 + return SKIP_TREE; 2003 + return SKIP; 1958 2004 } 2005 + 2006 + static DEFINE_SPINLOCK(memcg_oom_lock); 1959 2007 1960 2008 /* 1961 2009 * Check OOM-Killer is already running under our hierarchy. 1962 2010 * If someone is running, return false. 1963 - * Has to be called with memcg_oom_lock 1964 2011 */ 1965 - static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) 2012 + static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1966 2013 { 1967 2014 struct mem_cgroup *iter, *failed = NULL; 2015 + 2016 + spin_lock(&memcg_oom_lock); 1968 2017 1969 2018 for_each_mem_cgroup_tree(iter, memcg) { 1970 2019 if (iter->oom_lock) { ··· 1928 2079 iter->oom_lock = true; 1929 2080 } 1930 2081 1931 - if (!failed) 1932 - return true; 1933 - 1934 - /* 1935 - * OK, we failed to lock the whole subtree so we have to clean up 1936 - * what we set up to the failing subtree 1937 - */ 1938 - for_each_mem_cgroup_tree(iter, memcg) { 1939 - if (iter == failed) { 1940 - mem_cgroup_iter_break(memcg, iter); 1941 - break; 2082 + if (failed) { 2083 + /* 2084 + * OK, we failed to lock the whole subtree so we have 2085 + * to clean up what we set up to the failing subtree 2086 + */ 2087 + for_each_mem_cgroup_tree(iter, memcg) { 2088 + if (iter == failed) { 2089 + mem_cgroup_iter_break(memcg, iter); 2090 + break; 2091 + } 2092 + iter->oom_lock = false; 1942 2093 } 1943 - iter->oom_lock = false; 1944 2094 } 1945 - return false; 2095 + 2096 + spin_unlock(&memcg_oom_lock); 2097 + 2098 + return !failed; 1946 2099 } 1947 2100 1948 - /* 1949 - * Has to be called with memcg_oom_lock 1950 - */ 1951 - static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 2101 + static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1952 2102 { 1953 2103 struct mem_cgroup *iter; 1954 2104 2105 + spin_lock(&memcg_oom_lock); 1955 2106 for_each_mem_cgroup_tree(iter, memcg) 1956 2107 iter->oom_lock = false; 1957 - return 0; 2108 + spin_unlock(&memcg_oom_lock); 1958 2109 } 1959 2110 1960 2111 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) ··· 1978 2129 atomic_add_unless(&iter->under_oom, -1, 0); 1979 2130 } 1980 2131 1981 - static DEFINE_SPINLOCK(memcg_oom_lock); 1982 2132 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1983 2133 1984 2134 struct oom_wait_info { ··· 2007 2159 2008 2160 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 2009 2161 { 2162 + atomic_inc(&memcg->oom_wakeups); 2010 2163 /* for filtering, pass "memcg" as argument. */ 2011 2164 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 2012 2165 } ··· 2019 2170 } 2020 2171 2021 2172 /* 2022 - * try to call OOM killer. returns false if we should exit memory-reclaim loop. 2173 + * try to call OOM killer 2023 2174 */ 2024 - static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, 2025 - int order) 2175 + static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2176 + { 2177 + bool locked; 2178 + int wakeups; 2179 + 2180 + if (!current->memcg_oom.may_oom) 2181 + return; 2182 + 2183 + current->memcg_oom.in_memcg_oom = 1; 2184 + 2185 + /* 2186 + * As with any blocking lock, a contender needs to start 2187 + * listening for wakeups before attempting the trylock, 2188 + * otherwise it can miss the wakeup from the unlock and sleep 2189 + * indefinitely. This is just open-coded because our locking 2190 + * is so particular to memcg hierarchies. 2191 + */ 2192 + wakeups = atomic_read(&memcg->oom_wakeups); 2193 + mem_cgroup_mark_under_oom(memcg); 2194 + 2195 + locked = mem_cgroup_oom_trylock(memcg); 2196 + 2197 + if (locked) 2198 + mem_cgroup_oom_notify(memcg); 2199 + 2200 + if (locked && !memcg->oom_kill_disable) { 2201 + mem_cgroup_unmark_under_oom(memcg); 2202 + mem_cgroup_out_of_memory(memcg, mask, order); 2203 + mem_cgroup_oom_unlock(memcg); 2204 + /* 2205 + * There is no guarantee that an OOM-lock contender 2206 + * sees the wakeups triggered by the OOM kill 2207 + * uncharges. Wake any sleepers explicitely. 2208 + */ 2209 + memcg_oom_recover(memcg); 2210 + } else { 2211 + /* 2212 + * A system call can just return -ENOMEM, but if this 2213 + * is a page fault and somebody else is handling the 2214 + * OOM already, we need to sleep on the OOM waitqueue 2215 + * for this memcg until the situation is resolved. 2216 + * Which can take some time because it might be 2217 + * handled by a userspace task. 2218 + * 2219 + * However, this is the charge context, which means 2220 + * that we may sit on a large call stack and hold 2221 + * various filesystem locks, the mmap_sem etc. and we 2222 + * don't want the OOM handler to deadlock on them 2223 + * while we sit here and wait. Store the current OOM 2224 + * context in the task_struct, then return -ENOMEM. 2225 + * At the end of the page fault handler, with the 2226 + * stack unwound, pagefault_out_of_memory() will check 2227 + * back with us by calling 2228 + * mem_cgroup_oom_synchronize(), possibly putting the 2229 + * task to sleep. 2230 + */ 2231 + current->memcg_oom.oom_locked = locked; 2232 + current->memcg_oom.wakeups = wakeups; 2233 + css_get(&memcg->css); 2234 + current->memcg_oom.wait_on_memcg = memcg; 2235 + } 2236 + } 2237 + 2238 + /** 2239 + * mem_cgroup_oom_synchronize - complete memcg OOM handling 2240 + * 2241 + * This has to be called at the end of a page fault if the the memcg 2242 + * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. 2243 + * 2244 + * Memcg supports userspace OOM handling, so failed allocations must 2245 + * sleep on a waitqueue until the userspace task resolves the 2246 + * situation. Sleeping directly in the charge context with all kinds 2247 + * of locks held is not a good idea, instead we remember an OOM state 2248 + * in the task and mem_cgroup_oom_synchronize() has to be called at 2249 + * the end of the page fault to put the task to sleep and clean up the 2250 + * OOM state. 2251 + * 2252 + * Returns %true if an ongoing memcg OOM situation was detected and 2253 + * finalized, %false otherwise. 2254 + */ 2255 + bool mem_cgroup_oom_synchronize(void) 2026 2256 { 2027 2257 struct oom_wait_info owait; 2028 - bool locked, need_to_kill; 2258 + struct mem_cgroup *memcg; 2259 + 2260 + /* OOM is global, do not handle */ 2261 + if (!current->memcg_oom.in_memcg_oom) 2262 + return false; 2263 + 2264 + /* 2265 + * We invoked the OOM killer but there is a chance that a kill 2266 + * did not free up any charges. Everybody else might already 2267 + * be sleeping, so restart the fault and keep the rampage 2268 + * going until some charges are released. 2269 + */ 2270 + memcg = current->memcg_oom.wait_on_memcg; 2271 + if (!memcg) 2272 + goto out; 2273 + 2274 + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2275 + goto out_memcg; 2029 2276 2030 2277 owait.memcg = memcg; 2031 2278 owait.wait.flags = 0; 2032 2279 owait.wait.func = memcg_oom_wake_function; 2033 2280 owait.wait.private = current; 2034 2281 INIT_LIST_HEAD(&owait.wait.task_list); 2035 - need_to_kill = true; 2036 - mem_cgroup_mark_under_oom(memcg); 2037 2282 2038 - /* At first, try to OOM lock hierarchy under memcg.*/ 2039 - spin_lock(&memcg_oom_lock); 2040 - locked = mem_cgroup_oom_lock(memcg); 2041 - /* 2042 - * Even if signal_pending(), we can't quit charge() loop without 2043 - * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 2044 - * under OOM is always welcomed, use TASK_KILLABLE here. 2045 - */ 2046 2283 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2047 - if (!locked || memcg->oom_kill_disable) 2048 - need_to_kill = false; 2049 - if (locked) 2050 - mem_cgroup_oom_notify(memcg); 2051 - spin_unlock(&memcg_oom_lock); 2052 - 2053 - if (need_to_kill) { 2054 - finish_wait(&memcg_oom_waitq, &owait.wait); 2055 - mem_cgroup_out_of_memory(memcg, mask, order); 2056 - } else { 2284 + /* Only sleep if we didn't miss any wakeups since OOM */ 2285 + if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) 2057 2286 schedule(); 2058 - finish_wait(&memcg_oom_waitq, &owait.wait); 2059 - } 2060 - spin_lock(&memcg_oom_lock); 2061 - if (locked) 2062 - mem_cgroup_oom_unlock(memcg); 2063 - memcg_wakeup_oom(memcg); 2064 - spin_unlock(&memcg_oom_lock); 2065 - 2287 + finish_wait(&memcg_oom_waitq, &owait.wait); 2288 + out_memcg: 2066 2289 mem_cgroup_unmark_under_oom(memcg); 2067 - 2068 - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2069 - return false; 2070 - /* Give chance to dying process */ 2071 - schedule_timeout_uninterruptible(1); 2290 + if (current->memcg_oom.oom_locked) { 2291 + mem_cgroup_oom_unlock(memcg); 2292 + /* 2293 + * There is no guarantee that an OOM-lock contender 2294 + * sees the wakeups triggered by the OOM kill 2295 + * uncharges. Wake any sleepers explicitely. 2296 + */ 2297 + memcg_oom_recover(memcg); 2298 + } 2299 + css_put(&memcg->css); 2300 + current->memcg_oom.wait_on_memcg = NULL; 2301 + out: 2302 + current->memcg_oom.in_memcg_oom = 0; 2072 2303 return true; 2073 2304 } 2074 2305 ··· 2217 2288 } 2218 2289 2219 2290 void mem_cgroup_update_page_stat(struct page *page, 2220 - enum mem_cgroup_page_stat_item idx, int val) 2291 + enum mem_cgroup_stat_index idx, int val) 2221 2292 { 2222 2293 struct mem_cgroup *memcg; 2223 2294 struct page_cgroup *pc = lookup_page_cgroup(page); ··· 2226 2297 if (mem_cgroup_disabled()) 2227 2298 return; 2228 2299 2300 + VM_BUG_ON(!rcu_read_lock_held()); 2229 2301 memcg = pc->mem_cgroup; 2230 2302 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2231 2303 return; 2232 - 2233 - switch (idx) { 2234 - case MEMCG_NR_FILE_MAPPED: 2235 - idx = MEM_CGROUP_STAT_FILE_MAPPED; 2236 - break; 2237 - default: 2238 - BUG(); 2239 - } 2240 2304 2241 2305 this_cpu_add(memcg->stat->count[idx], val); 2242 2306 } ··· 2372 2450 flush_work(&stock->work); 2373 2451 } 2374 2452 out: 2375 - put_online_cpus(); 2453 + put_online_cpus(); 2376 2454 } 2377 2455 2378 2456 /* ··· 2454 2532 CHARGE_RETRY, /* need to retry but retry is not bad */ 2455 2533 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2456 2534 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2457 - CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2458 2535 }; 2459 2536 2460 2537 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2461 2538 unsigned int nr_pages, unsigned int min_pages, 2462 - bool oom_check) 2539 + bool invoke_oom) 2463 2540 { 2464 2541 unsigned long csize = nr_pages * PAGE_SIZE; 2465 2542 struct mem_cgroup *mem_over_limit; ··· 2515 2594 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2516 2595 return CHARGE_RETRY; 2517 2596 2518 - /* If we don't need to call oom-killer at el, return immediately */ 2519 - if (!oom_check) 2520 - return CHARGE_NOMEM; 2521 - /* check OOM */ 2522 - if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) 2523 - return CHARGE_OOM_DIE; 2597 + if (invoke_oom) 2598 + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); 2524 2599 2525 - return CHARGE_RETRY; 2600 + return CHARGE_NOMEM; 2526 2601 } 2527 2602 2528 2603 /* ··· 2621 2704 } 2622 2705 2623 2706 do { 2624 - bool oom_check; 2707 + bool invoke_oom = oom && !nr_oom_retries; 2625 2708 2626 2709 /* If killed, bypass charge */ 2627 2710 if (fatal_signal_pending(current)) { ··· 2629 2712 goto bypass; 2630 2713 } 2631 2714 2632 - oom_check = false; 2633 - if (oom && !nr_oom_retries) { 2634 - oom_check = true; 2635 - nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2636 - } 2637 - 2638 - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, 2639 - oom_check); 2715 + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, 2716 + nr_pages, invoke_oom); 2640 2717 switch (ret) { 2641 2718 case CHARGE_OK: 2642 2719 break; ··· 2643 2732 css_put(&memcg->css); 2644 2733 goto nomem; 2645 2734 case CHARGE_NOMEM: /* OOM routine works */ 2646 - if (!oom) { 2735 + if (!oom || invoke_oom) { 2647 2736 css_put(&memcg->css); 2648 2737 goto nomem; 2649 2738 } 2650 - /* If oom, we never return -ENOMEM */ 2651 2739 nr_oom_retries--; 2652 2740 break; 2653 - case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2654 - css_put(&memcg->css); 2655 - goto bypass; 2656 2741 } 2657 2742 } while (ret != CHARGE_OK); 2658 2743 ··· 2789 2882 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2790 2883 * before USED bit, we need memory barrier here. 2791 2884 * See mem_cgroup_add_lru_list(), etc. 2792 - */ 2885 + */ 2793 2886 smp_wmb(); 2794 2887 SetPageCgroupUsed(pc); 2795 2888 ··· 2812 2905 unlock_page_cgroup(pc); 2813 2906 2814 2907 /* 2815 - * "charge_statistics" updated event counter. Then, check it. 2816 - * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2817 - * if they exceeds softlimit. 2908 + * "charge_statistics" updated event counter. 2818 2909 */ 2819 2910 memcg_check_events(memcg, page); 2820 2911 } ··· 3531 3626 * the page allocator. Therefore, the following sequence when backed by 3532 3627 * the SLUB allocator: 3533 3628 * 3534 - * memcg_stop_kmem_account(); 3535 - * kmalloc(<large_number>) 3536 - * memcg_resume_kmem_account(); 3629 + * memcg_stop_kmem_account(); 3630 + * kmalloc(<large_number>) 3631 + * memcg_resume_kmem_account(); 3537 3632 * 3538 3633 * would effectively ignore the fact that we should skip accounting, 3539 3634 * since it will drive us directly to this function without passing ··· 3655 3750 } 3656 3751 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3657 3752 3753 + static inline 3754 + void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, 3755 + struct mem_cgroup *to, 3756 + unsigned int nr_pages, 3757 + enum mem_cgroup_stat_index idx) 3758 + { 3759 + /* Update stat data for mem_cgroup */ 3760 + preempt_disable(); 3761 + WARN_ON_ONCE(from->stat->count[idx] < nr_pages); 3762 + __this_cpu_add(from->stat->count[idx], -nr_pages); 3763 + __this_cpu_add(to->stat->count[idx], nr_pages); 3764 + preempt_enable(); 3765 + } 3766 + 3658 3767 /** 3659 3768 * mem_cgroup_move_account - move account of the page 3660 3769 * @page: the page ··· 3714 3795 3715 3796 move_lock_mem_cgroup(from, &flags); 3716 3797 3717 - if (!anon && page_mapped(page)) { 3718 - /* Update mapped_file data for mem_cgroup */ 3719 - preempt_disable(); 3720 - __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 3721 - __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 3722 - preempt_enable(); 3723 - } 3798 + if (!anon && page_mapped(page)) 3799 + mem_cgroup_move_account_page_stat(from, to, nr_pages, 3800 + MEM_CGROUP_STAT_FILE_MAPPED); 3801 + 3802 + if (PageWriteback(page)) 3803 + mem_cgroup_move_account_page_stat(from, to, nr_pages, 3804 + MEM_CGROUP_STAT_WRITEBACK); 3805 + 3724 3806 mem_cgroup_charge_statistics(from, page, anon, -nr_pages); 3725 3807 3726 3808 /* caller should have done css_get */ ··· 4577 4657 MEM_CGROUP_RECLAIM_SHRINK); 4578 4658 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 4579 4659 /* Usage is reduced ? */ 4580 - if (curusage >= oldusage) 4660 + if (curusage >= oldusage) 4581 4661 retry_count--; 4582 4662 else 4583 4663 oldusage = curusage; ··· 4598 4678 int enlarge = 0; 4599 4679 4600 4680 /* see mem_cgroup_resize_res_limit */ 4601 - retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 4681 + retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 4602 4682 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4603 4683 while (retry_count) { 4604 4684 if (signal_pending(current)) { ··· 4645 4725 if (!ret && enlarge) 4646 4726 memcg_oom_recover(memcg); 4647 4727 return ret; 4648 - } 4649 - 4650 - unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 4651 - gfp_t gfp_mask, 4652 - unsigned long *total_scanned) 4653 - { 4654 - unsigned long nr_reclaimed = 0; 4655 - struct mem_cgroup_per_zone *mz, *next_mz = NULL; 4656 - unsigned long reclaimed; 4657 - int loop = 0; 4658 - struct mem_cgroup_tree_per_zone *mctz; 4659 - unsigned long long excess; 4660 - unsigned long nr_scanned; 4661 - 4662 - if (order > 0) 4663 - return 0; 4664 - 4665 - mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 4666 - /* 4667 - * This loop can run a while, specially if mem_cgroup's continuously 4668 - * keep exceeding their soft limit and putting the system under 4669 - * pressure 4670 - */ 4671 - do { 4672 - if (next_mz) 4673 - mz = next_mz; 4674 - else 4675 - mz = mem_cgroup_largest_soft_limit_node(mctz); 4676 - if (!mz) 4677 - break; 4678 - 4679 - nr_scanned = 0; 4680 - reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 4681 - gfp_mask, &nr_scanned); 4682 - nr_reclaimed += reclaimed; 4683 - *total_scanned += nr_scanned; 4684 - spin_lock(&mctz->lock); 4685 - 4686 - /* 4687 - * If we failed to reclaim anything from this memory cgroup 4688 - * it is time to move on to the next cgroup 4689 - */ 4690 - next_mz = NULL; 4691 - if (!reclaimed) { 4692 - do { 4693 - /* 4694 - * Loop until we find yet another one. 4695 - * 4696 - * By the time we get the soft_limit lock 4697 - * again, someone might have aded the 4698 - * group back on the RB tree. Iterate to 4699 - * make sure we get a different mem. 4700 - * mem_cgroup_largest_soft_limit_node returns 4701 - * NULL if no other cgroup is present on 4702 - * the tree 4703 - */ 4704 - next_mz = 4705 - __mem_cgroup_largest_soft_limit_node(mctz); 4706 - if (next_mz == mz) 4707 - css_put(&next_mz->memcg->css); 4708 - else /* next_mz == NULL or other memcg */ 4709 - break; 4710 - } while (1); 4711 - } 4712 - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 4713 - excess = res_counter_soft_limit_excess(&mz->memcg->res); 4714 - /* 4715 - * One school of thought says that we should not add 4716 - * back the node to the tree if reclaim returns 0. 4717 - * But our reclaim could return 0, simply because due 4718 - * to priority we are exposing a smaller subset of 4719 - * memory to reclaim from. Consider this as a longer 4720 - * term TODO. 4721 - */ 4722 - /* If excess == 0, no tree ops */ 4723 - __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); 4724 - spin_unlock(&mctz->lock); 4725 - css_put(&mz->memcg->css); 4726 - loop++; 4727 - /* 4728 - * Could not reclaim anything and there are no more 4729 - * mem cgroups to try or we seem to be looping without 4730 - * reclaiming anything. 4731 - */ 4732 - if (!nr_reclaimed && 4733 - (next_mz == NULL || 4734 - loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 4735 - break; 4736 - } while (!nr_reclaimed); 4737 - if (next_mz) 4738 - css_put(&next_mz->memcg->css); 4739 - return nr_reclaimed; 4740 4728 } 4741 4729 4742 4730 /** ··· 4818 4990 unsigned int event) 4819 4991 { 4820 4992 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4821 - int ret; 4822 4993 4823 4994 if (mem_cgroup_is_root(memcg)) 4824 4995 return -EINVAL; 4825 - css_get(&memcg->css); 4826 - ret = mem_cgroup_force_empty(memcg); 4827 - css_put(&memcg->css); 4828 - 4829 - return ret; 4996 + return mem_cgroup_force_empty(memcg); 4830 4997 } 4831 - 4832 4998 4833 4999 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 4834 5000 struct cftype *cft) ··· 4961 5139 */ 4962 5140 mutex_lock(&memcg_create_mutex); 4963 5141 mutex_lock(&set_limit_mutex); 4964 - if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { 5142 + if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { 4965 5143 if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { 4966 5144 ret = -EBUSY; 4967 5145 goto out; ··· 4971 5149 4972 5150 ret = memcg_update_cache_sizes(memcg); 4973 5151 if (ret) { 4974 - res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); 5152 + res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); 4975 5153 goto out; 4976 5154 } 4977 5155 static_key_slow_inc(&memcg_kmem_enabled_key); ··· 5911 6089 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5912 6090 mz = &pn->zoneinfo[zone]; 5913 6091 lruvec_init(&mz->lruvec); 5914 - mz->usage_in_excess = 0; 5915 - mz->on_tree = false; 5916 6092 mz->memcg = memcg; 5917 6093 } 5918 6094 memcg->nodeinfo[node] = pn; ··· 5966 6146 int node; 5967 6147 size_t size = memcg_size(); 5968 6148 5969 - mem_cgroup_remove_from_trees(memcg); 5970 6149 free_css_id(&mem_cgroup_subsys, &memcg->css); 5971 6150 5972 6151 for_each_node(node) ··· 6002 6183 } 6003 6184 EXPORT_SYMBOL(parent_mem_cgroup); 6004 6185 6005 - static void __init mem_cgroup_soft_limit_tree_init(void) 6006 - { 6007 - struct mem_cgroup_tree_per_node *rtpn; 6008 - struct mem_cgroup_tree_per_zone *rtpz; 6009 - int tmp, node, zone; 6010 - 6011 - for_each_node(node) { 6012 - tmp = node; 6013 - if (!node_state(node, N_NORMAL_MEMORY)) 6014 - tmp = -1; 6015 - rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 6016 - BUG_ON(!rtpn); 6017 - 6018 - soft_limit_tree.rb_tree_per_node[node] = rtpn; 6019 - 6020 - for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6021 - rtpz = &rtpn->rb_tree_per_zone[zone]; 6022 - rtpz->rb_root = RB_ROOT; 6023 - spin_lock_init(&rtpz->lock); 6024 - } 6025 - } 6026 - } 6027 - 6028 6186 static struct cgroup_subsys_state * __ref 6029 6187 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6030 6188 { ··· 6031 6235 mutex_init(&memcg->thresholds_lock); 6032 6236 spin_lock_init(&memcg->move_lock); 6033 6237 vmpressure_init(&memcg->vmpressure); 6238 + spin_lock_init(&memcg->soft_lock); 6034 6239 6035 6240 return &memcg->css; 6036 6241 ··· 6109 6312 6110 6313 mem_cgroup_invalidate_reclaim_iterators(memcg); 6111 6314 mem_cgroup_reparent_charges(memcg); 6315 + if (memcg->soft_contributed) { 6316 + while ((memcg = parent_mem_cgroup(memcg))) 6317 + atomic_dec(&memcg->children_in_excess); 6318 + 6319 + if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) 6320 + atomic_dec(&root_mem_cgroup->children_in_excess); 6321 + } 6112 6322 mem_cgroup_destroy_all_caches(memcg); 6113 6323 vmpressure_cleanup(&memcg->vmpressure); 6114 6324 } ··· 6790 6986 { 6791 6987 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 6792 6988 enable_swap_cgroup(); 6793 - mem_cgroup_soft_limit_tree_init(); 6794 6989 memcg_stock_init(); 6795 6990 return 0; 6796 6991 }

+39 -13

mm/memory.c

··· 3695 3695 * but allow concurrent faults), and pte mapped but not yet locked. 3696 3696 * We return with mmap_sem still held, but pte unmapped and unlocked. 3697 3697 */ 3698 - int handle_pte_fault(struct mm_struct *mm, 3698 + static int handle_pte_fault(struct mm_struct *mm, 3699 3699 struct vm_area_struct *vma, unsigned long address, 3700 3700 pte_t *pte, pmd_t *pmd, unsigned int flags) 3701 3701 { ··· 3754 3754 /* 3755 3755 * By the time we get here, we already hold the mm semaphore 3756 3756 */ 3757 - int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3758 - unsigned long address, unsigned int flags) 3757 + static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3758 + unsigned long address, unsigned int flags) 3759 3759 { 3760 3760 pgd_t *pgd; 3761 3761 pud_t *pud; 3762 3762 pmd_t *pmd; 3763 3763 pte_t *pte; 3764 - 3765 - __set_current_state(TASK_RUNNING); 3766 - 3767 - count_vm_event(PGFAULT); 3768 - mem_cgroup_count_vm_event(mm, PGFAULT); 3769 - 3770 - /* do counter updates before entering really critical section. */ 3771 - check_sync_rss_stat(current); 3772 3764 3773 3765 if (unlikely(is_vm_hugetlb_page(vma))) 3774 3766 return hugetlb_fault(mm, vma, address, flags); ··· 3774 3782 if (!pmd) 3775 3783 return VM_FAULT_OOM; 3776 3784 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { 3785 + int ret = VM_FAULT_FALLBACK; 3777 3786 if (!vma->vm_ops) 3778 - return do_huge_pmd_anonymous_page(mm, vma, address, 3779 - pmd, flags); 3787 + ret = do_huge_pmd_anonymous_page(mm, vma, address, 3788 + pmd, flags); 3789 + if (!(ret & VM_FAULT_FALLBACK)) 3790 + return ret; 3780 3791 } else { 3781 3792 pmd_t orig_pmd = *pmd; 3782 3793 int ret; ··· 3843 3848 pte = pte_offset_map(pmd, address); 3844 3849 3845 3850 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3851 + } 3852 + 3853 + int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3854 + unsigned long address, unsigned int flags) 3855 + { 3856 + int ret; 3857 + 3858 + __set_current_state(TASK_RUNNING); 3859 + 3860 + count_vm_event(PGFAULT); 3861 + mem_cgroup_count_vm_event(mm, PGFAULT); 3862 + 3863 + /* do counter updates before entering really critical section. */ 3864 + check_sync_rss_stat(current); 3865 + 3866 + /* 3867 + * Enable the memcg OOM handling for faults triggered in user 3868 + * space. Kernel faults are handled more gracefully. 3869 + */ 3870 + if (flags & FAULT_FLAG_USER) 3871 + mem_cgroup_enable_oom(); 3872 + 3873 + ret = __handle_mm_fault(mm, vma, address, flags); 3874 + 3875 + if (flags & FAULT_FLAG_USER) 3876 + mem_cgroup_disable_oom(); 3877 + 3878 + if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) 3879 + mem_cgroup_oom_synchronize(); 3880 + 3881 + return ret; 3846 3882 } 3847 3883 3848 3884 #ifndef __PAGETABLE_PUD_FOLDED

+5 -2

mm/oom_kill.c

··· 678 678 */ 679 679 void pagefault_out_of_memory(void) 680 680 { 681 - struct zonelist *zonelist = node_zonelist(first_online_node, 682 - GFP_KERNEL); 681 + struct zonelist *zonelist; 683 682 683 + if (mem_cgroup_oom_synchronize()) 684 + return; 685 + 686 + zonelist = node_zonelist(first_online_node, GFP_KERNEL); 684 687 if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { 685 688 out_of_memory(NULL, 0, 0, NULL, false); 686 689 clear_zonelist_oom(zonelist, GFP_KERNEL);

+15

mm/page-writeback.c

··· 2143 2143 2144 2144 /* 2145 2145 * Helper function for set_page_writeback family. 2146 + * 2147 + * The caller must hold mem_cgroup_begin/end_update_page_stat() lock 2148 + * while calling this function. 2149 + * See test_set_page_writeback for example. 2150 + * 2146 2151 * NOTE: Unlike account_page_dirtied this does not rely on being atomic 2147 2152 * wrt interrupts. 2148 2153 */ 2149 2154 void account_page_writeback(struct page *page) 2150 2155 { 2156 + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); 2151 2157 inc_zone_page_state(page, NR_WRITEBACK); 2152 2158 } 2153 2159 EXPORT_SYMBOL(account_page_writeback); ··· 2370 2364 { 2371 2365 struct address_space *mapping = page_mapping(page); 2372 2366 int ret; 2367 + bool locked; 2368 + unsigned long memcg_flags; 2373 2369 2370 + mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); 2374 2371 if (mapping) { 2375 2372 struct backing_dev_info *bdi = mapping->backing_dev_info; 2376 2373 unsigned long flags; ··· 2394 2385 ret = TestClearPageWriteback(page); 2395 2386 } 2396 2387 if (ret) { 2388 + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); 2397 2389 dec_zone_page_state(page, NR_WRITEBACK); 2398 2390 inc_zone_page_state(page, NR_WRITTEN); 2399 2391 } 2392 + mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); 2400 2393 return ret; 2401 2394 } 2402 2395 ··· 2406 2395 { 2407 2396 struct address_space *mapping = page_mapping(page); 2408 2397 int ret; 2398 + bool locked; 2399 + unsigned long memcg_flags; 2409 2400 2401 + mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); 2410 2402 if (mapping) { 2411 2403 struct backing_dev_info *bdi = mapping->backing_dev_info; 2412 2404 unsigned long flags; ··· 2436 2422 } 2437 2423 if (!ret) 2438 2424 account_page_writeback(page); 2425 + mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); 2439 2426 return ret; 2440 2427 2441 2428 }

+11 -11

mm/rmap.c

··· 1052 1052 { 1053 1053 int first = atomic_inc_and_test(&page->_mapcount); 1054 1054 if (first) { 1055 - if (!PageTransHuge(page)) 1056 - __inc_zone_page_state(page, NR_ANON_PAGES); 1057 - else 1055 + if (PageTransHuge(page)) 1058 1056 __inc_zone_page_state(page, 1059 1057 NR_ANON_TRANSPARENT_HUGEPAGES); 1058 + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, 1059 + hpage_nr_pages(page)); 1060 1060 } 1061 1061 if (unlikely(PageKsm(page))) 1062 1062 return; ··· 1085 1085 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1086 1086 SetPageSwapBacked(page); 1087 1087 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 1088 - if (!PageTransHuge(page)) 1089 - __inc_zone_page_state(page, NR_ANON_PAGES); 1090 - else 1088 + if (PageTransHuge(page)) 1091 1089 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1090 + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, 1091 + hpage_nr_pages(page)); 1092 1092 __page_set_anon_rmap(page, vma, address, 1); 1093 1093 if (!mlocked_vma_newpage(vma, page)) { 1094 1094 SetPageActive(page); ··· 1111 1111 mem_cgroup_begin_update_page_stat(page, &locked, &flags); 1112 1112 if (atomic_inc_and_test(&page->_mapcount)) { 1113 1113 __inc_zone_page_state(page, NR_FILE_MAPPED); 1114 - mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); 1114 + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); 1115 1115 } 1116 1116 mem_cgroup_end_update_page_stat(page, &locked, &flags); 1117 1117 } ··· 1148 1148 goto out; 1149 1149 if (anon) { 1150 1150 mem_cgroup_uncharge_page(page); 1151 - if (!PageTransHuge(page)) 1152 - __dec_zone_page_state(page, NR_ANON_PAGES); 1153 - else 1151 + if (PageTransHuge(page)) 1154 1152 __dec_zone_page_state(page, 1155 1153 NR_ANON_TRANSPARENT_HUGEPAGES); 1154 + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, 1155 + -hpage_nr_pages(page)); 1156 1156 } else { 1157 1157 __dec_zone_page_state(page, NR_FILE_MAPPED); 1158 - mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); 1158 + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); 1159 1159 mem_cgroup_end_update_page_stat(page, &locked, &flags); 1160 1160 } 1161 1161 if (unlikely(PageMlocked(page)))

+39 -5

mm/swap.c

··· 432 432 pagevec_lru_move_fn(pvec, __activate_page, NULL); 433 433 } 434 434 435 + static bool need_activate_page_drain(int cpu) 436 + { 437 + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; 438 + } 439 + 435 440 void activate_page(struct page *page) 436 441 { 437 442 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { ··· 452 447 #else 453 448 static inline void activate_page_drain(int cpu) 454 449 { 450 + } 451 + 452 + static bool need_activate_page_drain(int cpu) 453 + { 454 + return false; 455 455 } 456 456 457 457 void activate_page(struct page *page) ··· 711 701 lru_add_drain(); 712 702 } 713 703 714 - /* 715 - * Returns 0 for success 716 - */ 717 - int lru_add_drain_all(void) 704 + static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 705 + 706 + void lru_add_drain_all(void) 718 707 { 719 - return schedule_on_each_cpu(lru_add_drain_per_cpu); 708 + static DEFINE_MUTEX(lock); 709 + static struct cpumask has_work; 710 + int cpu; 711 + 712 + mutex_lock(&lock); 713 + get_online_cpus(); 714 + cpumask_clear(&has_work); 715 + 716 + for_each_online_cpu(cpu) { 717 + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); 718 + 719 + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 720 + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 721 + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || 722 + need_activate_page_drain(cpu)) { 723 + INIT_WORK(work, lru_add_drain_per_cpu); 724 + schedule_work_on(cpu, work); 725 + cpumask_set_cpu(cpu, &has_work); 726 + } 727 + } 728 + 729 + for_each_cpu(cpu, &has_work) 730 + flush_work(&per_cpu(lru_add_drain_work, cpu)); 731 + 732 + put_online_cpus(); 733 + mutex_unlock(&lock); 720 734 } 721 735 722 736 /*

+2 -7

mm/truncate.c

··· 567 567 /** 568 568 * truncate_pagecache - unmap and remove pagecache that has been truncated 569 569 * @inode: inode 570 - * @oldsize: old file size 571 570 * @newsize: new file size 572 571 * 573 572 * inode's new i_size must already be written before truncate_pagecache ··· 579 580 * situations such as writepage being called for a page that has already 580 581 * had its underlying blocks deallocated. 581 582 */ 582 - void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) 583 + void truncate_pagecache(struct inode *inode, loff_t newsize) 583 584 { 584 585 struct address_space *mapping = inode->i_mapping; 585 586 loff_t holebegin = round_up(newsize, PAGE_SIZE); ··· 613 614 */ 614 615 void truncate_setsize(struct inode *inode, loff_t newsize) 615 616 { 616 - loff_t oldsize; 617 - 618 - oldsize = inode->i_size; 619 617 i_size_write(inode, newsize); 620 - 621 - truncate_pagecache(inode, oldsize, newsize); 618 + truncate_pagecache(inode, newsize); 622 619 } 623 620 EXPORT_SYMBOL(truncate_setsize); 624 621

+52 -31

mm/vmscan.c

··· 139 139 { 140 140 return !sc->target_mem_cgroup; 141 141 } 142 + 143 + static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) 144 + { 145 + struct mem_cgroup *root = sc->target_mem_cgroup; 146 + return !mem_cgroup_disabled() && 147 + mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; 148 + } 142 149 #else 143 150 static bool global_reclaim(struct scan_control *sc) 144 151 { 145 152 return true; 153 + } 154 + 155 + static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) 156 + { 157 + return false; 146 158 } 147 159 #endif 148 160 ··· 2176 2164 } 2177 2165 } 2178 2166 2179 - static void shrink_zone(struct zone *zone, struct scan_control *sc) 2167 + static int 2168 + __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) 2180 2169 { 2181 2170 unsigned long nr_reclaimed, nr_scanned; 2171 + int groups_scanned = 0; 2182 2172 2183 2173 do { 2184 2174 struct mem_cgroup *root = sc->target_mem_cgroup; ··· 2188 2174 .zone = zone, 2189 2175 .priority = sc->priority, 2190 2176 }; 2191 - struct mem_cgroup *memcg; 2177 + struct mem_cgroup *memcg = NULL; 2178 + mem_cgroup_iter_filter filter = (soft_reclaim) ? 2179 + mem_cgroup_soft_reclaim_eligible : NULL; 2192 2180 2193 2181 nr_reclaimed = sc->nr_reclaimed; 2194 2182 nr_scanned = sc->nr_scanned; 2195 2183 2196 - memcg = mem_cgroup_iter(root, NULL, &reclaim); 2197 - do { 2184 + while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { 2198 2185 struct lruvec *lruvec; 2199 2186 2187 + groups_scanned++; 2200 2188 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2201 2189 2202 2190 shrink_lruvec(lruvec, sc); ··· 2218 2202 mem_cgroup_iter_break(root, memcg); 2219 2203 break; 2220 2204 } 2221 - memcg = mem_cgroup_iter(root, memcg, &reclaim); 2222 - } while (memcg); 2205 + } 2223 2206 2224 2207 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2225 2208 sc->nr_scanned - nr_scanned, ··· 2226 2211 2227 2212 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 2228 2213 sc->nr_scanned - nr_scanned, sc)); 2214 + 2215 + return groups_scanned; 2216 + } 2217 + 2218 + 2219 + static void shrink_zone(struct zone *zone, struct scan_control *sc) 2220 + { 2221 + bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); 2222 + unsigned long nr_scanned = sc->nr_scanned; 2223 + int scanned_groups; 2224 + 2225 + scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); 2226 + /* 2227 + * memcg iterator might race with other reclaimer or start from 2228 + * a incomplete tree walk so the tree walk in __shrink_zone 2229 + * might have missed groups that are above the soft limit. Try 2230 + * another loop to catch up with others. Do it just once to 2231 + * prevent from reclaim latencies when other reclaimers always 2232 + * preempt this one. 2233 + */ 2234 + if (do_soft_reclaim && !scanned_groups) 2235 + __shrink_zone(zone, sc, do_soft_reclaim); 2236 + 2237 + /* 2238 + * No group is over the soft limit or those that are do not have 2239 + * pages in the zone we are reclaiming so we have to reclaim everybody 2240 + */ 2241 + if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { 2242 + __shrink_zone(zone, sc, false); 2243 + return; 2244 + } 2229 2245 } 2230 2246 2231 2247 /* Returns true if compaction should go ahead for a high-order request */ ··· 2320 2274 { 2321 2275 struct zoneref *z; 2322 2276 struct zone *zone; 2323 - unsigned long nr_soft_reclaimed; 2324 - unsigned long nr_soft_scanned; 2325 2277 bool aborted_reclaim = false; 2326 2278 2327 2279 /* ··· 2359 2315 continue; 2360 2316 } 2361 2317 } 2362 - /* 2363 - * This steals pages from memory cgroups over softlimit 2364 - * and returns the number of reclaimed pages and 2365 - * scanned pages. This works for global memory pressure 2366 - * and balancing, not for a memcg's limit. 2367 - */ 2368 - nr_soft_scanned = 0; 2369 - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2370 - sc->order, sc->gfp_mask, 2371 - &nr_soft_scanned); 2372 - sc->nr_reclaimed += nr_soft_reclaimed; 2373 - sc->nr_scanned += nr_soft_scanned; 2374 2318 /* need some check for avoid more shrink_zone() */ 2375 2319 } 2376 2320 ··· 2952 2920 { 2953 2921 int i; 2954 2922 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2955 - unsigned long nr_soft_reclaimed; 2956 - unsigned long nr_soft_scanned; 2957 2923 struct scan_control sc = { 2958 2924 .gfp_mask = GFP_KERNEL, 2959 2925 .priority = DEF_PRIORITY, ··· 3065 3035 continue; 3066 3036 3067 3037 sc.nr_scanned = 0; 3068 - 3069 - nr_soft_scanned = 0; 3070 - /* 3071 - * Call soft limit reclaim before calling shrink_zone. 3072 - */ 3073 - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 3074 - order, sc.gfp_mask, 3075 - &nr_soft_scanned); 3076 - sc.nr_reclaimed += nr_soft_reclaimed; 3077 3038 3078 3039 /* 3079 3040 * There should be no need to raise the scanning

+5 -5

net/ipv4/tcp_memcontrol.c

··· 87 87 if (!cg_proto) 88 88 return -EINVAL; 89 89 90 - if (val > RESOURCE_MAX) 91 - val = RESOURCE_MAX; 90 + if (val > RES_COUNTER_MAX) 91 + val = RES_COUNTER_MAX; 92 92 93 93 tcp = tcp_from_cgproto(cg_proto); 94 94 ··· 101 101 tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, 102 102 net->ipv4.sysctl_tcp_mem[i]); 103 103 104 - if (val == RESOURCE_MAX) 104 + if (val == RES_COUNTER_MAX) 105 105 clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); 106 - else if (val != RESOURCE_MAX) { 106 + else if (val != RES_COUNTER_MAX) { 107 107 /* 108 108 * The active bit needs to be written after the static_key 109 109 * update. This is what guarantees that the socket activation ··· 187 187 188 188 switch (cft->private) { 189 189 case RES_LIMIT: 190 - val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX); 190 + val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX); 191 191 break; 192 192 case RES_USAGE: 193 193 val = tcp_read_usage(memcg);